spidy 0.0.24 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 11b932ab32a92fe7cca2a4c7fb91204098204d2b60d3226137fc41c783f50997
4
- data.tar.gz: 34fcddd2519415792d0222bf0c8176dd06a734cc48db3025ccbadd6d10a80582
3
+ metadata.gz: 2418b53cec63dbd0cebe2dd7b0d10e9be1ea1177cd4be3584d3f04e2adbf6c10
4
+ data.tar.gz: bc8b78146e0f1f08b9b6fb052093ba92ca03aecf546a8ca9b079eab5991b8acf
5
5
  SHA512:
6
- metadata.gz: 64fe7aa6a460a4c13d90749827f85904c1cf8ca14391cc2e12f22e2ec4b5b8a3fd7776ad282e178cadddaa6db197a83f49b5870b27aaa8356b79d20565992383
7
- data.tar.gz: a8405babf239188f933a13b5213bbc67efe5610a7e7f68bfa49c646a755ecebba049cdab4dbe6b52b085c71f6a0ffd82609ea5aadf9ef355bd8bf6200bdfe5f0
6
+ metadata.gz: fb04e182f956b07c7bfc2290dabacdca588a940ba7aacfea1b417be94e7332bb5d9793e698150455b7fd484c2e745015c3cd4efe66f697f92710275d322376f1
7
+ data.tar.gz: 5cef25c2ca15ace3137cedb5181da48a95c67390304a4ab28c92f57de05801c4492296578e275cbcea51a597fa501a0ef661c20f4c0e67075dc492be57e558fa
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.2
1
+ 2.7.0
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.0.21)
4
+ spidy (0.0.24)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
@@ -21,6 +21,7 @@ GEM
21
21
  diff-lcs (1.3)
22
22
  domain_name (0.5.20190701)
23
23
  unf (>= 0.0.5, < 1.0.0)
24
+ ffaker (2.10.0)
24
25
  http-cookie (1.0.3)
25
26
  domain_name (~> 0.5)
26
27
  i18n (1.7.0)
@@ -77,6 +78,7 @@ PLATFORMS
77
78
 
78
79
  DEPENDENCIES
79
80
  bundler (~> 2.0)
81
+ ffaker
80
82
  pry
81
83
  rake (~> 10.0)
82
84
  rspec (~> 3.0)
@@ -5,11 +5,11 @@
5
5
  #
6
6
  class Spidy::Binder::Html
7
7
  class << self
8
- attr_reader :names
8
+ attr_reader :attribute_names
9
9
 
10
- @names = []
11
10
  def let(name, query = nil, &block)
12
- @names << name
11
+ @attribute_names ||= []
12
+ @attribute_names << name
13
13
  define_method(name) do
14
14
  return html.at(query)&.text if block.nil?
15
15
  return instance_exec(&block) if query.blank?
@@ -21,12 +21,11 @@ class Spidy::Binder::Html
21
21
  end
22
22
  end
23
23
 
24
- attr_reader :html, :source, :url
24
+ attr_reader :html, :url
25
25
 
26
26
  def initialize(html, url: nil)
27
- @html = html
28
27
  @url = url
29
- @source = html.body
28
+ @html = html
30
29
  end
31
30
 
32
31
  def to_s
@@ -34,6 +33,6 @@ class Spidy::Binder::Html
34
33
  end
35
34
 
36
35
  def to_h
37
- self.class.names.map { |name| [name, send(name)] }.to_h
36
+ self.class.attribute_names.map { |name| [name, send(name)] }.to_h
38
37
  end
39
38
  end
@@ -20,12 +20,11 @@ class Spidy::Binder::Json
20
20
  end
21
21
  end
22
22
 
23
- attr_reader :json, :source, :url
23
+ attr_reader :json, :url
24
24
 
25
25
  def initialize(json, url: nil)
26
26
  @json = json
27
27
  @url = url
28
- @source = json.to_json
29
28
  end
30
29
 
31
30
  def to_s
@@ -21,12 +21,11 @@ class Spidy::Binder::Xml
21
21
  end
22
22
  end
23
23
 
24
- attr_reader :xml, :source, :url
24
+ attr_reader :xml, :url
25
25
 
26
26
  def initialize(xml, url: nil)
27
27
  @xml = xml
28
28
  @url = url
29
- @source = xml.to_s
30
29
  end
31
30
 
32
31
  def to_s
@@ -5,10 +5,20 @@
5
5
  #
6
6
  module Spidy::Connector
7
7
  extend ActiveSupport::Autoload
8
+ autoload :Direct
8
9
  autoload :Html
9
10
  autoload :Json
10
11
  autoload :Xml
11
12
 
13
+ USER_AGENT = [
14
+ 'Mozilla/5.0',
15
+ '(Macintosh; Intel Mac OS X 10_12_6)',
16
+ 'AppleWebKit/537.36',
17
+ '(KHTML, like Gecko)',
18
+ 'Chrome/64.0.3282.186',
19
+ 'Safari/537.36'
20
+ ].join(' ')
21
+
12
22
  def self.get(value)
13
23
  return const_get(value.to_s.classify) if value.is_a?(String) || value.is_a?(Symbol)
14
24
 
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # Direct resource ( not network resource )
5
+ #
6
+ module Spidy::Connector::Direct
7
+ def self.call(resource, &yielder)
8
+ yielder.call(resource)
9
+ end
10
+ end
@@ -18,18 +18,10 @@ module Spidy::Connector::Html
18
18
  @response_code = error.try(:response_code) || page.try(:response_code)
19
19
  end
20
20
  end
21
- USER_AGENT = [
22
- 'Mozilla/5.0',
23
- '(Macintosh; Intel Mac OS X 10_12_6)',
24
- 'AppleWebKit/537.36',
25
- '(KHTML, like Gecko)',
26
- 'Chrome/64.0.3282.186',
27
- 'Safari/537.36'
28
- ].join(' ')
29
21
 
30
- @logger = proc { |values| STDERR.puts(values.map { |k, v| "#{k}:#{v}" }.join("\t")) }
22
+ @logger = proc { |values| STDERR.puts(values.to_json) }
31
23
  @agent = Mechanize.new
32
- @agent.user_agent = USER_AGENT
24
+ @agent.user_agent = Spidy::Connector::USER_AGENT
33
25
 
34
26
  class << self
35
27
  attr_reader :agent
@@ -56,7 +48,7 @@ module Spidy::Connector::Html
56
48
  'retry.rest_count': retry_count)
57
49
 
58
50
  @agent = Mechanize.new
59
- @agent.user_agent = USER_AGENT
51
+ @agent.user_agent = Spidy::Connector::USER_AGENT
60
52
 
61
53
  retry_count -= 1
62
54
  if retry_count.positive?
@@ -4,8 +4,8 @@
4
4
  # OpenURI to JSON.parse
5
5
  #
6
6
  module Spidy::Connector::Json
7
- def self.call(url, &yielder)
7
+ def self.call(url)
8
8
  fail 'url is not specified' if url.blank?
9
- OpenURI.open_uri(url) { |body| yielder.call(JSON.parse(body.read, symbolize_names: true)) }
9
+ OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) { |body| yield JSON.parse(body.read, symbolize_names: true) }
10
10
  end
11
11
  end
@@ -7,7 +7,8 @@ module Spidy::Connector::Xml
7
7
  def self.call(url)
8
8
  fail 'URL is undefined' if url.blank?
9
9
 
10
- xml = OpenURI.open_uri(url).read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, '')
11
- yield Nokogiri::XML(xml, url)
10
+ OpenURI.open_uri(url, "User-Agent" => Spidy::Connector::USER_AGENT) do |body|
11
+ yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
12
+ end
12
13
  end
13
14
  end
data/lib/spidy/console.rb CHANGED
@@ -19,11 +19,11 @@ class Spidy::Console
19
19
  @definition_file&.eval_definition
20
20
  end
21
21
 
22
- def call(name = nil, url: nil, &block)
23
- spidy.call(name, url: url, &block)
22
+ def call(name = :default, url: nil, &block)
23
+ spidy.call(name: name, url: url, &block)
24
24
  end
25
25
 
26
- def each(name = nil, url: nil, &block)
27
- spidy.each(name, url: url, &block)
26
+ def each(name = :default, url: nil, &block)
27
+ spidy.each(name: name, url: url, &block)
28
28
  end
29
29
  end
@@ -8,27 +8,27 @@ module Spidy::Definition
8
8
  @namespace
9
9
  end
10
10
 
11
- def call(name = :default, url: nil, stream: nil, err: nil, &output)
11
+ def call(source = nil, name: :default, &yielder)
12
12
  name = name.presence || :default
13
13
  spidy = @namespace[:"#{name}_scraper"]
14
14
  fail "undefined spidy [#{name}]" if spidy.nil?
15
15
 
16
- exec(spidy, url: url, stream: stream, err: err, &output)
16
+ spidy.call(source, &yielder)
17
17
  end
18
18
 
19
- def each(name = :default, url: nil, stream: nil, err: nil, &output)
19
+ def each(source = nil, name: :default, &yielder)
20
20
  name = name.presence || :default
21
21
  spidy = @namespace[:"#{name}_spider"]
22
22
  fail "undefined spidy [#{name}]" if spidy.nil?
23
23
 
24
- exec(spidy, url: url, stream: stream, err: err, &output)
24
+ spidy.call(source, &yielder)
25
25
  end
26
26
 
27
- def spider(name = :default, connector: nil, as: nil)
27
+ def spider(name = :default, connector: nil, as: nil, &define_block)
28
28
  @namespace ||= {}
29
- connector = Spidy::Connector.get(as || connector) || connector
30
- @namespace[:"#{name}_spider"] = proc do |url, &yielder|
31
- yield(yielder, connector, url)
29
+ connector = Spidy::Connector.get(connector || as)
30
+ @namespace[:"#{name}_spider"] = proc do |source, &yielder|
31
+ define_block.call(yielder, connector, source)
32
32
  end
33
33
  end
34
34
 
@@ -41,26 +41,14 @@ module Spidy::Definition
41
41
 
42
42
  private
43
43
 
44
- def exec(spidy, url: nil, stream: nil, err: nil, &output)
45
- return spidy.call(url, &output) if stream.nil?
46
-
47
- stream.each do |value|
48
- spidy.call(value.strip, &output)
49
- rescue StandardError => e
50
- raise e if err.nil?
51
-
52
- err.call(e, value.strip)
53
- end
54
- end
55
-
56
44
  def define_proc(connector, binder, define_block)
57
- proc do |url, &yielder|
45
+ proc do |source, &yielder|
58
46
  fail 'block is not specified' if yielder.nil?
59
47
 
60
- connection_yielder = lambda do |resource|
61
- binder.call(resource, url: url, define: define_block) { |object| yielder.call(object) }
48
+ connection_yielder = lambda do |page|
49
+ binder.call(page, url: source, define: define_block) { |object| yielder.call(object) }
62
50
  end
63
- connector.call(url, &connection_yielder)
51
+ connector.call(source, &connection_yielder)
64
52
  end
65
53
  end
66
54
  end
data/lib/spidy/shell.rb CHANGED
@@ -8,23 +8,45 @@ require 'pry'
8
8
  class Spidy::Shell
9
9
  attr_reader :definition_file, :spidy
10
10
  class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
11
- class_attribute :error_handler, default: (proc { |e, url| STDERR.puts("#{url}\n #{e.message}") })
11
+ class_attribute :error_handler, default: (proc { |e, url| STDERR.puts({ url: url, message: e.message, backtrace: e.backtrace }.to_json) })
12
12
  delegate :spidy, to: :definition_file
13
13
 
14
14
  def initialize(definition_file)
15
15
  @definition_file = definition_file
16
16
  end
17
17
 
18
- def call(name)
19
- return spidy.call(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
18
+ def each_stdin_lines(name)
19
+ STDIN.each_line do |url|
20
+ begin
21
+ spidy.each(url.strip, name: name, &output)
22
+ rescue => e
23
+ error_handler.call(e, url)
24
+ end
25
+ end
26
+ end
27
+
28
+ def call_stdin_lines(name)
29
+ STDIN.each_line do |url|
30
+ begin
31
+ spidy.call(url.strip, name: name, &output)
32
+ rescue => e
33
+ error_handler.call(e, url)
34
+ end
35
+ end
36
+ end
20
37
 
21
- spidy.call(name, err: error_handler, &output)
38
+ def call(name)
39
+ return call_stdin_lines(name) if FileTest.pipe?(STDIN)
40
+ spidy.call(name: name, &output) unless FileTest.pipe?(STDIN)
41
+ rescue => e
42
+ error_handler.call(e, nil)
22
43
  end
23
44
 
24
45
  def each(name)
25
- return spidy.each(name, stream: STDIN, err: error_handler, &output) if FileTest.pipe?(STDIN)
26
-
27
- spidy.each(name, err: error_handler, &output)
46
+ return each_stdin_lines(name) if FileTest.pipe?(STDIN)
47
+ spidy.each(name: name, &output)
48
+ rescue => e
49
+ error_handler.call(e, nil)
28
50
  end
29
51
 
30
52
  def function
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.0.24'
4
+ VERSION = '0.1.0'
5
5
  end
data/spidy.gemspec CHANGED
@@ -28,6 +28,7 @@ Gem::Specification.new do |spec|
28
28
  spec.add_development_dependency 'pry'
29
29
  spec.add_development_dependency 'rake', '~> 10.0'
30
30
  spec.add_development_dependency 'rspec', '~> 3.0'
31
+ spec.add_development_dependency 'ffaker'
31
32
 
32
33
  spec.add_runtime_dependency 'activesupport'
33
34
  spec.add_runtime_dependency 'mechanize'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.24
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-12-26 00:00:00.000000000 Z
11
+ date: 2020-01-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '3.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: ffaker
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: activesupport
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -138,6 +152,7 @@ files:
138
152
  - lib/spidy/binder/json.rb
139
153
  - lib/spidy/binder/xml.rb
140
154
  - lib/spidy/connector.rb
155
+ - lib/spidy/connector/direct.rb
141
156
  - lib/spidy/connector/html.rb
142
157
  - lib/spidy/connector/json.rb
143
158
  - lib/spidy/connector/xml.rb
@@ -168,7 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
183
  - !ruby/object:Gem::Version
169
184
  version: '0'
170
185
  requirements: []
171
- rubygems_version: 3.0.3
186
+ rubygems_version: 3.1.2
172
187
  signing_key:
173
188
  specification_version: 4
174
189
  summary: web spider dsl