spidy 0.3.6 → 0.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 38936d0a025edde8a0ecb15a164c36fbccf1346f9bee9ec7d01a43931eaeb163
4
- data.tar.gz: 71c02d424da0b18cfad383035c78465b1c81ccdfe41399baa511a7bae3675662
3
+ metadata.gz: 76cb60ea985d1a663f24b7b024198d222756376bd9dd979a032c46ba39b16548
4
+ data.tar.gz: ff2e7f056f7ad5afe06df90adf0bb2e438c696472cde50c8d5758b2f9801684e
5
5
  SHA512:
6
- metadata.gz: d97c61f76fb9dde1d5693c7d33830c6df7605930408c42666ffe31a6d46262d1203cfed710d2112f2c396cdcbc3e40f1149d9fc005c5947af3851946bd6f487e
7
- data.tar.gz: ffe6d6362f6bb50c11e54cb398730b0a649ac832b5fdd048d61675dce7a17ec58993900cbaa250238c0cc702cbe649e901cab660c265adb2ef8207e2704ea48c
6
+ metadata.gz: a721848978135752ddcfe3da30a293317a4852b41dc99209019ae71960538fe448ec4ad54da661e0b99edef3fcb85a84b095b99ddbbba9b628fdd4ac1be2f23c
7
+ data.tar.gz: a156f47f317cd4f1f0a66a13ac5102073723f139c5b797c8dc56d7dbdd41e342cb1ad1a6814812563033c68f448c4d57775c0edf300456dd164b90211632737e
data/.rubocop.yml CHANGED
@@ -1,7 +1,8 @@
1
1
  inherit_from: .rubocop_todo.yml
2
2
  AllCops:
3
+ TargetRubyVersion: 3.0.2
4
+ NewCops: enable
3
5
  DisplayCopNames: true
4
- TargetRubyVersion: 2.6
5
6
 
6
7
  Style/ClassAndModuleChildren:
7
8
  Enabled: false
@@ -9,7 +10,7 @@ Style/ClassAndModuleChildren:
9
10
  Style/SignalException:
10
11
  EnforcedStyle: semantic
11
12
 
12
- Naming/UncommunicativeMethodParamName:
13
+ Naming/MethodParameterName:
13
14
  AllowedNames:
14
15
  - as
15
16
 
@@ -17,8 +18,11 @@ Metrics/AbcSize:
17
18
  Max: 21
18
19
  Exclude:
19
20
 
21
+ Metrics/MethodLength:
22
+ Max: 15
23
+
20
24
  Metrics/LineLength:
21
- Max: 120
25
+ Max: 130
22
26
 
23
27
  Metrics/BlockLength:
24
28
  Max: 120
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.6
1
+ 3.0.2
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.3.4)
4
+ spidy (0.3.10)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
@@ -11,124 +11,126 @@ PATH
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
- activesupport (6.0.3.3)
14
+ activesupport (7.0.0)
15
15
  concurrent-ruby (~> 1.0, >= 1.0.2)
16
- i18n (>= 0.7, < 2)
17
- minitest (~> 5.1)
18
- tzinfo (~> 1.1)
19
- zeitwerk (~> 2.2, >= 2.2.2)
20
- addressable (2.7.0)
16
+ i18n (>= 1.6, < 2)
17
+ minitest (>= 5.1)
18
+ tzinfo (~> 2.0)
19
+ addressable (2.8.0)
21
20
  public_suffix (>= 2.0.2, < 5.0)
22
- capybara (3.33.0)
21
+ capybara (3.36.0)
23
22
  addressable
23
+ matrix
24
24
  mini_mime (>= 0.1.3)
25
25
  nokogiri (~> 1.8)
26
26
  rack (>= 1.6.0)
27
27
  rack-test (>= 0.6.3)
28
- regexp_parser (~> 1.5)
28
+ regexp_parser (>= 1.5, < 3.0)
29
29
  xpath (~> 3.2)
30
30
  capybara_discoball (0.1.0)
31
31
  capybara (>= 2.7, < 4)
32
- coderay (1.1.2)
33
- concurrent-ruby (1.1.7)
34
- connection_pool (2.2.3)
35
- diff-lcs (1.3)
32
+ coderay (1.1.3)
33
+ concurrent-ruby (1.1.9)
34
+ connection_pool (2.2.5)
35
+ diff-lcs (1.5.0)
36
36
  domain_name (0.5.20190701)
37
37
  unf (>= 0.0.5, < 1.0.0)
38
- ffaker (2.10.0)
39
- http-cookie (1.0.3)
38
+ ffaker (2.20.0)
39
+ http-cookie (1.0.4)
40
40
  domain_name (~> 0.5)
41
- i18n (1.8.5)
41
+ i18n (1.8.11)
42
42
  concurrent-ruby (~> 1.0)
43
- mechanize (2.7.6)
44
- domain_name (~> 0.5, >= 0.5.1)
45
- http-cookie (~> 1.0)
46
- mime-types (>= 1.17.2)
47
- net-http-digest_auth (~> 1.1, >= 1.1.1)
48
- net-http-persistent (>= 2.5.2)
49
- nokogiri (~> 1.6)
50
- ntlm-http (~> 0.1, >= 0.1.1)
51
- webrobots (>= 0.0.9, < 0.2)
52
- method_source (0.9.2)
53
- mime-types (3.3.1)
43
+ matrix (0.4.2)
44
+ mechanize (2.8.3)
45
+ addressable (~> 2.8)
46
+ domain_name (~> 0.5, >= 0.5.20190701)
47
+ http-cookie (~> 1.0, >= 1.0.3)
48
+ mime-types (~> 3.0)
49
+ net-http-digest_auth (~> 1.4, >= 1.4.1)
50
+ net-http-persistent (>= 2.5.2, < 5.0.dev)
51
+ nokogiri (~> 1.11, >= 1.11.2)
52
+ rubyntlm (~> 0.6, >= 0.6.3)
53
+ webrick (~> 1.7)
54
+ webrobots (~> 0.1.2)
55
+ method_source (1.0.0)
56
+ mime-types (3.4.1)
54
57
  mime-types-data (~> 3.2015)
55
- mime-types-data (3.2020.0512)
56
- mini_mime (1.0.2)
57
- mini_portile2 (2.4.0)
58
- minitest (5.14.2)
58
+ mime-types-data (3.2021.1115)
59
+ mini_mime (1.1.2)
60
+ minitest (5.15.0)
59
61
  mixlib-shellout (2.4.4)
60
62
  mustermann (1.1.1)
61
63
  ruby2_keywords (~> 0.0.1)
62
64
  net-http-digest_auth (1.4.1)
63
- net-http-persistent (4.0.0)
65
+ net-http-persistent (4.0.1)
64
66
  connection_pool (~> 2.2)
65
- nokogiri (1.10.10)
66
- mini_portile2 (~> 2.4.0)
67
- ntlm-http (0.1.1)
68
- pry (0.12.2)
69
- coderay (~> 1.1.0)
70
- method_source (~> 0.9.0)
67
+ nokogiri (1.12.5-arm64-darwin)
68
+ racc (~> 1.4)
69
+ pry (0.14.1)
70
+ coderay (~> 1.1)
71
+ method_source (~> 1.0)
71
72
  public_suffix (4.0.6)
73
+ racc (1.6.0)
72
74
  rack (2.2.3)
73
- rack-protection (2.0.8.1)
75
+ rack-protection (2.1.0)
74
76
  rack
75
77
  rack-test (1.1.0)
76
78
  rack (>= 1.0, < 3)
77
- rake (10.5.0)
78
- regexp_parser (1.8.1)
79
- rspec (3.8.0)
80
- rspec-core (~> 3.8.0)
81
- rspec-expectations (~> 3.8.0)
82
- rspec-mocks (~> 3.8.0)
79
+ rake (13.0.6)
80
+ regexp_parser (2.2.0)
81
+ rspec (3.10.0)
82
+ rspec-core (~> 3.10.0)
83
+ rspec-expectations (~> 3.10.0)
84
+ rspec-mocks (~> 3.10.0)
83
85
  rspec-command (1.0.3)
84
86
  mixlib-shellout (~> 2.0)
85
87
  rspec (~> 3.2)
86
88
  rspec-its (~> 1.2)
87
- rspec-core (3.8.2)
88
- rspec-support (~> 3.8.0)
89
- rspec-expectations (3.8.4)
89
+ rspec-core (3.10.1)
90
+ rspec-support (~> 3.10.0)
91
+ rspec-expectations (3.10.1)
90
92
  diff-lcs (>= 1.2.0, < 2.0)
91
- rspec-support (~> 3.8.0)
93
+ rspec-support (~> 3.10.0)
92
94
  rspec-its (1.3.0)
93
95
  rspec-core (>= 3.0.0)
94
96
  rspec-expectations (>= 3.0.0)
95
- rspec-mocks (3.8.1)
97
+ rspec-mocks (3.10.2)
96
98
  diff-lcs (>= 1.2.0, < 2.0)
97
- rspec-support (~> 3.8.0)
98
- rspec-support (3.8.2)
99
- ruby2_keywords (0.0.2)
100
- sinatra (2.0.8.1)
99
+ rspec-support (~> 3.10.0)
100
+ rspec-support (3.10.3)
101
+ ruby2_keywords (0.0.5)
102
+ rubyntlm (0.6.3)
103
+ sinatra (2.1.0)
101
104
  mustermann (~> 1.0)
102
- rack (~> 2.0)
103
- rack-protection (= 2.0.8.1)
105
+ rack (~> 2.2)
106
+ rack-protection (= 2.1.0)
104
107
  tilt (~> 2.0)
105
108
  socksify (1.7.1)
106
- thread_safe (0.3.6)
107
109
  tilt (2.0.10)
108
- tor (0.1.4)
109
- tzinfo (1.2.7)
110
- thread_safe (~> 0.1)
110
+ tor (0.1.5)
111
+ tzinfo (2.0.4)
112
+ concurrent-ruby (~> 1.0)
111
113
  unf (0.1.4)
112
114
  unf_ext
113
- unf_ext (0.0.7.7)
115
+ unf_ext (0.0.8)
116
+ webrick (1.7.0)
114
117
  webrobots (0.1.2)
115
118
  xpath (3.2.0)
116
119
  nokogiri (~> 1.8)
117
- zeitwerk (2.4.0)
118
120
 
119
121
  PLATFORMS
120
- ruby
122
+ arm64-darwin-20
121
123
 
122
124
  DEPENDENCIES
123
125
  bundler (~> 2.0)
124
126
  capybara_discoball
125
127
  ffaker
126
128
  pry
127
- rake (~> 10.0)
129
+ rake (~> 13.0)
128
130
  rspec (~> 3.0)
129
131
  rspec-command
130
132
  sinatra
131
133
  spidy!
132
134
 
133
135
  BUNDLED WITH
134
- 2.1.4
136
+ 2.2.22
@@ -1,7 +1,7 @@
1
-
1
+ # frozen_string_literal: true
2
2
 
3
3
  Spidy.define do
4
- url_to_params = ->(url) {
4
+ url_to_params = lambda { |url|
5
5
  uri = URI.parse(url)
6
6
  params = URI.decode_www_form(uri.query).to_h if uri.query.present?
7
7
  params if params.present?
@@ -13,41 +13,41 @@ Spidy.define do
13
13
 
14
14
  limit_page = 3
15
15
  per_page = 25
16
- yielder.call(Nokogiri::HTML::Builder.new { |doc|
17
- doc.html {
18
- doc.body {
19
- doc.span.bold {
20
- doc.text "Hello world"
21
- }
22
- doc.main {
23
- (page * per_page + 1).upto((page + 1) * per_page).each do |i|
16
+ yielder.call(Nokogiri::HTML::Builder.new do |doc|
17
+ doc.html do
18
+ doc.body do
19
+ doc.span.bold do
20
+ doc.text 'Hello world'
21
+ end
22
+ doc.main do
23
+ ((page * per_page) + 1).upto((page + 1) * per_page).each do |i|
24
24
  doc.a("page #{i}", href: "http://localhost/?id=#{i}")
25
25
  end
26
- }
26
+ end
27
27
  doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
28
- }
29
- }
30
- }.doc)
28
+ end
29
+ end
30
+ end.doc)
31
31
  }
32
32
 
33
33
  detail_page = proc { |url, &yielder|
34
34
  params = url_to_params.call(url)
35
35
  id = params['id']
36
36
 
37
- yielder.call(Nokogiri::HTML::Builder.new { |doc|
38
- doc.html {
39
- doc.body {
40
- doc.span.bold {
41
- doc.text "Hello world"
42
- }
37
+ yielder.call(Nokogiri::HTML::Builder.new do |doc|
38
+ doc.html do
39
+ doc.body do
40
+ doc.span.bold do
41
+ doc.text 'Hello world'
42
+ end
43
43
  doc.h1("title_#{id}", id: 'title')
44
44
  doc.main("body_#{id}", id: 'body')
45
45
  doc.div.sub do
46
46
  doc.span.name('testtest')
47
47
  end
48
- }
49
- }
50
- }.doc)
48
+ end
49
+ end
50
+ end.doc)
51
51
  }
52
52
 
53
53
  define(as: :html, connector: detail_page) do
data/example/proxy.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  Spidy.define do
2
4
  user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
3
5
  socks_proxy '127.0.0.1', 9050
data/example/retry.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  Spidy.define do
2
4
  spider(as: :json) do |yielder, connector|
3
5
  connector.call('https://httpbin.org/status/500') do |json|
data/example/wikip.rb CHANGED
@@ -11,11 +11,8 @@ Spidy.define do
11
11
 
12
12
  define(:infobox, as: :html, connector: :direct) do
13
13
  let(:columns) do
14
- html.search('tr').each do |tr|
15
- {
16
- name: tr.at('th')&.text,
17
- value: tr.at('td')&.text
18
- }
14
+ html.search('tr').map do |tr|
15
+ { name: tr.at('th')&.text, value: tr.at('td')&.text }
19
16
  end
20
17
  end
21
18
  end
data/exe/spidy CHANGED
@@ -6,10 +6,10 @@ require 'pry'
6
6
 
7
7
  if ARGV[1].blank?
8
8
  case ARGV[0]
9
- when 'version' then STDOUT.puts(Spidy::VERSION)
9
+ when 'version' then $stdout.puts(Spidy::VERSION)
10
10
  when 'console' then Spidy.shell.interactive
11
11
  else
12
- STDOUT.puts 'usage: spidy [version console]'
12
+ $stdout.puts 'usage: spidy [version console]'
13
13
  end
14
14
  else
15
15
  case ARGV[0]
@@ -17,7 +17,8 @@ else
17
17
  when 'function' then Spidy.shell(ARGV[1]).function
18
18
  when 'call' then Spidy.shell(ARGV[1]).call(ARGV[2])
19
19
  when 'each' then Spidy.shell(ARGV[1]).each(ARGV[2])
20
+ when 'eval' then Spidy.shell(ARGV[1]).eval_call(ARGV[2])
20
21
  else
21
- STDOUT.puts 'usage: spidy [console function call each] [file]'
22
+ $stdout.puts 'usage: spidy [console function call each run] [file]'
22
23
  end
23
24
  end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Spidy::Binder::Error < StandardError
4
+ end
@@ -3,12 +3,12 @@
3
3
  #
4
4
  # Bind html and convert to object
5
5
  #
6
- class Spidy::Binder::Html < Spidy::Binder::Base
7
- def self.let(name, query = nil, &block)
6
+ module Spidy::Binder::Html
7
+ def let(name, query = nil, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
11
- return define_method(name) { html.at(query)&.text } if block.nil?
11
+ return define_method(name) { html.at(query)&.text&.strip } if block.nil?
12
12
 
13
13
  define_method(name) do
14
14
  if query.present?
@@ -17,9 +17,11 @@ class Spidy::Binder::Html < Spidy::Binder::Base
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
23
 
24
- alias_method :html, :resource
24
+ def self.extended(obj)
25
+ obj.alias_method :html, :resource
26
+ end
25
27
  end
@@ -3,8 +3,8 @@
3
3
  #
4
4
  # Bind json and convert to object
5
5
  #
6
- class Spidy::Binder::Json < Spidy::Binder::Base
7
- def self.let(name, *query, &block)
6
+ module Spidy::Binder::Json
7
+ def let(name, *query, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
@@ -17,9 +17,11 @@ class Spidy::Binder::Json < Spidy::Binder::Base
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
23
 
24
- alias_method :json, :resource
24
+ def self.extended(obj)
25
+ obj.alias_method :json, :resource
26
+ end
25
27
  end
@@ -3,12 +3,12 @@
3
3
  #
4
4
  # Bind xml and convert to object
5
5
  #
6
- class Spidy::Binder::Xml < Spidy::Binder::Base
7
- def self.let(name, query = nil, &block)
6
+ module Spidy::Binder::Xml
7
+ def let(name, query = nil, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
11
- return define_method(name) { xml.at(query)&.text } if block.nil?
11
+ return define_method(name) { xml.at(query)&.text&.strip } if block.nil?
12
12
 
13
13
  define_method(name) do
14
14
  if query.present?
@@ -17,9 +17,11 @@ class Spidy::Binder::Xml < Spidy::Binder::Base
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
23
 
24
- alias_method :xml, :resource
24
+ def self.extended(obj)
25
+ obj.alias_method :xml, :resource
26
+ end
25
27
  end
data/lib/spidy/binder.rb CHANGED
@@ -5,51 +5,8 @@
5
5
  #
6
6
  module Spidy::Binder
7
7
  extend ActiveSupport::Autoload
8
+ autoload :Error
8
9
  autoload :Json
9
10
  autoload :Html
10
11
  autoload :Xml
11
-
12
- class Error < StandardError
13
- end
14
-
15
- class Caller
16
- def initialize(spidy, binder)
17
- @spidy = spidy
18
- @binder = binder
19
- end
20
-
21
- def call(source, url: nil, define: nil, define_name: nil)
22
- yield Class.new(@binder, &define).new(define_name, @spidy, source, url)
23
- end
24
- end
25
-
26
- class Base
27
- class << self
28
- attr_reader :attribute_names
29
- end
30
-
31
- attr_reader :resource, :url
32
-
33
- def initialize(define_name, spidy, resource, url)
34
- @define_name = define_name
35
- @spidy = spidy
36
- @resource = resource
37
- @url = url
38
- end
39
-
40
- def to_s
41
- to_h.to_json
42
- end
43
-
44
- def to_h
45
- self.class.attribute_names.map { |name| [name, send(name)] }.to_h
46
- end
47
- end
48
-
49
-
50
- def self.get(spidy, value)
51
- return Caller.new(spidy, const_get(value.to_s.classify)) if name.is_a?(String) || name.is_a?(Symbol)
52
-
53
- value
54
- end
55
12
  end
@@ -5,45 +5,47 @@
5
5
  #
6
6
  class Spidy::CommandLine
7
7
  delegate :spidy, to: :@definition_file
8
- class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
9
- class_attribute :error_handler, default: (proc { |e, url| STDERR.puts({ url: url, message: e.message, backtrace: e.backtrace }.to_json) })
8
+ class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
9
+ class_attribute :error_handler, default: (proc { |e, url|
10
+ warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
11
+ })
12
+
13
+ def eval_call(script)
14
+ @definition_file.spidy.instance_eval(script)
15
+ end
10
16
 
11
17
  def initialize(definition_file)
12
18
  @definition_file = definition_file
13
- raise 'unloaded spidy' if definition_file.spidy.nil?
19
+ fail 'unloaded spidy' if definition_file.spidy.nil?
14
20
  end
15
21
 
16
22
  def each_stdin_lines(name)
17
- STDIN.each_line do |url|
18
- begin
19
- spidy.each(url.strip, name: name, &output)
20
- rescue => e
21
- error_handler.call(e, url)
22
- end
23
+ $stdin.each_line do |url|
24
+ spidy.each(url.strip, name: name, &output)
25
+ rescue StandardError => e
26
+ error_handler.call(e, url)
23
27
  end
24
28
  end
25
29
 
26
30
  def call_stdin_lines(name)
27
- STDIN.each_line do |url|
28
- begin
29
- spidy.call(url.strip, name: name, &output)
30
- rescue => e
31
- error_handler.call(e, url)
32
- end
31
+ $stdin.each_line do |url|
32
+ spidy.call(url.strip, name: name, &output)
33
+ rescue StandardError => e
34
+ error_handler.call(e, url)
33
35
  end
34
36
  end
35
37
 
36
38
  def call(name)
37
- return call_stdin_lines(name) if FileTest.pipe?(STDIN)
38
- spidy.call(name: name, &output) unless FileTest.pipe?(STDIN)
39
- rescue => e
39
+ return call_stdin_lines(name) if FileTest.pipe?($stdin)
40
+ spidy.call(name: name, &output) unless FileTest.pipe?($stdin)
41
+ rescue StandardError => e
40
42
  error_handler.call(e, nil)
41
43
  end
42
44
 
43
45
  def each(name)
44
- return each_stdin_lines(name) if FileTest.pipe?(STDIN)
46
+ return each_stdin_lines(name) if FileTest.pipe?($stdin)
45
47
  spidy.each(name: name, &output)
46
- rescue => e
48
+ rescue StandardError => e
47
49
  error_handler.call(e, nil)
48
50
  end
49
51
 
@@ -59,36 +61,32 @@ class Spidy::CommandLine
59
61
  end
60
62
 
61
63
  def build(name)
62
- build_shell(name)
63
- build_ruby(name)
64
+ File.write("#{name}.sh", build_shell_script(name))
65
+ File.write("#{name}.rb", build_ruby_script)
64
66
  end
65
67
 
66
68
  def build_shell(name)
67
- File.open("#{name}.sh", 'w') do |f|
68
- f.write <<~SHELL
69
- #!/bin/bash
70
- eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
71
- spider example
72
- SHELL
73
- end
69
+ <<~SHELL
70
+ #!/bin/bash
71
+ eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
72
+ spider
73
+ SHELL
74
74
  end
75
75
 
76
- def build_ruby(name)
77
- File.open("#{name}.rb", 'w') do |f|
78
- f.write <<~RUBY
79
- # frozen_string_literal: true
76
+ def build_ruby
77
+ <<~RUBY
78
+ # frozen_string_literal: true
80
79
 
81
- Spidy.define do
82
- spider(:example) do |yielder, connector|
83
- # connector.call(url) do |resource|
84
- # yielder.call(url or resource)
85
- # end
86
- end
80
+ Spidy.define do
81
+ spider(as: :html) do |yielder, connector|
82
+ # connector.call(url) do |resource|
83
+ # yielder.call(url or resource)
84
+ # end
85
+ end
87
86
 
88
- define(:example) do
89
- end
87
+ define(as: :html) do
90
88
  end
91
- RUBY
92
- end
89
+ end
90
+ RUBY
93
91
  end
94
92
  end
@@ -4,7 +4,7 @@
4
4
  # Direct resource ( not network resource )
5
5
  #
6
6
  class Spidy::Connector::Direct
7
- def call(resource, &yielder)
7
+ def call(resource)
8
8
  if block_given?
9
9
  yield resource
10
10
  else
@@ -12,6 +12,5 @@ class Spidy::Connector::Direct
12
12
  end
13
13
  end
14
14
 
15
- def initialize(user_agent:)
16
- end
15
+ def initialize(user_agent:); end
17
16
  end
@@ -14,13 +14,13 @@ class Spidy::Connector::Html
14
14
 
15
15
  attr_reader :agent
16
16
 
17
- def call(url, encoding: nil, retry_count: 5, &yielder)
17
+ def call(url, encoding: nil, &yielder)
18
18
  fail 'url is not specified' if url.blank?
19
19
  if encoding
20
20
  agent.default_encoding = encoding
21
21
  agent.force_default_encoding = true
22
22
  end
23
- connect(url, retry_count, yielder)
23
+ connect(url, yielder)
24
24
  end
25
25
 
26
26
  def refresh!
@@ -30,17 +30,19 @@ class Spidy::Connector::Html
30
30
 
31
31
  private
32
32
 
33
- def connect(url, retry_count, yielder)
33
+ def connect(url, yielder)
34
34
  result = nil
35
35
  agent.get(url) do |page|
36
- fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
36
+ if page.title == 'Sorry, unable to access page...'
37
+ fail Spidy::Connector::Retry.new(object: page, response_code: page.try(:response_code))
38
+ end
37
39
 
38
40
  result = yielder.call(page)
39
41
  end
40
42
  result
41
43
  rescue Mechanize::ResponseCodeError => e
42
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
43
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
44
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
44
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '429'
45
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '502'
46
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code))
45
47
  end
46
48
  end
@@ -17,9 +17,9 @@ class Spidy::Connector::Json
17
17
  connect(url, &block)
18
18
  end
19
19
 
20
- def connect(url, retry_count: 5)
21
- OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
20
+ def connect(url)
21
+ OpenURI.open_uri(url, 'User-Agent' => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
22
22
  rescue OpenURI::HTTPError => e
23
- raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
23
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
24
24
  end
25
25
  end
@@ -13,11 +13,11 @@ class Spidy::Connector::Xml
13
13
  end
14
14
 
15
15
  def connect(url, &block)
16
- OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
16
+ OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
17
17
  block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
18
18
  end
19
19
  rescue OpenURI::HTTPError => e
20
- raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
20
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
21
21
  end
22
22
 
23
23
  def initialize(user_agent:)
@@ -27,7 +27,7 @@ module Spidy::Connector
27
27
  #
28
28
  # error output logger
29
29
  #
30
- DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
30
+ DEFAULT_LOGGER = proc { |values| warn(values.to_json) }
31
31
 
32
32
  #
33
33
  # static method
@@ -36,7 +36,9 @@ module Spidy::Connector
36
36
  extend ActiveSupport::Concern
37
37
  class_methods do
38
38
  def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
39
- ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(url, &block)
39
+ ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
40
+ url, &block
41
+ )
40
42
  end
41
43
  end
42
44
  end
@@ -51,6 +53,7 @@ module Spidy::Connector
51
53
  @object = object
52
54
  @response_code = response_code
53
55
  @error = error
56
+ super(error)
54
57
  end
55
58
  end
56
59
 
@@ -58,13 +61,13 @@ module Spidy::Connector
58
61
  # retry
59
62
  #
60
63
  class RetryableCaller
61
- attr_reader :origin_connector
64
+ attr_reader :origin_connector, :logger, :wait_time
62
65
 
63
- def initialize(connector, logger:, wait_time:)
66
+ def initialize(connector, logger:, wait_time:, retry_attempt_count: 5)
64
67
  @origin_connector = connector
65
68
  @logger = logger
66
69
  @wait_time = wait_time
67
- @retry_attempt_count = 5
70
+ @retry_attempt_count = retry_attempt_count
68
71
  end
69
72
 
70
73
  def call(url, &block)
@@ -73,18 +76,18 @@ module Spidy::Connector
73
76
  end
74
77
 
75
78
  def connect(url, retry_attempt_count: @retry_attempt_count, &block)
76
- @logger.call('connnector.get': url, 'connnector.accessed': Time.current)
77
- @origin_connector.call(url, &block)
79
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
80
+ origin_connector.call(url, &block)
78
81
  rescue Spidy::Connector::Retry => e
79
- @logger.call('retry.accessed': Time.current,
80
- 'retry.uri': url,
81
- 'retry.response_code': e.response_code,
82
- 'retry.attempt_count': retry_attempt_count)
82
+ logger.call('retry.accessed': Time.current,
83
+ 'retry.uri': url,
84
+ 'retry.response_code': e.response_code,
85
+ 'retry.attempt_count': retry_attempt_count)
83
86
 
84
87
  retry_attempt_count -= 1
85
88
  if retry_attempt_count.positive?
86
- sleep @wait_time
87
- @origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
89
+ sleep wait_time
90
+ origin_connector.refresh! if origin_connector.respond_to?(:refresh!)
88
91
  retry
89
92
  end
90
93
  raise e.error
@@ -103,7 +106,7 @@ module Spidy::Connector
103
106
  end
104
107
 
105
108
  def call(url, &block)
106
- Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
109
+ Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
107
110
  connector.call(url, &block)
108
111
  end
109
112
  end
@@ -141,7 +144,6 @@ module Spidy::Connector
141
144
  fail "Not defined connnector[#{value}]" if connector.nil?
142
145
  return connector if socks_proxy.nil?
143
146
 
144
- tor = TorConnector.new(connector, socks_proxy)
145
- tor
147
+ TorConnector.new(connector, socks_proxy)
146
148
  end
147
149
  end
@@ -33,33 +33,39 @@ module Spidy::Definition
33
33
  spidy = @namespace[:"#{name}_spider"]
34
34
  fail "undefined spidy [#{name}]" if spidy.nil?
35
35
 
36
- spidy.call(source, &yielder)
36
+ if yielder
37
+ spidy.call(source, &yielder)
38
+ else
39
+ Enumerator.new do |enumerate_yielder|
40
+ spidy.call(source, &enumerate_yielder)
41
+ end
42
+ end
37
43
  end
38
44
 
39
45
  def spider(name = :default, connector: nil, as: nil, &define_block)
40
46
  @namespace ||= {}
41
- connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
47
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
48
+ socks_proxy: @socks_proxy)
42
49
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
43
50
  define_block.call(yielder, connector, source)
44
51
  end
45
52
  end
46
53
 
47
- def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
54
+ def define(name = :default, connector: nil, as: nil, &define_block)
55
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
56
+ socks_proxy: @socks_proxy)
57
+ binder_base = Spidy::Binder.const_get(as.to_s.classify)
48
58
  @namespace ||= {}
49
- connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
50
- binder = Spidy::Binder.get(self, binder || as)
51
- @namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
52
- end
53
-
54
- private
55
-
56
- def define_proc(name, connector, binder, define_block)
57
- proc do |source, &yielder|
58
- yielder = lambda { |result| break result } if yielder.nil?
59
- connection_yielder = lambda do |page|
60
- binder.call(page, url: source, define: define_block, define_name: name) { |object| yielder.call(object) }
59
+ @namespace[:"#{name}_scraper"] = Class.new(Spidy::DefinitionObject) do
60
+ extend binder_base
61
+ class_eval(&define_block)
62
+ define_singleton_method(:call) do |source, &yielder|
63
+ yielder = ->(result) { break result } if yielder.nil?
64
+ connection_yielder = lambda do |page|
65
+ yielder.call(new(page, source))
66
+ end
67
+ connector.call(source, &connection_yielder)
61
68
  end
62
- connector.call(source, &connection_yielder)
63
69
  end
64
70
  end
65
71
  end
@@ -4,8 +4,7 @@
4
4
  # spidy interface binding
5
5
  #
6
6
  class Spidy::DefinitionFile
7
- attr_reader :path
8
- attr_reader :spidy
7
+ attr_reader :path, :spidy
9
8
 
10
9
  def self.open(filepath)
11
10
  object = new(filepath)
@@ -15,7 +14,7 @@ class Spidy::DefinitionFile
15
14
 
16
15
  # rubocop:disable Security/Eval
17
16
  def eval_definition
18
- @spidy = eval(File.open(path).read) if path
17
+ @spidy = eval(File.read(path)) if path
19
18
  end
20
19
  # rubocop:enable Security/Eval
21
20
 
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # An object that represents the scraper defined by define block.
5
+ #
6
+ class Spidy::DefinitionObject
7
+ class << self
8
+ attr_reader :attribute_names
9
+ end
10
+ attr_reader :resource, :url
11
+
12
+ def initialize(resource, url)
13
+ @resource = resource
14
+ @url = url
15
+ end
16
+
17
+ def to_s
18
+ to_h.to_json
19
+ end
20
+
21
+ def to_h
22
+ self.class.attribute_names.to_h { |name| [name, send(name)] }
23
+ end
24
+ end
data/lib/spidy/shell.rb CHANGED
@@ -16,5 +16,5 @@ class Spidy::Shell
16
16
  Spidy::CommandLine.new(@definition_file)
17
17
  end
18
18
 
19
- delegate :function, :each, :call, to: :command_line
19
+ delegate :function, :each, :call, :eval_call, to: :command_line
20
20
  end
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.3.6'
4
+ VERSION = '0.3.12'
5
5
  end
data/lib/spidy.rb CHANGED
@@ -17,6 +17,7 @@ module Spidy
17
17
  autoload :Console
18
18
  autoload :Definition
19
19
  autoload :DefinitionFile
20
+ autoload :DefinitionObject
20
21
  autoload :Binder
21
22
  autoload :Connector
22
23
 
data/spidy.gemspec CHANGED
@@ -25,17 +25,20 @@ Gem::Specification.new do |spec|
25
25
  spec.require_paths = ['lib']
26
26
 
27
27
  spec.add_development_dependency 'bundler', '~> 2.0'
28
+ spec.add_development_dependency 'capybara_discoball'
29
+ spec.add_development_dependency 'ffaker'
28
30
  spec.add_development_dependency 'pry'
29
- spec.add_development_dependency 'rake', '~> 10.0'
31
+ spec.add_development_dependency 'rake', '~> 13.0'
30
32
  spec.add_development_dependency 'rspec', '~> 3.0'
31
- spec.add_development_dependency 'ffaker'
32
33
  spec.add_development_dependency 'rspec-command'
33
- spec.add_development_dependency 'capybara_discoball'
34
34
  spec.add_development_dependency 'sinatra'
35
35
 
36
- spec.add_runtime_dependency 'tor'
37
36
  spec.add_runtime_dependency 'activesupport'
38
37
  spec.add_runtime_dependency 'mechanize'
39
- spec.add_runtime_dependency 'socksify'
40
38
  spec.add_runtime_dependency 'pry'
39
+ spec.add_runtime_dependency 'socksify'
40
+ spec.add_runtime_dependency 'tor'
41
+ spec.metadata = {
42
+ 'rubygems_mfa_required' => 'true'
43
+ }
41
44
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-09 00:00:00.000000000 Z
11
+ date: 2022-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -25,7 +25,7 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: pry
28
+ name: capybara_discoball
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
@@ -39,63 +39,63 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rake
42
+ name: ffaker
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: rspec
56
+ name: pry
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '3.0'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '3.0'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: ffaker
70
+ name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ">="
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0'
75
+ version: '13.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ">="
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0'
82
+ version: '13.0'
83
83
  - !ruby/object:Gem::Dependency
84
- name: rspec-command
84
+ name: rspec
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ">="
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '0'
89
+ version: '3.0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - ">="
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '0'
96
+ version: '3.0'
97
97
  - !ruby/object:Gem::Dependency
98
- name: capybara_discoball
98
+ name: rspec-command
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - ">="
@@ -123,7 +123,7 @@ dependencies:
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  - !ruby/object:Gem::Dependency
126
- name: tor
126
+ name: activesupport
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
129
  - - ">="
@@ -137,7 +137,7 @@ dependencies:
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0'
139
139
  - !ruby/object:Gem::Dependency
140
- name: activesupport
140
+ name: mechanize
141
141
  requirement: !ruby/object:Gem::Requirement
142
142
  requirements:
143
143
  - - ">="
@@ -151,7 +151,7 @@ dependencies:
151
151
  - !ruby/object:Gem::Version
152
152
  version: '0'
153
153
  - !ruby/object:Gem::Dependency
154
- name: mechanize
154
+ name: pry
155
155
  requirement: !ruby/object:Gem::Requirement
156
156
  requirements:
157
157
  - - ">="
@@ -179,7 +179,7 @@ dependencies:
179
179
  - !ruby/object:Gem::Version
180
180
  version: '0'
181
181
  - !ruby/object:Gem::Dependency
182
- name: pry
182
+ name: tor
183
183
  requirement: !ruby/object:Gem::Requirement
184
184
  requirements:
185
185
  - - ">="
@@ -222,6 +222,7 @@ files:
222
222
  - exe/spidy
223
223
  - lib/spidy.rb
224
224
  - lib/spidy/binder.rb
225
+ - lib/spidy/binder/error.rb
225
226
  - lib/spidy/binder/html.rb
226
227
  - lib/spidy/binder/json.rb
227
228
  - lib/spidy/binder/xml.rb
@@ -234,6 +235,7 @@ files:
234
235
  - lib/spidy/console.rb
235
236
  - lib/spidy/definition.rb
236
237
  - lib/spidy/definition_file.rb
238
+ - lib/spidy/definition_object.rb
237
239
  - lib/spidy/shell.rb
238
240
  - lib/spidy/spider.rb
239
241
  - lib/spidy/version.rb
@@ -242,7 +244,8 @@ files:
242
244
  homepage: https://github.com/aileron-inc/spidy
243
245
  licenses:
244
246
  - MIT
245
- metadata: {}
247
+ metadata:
248
+ rubygems_mfa_required: 'true'
246
249
  post_install_message:
247
250
  rdoc_options: []
248
251
  require_paths:
@@ -258,7 +261,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
258
261
  - !ruby/object:Gem::Version
259
262
  version: '0'
260
263
  requirements: []
261
- rubygems_version: 3.1.4
264
+ rubygems_version: 3.2.22
262
265
  signing_key:
263
266
  specification_version: 4
264
267
  summary: web spider dsl