spidy 0.3.6 → 0.3.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 38936d0a025edde8a0ecb15a164c36fbccf1346f9bee9ec7d01a43931eaeb163
4
- data.tar.gz: 71c02d424da0b18cfad383035c78465b1c81ccdfe41399baa511a7bae3675662
3
+ metadata.gz: 76cb60ea985d1a663f24b7b024198d222756376bd9dd979a032c46ba39b16548
4
+ data.tar.gz: ff2e7f056f7ad5afe06df90adf0bb2e438c696472cde50c8d5758b2f9801684e
5
5
  SHA512:
6
- metadata.gz: d97c61f76fb9dde1d5693c7d33830c6df7605930408c42666ffe31a6d46262d1203cfed710d2112f2c396cdcbc3e40f1149d9fc005c5947af3851946bd6f487e
7
- data.tar.gz: ffe6d6362f6bb50c11e54cb398730b0a649ac832b5fdd048d61675dce7a17ec58993900cbaa250238c0cc702cbe649e901cab660c265adb2ef8207e2704ea48c
6
+ metadata.gz: a721848978135752ddcfe3da30a293317a4852b41dc99209019ae71960538fe448ec4ad54da661e0b99edef3fcb85a84b095b99ddbbba9b628fdd4ac1be2f23c
7
+ data.tar.gz: a156f47f317cd4f1f0a66a13ac5102073723f139c5b797c8dc56d7dbdd41e342cb1ad1a6814812563033c68f448c4d57775c0edf300456dd164b90211632737e
data/.rubocop.yml CHANGED
@@ -1,7 +1,8 @@
1
1
  inherit_from: .rubocop_todo.yml
2
2
  AllCops:
3
+ TargetRubyVersion: 3.0.2
4
+ NewCops: enable
3
5
  DisplayCopNames: true
4
- TargetRubyVersion: 2.6
5
6
 
6
7
  Style/ClassAndModuleChildren:
7
8
  Enabled: false
@@ -9,7 +10,7 @@ Style/ClassAndModuleChildren:
9
10
  Style/SignalException:
10
11
  EnforcedStyle: semantic
11
12
 
12
- Naming/UncommunicativeMethodParamName:
13
+ Naming/MethodParameterName:
13
14
  AllowedNames:
14
15
  - as
15
16
 
@@ -17,8 +18,11 @@ Metrics/AbcSize:
17
18
  Max: 21
18
19
  Exclude:
19
20
 
21
+ Metrics/MethodLength:
22
+ Max: 15
23
+
20
24
  Metrics/LineLength:
21
- Max: 120
25
+ Max: 130
22
26
 
23
27
  Metrics/BlockLength:
24
28
  Max: 120
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.6.6
1
+ 3.0.2
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.3.4)
4
+ spidy (0.3.10)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
@@ -11,124 +11,126 @@ PATH
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
- activesupport (6.0.3.3)
14
+ activesupport (7.0.0)
15
15
  concurrent-ruby (~> 1.0, >= 1.0.2)
16
- i18n (>= 0.7, < 2)
17
- minitest (~> 5.1)
18
- tzinfo (~> 1.1)
19
- zeitwerk (~> 2.2, >= 2.2.2)
20
- addressable (2.7.0)
16
+ i18n (>= 1.6, < 2)
17
+ minitest (>= 5.1)
18
+ tzinfo (~> 2.0)
19
+ addressable (2.8.0)
21
20
  public_suffix (>= 2.0.2, < 5.0)
22
- capybara (3.33.0)
21
+ capybara (3.36.0)
23
22
  addressable
23
+ matrix
24
24
  mini_mime (>= 0.1.3)
25
25
  nokogiri (~> 1.8)
26
26
  rack (>= 1.6.0)
27
27
  rack-test (>= 0.6.3)
28
- regexp_parser (~> 1.5)
28
+ regexp_parser (>= 1.5, < 3.0)
29
29
  xpath (~> 3.2)
30
30
  capybara_discoball (0.1.0)
31
31
  capybara (>= 2.7, < 4)
32
- coderay (1.1.2)
33
- concurrent-ruby (1.1.7)
34
- connection_pool (2.2.3)
35
- diff-lcs (1.3)
32
+ coderay (1.1.3)
33
+ concurrent-ruby (1.1.9)
34
+ connection_pool (2.2.5)
35
+ diff-lcs (1.5.0)
36
36
  domain_name (0.5.20190701)
37
37
  unf (>= 0.0.5, < 1.0.0)
38
- ffaker (2.10.0)
39
- http-cookie (1.0.3)
38
+ ffaker (2.20.0)
39
+ http-cookie (1.0.4)
40
40
  domain_name (~> 0.5)
41
- i18n (1.8.5)
41
+ i18n (1.8.11)
42
42
  concurrent-ruby (~> 1.0)
43
- mechanize (2.7.6)
44
- domain_name (~> 0.5, >= 0.5.1)
45
- http-cookie (~> 1.0)
46
- mime-types (>= 1.17.2)
47
- net-http-digest_auth (~> 1.1, >= 1.1.1)
48
- net-http-persistent (>= 2.5.2)
49
- nokogiri (~> 1.6)
50
- ntlm-http (~> 0.1, >= 0.1.1)
51
- webrobots (>= 0.0.9, < 0.2)
52
- method_source (0.9.2)
53
- mime-types (3.3.1)
43
+ matrix (0.4.2)
44
+ mechanize (2.8.3)
45
+ addressable (~> 2.8)
46
+ domain_name (~> 0.5, >= 0.5.20190701)
47
+ http-cookie (~> 1.0, >= 1.0.3)
48
+ mime-types (~> 3.0)
49
+ net-http-digest_auth (~> 1.4, >= 1.4.1)
50
+ net-http-persistent (>= 2.5.2, < 5.0.dev)
51
+ nokogiri (~> 1.11, >= 1.11.2)
52
+ rubyntlm (~> 0.6, >= 0.6.3)
53
+ webrick (~> 1.7)
54
+ webrobots (~> 0.1.2)
55
+ method_source (1.0.0)
56
+ mime-types (3.4.1)
54
57
  mime-types-data (~> 3.2015)
55
- mime-types-data (3.2020.0512)
56
- mini_mime (1.0.2)
57
- mini_portile2 (2.4.0)
58
- minitest (5.14.2)
58
+ mime-types-data (3.2021.1115)
59
+ mini_mime (1.1.2)
60
+ minitest (5.15.0)
59
61
  mixlib-shellout (2.4.4)
60
62
  mustermann (1.1.1)
61
63
  ruby2_keywords (~> 0.0.1)
62
64
  net-http-digest_auth (1.4.1)
63
- net-http-persistent (4.0.0)
65
+ net-http-persistent (4.0.1)
64
66
  connection_pool (~> 2.2)
65
- nokogiri (1.10.10)
66
- mini_portile2 (~> 2.4.0)
67
- ntlm-http (0.1.1)
68
- pry (0.12.2)
69
- coderay (~> 1.1.0)
70
- method_source (~> 0.9.0)
67
+ nokogiri (1.12.5-arm64-darwin)
68
+ racc (~> 1.4)
69
+ pry (0.14.1)
70
+ coderay (~> 1.1)
71
+ method_source (~> 1.0)
71
72
  public_suffix (4.0.6)
73
+ racc (1.6.0)
72
74
  rack (2.2.3)
73
- rack-protection (2.0.8.1)
75
+ rack-protection (2.1.0)
74
76
  rack
75
77
  rack-test (1.1.0)
76
78
  rack (>= 1.0, < 3)
77
- rake (10.5.0)
78
- regexp_parser (1.8.1)
79
- rspec (3.8.0)
80
- rspec-core (~> 3.8.0)
81
- rspec-expectations (~> 3.8.0)
82
- rspec-mocks (~> 3.8.0)
79
+ rake (13.0.6)
80
+ regexp_parser (2.2.0)
81
+ rspec (3.10.0)
82
+ rspec-core (~> 3.10.0)
83
+ rspec-expectations (~> 3.10.0)
84
+ rspec-mocks (~> 3.10.0)
83
85
  rspec-command (1.0.3)
84
86
  mixlib-shellout (~> 2.0)
85
87
  rspec (~> 3.2)
86
88
  rspec-its (~> 1.2)
87
- rspec-core (3.8.2)
88
- rspec-support (~> 3.8.0)
89
- rspec-expectations (3.8.4)
89
+ rspec-core (3.10.1)
90
+ rspec-support (~> 3.10.0)
91
+ rspec-expectations (3.10.1)
90
92
  diff-lcs (>= 1.2.0, < 2.0)
91
- rspec-support (~> 3.8.0)
93
+ rspec-support (~> 3.10.0)
92
94
  rspec-its (1.3.0)
93
95
  rspec-core (>= 3.0.0)
94
96
  rspec-expectations (>= 3.0.0)
95
- rspec-mocks (3.8.1)
97
+ rspec-mocks (3.10.2)
96
98
  diff-lcs (>= 1.2.0, < 2.0)
97
- rspec-support (~> 3.8.0)
98
- rspec-support (3.8.2)
99
- ruby2_keywords (0.0.2)
100
- sinatra (2.0.8.1)
99
+ rspec-support (~> 3.10.0)
100
+ rspec-support (3.10.3)
101
+ ruby2_keywords (0.0.5)
102
+ rubyntlm (0.6.3)
103
+ sinatra (2.1.0)
101
104
  mustermann (~> 1.0)
102
- rack (~> 2.0)
103
- rack-protection (= 2.0.8.1)
105
+ rack (~> 2.2)
106
+ rack-protection (= 2.1.0)
104
107
  tilt (~> 2.0)
105
108
  socksify (1.7.1)
106
- thread_safe (0.3.6)
107
109
  tilt (2.0.10)
108
- tor (0.1.4)
109
- tzinfo (1.2.7)
110
- thread_safe (~> 0.1)
110
+ tor (0.1.5)
111
+ tzinfo (2.0.4)
112
+ concurrent-ruby (~> 1.0)
111
113
  unf (0.1.4)
112
114
  unf_ext
113
- unf_ext (0.0.7.7)
115
+ unf_ext (0.0.8)
116
+ webrick (1.7.0)
114
117
  webrobots (0.1.2)
115
118
  xpath (3.2.0)
116
119
  nokogiri (~> 1.8)
117
- zeitwerk (2.4.0)
118
120
 
119
121
  PLATFORMS
120
- ruby
122
+ arm64-darwin-20
121
123
 
122
124
  DEPENDENCIES
123
125
  bundler (~> 2.0)
124
126
  capybara_discoball
125
127
  ffaker
126
128
  pry
127
- rake (~> 10.0)
129
+ rake (~> 13.0)
128
130
  rspec (~> 3.0)
129
131
  rspec-command
130
132
  sinatra
131
133
  spidy!
132
134
 
133
135
  BUNDLED WITH
134
- 2.1.4
136
+ 2.2.22
@@ -1,7 +1,7 @@
1
-
1
+ # frozen_string_literal: true
2
2
 
3
3
  Spidy.define do
4
- url_to_params = ->(url) {
4
+ url_to_params = lambda { |url|
5
5
  uri = URI.parse(url)
6
6
  params = URI.decode_www_form(uri.query).to_h if uri.query.present?
7
7
  params if params.present?
@@ -13,41 +13,41 @@ Spidy.define do
13
13
 
14
14
  limit_page = 3
15
15
  per_page = 25
16
- yielder.call(Nokogiri::HTML::Builder.new { |doc|
17
- doc.html {
18
- doc.body {
19
- doc.span.bold {
20
- doc.text "Hello world"
21
- }
22
- doc.main {
23
- (page * per_page + 1).upto((page + 1) * per_page).each do |i|
16
+ yielder.call(Nokogiri::HTML::Builder.new do |doc|
17
+ doc.html do
18
+ doc.body do
19
+ doc.span.bold do
20
+ doc.text 'Hello world'
21
+ end
22
+ doc.main do
23
+ ((page * per_page) + 1).upto((page + 1) * per_page).each do |i|
24
24
  doc.a("page #{i}", href: "http://localhost/?id=#{i}")
25
25
  end
26
- }
26
+ end
27
27
  doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
28
- }
29
- }
30
- }.doc)
28
+ end
29
+ end
30
+ end.doc)
31
31
  }
32
32
 
33
33
  detail_page = proc { |url, &yielder|
34
34
  params = url_to_params.call(url)
35
35
  id = params['id']
36
36
 
37
- yielder.call(Nokogiri::HTML::Builder.new { |doc|
38
- doc.html {
39
- doc.body {
40
- doc.span.bold {
41
- doc.text "Hello world"
42
- }
37
+ yielder.call(Nokogiri::HTML::Builder.new do |doc|
38
+ doc.html do
39
+ doc.body do
40
+ doc.span.bold do
41
+ doc.text 'Hello world'
42
+ end
43
43
  doc.h1("title_#{id}", id: 'title')
44
44
  doc.main("body_#{id}", id: 'body')
45
45
  doc.div.sub do
46
46
  doc.span.name('testtest')
47
47
  end
48
- }
49
- }
50
- }.doc)
48
+ end
49
+ end
50
+ end.doc)
51
51
  }
52
52
 
53
53
  define(as: :html, connector: detail_page) do
data/example/proxy.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  Spidy.define do
2
4
  user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
3
5
  socks_proxy '127.0.0.1', 9050
data/example/retry.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  Spidy.define do
2
4
  spider(as: :json) do |yielder, connector|
3
5
  connector.call('https://httpbin.org/status/500') do |json|
data/example/wikip.rb CHANGED
@@ -11,11 +11,8 @@ Spidy.define do
11
11
 
12
12
  define(:infobox, as: :html, connector: :direct) do
13
13
  let(:columns) do
14
- html.search('tr').each do |tr|
15
- {
16
- name: tr.at('th')&.text,
17
- value: tr.at('td')&.text
18
- }
14
+ html.search('tr').map do |tr|
15
+ { name: tr.at('th')&.text, value: tr.at('td')&.text }
19
16
  end
20
17
  end
21
18
  end
data/exe/spidy CHANGED
@@ -6,10 +6,10 @@ require 'pry'
6
6
 
7
7
  if ARGV[1].blank?
8
8
  case ARGV[0]
9
- when 'version' then STDOUT.puts(Spidy::VERSION)
9
+ when 'version' then $stdout.puts(Spidy::VERSION)
10
10
  when 'console' then Spidy.shell.interactive
11
11
  else
12
- STDOUT.puts 'usage: spidy [version console]'
12
+ $stdout.puts 'usage: spidy [version console]'
13
13
  end
14
14
  else
15
15
  case ARGV[0]
@@ -17,7 +17,8 @@ else
17
17
  when 'function' then Spidy.shell(ARGV[1]).function
18
18
  when 'call' then Spidy.shell(ARGV[1]).call(ARGV[2])
19
19
  when 'each' then Spidy.shell(ARGV[1]).each(ARGV[2])
20
+ when 'eval' then Spidy.shell(ARGV[1]).eval_call(ARGV[2])
20
21
  else
21
- STDOUT.puts 'usage: spidy [console function call each] [file]'
22
+ $stdout.puts 'usage: spidy [console function call each run] [file]'
22
23
  end
23
24
  end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Spidy::Binder::Error < StandardError
4
+ end
@@ -3,12 +3,12 @@
3
3
  #
4
4
  # Bind html and convert to object
5
5
  #
6
- class Spidy::Binder::Html < Spidy::Binder::Base
7
- def self.let(name, query = nil, &block)
6
+ module Spidy::Binder::Html
7
+ def let(name, query = nil, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
11
- return define_method(name) { html.at(query)&.text } if block.nil?
11
+ return define_method(name) { html.at(query)&.text&.strip } if block.nil?
12
12
 
13
13
  define_method(name) do
14
14
  if query.present?
@@ -17,9 +17,11 @@ class Spidy::Binder::Html < Spidy::Binder::Base
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
23
 
24
- alias_method :html, :resource
24
+ def self.extended(obj)
25
+ obj.alias_method :html, :resource
26
+ end
25
27
  end
@@ -3,8 +3,8 @@
3
3
  #
4
4
  # Bind json and convert to object
5
5
  #
6
- class Spidy::Binder::Json < Spidy::Binder::Base
7
- def self.let(name, *query, &block)
6
+ module Spidy::Binder::Json
7
+ def let(name, *query, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
@@ -17,9 +17,11 @@ class Spidy::Binder::Json < Spidy::Binder::Base
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
23
 
24
- alias_method :json, :resource
24
+ def self.extended(obj)
25
+ obj.alias_method :json, :resource
26
+ end
25
27
  end
@@ -3,12 +3,12 @@
3
3
  #
4
4
  # Bind xml and convert to object
5
5
  #
6
- class Spidy::Binder::Xml < Spidy::Binder::Base
7
- def self.let(name, query = nil, &block)
6
+ module Spidy::Binder::Xml
7
+ def let(name, query = nil, &block)
8
8
  @attribute_names ||= []
9
9
  @attribute_names << name
10
10
 
11
- return define_method(name) { xml.at(query)&.text } if block.nil?
11
+ return define_method(name) { xml.at(query)&.text&.strip } if block.nil?
12
12
 
13
13
  define_method(name) do
14
14
  if query.present?
@@ -17,9 +17,11 @@ class Spidy::Binder::Xml < Spidy::Binder::Base
17
17
  instance_exec(&block)
18
18
  end
19
19
  rescue StandardError => e
20
- fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
20
+ raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
21
21
  end
22
22
  end
23
23
 
24
- alias_method :xml, :resource
24
+ def self.extended(obj)
25
+ obj.alias_method :xml, :resource
26
+ end
25
27
  end
data/lib/spidy/binder.rb CHANGED
@@ -5,51 +5,8 @@
5
5
  #
6
6
  module Spidy::Binder
7
7
  extend ActiveSupport::Autoload
8
+ autoload :Error
8
9
  autoload :Json
9
10
  autoload :Html
10
11
  autoload :Xml
11
-
12
- class Error < StandardError
13
- end
14
-
15
- class Caller
16
- def initialize(spidy, binder)
17
- @spidy = spidy
18
- @binder = binder
19
- end
20
-
21
- def call(source, url: nil, define: nil, define_name: nil)
22
- yield Class.new(@binder, &define).new(define_name, @spidy, source, url)
23
- end
24
- end
25
-
26
- class Base
27
- class << self
28
- attr_reader :attribute_names
29
- end
30
-
31
- attr_reader :resource, :url
32
-
33
- def initialize(define_name, spidy, resource, url)
34
- @define_name = define_name
35
- @spidy = spidy
36
- @resource = resource
37
- @url = url
38
- end
39
-
40
- def to_s
41
- to_h.to_json
42
- end
43
-
44
- def to_h
45
- self.class.attribute_names.map { |name| [name, send(name)] }.to_h
46
- end
47
- end
48
-
49
-
50
- def self.get(spidy, value)
51
- return Caller.new(spidy, const_get(value.to_s.classify)) if name.is_a?(String) || name.is_a?(Symbol)
52
-
53
- value
54
- end
55
12
  end
@@ -5,45 +5,47 @@
5
5
  #
6
6
  class Spidy::CommandLine
7
7
  delegate :spidy, to: :@definition_file
8
- class_attribute :output, default: (proc { |result| STDOUT.puts(result.to_s) })
9
- class_attribute :error_handler, default: (proc { |e, url| STDERR.puts({ url: url, message: e.message, backtrace: e.backtrace }.to_json) })
8
+ class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
9
+ class_attribute :error_handler, default: (proc { |e, url|
10
+ warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
11
+ })
12
+
13
+ def eval_call(script)
14
+ @definition_file.spidy.instance_eval(script)
15
+ end
10
16
 
11
17
  def initialize(definition_file)
12
18
  @definition_file = definition_file
13
- raise 'unloaded spidy' if definition_file.spidy.nil?
19
+ fail 'unloaded spidy' if definition_file.spidy.nil?
14
20
  end
15
21
 
16
22
  def each_stdin_lines(name)
17
- STDIN.each_line do |url|
18
- begin
19
- spidy.each(url.strip, name: name, &output)
20
- rescue => e
21
- error_handler.call(e, url)
22
- end
23
+ $stdin.each_line do |url|
24
+ spidy.each(url.strip, name: name, &output)
25
+ rescue StandardError => e
26
+ error_handler.call(e, url)
23
27
  end
24
28
  end
25
29
 
26
30
  def call_stdin_lines(name)
27
- STDIN.each_line do |url|
28
- begin
29
- spidy.call(url.strip, name: name, &output)
30
- rescue => e
31
- error_handler.call(e, url)
32
- end
31
+ $stdin.each_line do |url|
32
+ spidy.call(url.strip, name: name, &output)
33
+ rescue StandardError => e
34
+ error_handler.call(e, url)
33
35
  end
34
36
  end
35
37
 
36
38
  def call(name)
37
- return call_stdin_lines(name) if FileTest.pipe?(STDIN)
38
- spidy.call(name: name, &output) unless FileTest.pipe?(STDIN)
39
- rescue => e
39
+ return call_stdin_lines(name) if FileTest.pipe?($stdin)
40
+ spidy.call(name: name, &output) unless FileTest.pipe?($stdin)
41
+ rescue StandardError => e
40
42
  error_handler.call(e, nil)
41
43
  end
42
44
 
43
45
  def each(name)
44
- return each_stdin_lines(name) if FileTest.pipe?(STDIN)
46
+ return each_stdin_lines(name) if FileTest.pipe?($stdin)
45
47
  spidy.each(name: name, &output)
46
- rescue => e
48
+ rescue StandardError => e
47
49
  error_handler.call(e, nil)
48
50
  end
49
51
 
@@ -59,36 +61,32 @@ class Spidy::CommandLine
59
61
  end
60
62
 
61
63
  def build(name)
62
- build_shell(name)
63
- build_ruby(name)
64
+ File.write("#{name}.sh", build_shell_script(name))
65
+ File.write("#{name}.rb", build_ruby_script)
64
66
  end
65
67
 
66
68
  def build_shell(name)
67
- File.open("#{name}.sh", 'w') do |f|
68
- f.write <<~SHELL
69
- #!/bin/bash
70
- eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
71
- spider example
72
- SHELL
73
- end
69
+ <<~SHELL
70
+ #!/bin/bash
71
+ eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
72
+ spider
73
+ SHELL
74
74
  end
75
75
 
76
- def build_ruby(name)
77
- File.open("#{name}.rb", 'w') do |f|
78
- f.write <<~RUBY
79
- # frozen_string_literal: true
76
+ def build_ruby
77
+ <<~RUBY
78
+ # frozen_string_literal: true
80
79
 
81
- Spidy.define do
82
- spider(:example) do |yielder, connector|
83
- # connector.call(url) do |resource|
84
- # yielder.call(url or resource)
85
- # end
86
- end
80
+ Spidy.define do
81
+ spider(as: :html) do |yielder, connector|
82
+ # connector.call(url) do |resource|
83
+ # yielder.call(url or resource)
84
+ # end
85
+ end
87
86
 
88
- define(:example) do
89
- end
87
+ define(as: :html) do
90
88
  end
91
- RUBY
92
- end
89
+ end
90
+ RUBY
93
91
  end
94
92
  end
@@ -4,7 +4,7 @@
4
4
  # Direct resource ( not network resource )
5
5
  #
6
6
  class Spidy::Connector::Direct
7
- def call(resource, &yielder)
7
+ def call(resource)
8
8
  if block_given?
9
9
  yield resource
10
10
  else
@@ -12,6 +12,5 @@ class Spidy::Connector::Direct
12
12
  end
13
13
  end
14
14
 
15
- def initialize(user_agent:)
16
- end
15
+ def initialize(user_agent:); end
17
16
  end
@@ -14,13 +14,13 @@ class Spidy::Connector::Html
14
14
 
15
15
  attr_reader :agent
16
16
 
17
- def call(url, encoding: nil, retry_count: 5, &yielder)
17
+ def call(url, encoding: nil, &yielder)
18
18
  fail 'url is not specified' if url.blank?
19
19
  if encoding
20
20
  agent.default_encoding = encoding
21
21
  agent.force_default_encoding = true
22
22
  end
23
- connect(url, retry_count, yielder)
23
+ connect(url, yielder)
24
24
  end
25
25
 
26
26
  def refresh!
@@ -30,17 +30,19 @@ class Spidy::Connector::Html
30
30
 
31
31
  private
32
32
 
33
- def connect(url, retry_count, yielder)
33
+ def connect(url, yielder)
34
34
  result = nil
35
35
  agent.get(url) do |page|
36
- fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
36
+ if page.title == 'Sorry, unable to access page...'
37
+ fail Spidy::Connector::Retry.new(object: page, response_code: page.try(:response_code))
38
+ end
37
39
 
38
40
  result = yielder.call(page)
39
41
  end
40
42
  result
41
43
  rescue Mechanize::ResponseCodeError => e
42
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
43
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
44
- raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
44
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '429'
45
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '502'
46
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code))
45
47
  end
46
48
  end
@@ -17,9 +17,9 @@ class Spidy::Connector::Json
17
17
  connect(url, &block)
18
18
  end
19
19
 
20
- def connect(url, retry_count: 5)
21
- OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
20
+ def connect(url)
21
+ OpenURI.open_uri(url, 'User-Agent' => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
22
22
  rescue OpenURI::HTTPError => e
23
- raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
23
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
24
24
  end
25
25
  end
@@ -13,11 +13,11 @@ class Spidy::Connector::Xml
13
13
  end
14
14
 
15
15
  def connect(url, &block)
16
- OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
16
+ OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
17
17
  block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
18
18
  end
19
19
  rescue OpenURI::HTTPError => e
20
- raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
20
+ raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
21
21
  end
22
22
 
23
23
  def initialize(user_agent:)
@@ -27,7 +27,7 @@ module Spidy::Connector
27
27
  #
28
28
  # error output logger
29
29
  #
30
- DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
30
+ DEFAULT_LOGGER = proc { |values| warn(values.to_json) }
31
31
 
32
32
  #
33
33
  # static method
@@ -36,7 +36,9 @@ module Spidy::Connector
36
36
  extend ActiveSupport::Concern
37
37
  class_methods do
38
38
  def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
39
- ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(url, &block)
39
+ ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
40
+ url, &block
41
+ )
40
42
  end
41
43
  end
42
44
  end
@@ -51,6 +53,7 @@ module Spidy::Connector
51
53
  @object = object
52
54
  @response_code = response_code
53
55
  @error = error
56
+ super(error)
54
57
  end
55
58
  end
56
59
 
@@ -58,13 +61,13 @@ module Spidy::Connector
58
61
  # retry
59
62
  #
60
63
  class RetryableCaller
61
- attr_reader :origin_connector
64
+ attr_reader :origin_connector, :logger, :wait_time
62
65
 
63
- def initialize(connector, logger:, wait_time:)
66
+ def initialize(connector, logger:, wait_time:, retry_attempt_count: 5)
64
67
  @origin_connector = connector
65
68
  @logger = logger
66
69
  @wait_time = wait_time
67
- @retry_attempt_count = 5
70
+ @retry_attempt_count = retry_attempt_count
68
71
  end
69
72
 
70
73
  def call(url, &block)
@@ -73,18 +76,18 @@ module Spidy::Connector
73
76
  end
74
77
 
75
78
  def connect(url, retry_attempt_count: @retry_attempt_count, &block)
76
- @logger.call('connnector.get': url, 'connnector.accessed': Time.current)
77
- @origin_connector.call(url, &block)
79
+ logger.call('connnector.get': url, 'connnector.accessed': Time.current)
80
+ origin_connector.call(url, &block)
78
81
  rescue Spidy::Connector::Retry => e
79
- @logger.call('retry.accessed': Time.current,
80
- 'retry.uri': url,
81
- 'retry.response_code': e.response_code,
82
- 'retry.attempt_count': retry_attempt_count)
82
+ logger.call('retry.accessed': Time.current,
83
+ 'retry.uri': url,
84
+ 'retry.response_code': e.response_code,
85
+ 'retry.attempt_count': retry_attempt_count)
83
86
 
84
87
  retry_attempt_count -= 1
85
88
  if retry_attempt_count.positive?
86
- sleep @wait_time
87
- @origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
89
+ sleep wait_time
90
+ origin_connector.refresh! if origin_connector.respond_to?(:refresh!)
88
91
  retry
89
92
  end
90
93
  raise e.error
@@ -103,7 +106,7 @@ module Spidy::Connector
103
106
  end
104
107
 
105
108
  def call(url, &block)
106
- Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
109
+ Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
107
110
  connector.call(url, &block)
108
111
  end
109
112
  end
@@ -141,7 +144,6 @@ module Spidy::Connector
141
144
  fail "Not defined connnector[#{value}]" if connector.nil?
142
145
  return connector if socks_proxy.nil?
143
146
 
144
- tor = TorConnector.new(connector, socks_proxy)
145
- tor
147
+ TorConnector.new(connector, socks_proxy)
146
148
  end
147
149
  end
@@ -33,33 +33,39 @@ module Spidy::Definition
33
33
  spidy = @namespace[:"#{name}_spider"]
34
34
  fail "undefined spidy [#{name}]" if spidy.nil?
35
35
 
36
- spidy.call(source, &yielder)
36
+ if yielder
37
+ spidy.call(source, &yielder)
38
+ else
39
+ Enumerator.new do |enumerate_yielder|
40
+ spidy.call(source, &enumerate_yielder)
41
+ end
42
+ end
37
43
  end
38
44
 
39
45
  def spider(name = :default, connector: nil, as: nil, &define_block)
40
46
  @namespace ||= {}
41
- connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
47
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
48
+ socks_proxy: @socks_proxy)
42
49
  @namespace[:"#{name}_spider"] = proc do |source, &yielder|
43
50
  define_block.call(yielder, connector, source)
44
51
  end
45
52
  end
46
53
 
47
- def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
54
+ def define(name = :default, connector: nil, as: nil, &define_block)
55
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
56
+ socks_proxy: @socks_proxy)
57
+ binder_base = Spidy::Binder.const_get(as.to_s.classify)
48
58
  @namespace ||= {}
49
- connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
50
- binder = Spidy::Binder.get(self, binder || as)
51
- @namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
52
- end
53
-
54
- private
55
-
56
- def define_proc(name, connector, binder, define_block)
57
- proc do |source, &yielder|
58
- yielder = lambda { |result| break result } if yielder.nil?
59
- connection_yielder = lambda do |page|
60
- binder.call(page, url: source, define: define_block, define_name: name) { |object| yielder.call(object) }
59
+ @namespace[:"#{name}_scraper"] = Class.new(Spidy::DefinitionObject) do
60
+ extend binder_base
61
+ class_eval(&define_block)
62
+ define_singleton_method(:call) do |source, &yielder|
63
+ yielder = ->(result) { break result } if yielder.nil?
64
+ connection_yielder = lambda do |page|
65
+ yielder.call(new(page, source))
66
+ end
67
+ connector.call(source, &connection_yielder)
61
68
  end
62
- connector.call(source, &connection_yielder)
63
69
  end
64
70
  end
65
71
  end
@@ -4,8 +4,7 @@
4
4
  # spidy interface binding
5
5
  #
6
6
  class Spidy::DefinitionFile
7
- attr_reader :path
8
- attr_reader :spidy
7
+ attr_reader :path, :spidy
9
8
 
10
9
  def self.open(filepath)
11
10
  object = new(filepath)
@@ -15,7 +14,7 @@ class Spidy::DefinitionFile
15
14
 
16
15
  # rubocop:disable Security/Eval
17
16
  def eval_definition
18
- @spidy = eval(File.open(path).read) if path
17
+ @spidy = eval(File.read(path)) if path
19
18
  end
20
19
  # rubocop:enable Security/Eval
21
20
 
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ #
4
+ # An object that represents the scraper defined by define block.
5
+ #
6
+ class Spidy::DefinitionObject
7
+ class << self
8
+ attr_reader :attribute_names
9
+ end
10
+ attr_reader :resource, :url
11
+
12
+ def initialize(resource, url)
13
+ @resource = resource
14
+ @url = url
15
+ end
16
+
17
+ def to_s
18
+ to_h.to_json
19
+ end
20
+
21
+ def to_h
22
+ self.class.attribute_names.to_h { |name| [name, send(name)] }
23
+ end
24
+ end
data/lib/spidy/shell.rb CHANGED
@@ -16,5 +16,5 @@ class Spidy::Shell
16
16
  Spidy::CommandLine.new(@definition_file)
17
17
  end
18
18
 
19
- delegate :function, :each, :call, to: :command_line
19
+ delegate :function, :each, :call, :eval_call, to: :command_line
20
20
  end
data/lib/spidy/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.3.6'
4
+ VERSION = '0.3.12'
5
5
  end
data/lib/spidy.rb CHANGED
@@ -17,6 +17,7 @@ module Spidy
17
17
  autoload :Console
18
18
  autoload :Definition
19
19
  autoload :DefinitionFile
20
+ autoload :DefinitionObject
20
21
  autoload :Binder
21
22
  autoload :Connector
22
23
 
data/spidy.gemspec CHANGED
@@ -25,17 +25,20 @@ Gem::Specification.new do |spec|
25
25
  spec.require_paths = ['lib']
26
26
 
27
27
  spec.add_development_dependency 'bundler', '~> 2.0'
28
+ spec.add_development_dependency 'capybara_discoball'
29
+ spec.add_development_dependency 'ffaker'
28
30
  spec.add_development_dependency 'pry'
29
- spec.add_development_dependency 'rake', '~> 10.0'
31
+ spec.add_development_dependency 'rake', '~> 13.0'
30
32
  spec.add_development_dependency 'rspec', '~> 3.0'
31
- spec.add_development_dependency 'ffaker'
32
33
  spec.add_development_dependency 'rspec-command'
33
- spec.add_development_dependency 'capybara_discoball'
34
34
  spec.add_development_dependency 'sinatra'
35
35
 
36
- spec.add_runtime_dependency 'tor'
37
36
  spec.add_runtime_dependency 'activesupport'
38
37
  spec.add_runtime_dependency 'mechanize'
39
- spec.add_runtime_dependency 'socksify'
40
38
  spec.add_runtime_dependency 'pry'
39
+ spec.add_runtime_dependency 'socksify'
40
+ spec.add_runtime_dependency 'tor'
41
+ spec.metadata = {
42
+ 'rubygems_mfa_required' => 'true'
43
+ }
41
44
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-09 00:00:00.000000000 Z
11
+ date: 2022-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -25,7 +25,7 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: pry
28
+ name: capybara_discoball
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
@@ -39,63 +39,63 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rake
42
+ name: ffaker
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: rspec
56
+ name: pry
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '3.0'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '3.0'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: ffaker
70
+ name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ">="
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0'
75
+ version: '13.0'
76
76
  type: :development
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ">="
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0'
82
+ version: '13.0'
83
83
  - !ruby/object:Gem::Dependency
84
- name: rspec-command
84
+ name: rspec
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ">="
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '0'
89
+ version: '3.0'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - ">="
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '0'
96
+ version: '3.0'
97
97
  - !ruby/object:Gem::Dependency
98
- name: capybara_discoball
98
+ name: rspec-command
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - ">="
@@ -123,7 +123,7 @@ dependencies:
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  - !ruby/object:Gem::Dependency
126
- name: tor
126
+ name: activesupport
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
129
  - - ">="
@@ -137,7 +137,7 @@ dependencies:
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0'
139
139
  - !ruby/object:Gem::Dependency
140
- name: activesupport
140
+ name: mechanize
141
141
  requirement: !ruby/object:Gem::Requirement
142
142
  requirements:
143
143
  - - ">="
@@ -151,7 +151,7 @@ dependencies:
151
151
  - !ruby/object:Gem::Version
152
152
  version: '0'
153
153
  - !ruby/object:Gem::Dependency
154
- name: mechanize
154
+ name: pry
155
155
  requirement: !ruby/object:Gem::Requirement
156
156
  requirements:
157
157
  - - ">="
@@ -179,7 +179,7 @@ dependencies:
179
179
  - !ruby/object:Gem::Version
180
180
  version: '0'
181
181
  - !ruby/object:Gem::Dependency
182
- name: pry
182
+ name: tor
183
183
  requirement: !ruby/object:Gem::Requirement
184
184
  requirements:
185
185
  - - ">="
@@ -222,6 +222,7 @@ files:
222
222
  - exe/spidy
223
223
  - lib/spidy.rb
224
224
  - lib/spidy/binder.rb
225
+ - lib/spidy/binder/error.rb
225
226
  - lib/spidy/binder/html.rb
226
227
  - lib/spidy/binder/json.rb
227
228
  - lib/spidy/binder/xml.rb
@@ -234,6 +235,7 @@ files:
234
235
  - lib/spidy/console.rb
235
236
  - lib/spidy/definition.rb
236
237
  - lib/spidy/definition_file.rb
238
+ - lib/spidy/definition_object.rb
237
239
  - lib/spidy/shell.rb
238
240
  - lib/spidy/spider.rb
239
241
  - lib/spidy/version.rb
@@ -242,7 +244,8 @@ files:
242
244
  homepage: https://github.com/aileron-inc/spidy
243
245
  licenses:
244
246
  - MIT
245
- metadata: {}
247
+ metadata:
248
+ rubygems_mfa_required: 'true'
246
249
  post_install_message:
247
250
  rdoc_options: []
248
251
  require_paths:
@@ -258,7 +261,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
258
261
  - !ruby/object:Gem::Version
259
262
  version: '0'
260
263
  requirements: []
261
- rubygems_version: 3.1.4
264
+ rubygems_version: 3.2.22
262
265
  signing_key:
263
266
  specification_version: 4
264
267
  summary: web spider dsl