spidy 0.2.8 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 43e5538d56f4d4115a1bb2cb662f0722d688ad44d8212506436492f0ee3e0570
4
- data.tar.gz: b0db640263836c7244876a50219ed4f916c6c2f7777e11c7e13d11c2142714dc
3
+ metadata.gz: a70bf5d610f60d0b71c719cf870995a7e93b6e9abd71ef9823a71e2ed506f190
4
+ data.tar.gz: 203bc7721020e244b9ad3ecc526b27e1c8d83a807cc2798bcb8930b1e17d5277
5
5
  SHA512:
6
- metadata.gz: 3d6dcfea624a7710eec941e55238334596edced66d60a09b046c5b057de06c2fb7d279c3d912a8e054ef206d9d5c148ee1e43d3394d6d8c30d364fa17fcc69fb
7
- data.tar.gz: fe42ba4880e255aa2a09d360cfec59aaf199ec60407d045641d8ac3cad8319fb3277f2b3e91760e24ab879ec224860628c30fce2282e8c4467a6bfc75826fee5
6
+ metadata.gz: 9977aeb13ff786bd8fbeb7d8ca0ee3ef7b67dfb577739c9aba0a310c46741fa48d86596de6f69ad05eaf2d64f9de4259bea84d4939ee2c7288781106fb25f2b3
7
+ data.tar.gz: d64e9e66b25d8985f2c009abddd6f6b862aa8f533b67e125fa35008d308424e766d3ad4a8a64c16a0c9ed3448cdad14a09c80f6f77437e604826808286270d4f
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.2.6)
4
+ spidy (0.3.3)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
@@ -11,12 +11,24 @@ PATH
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
- activesupport (6.0.3.2)
14
+ activesupport (6.0.3.3)
15
15
  concurrent-ruby (~> 1.0, >= 1.0.2)
16
16
  i18n (>= 0.7, < 2)
17
17
  minitest (~> 5.1)
18
18
  tzinfo (~> 1.1)
19
19
  zeitwerk (~> 2.2, >= 2.2.2)
20
+ addressable (2.7.0)
21
+ public_suffix (>= 2.0.2, < 5.0)
22
+ capybara (3.33.0)
23
+ addressable
24
+ mini_mime (>= 0.1.3)
25
+ nokogiri (~> 1.8)
26
+ rack (>= 1.6.0)
27
+ rack-test (>= 0.6.3)
28
+ regexp_parser (~> 1.5)
29
+ xpath (~> 3.2)
30
+ capybara_discoball (0.1.0)
31
+ capybara (>= 2.7, < 4)
20
32
  coderay (1.1.2)
21
33
  concurrent-ruby (1.1.7)
22
34
  connection_pool (2.2.3)
@@ -41,9 +53,12 @@ GEM
41
53
  mime-types (3.3.1)
42
54
  mime-types-data (~> 3.2015)
43
55
  mime-types-data (3.2020.0512)
56
+ mini_mime (1.0.2)
44
57
  mini_portile2 (2.4.0)
45
58
  minitest (5.14.2)
46
59
  mixlib-shellout (2.4.4)
60
+ mustermann (1.1.1)
61
+ ruby2_keywords (~> 0.0.1)
47
62
  net-http-digest_auth (1.4.1)
48
63
  net-http-persistent (4.0.0)
49
64
  connection_pool (~> 2.2)
@@ -53,7 +68,14 @@ GEM
53
68
  pry (0.12.2)
54
69
  coderay (~> 1.1.0)
55
70
  method_source (~> 0.9.0)
71
+ public_suffix (4.0.6)
72
+ rack (2.2.3)
73
+ rack-protection (2.0.8.1)
74
+ rack
75
+ rack-test (1.1.0)
76
+ rack (>= 1.0, < 3)
56
77
  rake (10.5.0)
78
+ regexp_parser (1.8.1)
57
79
  rspec (3.8.0)
58
80
  rspec-core (~> 3.8.0)
59
81
  rspec-expectations (~> 3.8.0)
@@ -74,8 +96,15 @@ GEM
74
96
  diff-lcs (>= 1.2.0, < 2.0)
75
97
  rspec-support (~> 3.8.0)
76
98
  rspec-support (3.8.2)
99
+ ruby2_keywords (0.0.2)
100
+ sinatra (2.0.8.1)
101
+ mustermann (~> 1.0)
102
+ rack (~> 2.0)
103
+ rack-protection (= 2.0.8.1)
104
+ tilt (~> 2.0)
77
105
  socksify (1.7.1)
78
106
  thread_safe (0.3.6)
107
+ tilt (2.0.10)
79
108
  tor (0.1.4)
80
109
  tzinfo (1.2.7)
81
110
  thread_safe (~> 0.1)
@@ -83,6 +112,8 @@ GEM
83
112
  unf_ext
84
113
  unf_ext (0.0.7.7)
85
114
  webrobots (0.1.2)
115
+ xpath (3.2.0)
116
+ nokogiri (~> 1.8)
86
117
  zeitwerk (2.4.0)
87
118
 
88
119
  PLATFORMS
@@ -90,11 +121,13 @@ PLATFORMS
90
121
 
91
122
  DEPENDENCIES
92
123
  bundler (~> 2.0)
124
+ capybara_discoball
93
125
  ffaker
94
126
  pry
95
127
  rake (~> 10.0)
96
128
  rspec (~> 3.0)
97
129
  rspec-command
130
+ sinatra
98
131
  spidy!
99
132
 
100
133
  BUNDLED WITH
@@ -0,0 +1,73 @@
1
+
2
+
3
+ Spidy.define do
4
+ url_to_params = ->(url) {
5
+ uri = URI.parse(url)
6
+ params = URI.decode_www_form(uri.query).to_h if uri.query.present?
7
+ params if params.present?
8
+ }
9
+
10
+ master_page = proc { |url, &yielder|
11
+ params = url_to_params.call(url)
12
+ page = params&.dig('page')&.to_i || 0
13
+
14
+ limit_page = 3
15
+ per_page = 25
16
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
17
+ doc.html {
18
+ doc.body {
19
+ doc.span.bold {
20
+ doc.text "Hello world"
21
+ }
22
+ doc.main {
23
+ (page * per_page + 1).upto((page + 1) * per_page).each do |i|
24
+ doc.a("page #{i}", href: "http://localhost/?id=#{i}")
25
+ end
26
+ }
27
+ doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
28
+ }
29
+ }
30
+ }.doc)
31
+ }
32
+
33
+ detail_page = proc { |url, &yielder|
34
+ params = url_to_params.call(url)
35
+ id = params['id']
36
+
37
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
38
+ doc.html {
39
+ doc.body {
40
+ doc.span.bold {
41
+ doc.text "Hello world"
42
+ }
43
+ doc.h1("title_#{id}", id: 'title')
44
+ doc.main("body_#{id}", id: 'body')
45
+ doc.div.sub do
46
+ doc.span.name('testtest')
47
+ end
48
+ }
49
+ }
50
+ }.doc)
51
+ }
52
+
53
+ define(as: :html, connector: detail_page) do
54
+ let(:title, '#title')
55
+ let(:body, '#body')
56
+ end
57
+
58
+ define(:sub, as: :html, connector: :direct) do
59
+ let(:name, '.name')
60
+ end
61
+
62
+ spider(as: :html, connector: master_page) do |yielder, connector|
63
+ next_url = 'http://localhost'
64
+ while next_url.present?
65
+ connector.call(next_url) do |page|
66
+ page.search('main a').each do |a|
67
+ yielder.call(a.attr('href'))
68
+ end
69
+ next_url = page.at('a.next')&.attr('href')
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,10 @@
1
+ Spidy.define do
2
+ user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
3
+ socks_proxy '127.0.0.1', 9050
4
+
5
+ spider(as: :json) do |yielder, connector|
6
+ connector.call('https://httpbin.org/ip') do |json|
7
+ yielder.call(json[:origin])
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,7 @@
1
+ Spidy.define do
2
+ spider(as: :json) do |yielder, connector|
3
+ connector.call('https://httpbin.org/status/500') do |json|
4
+ yielder.call(json[:origin])
5
+ end
6
+ end
7
+ end
@@ -10,6 +10,8 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
+ DEFAULT_WAIT_TIME = 5
14
+
13
15
  #
14
16
  # default user agent
15
17
  #
@@ -22,11 +24,19 @@ module Spidy::Connector
22
24
  'Safari/537.36'
23
25
  ].join(' ')
24
26
 
27
+ #
28
+ # error output logger
29
+ #
30
+ DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
31
+
32
+ #
33
+ # static method
34
+ #
25
35
  module StaticAccessor
26
36
  extend ActiveSupport::Concern
27
37
  class_methods do
28
- def call(url, wait_time: nil, user_agent: Spidy::Connector::USER_AGENT, &block)
29
- new(wait_time: wait_time, user_agent: user_agent).call(url, &block)
38
+ def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
39
+ ::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(url, &block)
30
40
  end
31
41
  end
32
42
  end
@@ -35,52 +45,101 @@ module Spidy::Connector
35
45
  # retry class
36
46
  #
37
47
  class Retry < StandardError
38
- attr_reader :page
39
- attr_reader :response_code
40
- attr_reader :wait_time
48
+ attr_reader :object, :response_code, :error
41
49
 
42
- def initialize(wait_time: 2, page: nil, error: nil)
43
- @page = page
50
+ def initialize(object: nil, error: nil, response_code: nil)
51
+ @object = object
52
+ @response_code = response_code
53
+ @error = error
54
+ end
55
+ end
56
+
57
+ #
58
+ # retry
59
+ #
60
+ class RetryableCaller
61
+ attr_reader :origin_connector
62
+
63
+ def initialize(connector, logger:, wait_time:)
64
+ @origin_connector = connector
65
+ @logger = logger
44
66
  @wait_time = wait_time
45
- @response_code = error.try(:response_code) || page.try(:response_code)
67
+ @retry_attempt_count = 5
68
+ end
69
+
70
+ def call(url, &block)
71
+ block ||= ->(result) { break result }
72
+ connect(url, &block)
73
+ end
74
+
75
+ def connect(url, retry_attempt_count: @retry_attempt_count, &block)
76
+ @logger.call('connnector.get': url, 'connnector.accessed': Time.current)
77
+ @origin_connector.call(url, &block)
78
+ rescue Spidy::Connector::Retry => e
79
+ @logger.call('retry.accessed': Time.current,
80
+ 'retry.uri': url,
81
+ 'retry.response_code': e.response_code,
82
+ 'retry.attempt_count': retry_attempt_count)
83
+
84
+ retry_attempt_count -= 1
85
+ if retry_attempt_count.positive?
86
+ sleep @wait_time
87
+ @origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
88
+ retry
89
+ end
90
+ raise e.error
46
91
  end
47
92
  end
48
93
 
49
- class Builder
50
- attr_reader :origin_connector, :proxy_connector
94
+ #
95
+ # tor proxy
96
+ #
97
+ class TorConnector
98
+ attr_reader :connector, :socks_proxy
51
99
 
52
100
  def initialize(connector, socks_proxy)
101
+ @connector = connector
53
102
  @socks_proxy = socks_proxy
54
- @origin_connector = connector
55
- @proxy_connector =
56
- lambda do |url, &block|
57
- Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
58
- connector.call(url, &block)
59
- end
60
- end
61
103
  end
62
104
 
63
- def proxy_disabled?
64
- !tor?
105
+ def call(url, &block)
106
+ Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
107
+ connector.call(url, &block)
108
+ end
65
109
  end
66
110
 
67
- def tor?
68
- Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
111
+ def try_connection?
112
+ try_connection!
69
113
  true
70
114
  rescue Errno::ECONNREFUSED
71
115
  false
72
116
  end
117
+
118
+ def try_connection!
119
+ Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
120
+ end
121
+ end
122
+
123
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
124
+ user_agent ||= USER_AGENT
125
+ logger ||= DEFAULT_LOGGER
126
+ wait_time ||= DEFAULT_WAIT_TIME
127
+
128
+ connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
129
+ RetryableCaller.new(connector, wait_time: wait_time, logger: logger)
73
130
  end
74
131
 
75
132
  #
76
133
  # get connection handller
77
134
  #
78
- def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil)
135
+ def self.get_connector(value, user_agent: nil, socks_proxy: nil)
79
136
  return value if value.respond_to?(:call)
80
137
 
81
- builder = Builder.new(const_get(value.to_s.classify).new(wait_time: wait_time || 5, user_agent: user_agent || USER_AGENT), socks_proxy)
82
- return builder.origin_connector if socks_proxy.nil? || builder.proxy_disabled?
138
+ connector = const_get(value.to_s.classify).new(user_agent: user_agent)
139
+ fail "Not defined connnector[#{value}]" if connector.nil?
140
+ return connector if socks_proxy.nil?
83
141
 
84
- builder.proxy_connector
142
+ tor = TorConnector.new(connector, socks_proxy)
143
+ tor
85
144
  end
86
145
  end
@@ -8,6 +8,6 @@ class Spidy::Connector::Direct
8
8
  yielder.call(resource)
9
9
  end
10
10
 
11
- def initialize(wait_time: nil, user_agent: nil)
11
+ def initialize(user_agent:)
12
12
  end
13
13
  end
@@ -6,16 +6,13 @@
6
6
  class Spidy::Connector::Html
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def initialize(wait_time:, user_agent:, logger: nil)
10
- @wait_time = wait_time
11
- @logger = logger || proc { |values| STDERR.puts(values.to_json) }
9
+ def initialize(user_agent:)
12
10
  @agent = Mechanize.new
13
11
  @user_agent = user_agent
14
12
  @agent.user_agent = user_agent
15
13
  end
16
14
 
17
15
  attr_reader :agent
18
- attr_reader :logger
19
16
 
20
17
  def call(url, encoding: nil, retry_count: 5, &yielder)
21
18
  fail 'url is not specified' if url.blank?
@@ -23,42 +20,27 @@ class Spidy::Connector::Html
23
20
  agent.default_encoding = encoding
24
21
  agent.force_default_encoding = true
25
22
  end
26
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
27
- get(url, retry_count, yielder)
28
- end
29
-
30
- private
31
-
32
- def get(url, retry_count, yielder)
33
23
  connect(url, retry_count, yielder)
34
- rescue Spidy::Connector::Retry => e
35
- logger.call('retry.accessed': Time.current,
36
- 'retry.uri': url,
37
- 'retry.response_code': e.response_code,
38
- 'retry.rest_count': retry_count)
24
+ end
39
25
 
26
+ def refresh!
40
27
  @agent = Mechanize.new
41
28
  @agent.user_agent = @user_agent
42
-
43
- retry_count -= 1
44
- if retry_count.positive?
45
- sleep e.wait_time
46
- retry
47
- end
48
- raise e
49
29
  end
50
30
 
31
+ private
32
+
51
33
  def connect(url, retry_count, yielder)
52
34
  result = nil
53
35
  agent.get(url) do |page|
54
- fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
36
+ fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
55
37
 
56
38
  result = yielder.call(page)
57
39
  end
58
40
  result
59
41
  rescue Mechanize::ResponseCodeError => e
60
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
61
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
62
- raise e
42
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
43
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
44
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
63
45
  end
64
46
  end
@@ -6,7 +6,9 @@
6
6
  class Spidy::Connector::Json
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def initialize(wait_time: nil, user_agent: nil)
9
+ attr_reader :logger
10
+
11
+ def initialize(user_agent:)
10
12
  @user_agent = user_agent
11
13
  end
12
14
 
@@ -15,7 +17,9 @@ class Spidy::Connector::Json
15
17
  connect(url, &block)
16
18
  end
17
19
 
18
- def connect(url)
20
+ def connect(url, retry_count: 5)
19
21
  OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
22
+ rescue OpenURI::HTTPError => e
23
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
20
24
  end
21
25
  end
@@ -6,15 +6,21 @@
6
6
  class Spidy::Connector::Xml
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def call(url)
9
+ def call(url, &block)
10
10
  fail 'URL is undefined' if url.blank?
11
11
 
12
+ connect(url, &block)
13
+ end
14
+
15
+ def connect(url, &block)
12
16
  OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
13
- yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
17
+ block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
14
18
  end
19
+ rescue OpenURI::HTTPError => e
20
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
15
21
  end
16
22
 
17
- def initialize(wait_time: nil, user_agent: nil)
23
+ def initialize(user_agent:)
18
24
  @user_agent = user_agent
19
25
  end
20
26
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.8'
4
+ VERSION = '0.3.3'
5
5
  end
@@ -30,6 +30,8 @@ Gem::Specification.new do |spec|
30
30
  spec.add_development_dependency 'rspec', '~> 3.0'
31
31
  spec.add_development_dependency 'ffaker'
32
32
  spec.add_development_dependency 'rspec-command'
33
+ spec.add_development_dependency 'capybara_discoball'
34
+ spec.add_development_dependency 'sinatra'
33
35
 
34
36
  spec.add_runtime_dependency 'tor'
35
37
  spec.add_runtime_dependency 'activesupport'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-09-07 00:00:00.000000000 Z
11
+ date: 2020-10-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -94,6 +94,34 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: capybara_discoball
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: sinatra
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: tor
99
127
  requirement: !ruby/object:Gem::Requirement
@@ -187,6 +215,9 @@ files:
187
215
  - Rakefile
188
216
  - bin/console
189
217
  - bin/setup
218
+ - example/master_detail.rb
219
+ - example/proxy.rb
220
+ - example/retry.rb
190
221
  - exe/spidy
191
222
  - lib/spidy.rb
192
223
  - lib/spidy/binder.rb