spidy 0.2.9 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 44d16509821e1779d821effb7c571d2a11cbb79166168943242628b2e53457ec
4
- data.tar.gz: 5b52389a8042c9e69aaeead0d96b81553da2e5f5e51ad16696f87249e38067bd
3
+ metadata.gz: f87cda14101ec7c184d3e3134f25601cf3c0d2ef2d22b9766f5ce2b417734212
4
+ data.tar.gz: 803f7012304d475280949f742fb5da7af588dc9f2ab85e3991973b4c60a81a46
5
5
  SHA512:
6
- metadata.gz: 3475b05ea1235b388960416a03ab96adb64608ce4046d1812b4d80f038a91b99343882e45413ee6a2f74e29626869aca8a48d34f8e57c64b480c8893efcda123
7
- data.tar.gz: 7e4fd1abf5898a7a7273d4eaf53021e4c1898d995e8e0d23abfcbf80c3e1b45b2f059c95e48707577bae72a4f158d7cba7f9be097e7e8a277d89e683386c32d6
6
+ metadata.gz: cd725d2f1a697d4f912e5b37113de561bb002423db61a666cf06404a4bcb3c46765ef1b54dd024f6dd4760f73dc75e285e828e2672348d70e3cb8d71861f5c5f
7
+ data.tar.gz: ba4550f6871bad2c10f8ad1964e6fd30534f66e02d6f034f916e1afa5247a0f23249fa91842b60477f6f2cbbe2829d8f7edf279655b020eeeb4d242b26d81970
@@ -0,0 +1,73 @@
1
+
2
+
3
+ Spidy.define do
4
+ url_to_params = ->(url) {
5
+ uri = URI.parse(url)
6
+ params = URI.decode_www_form(uri.query).to_h if uri.query.present?
7
+ params if params.present?
8
+ }
9
+
10
+ master_page = proc { |url, &yielder|
11
+ params = url_to_params.call(url)
12
+ page = params&.dig('page')&.to_i || 0
13
+
14
+ limit_page = 3
15
+ per_page = 25
16
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
17
+ doc.html {
18
+ doc.body {
19
+ doc.span.bold {
20
+ doc.text "Hello world"
21
+ }
22
+ doc.main {
23
+ (page * per_page + 1).upto((page + 1) * per_page).each do |i|
24
+ doc.a("page #{i}", href: "http://localhost/?id=#{i}")
25
+ end
26
+ }
27
+ doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
28
+ }
29
+ }
30
+ }.doc)
31
+ }
32
+
33
+ detail_page = proc { |url, &yielder|
34
+ params = url_to_params.call(url)
35
+ id = params['id']
36
+
37
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
38
+ doc.html {
39
+ doc.body {
40
+ doc.span.bold {
41
+ doc.text "Hello world"
42
+ }
43
+ doc.h1("title_#{id}", id: 'title')
44
+ doc.main("body_#{id}", id: 'body')
45
+ doc.div.sub do
46
+ doc.span.name('testtest')
47
+ end
48
+ }
49
+ }
50
+ }.doc)
51
+ }
52
+
53
+ define(as: :html, connector: detail_page) do
54
+ let(:title, '#title')
55
+ let(:body, '#body')
56
+ end
57
+
58
+ define(:sub, as: :html, connector: :direct) do
59
+ let(:name, '.name')
60
+ end
61
+
62
+ spider(as: :html, connector: master_page) do |yielder, connector|
63
+ next_url = 'http://localhost'
64
+ while next_url.present?
65
+ connector.call(next_url) do |page|
66
+ page.search('main a').each do |a|
67
+ yielder.call(a.attr('href'))
68
+ end
69
+ next_url = page.at('a.next')&.attr('href')
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,10 @@
1
+ Spidy.define do
2
+ user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
3
+ socks_proxy '127.0.0.1', 9050
4
+
5
+ spider(as: :json) do |yielder, connector|
6
+ connector.call('https://httpbin.org/ip') do |json|
7
+ yielder.call(json[:origin])
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,7 @@
1
+ Spidy.define do
2
+ spider(as: :json) do |yielder, connector|
3
+ connector.call('https://httpbin.org/status/500') do |json|
4
+ yielder.call(json[:origin])
5
+ end
6
+ end
7
+ end
@@ -10,10 +10,12 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
+ DEFAULT_WAIT_TIME = 5
14
+
13
15
  #
14
16
  # default user agent
15
17
  #
16
- USER_AGENT = [
18
+ DEFAULT_USER_AGENT = [
17
19
  'Mozilla/5.0',
18
20
  '(Macintosh; Intel Mac OS X 10_12_6)',
19
21
  'AppleWebKit/537.36',
@@ -43,56 +45,101 @@ module Spidy::Connector
43
45
  # retry class
44
46
  #
45
47
  class Retry < StandardError
46
- attr_reader :page
47
- attr_reader :response_code
48
- attr_reader :wait_time
48
+ attr_reader :object, :response_code, :error
49
+
50
+ def initialize(object: nil, error: nil, response_code: nil)
51
+ @object = object
52
+ @response_code = response_code
53
+ @error = error
54
+ end
55
+ end
56
+
57
+ #
58
+ # retry
59
+ #
60
+ class Retryable
61
+ attr_reader :origin_connector
49
62
 
50
- def initialize(wait_time: 2, page: nil, error: nil)
51
- @page = page
63
+ def initialize(connector, logger:, wait_time:)
64
+ @origin_connector = connector
65
+ @logger = logger
52
66
  @wait_time = wait_time
53
- @response_code = error.try(:response_code) || page.try(:response_code)
67
+ @retry_attempt_count = 5
68
+ end
69
+
70
+ def call(url, &block)
71
+ connect(url, &block)
72
+ end
73
+
74
+ def connect(url, retry_attempt_count: @retry_attempt_count, &block)
75
+ @logger.call('connnector.get': url, 'connnector.accessed': Time.current)
76
+ @origin_connector.call(url, &block)
77
+ rescue Spidy::Connector::Retry => e
78
+ @logger.call('retry.accessed': Time.current,
79
+ 'retry.uri': url,
80
+ 'retry.response_code': e.response_code,
81
+ 'retry.attempt_count': retry_attempt_count)
82
+
83
+ retry_attempt_count -= 1
84
+ if retry_attempt_count.positive?
85
+ sleep @wait_time
86
+ @origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
87
+ retry
88
+ end
89
+ raise e.error
54
90
  end
55
91
  end
56
92
 
57
- class Builder
58
- attr_reader :origin_connector, :proxy_connector
93
+ #
94
+ # tor proxy
95
+ #
96
+ class TorConnector
97
+ attr_reader :connector, :socks_proxy
59
98
 
60
99
  def initialize(connector, socks_proxy)
100
+ @connector = connector
61
101
  @socks_proxy = socks_proxy
62
- @origin_connector = connector
63
- @proxy_connector =
64
- lambda do |url, &block|
65
- Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
66
- connector.call(url, &block)
67
- end
68
- end
69
102
  end
70
103
 
71
- def proxy_disabled?
72
- !tor?
104
+ def call(url, &block)
105
+ Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
106
+ connector.call(url, &block)
107
+ end
73
108
  end
74
109
 
75
- def tor?
76
- Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
110
+ def try_connection?
111
+ try_connection!
77
112
  true
78
113
  rescue Errno::ECONNREFUSED
79
114
  false
80
115
  end
116
+
117
+ def try_connection!
118
+ Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
119
+ end
120
+ end
121
+
122
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
123
+ logger ||= DEFAULT_LOGGER
124
+ user_agent ||= DEFAULT_USER_AGENT
125
+ wait_time ||= DEFAULT_WAIT_TIME
126
+
127
+ connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
128
+ Retryable.new(connector, wait_time: wait_time, logger: logger)
81
129
  end
82
130
 
83
131
  #
84
132
  # get connection handller
85
133
  #
86
- def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
134
+ def self.get_connector(value, user_agent: nil, socks_proxy: nil)
87
135
  return value if value.respond_to?(:call)
88
136
 
89
- builder = Builder.new(const_get(value.to_s.classify).new(
90
- wait_time: wait_time || 5,
91
- user_agent: user_agent || USER_AGENT,
92
- logger: logger || DEFAULT_LOGGER,
93
- ), socks_proxy)
94
- return builder.origin_connector if socks_proxy.nil? || builder.proxy_disabled?
137
+ connector = const_get(value.to_s.classify).new(user_agent: user_agent)
138
+ fail "Not defined connnector[#{value}]" if connector.nil?
139
+ return connector if socks_proxy.nil?
95
140
 
96
- builder.proxy_connector
141
+ tor = TorConnector.new(connnector, socks_proxy)
142
+ tor.try_connection!
143
+ tor
97
144
  end
98
145
  end
@@ -8,6 +8,6 @@ class Spidy::Connector::Direct
8
8
  yielder.call(resource)
9
9
  end
10
10
 
11
- def initialize(wait_time: nil, user_agent: nil, logger: nil)
11
+ def initialize(user_agent:)
12
12
  end
13
13
  end
@@ -6,16 +6,13 @@
6
6
  class Spidy::Connector::Html
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def initialize(wait_time:, user_agent:, logger: nil)
10
- @wait_time = wait_time
11
- @logger = logger
9
+ def initialize(user_agent:)
12
10
  @agent = Mechanize.new
13
11
  @user_agent = user_agent
14
12
  @agent.user_agent = user_agent
15
13
  end
16
14
 
17
15
  attr_reader :agent
18
- attr_reader :logger
19
16
 
20
17
  def call(url, encoding: nil, retry_count: 5, &yielder)
21
18
  fail 'url is not specified' if url.blank?
@@ -23,42 +20,27 @@ class Spidy::Connector::Html
23
20
  agent.default_encoding = encoding
24
21
  agent.force_default_encoding = true
25
22
  end
26
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
27
- get(url, retry_count, yielder)
28
- end
29
-
30
- private
31
-
32
- def get(url, retry_count, yielder)
33
23
  connect(url, retry_count, yielder)
34
- rescue Spidy::Connector::Retry => e
35
- logger.call('retry.accessed': Time.current,
36
- 'retry.uri': url,
37
- 'retry.response_code': e.response_code,
38
- 'retry.rest_count': retry_count)
24
+ end
39
25
 
26
+ def refresh!
40
27
  @agent = Mechanize.new
41
28
  @agent.user_agent = @user_agent
42
-
43
- retry_count -= 1
44
- if retry_count.positive?
45
- sleep e.wait_time
46
- retry
47
- end
48
- raise e
49
29
  end
50
30
 
31
+ private
32
+
51
33
  def connect(url, retry_count, yielder)
52
34
  result = nil
53
35
  agent.get(url) do |page|
54
- fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
36
+ fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
55
37
 
56
38
  result = yielder.call(page)
57
39
  end
58
40
  result
59
41
  rescue Mechanize::ResponseCodeError => e
60
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
61
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
62
- raise e
42
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
43
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
44
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
63
45
  end
64
46
  end
@@ -8,10 +8,8 @@ class Spidy::Connector::Json
8
8
 
9
9
  attr_reader :logger
10
10
 
11
- def initialize(wait_time: nil, user_agent: nil, logger: nil)
12
- @wait_time = wait_time
11
+ def initialize(user_agent: nil)
13
12
  @user_agent = user_agent
14
- @logger = logger
15
13
  end
16
14
 
17
15
  def call(url, &block)
@@ -22,16 +20,6 @@ class Spidy::Connector::Json
22
20
  def connect(url, retry_count: 5)
23
21
  OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
24
22
  rescue OpenURI::HTTPError => e
25
- logger.call('retry.accessed': Time.current,
26
- 'retry.uri': url,
27
- 'retry.response_code': e.message,
28
- 'retry.rest_count': retry_count)
29
-
30
- retry_count -= 1
31
- if retry_count.positive?
32
- sleep @wait_time
33
- retry
34
- end
35
- raise e
23
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
36
24
  end
37
25
  end
@@ -6,15 +6,21 @@
6
6
  class Spidy::Connector::Xml
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def call(url)
9
+ def call(url, &block)
10
10
  fail 'URL is undefined' if url.blank?
11
11
 
12
+ connect(url, &block)
13
+ end
14
+
15
+ def connect(url, &block)
12
16
  OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
13
- yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
17
+ block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
14
18
  end
19
+ rescue OpenURI::HTTPError => e
20
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
15
21
  end
16
22
 
17
- def initialize(user_agent: nil, logger: nil)
23
+ def initialize(user_agent:)
18
24
  @user_agent = user_agent
19
25
  end
20
26
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.9'
4
+ VERSION = '0.3.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.9
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-09-08 00:00:00.000000000 Z
11
+ date: 2020-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -187,6 +187,9 @@ files:
187
187
  - Rakefile
188
188
  - bin/console
189
189
  - bin/setup
190
+ - example/master_detail.rb
191
+ - example/proxy.rb
192
+ - example/retry.rb
190
193
  - exe/spidy
191
194
  - lib/spidy.rb
192
195
  - lib/spidy/binder.rb