spidy 0.2.9 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 44d16509821e1779d821effb7c571d2a11cbb79166168943242628b2e53457ec
4
- data.tar.gz: 5b52389a8042c9e69aaeead0d96b81553da2e5f5e51ad16696f87249e38067bd
3
+ metadata.gz: f87cda14101ec7c184d3e3134f25601cf3c0d2ef2d22b9766f5ce2b417734212
4
+ data.tar.gz: 803f7012304d475280949f742fb5da7af588dc9f2ab85e3991973b4c60a81a46
5
5
  SHA512:
6
- metadata.gz: 3475b05ea1235b388960416a03ab96adb64608ce4046d1812b4d80f038a91b99343882e45413ee6a2f74e29626869aca8a48d34f8e57c64b480c8893efcda123
7
- data.tar.gz: 7e4fd1abf5898a7a7273d4eaf53021e4c1898d995e8e0d23abfcbf80c3e1b45b2f059c95e48707577bae72a4f158d7cba7f9be097e7e8a277d89e683386c32d6
6
+ metadata.gz: cd725d2f1a697d4f912e5b37113de561bb002423db61a666cf06404a4bcb3c46765ef1b54dd024f6dd4760f73dc75e285e828e2672348d70e3cb8d71861f5c5f
7
+ data.tar.gz: ba4550f6871bad2c10f8ad1964e6fd30534f66e02d6f034f916e1afa5247a0f23249fa91842b60477f6f2cbbe2829d8f7edf279655b020eeeb4d242b26d81970
@@ -0,0 +1,73 @@
1
+
2
+
3
+ Spidy.define do
4
+ url_to_params = ->(url) {
5
+ uri = URI.parse(url)
6
+ params = URI.decode_www_form(uri.query).to_h if uri.query.present?
7
+ params if params.present?
8
+ }
9
+
10
+ master_page = proc { |url, &yielder|
11
+ params = url_to_params.call(url)
12
+ page = params&.dig('page')&.to_i || 0
13
+
14
+ limit_page = 3
15
+ per_page = 25
16
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
17
+ doc.html {
18
+ doc.body {
19
+ doc.span.bold {
20
+ doc.text "Hello world"
21
+ }
22
+ doc.main {
23
+ (page * per_page + 1).upto((page + 1) * per_page).each do |i|
24
+ doc.a("page #{i}", href: "http://localhost/?id=#{i}")
25
+ end
26
+ }
27
+ doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
28
+ }
29
+ }
30
+ }.doc)
31
+ }
32
+
33
+ detail_page = proc { |url, &yielder|
34
+ params = url_to_params.call(url)
35
+ id = params['id']
36
+
37
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
38
+ doc.html {
39
+ doc.body {
40
+ doc.span.bold {
41
+ doc.text "Hello world"
42
+ }
43
+ doc.h1("title_#{id}", id: 'title')
44
+ doc.main("body_#{id}", id: 'body')
45
+ doc.div.sub do
46
+ doc.span.name('testtest')
47
+ end
48
+ }
49
+ }
50
+ }.doc)
51
+ }
52
+
53
+ define(as: :html, connector: detail_page) do
54
+ let(:title, '#title')
55
+ let(:body, '#body')
56
+ end
57
+
58
+ define(:sub, as: :html, connector: :direct) do
59
+ let(:name, '.name')
60
+ end
61
+
62
+ spider(as: :html, connector: master_page) do |yielder, connector|
63
+ next_url = 'http://localhost'
64
+ while next_url.present?
65
+ connector.call(next_url) do |page|
66
+ page.search('main a').each do |a|
67
+ yielder.call(a.attr('href'))
68
+ end
69
+ next_url = page.at('a.next')&.attr('href')
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,10 @@
1
+ Spidy.define do
2
+ user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
3
+ socks_proxy '127.0.0.1', 9050
4
+
5
+ spider(as: :json) do |yielder, connector|
6
+ connector.call('https://httpbin.org/ip') do |json|
7
+ yielder.call(json[:origin])
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,7 @@
1
+ Spidy.define do
2
+ spider(as: :json) do |yielder, connector|
3
+ connector.call('https://httpbin.org/status/500') do |json|
4
+ yielder.call(json[:origin])
5
+ end
6
+ end
7
+ end
@@ -10,10 +10,12 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
+ DEFAULT_WAIT_TIME = 5
14
+
13
15
  #
14
16
  # default user agent
15
17
  #
16
- USER_AGENT = [
18
+ DEFAULT_USER_AGENT = [
17
19
  'Mozilla/5.0',
18
20
  '(Macintosh; Intel Mac OS X 10_12_6)',
19
21
  'AppleWebKit/537.36',
@@ -43,56 +45,101 @@ module Spidy::Connector
43
45
  # retry class
44
46
  #
45
47
  class Retry < StandardError
46
- attr_reader :page
47
- attr_reader :response_code
48
- attr_reader :wait_time
48
+ attr_reader :object, :response_code, :error
49
+
50
+ def initialize(object: nil, error: nil, response_code: nil)
51
+ @object = object
52
+ @response_code = response_code
53
+ @error = error
54
+ end
55
+ end
56
+
57
+ #
58
+ # retry
59
+ #
60
+ class Retryable
61
+ attr_reader :origin_connector
49
62
 
50
- def initialize(wait_time: 2, page: nil, error: nil)
51
- @page = page
63
+ def initialize(connector, logger:, wait_time:)
64
+ @origin_connector = connector
65
+ @logger = logger
52
66
  @wait_time = wait_time
53
- @response_code = error.try(:response_code) || page.try(:response_code)
67
+ @retry_attempt_count = 5
68
+ end
69
+
70
+ def call(url, &block)
71
+ connect(url, &block)
72
+ end
73
+
74
+ def connect(url, retry_attempt_count: @retry_attempt_count, &block)
75
+ @logger.call('connnector.get': url, 'connnector.accessed': Time.current)
76
+ @origin_connector.call(url, &block)
77
+ rescue Spidy::Connector::Retry => e
78
+ @logger.call('retry.accessed': Time.current,
79
+ 'retry.uri': url,
80
+ 'retry.response_code': e.response_code,
81
+ 'retry.attempt_count': retry_attempt_count)
82
+
83
+ retry_attempt_count -= 1
84
+ if retry_attempt_count.positive?
85
+ sleep @wait_time
86
+ @origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
87
+ retry
88
+ end
89
+ raise e.error
54
90
  end
55
91
  end
56
92
 
57
- class Builder
58
- attr_reader :origin_connector, :proxy_connector
93
+ #
94
+ # tor proxy
95
+ #
96
+ class TorConnector
97
+ attr_reader :connector, :socks_proxy
59
98
 
60
99
  def initialize(connector, socks_proxy)
100
+ @connector = connector
61
101
  @socks_proxy = socks_proxy
62
- @origin_connector = connector
63
- @proxy_connector =
64
- lambda do |url, &block|
65
- Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
66
- connector.call(url, &block)
67
- end
68
- end
69
102
  end
70
103
 
71
- def proxy_disabled?
72
- !tor?
104
+ def call(url, &block)
105
+ Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
106
+ connector.call(url, &block)
107
+ end
73
108
  end
74
109
 
75
- def tor?
76
- Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
110
+ def try_connection?
111
+ try_connection!
77
112
  true
78
113
  rescue Errno::ECONNREFUSED
79
114
  false
80
115
  end
116
+
117
+ def try_connection!
118
+ Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
119
+ end
120
+ end
121
+
122
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
123
+ logger ||= DEFAULT_LOGGER
124
+ user_agent ||= DEFAULT_USER_AGENT
125
+ wait_time ||= DEFAULT_WAIT_TIME
126
+
127
+ connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
128
+ Retryable.new(connector, wait_time: wait_time, logger: logger)
81
129
  end
82
130
 
83
131
  #
84
132
  # get connection handller
85
133
  #
86
- def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
134
+ def self.get_connector(value, user_agent: nil, socks_proxy: nil)
87
135
  return value if value.respond_to?(:call)
88
136
 
89
- builder = Builder.new(const_get(value.to_s.classify).new(
90
- wait_time: wait_time || 5,
91
- user_agent: user_agent || USER_AGENT,
92
- logger: logger || DEFAULT_LOGGER,
93
- ), socks_proxy)
94
- return builder.origin_connector if socks_proxy.nil? || builder.proxy_disabled?
137
+ connector = const_get(value.to_s.classify).new(user_agent: user_agent)
138
+ fail "Not defined connnector[#{value}]" if connector.nil?
139
+ return connector if socks_proxy.nil?
95
140
 
96
- builder.proxy_connector
141
+ tor = TorConnector.new(connnector, socks_proxy)
142
+ tor.try_connection!
143
+ tor
97
144
  end
98
145
  end
@@ -8,6 +8,6 @@ class Spidy::Connector::Direct
8
8
  yielder.call(resource)
9
9
  end
10
10
 
11
- def initialize(wait_time: nil, user_agent: nil, logger: nil)
11
+ def initialize(user_agent:)
12
12
  end
13
13
  end
@@ -6,16 +6,13 @@
6
6
  class Spidy::Connector::Html
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def initialize(wait_time:, user_agent:, logger: nil)
10
- @wait_time = wait_time
11
- @logger = logger
9
+ def initialize(user_agent:)
12
10
  @agent = Mechanize.new
13
11
  @user_agent = user_agent
14
12
  @agent.user_agent = user_agent
15
13
  end
16
14
 
17
15
  attr_reader :agent
18
- attr_reader :logger
19
16
 
20
17
  def call(url, encoding: nil, retry_count: 5, &yielder)
21
18
  fail 'url is not specified' if url.blank?
@@ -23,42 +20,27 @@ class Spidy::Connector::Html
23
20
  agent.default_encoding = encoding
24
21
  agent.force_default_encoding = true
25
22
  end
26
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
27
- get(url, retry_count, yielder)
28
- end
29
-
30
- private
31
-
32
- def get(url, retry_count, yielder)
33
23
  connect(url, retry_count, yielder)
34
- rescue Spidy::Connector::Retry => e
35
- logger.call('retry.accessed': Time.current,
36
- 'retry.uri': url,
37
- 'retry.response_code': e.response_code,
38
- 'retry.rest_count': retry_count)
24
+ end
39
25
 
26
+ def refresh!
40
27
  @agent = Mechanize.new
41
28
  @agent.user_agent = @user_agent
42
-
43
- retry_count -= 1
44
- if retry_count.positive?
45
- sleep e.wait_time
46
- retry
47
- end
48
- raise e
49
29
  end
50
30
 
31
+ private
32
+
51
33
  def connect(url, retry_count, yielder)
52
34
  result = nil
53
35
  agent.get(url) do |page|
54
- fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
36
+ fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
55
37
 
56
38
  result = yielder.call(page)
57
39
  end
58
40
  result
59
41
  rescue Mechanize::ResponseCodeError => e
60
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
61
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
62
- raise e
42
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
43
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
44
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
63
45
  end
64
46
  end
@@ -8,10 +8,8 @@ class Spidy::Connector::Json
8
8
 
9
9
  attr_reader :logger
10
10
 
11
- def initialize(wait_time: nil, user_agent: nil, logger: nil)
12
- @wait_time = wait_time
11
+ def initialize(user_agent: nil)
13
12
  @user_agent = user_agent
14
- @logger = logger
15
13
  end
16
14
 
17
15
  def call(url, &block)
@@ -22,16 +20,6 @@ class Spidy::Connector::Json
22
20
  def connect(url, retry_count: 5)
23
21
  OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
24
22
  rescue OpenURI::HTTPError => e
25
- logger.call('retry.accessed': Time.current,
26
- 'retry.uri': url,
27
- 'retry.response_code': e.message,
28
- 'retry.rest_count': retry_count)
29
-
30
- retry_count -= 1
31
- if retry_count.positive?
32
- sleep @wait_time
33
- retry
34
- end
35
- raise e
23
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
36
24
  end
37
25
  end
@@ -6,15 +6,21 @@
6
6
  class Spidy::Connector::Xml
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def call(url)
9
+ def call(url, &block)
10
10
  fail 'URL is undefined' if url.blank?
11
11
 
12
+ connect(url, &block)
13
+ end
14
+
15
+ def connect(url, &block)
12
16
  OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
13
- yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
17
+ block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
14
18
  end
19
+ rescue OpenURI::HTTPError => e
20
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
15
21
  end
16
22
 
17
- def initialize(user_agent: nil, logger: nil)
23
+ def initialize(user_agent:)
18
24
  @user_agent = user_agent
19
25
  end
20
26
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.9'
4
+ VERSION = '0.3.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.9
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-09-08 00:00:00.000000000 Z
11
+ date: 2020-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -187,6 +187,9 @@ files:
187
187
  - Rakefile
188
188
  - bin/console
189
189
  - bin/setup
190
+ - example/master_detail.rb
191
+ - example/proxy.rb
192
+ - example/retry.rb
190
193
  - exe/spidy
191
194
  - lib/spidy.rb
192
195
  - lib/spidy/binder.rb