spidy 0.2.7 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 177e6b5e3b4f864251d7071d23b2b42b219d8193673a5b57ee9b376e393092a9
4
- data.tar.gz: 92b39a277d752eb5b1944f822702eae803356f7f5e3211b1f91e2d9ad635090e
3
+ metadata.gz: 693834c20f838dfc7bfed8bfe3d77f21b519d68840f5f5ce3c18e971de496b8e
4
+ data.tar.gz: b3a43c5ef746fc550fa617157e3a779916cb6f1963b48896c8e6ef919cc5f9ad
5
5
  SHA512:
6
- metadata.gz: b4078e3d1ad9813b0273f267d99bf7a9bf4473bb86b6cfa4fa283e6c5b060fa248e0b6fb517bede448f94e0dcebbb90ee71ef4eeb4bd5a45b1405c50005985fe
7
- data.tar.gz: 00aa2cd512cf3a083082227234f59148d86e8ee0e4f602e07d1c67cbf9f3e8160b9144ebd29f3a67efded607c1705c2ce3008df235325afc7f3a715a6dd7527d
6
+ metadata.gz: 4dd3ffbc9f786b7c7b4d20f4ca07f7873922ca6633730b143f7b071063a8a08a30a83cd4444efc66ca1a5bc820d1e6ea3d5c2c16bd83447ee59454084f82d6ba
7
+ data.tar.gz: 299293fdf0fb701e5adfd49f6a7f91c712bf911ec78ee555eefb07e7f0c0247f48bc63e9ab414fcc900545a506a0ac1bd47625edf1905973ed2e447d376f2b80
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.2.6)
4
+ spidy (0.2.9)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
@@ -0,0 +1,73 @@
1
+
2
+
3
+ Spidy.define do
4
+ url_to_params = ->(url) {
5
+ uri = URI.parse(url)
6
+ params = URI.decode_www_form(uri.query).to_h if uri.query.present?
7
+ params if params.present?
8
+ }
9
+
10
+ master_page = proc { |url, &yielder|
11
+ params = url_to_params.call(url)
12
+ page = params&.dig('page')&.to_i || 0
13
+
14
+ limit_page = 3
15
+ per_page = 25
16
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
17
+ doc.html {
18
+ doc.body {
19
+ doc.span.bold {
20
+ doc.text "Hello world"
21
+ }
22
+ doc.main {
23
+ (page * per_page + 1).upto((page + 1) * per_page).each do |i|
24
+ doc.a("page #{i}", href: "http://localhost/?id=#{i}")
25
+ end
26
+ }
27
+ doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
28
+ }
29
+ }
30
+ }.doc)
31
+ }
32
+
33
+ detail_page = proc { |url, &yielder|
34
+ params = url_to_params.call(url)
35
+ id = params['id']
36
+
37
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
38
+ doc.html {
39
+ doc.body {
40
+ doc.span.bold {
41
+ doc.text "Hello world"
42
+ }
43
+ doc.h1("title_#{id}", id: 'title')
44
+ doc.main("body_#{id}", id: 'body')
45
+ doc.div.sub do
46
+ doc.span.name('testtest')
47
+ end
48
+ }
49
+ }
50
+ }.doc)
51
+ }
52
+
53
+ define(as: :html, connector: detail_page) do
54
+ let(:title, '#title')
55
+ let(:body, '#body')
56
+ end
57
+
58
+ define(:sub, as: :html, connector: :direct) do
59
+ let(:name, '.name')
60
+ end
61
+
62
+ spider(as: :html, connector: master_page) do |yielder, connector|
63
+ next_url = 'http://localhost'
64
+ while next_url.present?
65
+ connector.call(next_url) do |page|
66
+ page.search('main a').each do |a|
67
+ yielder.call(a.attr('href'))
68
+ end
69
+ next_url = page.at('a.next')&.attr('href')
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,10 @@
1
+ Spidy.define do
2
+ user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
3
+ socks_proxy '127.0.0.1', 9050
4
+
5
+ spider(as: :json) do |yielder, connector|
6
+ connector.call('https://httpbin.org/ip') do |json|
7
+ yielder.call(json[:origin])
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,7 @@
1
+ Spidy.define do
2
+ spider(as: :json) do |yielder, connector|
3
+ connector.call('https://httpbin.org/status/500') do |json|
4
+ yielder.call(json[:origin])
5
+ end
6
+ end
7
+ end
@@ -10,11 +10,33 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
+ DEFAULT_WAIT_TIME = 5
14
+
15
+ #
16
+ # default user agent
17
+ #
18
+ DEFAULT_USER_AGENT = [
19
+ 'Mozilla/5.0',
20
+ '(Macintosh; Intel Mac OS X 10_12_6)',
21
+ 'AppleWebKit/537.36',
22
+ '(KHTML, like Gecko)',
23
+ 'Chrome/64.0.3282.186',
24
+ 'Safari/537.36'
25
+ ].join(' ')
26
+
27
+ #
28
+ # error output logger
29
+ #
30
+ DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
31
+
32
+ #
33
+ # static method
34
+ #
13
35
  module StaticAccessor
14
36
  extend ActiveSupport::Concern
15
37
  class_methods do
16
- def call(url, wait_time: nil, user_agent: Spidy::Connector::USER_AGENT, &block)
17
- new(wait_time: wait_time, user_agent: user_agent).call(url, &block)
38
+ def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
39
+ new(wait_time: wait_time, user_agent: user_agent, logger: logger).call(url, &block)
18
40
  end
19
41
  end
20
42
  end
@@ -23,42 +45,100 @@ module Spidy::Connector
23
45
  # retry class
24
46
  #
25
47
  class Retry < StandardError
26
- attr_reader :page
27
- attr_reader :response_code
28
- attr_reader :wait_time
48
+ attr_reader :object, :response_code, :error
29
49
 
30
- def initialize(wait_time: 2, page: nil, error: nil)
31
- @page = page
32
- @wait_time = wait_time
33
- @response_code = error.try(:response_code) || page.try(:response_code)
50
+ def initialize(object: nil, error: nil, response_code: nil)
51
+ @object = object
52
+ @response_code = response_code
53
+ @error = error
34
54
  end
35
55
  end
36
56
 
37
57
  #
38
- # default user agent
58
+ # retry
39
59
  #
40
- USER_AGENT = [
41
- 'Mozilla/5.0',
42
- '(Macintosh; Intel Mac OS X 10_12_6)',
43
- 'AppleWebKit/537.36',
44
- '(KHTML, like Gecko)',
45
- 'Chrome/64.0.3282.186',
46
- 'Safari/537.36'
47
- ].join(' ')
60
+ class Retryable
61
+ attr_reader :origin_connector
62
+
63
+ def initialize(connector, logger:, wait_time:)
64
+ @origin_connector = connector
65
+ @logger = logger
66
+ @wait_time = wait_time
67
+ @retry_attempt_count = 5
68
+ end
69
+
70
+ def call(url, &block)
71
+ connect(url, &block)
72
+ end
73
+
74
+ def connect(url, retry_attempt_count: @retry_attempt_count, &block)
75
+ @logger.call('connnector.get': url, 'connnector.accessed': Time.current)
76
+ @origin_connector.call(url, &block)
77
+ rescue Spidy::Connector::Retry => e
78
+ @logger.call('retry.accessed': Time.current,
79
+ 'retry.uri': url,
80
+ 'retry.response_code': e.response_code,
81
+ 'retry.attempt_count': retry_attempt_count)
82
+
83
+ retry_attempt_count -= 1
84
+ if retry_attempt_count.positive?
85
+ sleep @wait_time
86
+ @origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
87
+ retry
88
+ end
89
+ raise e.error
90
+ end
91
+ end
48
92
 
49
93
  #
50
- # get connection handller
94
+ # tor proxy
51
95
  #
52
- def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil)
53
- return value if value.respond_to?(:call)
96
+ class TorConnector
97
+ attr_reader :connector, :socks_proxy
54
98
 
55
- connector = const_get(value.to_s.classify).new(wait_time: wait_time || 5, user_agent: user_agent || USER_AGENT)
56
- return connector if socks_proxy.nil?
99
+ def initialize(connector, socks_proxy)
100
+ @connector = connector
101
+ @socks_proxy = socks_proxy
102
+ end
57
103
 
58
- lambda do |url, &block|
104
+ def call(url, &block)
59
105
  Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
60
106
  connector.call(url, &block)
61
107
  end
62
108
  end
109
+
110
+ def try_connection?
111
+ try_connection!
112
+ true
113
+ rescue Errno::ECONNREFUSED
114
+ false
115
+ end
116
+
117
+ def try_connection!
118
+ Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
119
+ end
120
+ end
121
+
122
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
123
+ logger ||= DEFAULT_LOGGER
124
+ user_agent ||= DEFAULT_USER_AGENT
125
+ wait_time ||= DEFAULT_WAIT_TIME
126
+
127
+ connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
128
+ Retryable.new(connector, wait_time: wait_time, logger: logger)
129
+ end
130
+
131
+ #
132
+ # get connection handller
133
+ #
134
+ def self.get_connector(value, user_agent: nil, socks_proxy: nil)
135
+ return value if value.respond_to?(:call)
136
+
137
+ connector = const_get(value.to_s.classify).new(user_agent: user_agent)
138
+ fail "Not defined connnector[#{value}]" if connector.nil?
139
+ return connector if socks_proxy.nil?
140
+
141
+ tor = TorConnector.new(connector, socks_proxy)
142
+ tor
63
143
  end
64
144
  end
@@ -8,6 +8,6 @@ class Spidy::Connector::Direct
8
8
  yielder.call(resource)
9
9
  end
10
10
 
11
- def initialize(wait_time: nil, user_agent: nil)
11
+ def initialize(user_agent:)
12
12
  end
13
13
  end
@@ -6,16 +6,13 @@
6
6
  class Spidy::Connector::Html
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def initialize(wait_time:, user_agent:, logger: nil)
10
- @wait_time = wait_time
11
- @logger = logger || proc { |values| STDERR.puts(values.to_json) }
9
+ def initialize(user_agent:)
12
10
  @agent = Mechanize.new
13
11
  @user_agent = user_agent
14
12
  @agent.user_agent = user_agent
15
13
  end
16
14
 
17
15
  attr_reader :agent
18
- attr_reader :logger
19
16
 
20
17
  def call(url, encoding: nil, retry_count: 5, &yielder)
21
18
  fail 'url is not specified' if url.blank?
@@ -23,42 +20,27 @@ class Spidy::Connector::Html
23
20
  agent.default_encoding = encoding
24
21
  agent.force_default_encoding = true
25
22
  end
26
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
27
- get(url, retry_count, yielder)
28
- end
29
-
30
- private
31
-
32
- def get(url, retry_count, yielder)
33
23
  connect(url, retry_count, yielder)
34
- rescue Spidy::Connector::Retry => e
35
- logger.call('retry.accessed': Time.current,
36
- 'retry.uri': url,
37
- 'retry.response_code': e.response_code,
38
- 'retry.rest_count': retry_count)
24
+ end
39
25
 
26
+ def refresh!
40
27
  @agent = Mechanize.new
41
28
  @agent.user_agent = @user_agent
42
-
43
- retry_count -= 1
44
- if retry_count.positive?
45
- sleep e.wait_time
46
- retry
47
- end
48
- raise e
49
29
  end
50
30
 
31
+ private
32
+
51
33
  def connect(url, retry_count, yielder)
52
34
  result = nil
53
35
  agent.get(url) do |page|
54
- fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
36
+ fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
55
37
 
56
38
  result = yielder.call(page)
57
39
  end
58
40
  result
59
41
  rescue Mechanize::ResponseCodeError => e
60
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
61
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
62
- raise e
42
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
43
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
44
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
63
45
  end
64
46
  end
@@ -6,7 +6,9 @@
6
6
  class Spidy::Connector::Json
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def initialize(wait_time: nil, user_agent: nil)
9
+ attr_reader :logger
10
+
11
+ def initialize(user_agent: nil)
10
12
  @user_agent = user_agent
11
13
  end
12
14
 
@@ -15,7 +17,9 @@ class Spidy::Connector::Json
15
17
  connect(url, &block)
16
18
  end
17
19
 
18
- def connect(url)
20
+ def connect(url, retry_count: 5)
19
21
  OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
22
+ rescue OpenURI::HTTPError => e
23
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
20
24
  end
21
25
  end
@@ -6,15 +6,21 @@
6
6
  class Spidy::Connector::Xml
7
7
  include Spidy::Connector::StaticAccessor
8
8
 
9
- def call(url)
9
+ def call(url, &block)
10
10
  fail 'URL is undefined' if url.blank?
11
11
 
12
+ connect(url, &block)
13
+ end
14
+
15
+ def connect(url, &block)
12
16
  OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
13
- yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
17
+ block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
14
18
  end
19
+ rescue OpenURI::HTTPError => e
20
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
15
21
  end
16
22
 
17
- def initialize(wait_time: nil, user_agent: nil)
23
+ def initialize(user_agent:)
18
24
  @user_agent = user_agent
19
25
  end
20
26
  end
@@ -28,10 +28,6 @@ module Spidy::Definition
28
28
  @socks_proxy = { host: host, port: port }
29
29
  end
30
30
 
31
- def tor?
32
- Tor.running?
33
- end
34
-
35
31
  def each(source = nil, name: :default, &yielder)
36
32
  name = name.presence || :default
37
33
  spidy = @namespace[:"#{name}_spider"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.7'
4
+ VERSION = '0.3.2'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-09-01 00:00:00.000000000 Z
11
+ date: 2020-09-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -187,6 +187,9 @@ files:
187
187
  - Rakefile
188
188
  - bin/console
189
189
  - bin/setup
190
+ - example/master_detail.rb
191
+ - example/proxy.rb
192
+ - example/retry.rb
190
193
  - exe/spidy
191
194
  - lib/spidy.rb
192
195
  - lib/spidy/binder.rb