spidy 0.2.6 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c6c2e2e85979c5d5492564fec243c23c44ef1d762230cd1a59400fcc46bf42a3
4
- data.tar.gz: d551534240f528923ccb862e17ae521aaccdc5596e45c09e1cee0e90f0c990c5
3
+ metadata.gz: 52bd83a0c2e46c24579454780bc89ea49c9cfab9a38403b117f14727e44b571f
4
+ data.tar.gz: 3d258302ce5a59d48a68b1bc673f7517b2bd25be7129f29ec3e2ea11a8161640
5
5
  SHA512:
6
- metadata.gz: 3348c6243500f90f7157435bf36f1675744f46c0025dc4bab81fad4ccfb65c4e2ac0d029a837fb589c768b4868fbaf8ab7c258ad67aa4e1a9f33f0949701b2da
7
- data.tar.gz: ef8d804eae97e2300747abb1b9e97f84e100b6b151e01b587fff95c98327ce21e0024456c6d8208ecd81ceaead8a0353d251f15a4ef4deff7e3b077e13c872cb
6
+ metadata.gz: 26cdfeefd087471eb5ed62937176014a01b0b083e2ebed8dd046a27c9ec9cd795a36a1bed59fdfdc29fca9cf186a97125ee8db0b255701869c51ca268a0f737f
7
+ data.tar.gz: d796fe650f6fd98a52c9aaad2aed53f73de3efb1a9a7b5147daf3e455a63d38d294a41471e9ac7198f38e7c79f0f308cbb70b71786bbb33f5413bfd73138f242
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- spidy (0.2.6)
4
+ spidy (0.2.9)
5
5
  activesupport
6
6
  mechanize
7
7
  pry
@@ -0,0 +1,73 @@
1
+
2
+
3
+ Spidy.define do
4
+ url_to_params = ->(url) {
5
+ uri = URI.parse(url)
6
+ params = URI.decode_www_form(uri.query).to_h if uri.query.present?
7
+ params if params.present?
8
+ }
9
+
10
+ master_page = proc { |url, &yielder|
11
+ params = url_to_params.call(url)
12
+ page = params&.dig('page')&.to_i || 0
13
+
14
+ limit_page = 3
15
+ per_page = 25
16
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
17
+ doc.html {
18
+ doc.body {
19
+ doc.span.bold {
20
+ doc.text "Hello world"
21
+ }
22
+ doc.main {
23
+ (page * per_page + 1).upto((page + 1) * per_page).each do |i|
24
+ doc.a("page #{i}", href: "http://localhost/?id=#{i}")
25
+ end
26
+ }
27
+ doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
28
+ }
29
+ }
30
+ }.doc)
31
+ }
32
+
33
+ detail_page = proc { |url, &yielder|
34
+ params = url_to_params.call(url)
35
+ id = params['id']
36
+
37
+ yielder.call(Nokogiri::HTML::Builder.new { |doc|
38
+ doc.html {
39
+ doc.body {
40
+ doc.span.bold {
41
+ doc.text "Hello world"
42
+ }
43
+ doc.h1("title_#{id}", id: 'title')
44
+ doc.main("body_#{id}", id: 'body')
45
+ doc.div.sub do
46
+ doc.span.name('testtest')
47
+ end
48
+ }
49
+ }
50
+ }.doc)
51
+ }
52
+
53
+ define(as: :html, connector: detail_page) do
54
+ let(:title, '#title')
55
+ let(:body, '#body')
56
+ end
57
+
58
+ define(:sub, as: :html, connector: :direct) do
59
+ let(:name, '.name')
60
+ end
61
+
62
+ spider(as: :html, connector: master_page) do |yielder, connector|
63
+ next_url = 'http://localhost'
64
+ while next_url.present?
65
+ connector.call(next_url) do |page|
66
+ page.search('main a').each do |a|
67
+ yielder.call(a.attr('href'))
68
+ end
69
+ next_url = page.at('a.next')&.attr('href')
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,10 @@
1
+ Spidy.define do
2
+ user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
3
+ socks_proxy '127.0.0.1', 9050
4
+
5
+ spider(as: :json) do |yielder, connector|
6
+ connector.call('https://httpbin.org/ip') do |json|
7
+ yielder.call(json[:origin])
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,7 @@
1
+ Spidy.define do
2
+ spider(as: :json) do |yielder, connector|
3
+ connector.call('https://httpbin.org/status/500') do |json|
4
+ yielder.call(json[:origin])
5
+ end
6
+ end
7
+ end
@@ -10,25 +10,12 @@ module Spidy::Connector
10
10
  autoload :Json
11
11
  autoload :Xml
12
12
 
13
- #
14
- # retry class
15
- #
16
- class Retry < StandardError
17
- attr_reader :page
18
- attr_reader :response_code
19
- attr_reader :wait_time
20
-
21
- def initialize(wait_time: 2, page: nil, error: nil)
22
- @page = page
23
- @wait_time = wait_time
24
- @response_code = error.try(:response_code) || page.try(:response_code)
25
- end
26
- end
13
+ DEFAULT_WAIT_TIME = 5
27
14
 
28
15
  #
29
16
  # default user agent
30
17
  #
31
- USER_AGENT = [
18
+ DEFAULT_USER_AGENT = [
32
19
  'Mozilla/5.0',
33
20
  '(Macintosh; Intel Mac OS X 10_12_6)',
34
21
  'AppleWebKit/537.36',
@@ -38,18 +25,121 @@ module Spidy::Connector
38
25
  ].join(' ')
39
26
 
40
27
  #
41
- # get connection handller
28
+ # error output logger
42
29
  #
43
- def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil)
44
- return value if value.respond_to?(:call)
30
+ DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
45
31
 
46
- connector = const_get(value.to_s.classify).new(wait_time: wait_time || 5, user_agent: user_agent || USER_AGENT)
47
- return connector if socks_proxy.nil?
32
+ #
33
+ # static method
34
+ #
35
+ module StaticAccessor
36
+ extend ActiveSupport::Concern
37
+ class_methods do
38
+ def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
39
+ new(wait_time: wait_time, user_agent: user_agent, logger: logger).call(url, &block)
40
+ end
41
+ end
42
+ end
43
+
44
+ #
45
+ # retry class
46
+ #
47
+ class Retry < StandardError
48
+ attr_reader :object, :response_code, :error
49
+
50
+ def initialize(object: nil, error: nil, response_code: nil)
51
+ @object = object
52
+ @response_code = response_code
53
+ @error = error
54
+ end
55
+ end
56
+
57
+ #
58
+ # retry
59
+ #
60
+ class Retryable
61
+ attr_reader :origin_connector
62
+
63
+ def initialize(connector, logger:, wait_time:)
64
+ @origin_connector = connector
65
+ @logger = logger
66
+ @wait_time = wait_time
67
+ @retry_attempt_count = 5
68
+ end
69
+
70
+ def call(url, &block)
71
+ connect(url, &block)
72
+ end
73
+
74
+ def connect(url, retry_attempt_count: @retry_attempt_count, &block)
75
+ @logger.call('connnector.get': url, 'connnector.accessed': Time.current)
76
+ @origin_connector.call(url, &block)
77
+ rescue Spidy::Connector::Retry => e
78
+ @logger.call('retry.accessed': Time.current,
79
+ 'retry.uri': url,
80
+ 'retry.response_code': e.response_code,
81
+ 'retry.attempt_count': retry_attempt_count)
82
+
83
+ retry_attempt_count -= 1
84
+ if retry_attempt_count.positive?
85
+ sleep @wait_time
86
+ @origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
87
+ retry
88
+ end
89
+ raise e.error
90
+ end
91
+ end
48
92
 
49
- lambda do |url, &block|
93
+ #
94
+ # tor proxy
95
+ #
96
+ class TorConnector
97
+ attr_reader :connector, :socks_proxy
98
+
99
+ def initialize(connector, socks_proxy)
100
+ @connector = connector
101
+ @socks_proxy = socks_proxy
102
+ end
103
+
104
+ def call(url, &block)
50
105
  Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
51
106
  connector.call(url, &block)
52
107
  end
53
108
  end
109
+
110
+ def try_connection?
111
+ try_connection!
112
+ true
113
+ rescue Errno::ECONNREFUSED
114
+ false
115
+ end
116
+
117
+ def try_connection!
118
+ Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
119
+ end
120
+ end
121
+
122
+ def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
123
+ logger ||= DEFAULT_LOGGER
124
+ user_agent ||= DEFAULT_USER_AGENT
125
+ wait_time ||= DEFAULT_WAIT_TIME
126
+
127
+ connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
128
+ Retryable.new(connector, wait_time: wait_time, logger: logger)
129
+ end
130
+
131
+ #
132
+ # get connection handller
133
+ #
134
+ def self.get_connector(value, user_agent: nil, socks_proxy: nil)
135
+ return value if value.respond_to?(:call)
136
+
137
+ connector = const_get(value.to_s.classify).new(user_agent: user_agent)
138
+ fail "Not defined connnector[#{value}]" if connector.nil?
139
+ return connector if socks_proxy.nil?
140
+
141
+ tor = TorConnector.new(connector, socks_proxy)
142
+ tor.try_connection!
143
+ tor
54
144
  end
55
145
  end
@@ -8,6 +8,6 @@ class Spidy::Connector::Direct
8
8
  yielder.call(resource)
9
9
  end
10
10
 
11
- def initialize(wait_time: nil, user_agent: nil)
11
+ def initialize(user_agent:)
12
12
  end
13
13
  end
@@ -4,16 +4,15 @@
4
4
  # Mechanize wrapper
5
5
  #
6
6
  class Spidy::Connector::Html
7
- def initialize(wait_time:, user_agent:, logger: nil)
8
- @wait_time = wait_time
9
- @logger = logger || proc { |values| STDERR.puts(values.to_json) }
7
+ include Spidy::Connector::StaticAccessor
8
+
9
+ def initialize(user_agent:)
10
10
  @agent = Mechanize.new
11
11
  @user_agent = user_agent
12
12
  @agent.user_agent = user_agent
13
13
  end
14
14
 
15
15
  attr_reader :agent
16
- attr_reader :logger
17
16
 
18
17
  def call(url, encoding: nil, retry_count: 5, &yielder)
19
18
  fail 'url is not specified' if url.blank?
@@ -21,42 +20,27 @@ class Spidy::Connector::Html
21
20
  agent.default_encoding = encoding
22
21
  agent.force_default_encoding = true
23
22
  end
24
- logger.call('connnector.get': url, 'connnector.accessed': Time.current)
25
- get(url, retry_count, yielder)
26
- end
27
-
28
- private
29
-
30
- def get(url, retry_count, yielder)
31
23
  connect(url, retry_count, yielder)
32
- rescue Spidy::Connector::Retry => e
33
- logger.call('retry.accessed': Time.current,
34
- 'retry.uri': url,
35
- 'retry.response_code': e.response_code,
36
- 'retry.rest_count': retry_count)
24
+ end
37
25
 
26
+ def refresh!
38
27
  @agent = Mechanize.new
39
28
  @agent.user_agent = @user_agent
40
-
41
- retry_count -= 1
42
- if retry_count.positive?
43
- sleep e.wait_time
44
- retry
45
- end
46
- raise e
47
29
  end
48
30
 
31
+ private
32
+
49
33
  def connect(url, retry_count, yielder)
50
34
  result = nil
51
35
  agent.get(url) do |page|
52
- fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
36
+ fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
53
37
 
54
38
  result = yielder.call(page)
55
39
  end
56
40
  result
57
41
  rescue Mechanize::ResponseCodeError => e
58
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
59
- raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
60
- raise e
42
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
43
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
44
+ raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
61
45
  end
62
46
  end
@@ -4,7 +4,11 @@
4
4
  # OpenURI to JSON.parse
5
5
  #
6
6
  class Spidy::Connector::Json
7
- def initialize(wait_time: nil, user_agent: nil)
7
+ include Spidy::Connector::StaticAccessor
8
+
9
+ attr_reader :logger
10
+
11
+ def initialize(user_agent: nil)
8
12
  @user_agent = user_agent
9
13
  end
10
14
 
@@ -13,7 +17,9 @@ class Spidy::Connector::Json
13
17
  connect(url, &block)
14
18
  end
15
19
 
16
- def connect(url)
20
+ def connect(url, retry_count: 5)
17
21
  OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
22
+ rescue OpenURI::HTTPError => e
23
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
18
24
  end
19
25
  end
@@ -4,15 +4,23 @@
4
4
  # xml
5
5
  #
6
6
  class Spidy::Connector::Xml
7
- def call(url)
7
+ include Spidy::Connector::StaticAccessor
8
+
9
+ def call(url, &block)
8
10
  fail 'URL is undefined' if url.blank?
9
11
 
12
+ connect(url, &block)
13
+ end
14
+
15
+ def connect(url, &block)
10
16
  OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
11
- yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
17
+ block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
12
18
  end
19
+ rescue OpenURI::HTTPError => e
20
+ raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
13
21
  end
14
22
 
15
- def initialize(wait_time: nil, user_agent: nil)
23
+ def initialize(user_agent:)
16
24
  @user_agent = user_agent
17
25
  end
18
26
  end
@@ -28,10 +28,6 @@ module Spidy::Definition
28
28
  @socks_proxy = { host: host, port: port }
29
29
  end
30
30
 
31
- def tor?
32
- Tor.running?
33
- end
34
-
35
31
  def each(source = nil, name: :default, &yielder)
36
32
  name = name.presence || :default
37
33
  spidy = @namespace[:"#{name}_spider"]
@@ -50,7 +46,7 @@ module Spidy::Definition
50
46
 
51
47
  def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
52
48
  @namespace ||= {}
53
- connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent)
49
+ connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
54
50
  binder = Spidy::Binder.get(self, binder || as)
55
51
  @namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
56
52
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Spidy
4
- VERSION = '0.2.6'
4
+ VERSION = '0.3.1'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - aileron
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-09-01 00:00:00.000000000 Z
11
+ date: 2020-09-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -187,6 +187,9 @@ files:
187
187
  - Rakefile
188
188
  - bin/console
189
189
  - bin/setup
190
+ - example/master_detail.rb
191
+ - example/proxy.rb
192
+ - example/retry.rb
190
193
  - exe/spidy
191
194
  - lib/spidy.rb
192
195
  - lib/spidy/binder.rb