spidy 0.2.9 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/example/master_detail.rb +73 -0
- data/example/proxy.rb +10 -0
- data/example/retry.rb +7 -0
- data/lib/spidy/connector.rb +75 -28
- data/lib/spidy/connector/direct.rb +1 -1
- data/lib/spidy/connector/html.rb +9 -27
- data/lib/spidy/connector/json.rb +2 -14
- data/lib/spidy/connector/xml.rb +9 -3
- data/lib/spidy/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f87cda14101ec7c184d3e3134f25601cf3c0d2ef2d22b9766f5ce2b417734212
|
4
|
+
data.tar.gz: 803f7012304d475280949f742fb5da7af588dc9f2ab85e3991973b4c60a81a46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd725d2f1a697d4f912e5b37113de561bb002423db61a666cf06404a4bcb3c46765ef1b54dd024f6dd4760f73dc75e285e828e2672348d70e3cb8d71861f5c5f
|
7
|
+
data.tar.gz: ba4550f6871bad2c10f8ad1964e6fd30534f66e02d6f034f916e1afa5247a0f23249fa91842b60477f6f2cbbe2829d8f7edf279655b020eeeb4d242b26d81970
|
@@ -0,0 +1,73 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
Spidy.define do
|
4
|
+
url_to_params = ->(url) {
|
5
|
+
uri = URI.parse(url)
|
6
|
+
params = URI.decode_www_form(uri.query).to_h if uri.query.present?
|
7
|
+
params if params.present?
|
8
|
+
}
|
9
|
+
|
10
|
+
master_page = proc { |url, &yielder|
|
11
|
+
params = url_to_params.call(url)
|
12
|
+
page = params&.dig('page')&.to_i || 0
|
13
|
+
|
14
|
+
limit_page = 3
|
15
|
+
per_page = 25
|
16
|
+
yielder.call(Nokogiri::HTML::Builder.new { |doc|
|
17
|
+
doc.html {
|
18
|
+
doc.body {
|
19
|
+
doc.span.bold {
|
20
|
+
doc.text "Hello world"
|
21
|
+
}
|
22
|
+
doc.main {
|
23
|
+
(page * per_page + 1).upto((page + 1) * per_page).each do |i|
|
24
|
+
doc.a("page #{i}", href: "http://localhost/?id=#{i}")
|
25
|
+
end
|
26
|
+
}
|
27
|
+
doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
|
28
|
+
}
|
29
|
+
}
|
30
|
+
}.doc)
|
31
|
+
}
|
32
|
+
|
33
|
+
detail_page = proc { |url, &yielder|
|
34
|
+
params = url_to_params.call(url)
|
35
|
+
id = params['id']
|
36
|
+
|
37
|
+
yielder.call(Nokogiri::HTML::Builder.new { |doc|
|
38
|
+
doc.html {
|
39
|
+
doc.body {
|
40
|
+
doc.span.bold {
|
41
|
+
doc.text "Hello world"
|
42
|
+
}
|
43
|
+
doc.h1("title_#{id}", id: 'title')
|
44
|
+
doc.main("body_#{id}", id: 'body')
|
45
|
+
doc.div.sub do
|
46
|
+
doc.span.name('testtest')
|
47
|
+
end
|
48
|
+
}
|
49
|
+
}
|
50
|
+
}.doc)
|
51
|
+
}
|
52
|
+
|
53
|
+
define(as: :html, connector: detail_page) do
|
54
|
+
let(:title, '#title')
|
55
|
+
let(:body, '#body')
|
56
|
+
end
|
57
|
+
|
58
|
+
define(:sub, as: :html, connector: :direct) do
|
59
|
+
let(:name, '.name')
|
60
|
+
end
|
61
|
+
|
62
|
+
spider(as: :html, connector: master_page) do |yielder, connector|
|
63
|
+
next_url = 'http://localhost'
|
64
|
+
while next_url.present?
|
65
|
+
connector.call(next_url) do |page|
|
66
|
+
page.search('main a').each do |a|
|
67
|
+
yielder.call(a.attr('href'))
|
68
|
+
end
|
69
|
+
next_url = page.at('a.next')&.attr('href')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
data/example/proxy.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
Spidy.define do
|
2
|
+
user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
|
3
|
+
socks_proxy '127.0.0.1', 9050
|
4
|
+
|
5
|
+
spider(as: :json) do |yielder, connector|
|
6
|
+
connector.call('https://httpbin.org/ip') do |json|
|
7
|
+
yielder.call(json[:origin])
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
data/example/retry.rb
ADDED
data/lib/spidy/connector.rb
CHANGED
@@ -10,10 +10,12 @@ module Spidy::Connector
|
|
10
10
|
autoload :Json
|
11
11
|
autoload :Xml
|
12
12
|
|
13
|
+
DEFAULT_WAIT_TIME = 5
|
14
|
+
|
13
15
|
#
|
14
16
|
# default user agent
|
15
17
|
#
|
16
|
-
|
18
|
+
DEFAULT_USER_AGENT = [
|
17
19
|
'Mozilla/5.0',
|
18
20
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
19
21
|
'AppleWebKit/537.36',
|
@@ -43,56 +45,101 @@ module Spidy::Connector
|
|
43
45
|
# retry class
|
44
46
|
#
|
45
47
|
class Retry < StandardError
|
46
|
-
attr_reader :
|
47
|
-
|
48
|
-
|
48
|
+
attr_reader :object, :response_code, :error
|
49
|
+
|
50
|
+
def initialize(object: nil, error: nil, response_code: nil)
|
51
|
+
@object = object
|
52
|
+
@response_code = response_code
|
53
|
+
@error = error
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# retry
|
59
|
+
#
|
60
|
+
class Retryable
|
61
|
+
attr_reader :origin_connector
|
49
62
|
|
50
|
-
def initialize(
|
51
|
-
@
|
63
|
+
def initialize(connector, logger:, wait_time:)
|
64
|
+
@origin_connector = connector
|
65
|
+
@logger = logger
|
52
66
|
@wait_time = wait_time
|
53
|
-
@
|
67
|
+
@retry_attempt_count = 5
|
68
|
+
end
|
69
|
+
|
70
|
+
def call(url, &block)
|
71
|
+
connect(url, &block)
|
72
|
+
end
|
73
|
+
|
74
|
+
def connect(url, retry_attempt_count: @retry_attempt_count, &block)
|
75
|
+
@logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
76
|
+
@origin_connector.call(url, &block)
|
77
|
+
rescue Spidy::Connector::Retry => e
|
78
|
+
@logger.call('retry.accessed': Time.current,
|
79
|
+
'retry.uri': url,
|
80
|
+
'retry.response_code': e.response_code,
|
81
|
+
'retry.attempt_count': retry_attempt_count)
|
82
|
+
|
83
|
+
retry_attempt_count -= 1
|
84
|
+
if retry_attempt_count.positive?
|
85
|
+
sleep @wait_time
|
86
|
+
@origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
|
87
|
+
retry
|
88
|
+
end
|
89
|
+
raise e.error
|
54
90
|
end
|
55
91
|
end
|
56
92
|
|
57
|
-
|
58
|
-
|
93
|
+
#
|
94
|
+
# tor proxy
|
95
|
+
#
|
96
|
+
class TorConnector
|
97
|
+
attr_reader :connector, :socks_proxy
|
59
98
|
|
60
99
|
def initialize(connector, socks_proxy)
|
100
|
+
@connector = connector
|
61
101
|
@socks_proxy = socks_proxy
|
62
|
-
@origin_connector = connector
|
63
|
-
@proxy_connector =
|
64
|
-
lambda do |url, &block|
|
65
|
-
Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
|
66
|
-
connector.call(url, &block)
|
67
|
-
end
|
68
|
-
end
|
69
102
|
end
|
70
103
|
|
71
|
-
def
|
72
|
-
|
104
|
+
def call(url, &block)
|
105
|
+
Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
|
106
|
+
connector.call(url, &block)
|
107
|
+
end
|
73
108
|
end
|
74
109
|
|
75
|
-
def
|
76
|
-
|
110
|
+
def try_connection?
|
111
|
+
try_connection!
|
77
112
|
true
|
78
113
|
rescue Errno::ECONNREFUSED
|
79
114
|
false
|
80
115
|
end
|
116
|
+
|
117
|
+
def try_connection!
|
118
|
+
Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
|
123
|
+
logger ||= DEFAULT_LOGGER
|
124
|
+
user_agent ||= DEFAULT_USER_AGENT
|
125
|
+
wait_time ||= DEFAULT_WAIT_TIME
|
126
|
+
|
127
|
+
connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
|
128
|
+
Retryable.new(connector, wait_time: wait_time, logger: logger)
|
81
129
|
end
|
82
130
|
|
83
131
|
#
|
84
132
|
# get connection handller
|
85
133
|
#
|
86
|
-
def self.
|
134
|
+
def self.get_connector(value, user_agent: nil, socks_proxy: nil)
|
87
135
|
return value if value.respond_to?(:call)
|
88
136
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
logger: logger || DEFAULT_LOGGER,
|
93
|
-
), socks_proxy)
|
94
|
-
return builder.origin_connector if socks_proxy.nil? || builder.proxy_disabled?
|
137
|
+
connector = const_get(value.to_s.classify).new(user_agent: user_agent)
|
138
|
+
fail "Not defined connnector[#{value}]" if connector.nil?
|
139
|
+
return connector if socks_proxy.nil?
|
95
140
|
|
96
|
-
|
141
|
+
tor = TorConnector.new(connnector, socks_proxy)
|
142
|
+
tor.try_connection!
|
143
|
+
tor
|
97
144
|
end
|
98
145
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -6,16 +6,13 @@
|
|
6
6
|
class Spidy::Connector::Html
|
7
7
|
include Spidy::Connector::StaticAccessor
|
8
8
|
|
9
|
-
def initialize(
|
10
|
-
@wait_time = wait_time
|
11
|
-
@logger = logger
|
9
|
+
def initialize(user_agent:)
|
12
10
|
@agent = Mechanize.new
|
13
11
|
@user_agent = user_agent
|
14
12
|
@agent.user_agent = user_agent
|
15
13
|
end
|
16
14
|
|
17
15
|
attr_reader :agent
|
18
|
-
attr_reader :logger
|
19
16
|
|
20
17
|
def call(url, encoding: nil, retry_count: 5, &yielder)
|
21
18
|
fail 'url is not specified' if url.blank?
|
@@ -23,42 +20,27 @@ class Spidy::Connector::Html
|
|
23
20
|
agent.default_encoding = encoding
|
24
21
|
agent.force_default_encoding = true
|
25
22
|
end
|
26
|
-
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
27
|
-
get(url, retry_count, yielder)
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def get(url, retry_count, yielder)
|
33
23
|
connect(url, retry_count, yielder)
|
34
|
-
|
35
|
-
logger.call('retry.accessed': Time.current,
|
36
|
-
'retry.uri': url,
|
37
|
-
'retry.response_code': e.response_code,
|
38
|
-
'retry.rest_count': retry_count)
|
24
|
+
end
|
39
25
|
|
26
|
+
def refresh!
|
40
27
|
@agent = Mechanize.new
|
41
28
|
@agent.user_agent = @user_agent
|
42
|
-
|
43
|
-
retry_count -= 1
|
44
|
-
if retry_count.positive?
|
45
|
-
sleep e.wait_time
|
46
|
-
retry
|
47
|
-
end
|
48
|
-
raise e
|
49
29
|
end
|
50
30
|
|
31
|
+
private
|
32
|
+
|
51
33
|
def connect(url, retry_count, yielder)
|
52
34
|
result = nil
|
53
35
|
agent.get(url) do |page|
|
54
|
-
fail Spidy::Connector::Retry,
|
36
|
+
fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
|
55
37
|
|
56
38
|
result = yielder.call(page)
|
57
39
|
end
|
58
40
|
result
|
59
41
|
rescue Mechanize::ResponseCodeError => e
|
60
|
-
raise Spidy::Connector::Retry, error: e,
|
61
|
-
raise Spidy::Connector::Retry, error: e,
|
62
|
-
raise e
|
42
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
|
43
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
|
44
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
|
63
45
|
end
|
64
46
|
end
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -8,10 +8,8 @@ class Spidy::Connector::Json
|
|
8
8
|
|
9
9
|
attr_reader :logger
|
10
10
|
|
11
|
-
def initialize(
|
12
|
-
@wait_time = wait_time
|
11
|
+
def initialize(user_agent: nil)
|
13
12
|
@user_agent = user_agent
|
14
|
-
@logger = logger
|
15
13
|
end
|
16
14
|
|
17
15
|
def call(url, &block)
|
@@ -22,16 +20,6 @@ class Spidy::Connector::Json
|
|
22
20
|
def connect(url, retry_count: 5)
|
23
21
|
OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
24
22
|
rescue OpenURI::HTTPError => e
|
25
|
-
|
26
|
-
'retry.uri': url,
|
27
|
-
'retry.response_code': e.message,
|
28
|
-
'retry.rest_count': retry_count)
|
29
|
-
|
30
|
-
retry_count -= 1
|
31
|
-
if retry_count.positive?
|
32
|
-
sleep @wait_time
|
33
|
-
retry
|
34
|
-
end
|
35
|
-
raise e
|
23
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
|
36
24
|
end
|
37
25
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -6,15 +6,21 @@
|
|
6
6
|
class Spidy::Connector::Xml
|
7
7
|
include Spidy::Connector::StaticAccessor
|
8
8
|
|
9
|
-
def call(url)
|
9
|
+
def call(url, &block)
|
10
10
|
fail 'URL is undefined' if url.blank?
|
11
11
|
|
12
|
+
connect(url, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def connect(url, &block)
|
12
16
|
OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
|
13
|
-
|
17
|
+
block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
14
18
|
end
|
19
|
+
rescue OpenURI::HTTPError => e
|
20
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
|
15
21
|
end
|
16
22
|
|
17
|
-
def initialize(user_agent:
|
23
|
+
def initialize(user_agent:)
|
18
24
|
@user_agent = user_agent
|
19
25
|
end
|
20
26
|
end
|
data/lib/spidy/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -187,6 +187,9 @@ files:
|
|
187
187
|
- Rakefile
|
188
188
|
- bin/console
|
189
189
|
- bin/setup
|
190
|
+
- example/master_detail.rb
|
191
|
+
- example/proxy.rb
|
192
|
+
- example/retry.rb
|
190
193
|
- exe/spidy
|
191
194
|
- lib/spidy.rb
|
192
195
|
- lib/spidy/binder.rb
|