spidy 0.2.9 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/example/master_detail.rb +73 -0
- data/example/proxy.rb +10 -0
- data/example/retry.rb +7 -0
- data/lib/spidy/connector.rb +75 -28
- data/lib/spidy/connector/direct.rb +1 -1
- data/lib/spidy/connector/html.rb +9 -27
- data/lib/spidy/connector/json.rb +2 -14
- data/lib/spidy/connector/xml.rb +9 -3
- data/lib/spidy/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f87cda14101ec7c184d3e3134f25601cf3c0d2ef2d22b9766f5ce2b417734212
|
4
|
+
data.tar.gz: 803f7012304d475280949f742fb5da7af588dc9f2ab85e3991973b4c60a81a46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd725d2f1a697d4f912e5b37113de561bb002423db61a666cf06404a4bcb3c46765ef1b54dd024f6dd4760f73dc75e285e828e2672348d70e3cb8d71861f5c5f
|
7
|
+
data.tar.gz: ba4550f6871bad2c10f8ad1964e6fd30534f66e02d6f034f916e1afa5247a0f23249fa91842b60477f6f2cbbe2829d8f7edf279655b020eeeb4d242b26d81970
|
@@ -0,0 +1,73 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
Spidy.define do
|
4
|
+
url_to_params = ->(url) {
|
5
|
+
uri = URI.parse(url)
|
6
|
+
params = URI.decode_www_form(uri.query).to_h if uri.query.present?
|
7
|
+
params if params.present?
|
8
|
+
}
|
9
|
+
|
10
|
+
master_page = proc { |url, &yielder|
|
11
|
+
params = url_to_params.call(url)
|
12
|
+
page = params&.dig('page')&.to_i || 0
|
13
|
+
|
14
|
+
limit_page = 3
|
15
|
+
per_page = 25
|
16
|
+
yielder.call(Nokogiri::HTML::Builder.new { |doc|
|
17
|
+
doc.html {
|
18
|
+
doc.body {
|
19
|
+
doc.span.bold {
|
20
|
+
doc.text "Hello world"
|
21
|
+
}
|
22
|
+
doc.main {
|
23
|
+
(page * per_page + 1).upto((page + 1) * per_page).each do |i|
|
24
|
+
doc.a("page #{i}", href: "http://localhost/?id=#{i}")
|
25
|
+
end
|
26
|
+
}
|
27
|
+
doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
|
28
|
+
}
|
29
|
+
}
|
30
|
+
}.doc)
|
31
|
+
}
|
32
|
+
|
33
|
+
detail_page = proc { |url, &yielder|
|
34
|
+
params = url_to_params.call(url)
|
35
|
+
id = params['id']
|
36
|
+
|
37
|
+
yielder.call(Nokogiri::HTML::Builder.new { |doc|
|
38
|
+
doc.html {
|
39
|
+
doc.body {
|
40
|
+
doc.span.bold {
|
41
|
+
doc.text "Hello world"
|
42
|
+
}
|
43
|
+
doc.h1("title_#{id}", id: 'title')
|
44
|
+
doc.main("body_#{id}", id: 'body')
|
45
|
+
doc.div.sub do
|
46
|
+
doc.span.name('testtest')
|
47
|
+
end
|
48
|
+
}
|
49
|
+
}
|
50
|
+
}.doc)
|
51
|
+
}
|
52
|
+
|
53
|
+
define(as: :html, connector: detail_page) do
|
54
|
+
let(:title, '#title')
|
55
|
+
let(:body, '#body')
|
56
|
+
end
|
57
|
+
|
58
|
+
define(:sub, as: :html, connector: :direct) do
|
59
|
+
let(:name, '.name')
|
60
|
+
end
|
61
|
+
|
62
|
+
spider(as: :html, connector: master_page) do |yielder, connector|
|
63
|
+
next_url = 'http://localhost'
|
64
|
+
while next_url.present?
|
65
|
+
connector.call(next_url) do |page|
|
66
|
+
page.search('main a').each do |a|
|
67
|
+
yielder.call(a.attr('href'))
|
68
|
+
end
|
69
|
+
next_url = page.at('a.next')&.attr('href')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
data/example/proxy.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
Spidy.define do
|
2
|
+
user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
|
3
|
+
socks_proxy '127.0.0.1', 9050
|
4
|
+
|
5
|
+
spider(as: :json) do |yielder, connector|
|
6
|
+
connector.call('https://httpbin.org/ip') do |json|
|
7
|
+
yielder.call(json[:origin])
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
data/example/retry.rb
ADDED
data/lib/spidy/connector.rb
CHANGED
@@ -10,10 +10,12 @@ module Spidy::Connector
|
|
10
10
|
autoload :Json
|
11
11
|
autoload :Xml
|
12
12
|
|
13
|
+
DEFAULT_WAIT_TIME = 5
|
14
|
+
|
13
15
|
#
|
14
16
|
# default user agent
|
15
17
|
#
|
16
|
-
|
18
|
+
DEFAULT_USER_AGENT = [
|
17
19
|
'Mozilla/5.0',
|
18
20
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
19
21
|
'AppleWebKit/537.36',
|
@@ -43,56 +45,101 @@ module Spidy::Connector
|
|
43
45
|
# retry class
|
44
46
|
#
|
45
47
|
class Retry < StandardError
|
46
|
-
attr_reader :
|
47
|
-
|
48
|
-
|
48
|
+
attr_reader :object, :response_code, :error
|
49
|
+
|
50
|
+
def initialize(object: nil, error: nil, response_code: nil)
|
51
|
+
@object = object
|
52
|
+
@response_code = response_code
|
53
|
+
@error = error
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# retry
|
59
|
+
#
|
60
|
+
class Retryable
|
61
|
+
attr_reader :origin_connector
|
49
62
|
|
50
|
-
def initialize(
|
51
|
-
@
|
63
|
+
def initialize(connector, logger:, wait_time:)
|
64
|
+
@origin_connector = connector
|
65
|
+
@logger = logger
|
52
66
|
@wait_time = wait_time
|
53
|
-
@
|
67
|
+
@retry_attempt_count = 5
|
68
|
+
end
|
69
|
+
|
70
|
+
def call(url, &block)
|
71
|
+
connect(url, &block)
|
72
|
+
end
|
73
|
+
|
74
|
+
def connect(url, retry_attempt_count: @retry_attempt_count, &block)
|
75
|
+
@logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
76
|
+
@origin_connector.call(url, &block)
|
77
|
+
rescue Spidy::Connector::Retry => e
|
78
|
+
@logger.call('retry.accessed': Time.current,
|
79
|
+
'retry.uri': url,
|
80
|
+
'retry.response_code': e.response_code,
|
81
|
+
'retry.attempt_count': retry_attempt_count)
|
82
|
+
|
83
|
+
retry_attempt_count -= 1
|
84
|
+
if retry_attempt_count.positive?
|
85
|
+
sleep @wait_time
|
86
|
+
@origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
|
87
|
+
retry
|
88
|
+
end
|
89
|
+
raise e.error
|
54
90
|
end
|
55
91
|
end
|
56
92
|
|
57
|
-
|
58
|
-
|
93
|
+
#
|
94
|
+
# tor proxy
|
95
|
+
#
|
96
|
+
class TorConnector
|
97
|
+
attr_reader :connector, :socks_proxy
|
59
98
|
|
60
99
|
def initialize(connector, socks_proxy)
|
100
|
+
@connector = connector
|
61
101
|
@socks_proxy = socks_proxy
|
62
|
-
@origin_connector = connector
|
63
|
-
@proxy_connector =
|
64
|
-
lambda do |url, &block|
|
65
|
-
Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
|
66
|
-
connector.call(url, &block)
|
67
|
-
end
|
68
|
-
end
|
69
102
|
end
|
70
103
|
|
71
|
-
def
|
72
|
-
|
104
|
+
def call(url, &block)
|
105
|
+
Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
|
106
|
+
connector.call(url, &block)
|
107
|
+
end
|
73
108
|
end
|
74
109
|
|
75
|
-
def
|
76
|
-
|
110
|
+
def try_connection?
|
111
|
+
try_connection!
|
77
112
|
true
|
78
113
|
rescue Errno::ECONNREFUSED
|
79
114
|
false
|
80
115
|
end
|
116
|
+
|
117
|
+
def try_connection!
|
118
|
+
Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
|
123
|
+
logger ||= DEFAULT_LOGGER
|
124
|
+
user_agent ||= DEFAULT_USER_AGENT
|
125
|
+
wait_time ||= DEFAULT_WAIT_TIME
|
126
|
+
|
127
|
+
connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
|
128
|
+
Retryable.new(connector, wait_time: wait_time, logger: logger)
|
81
129
|
end
|
82
130
|
|
83
131
|
#
|
84
132
|
# get connection handller
|
85
133
|
#
|
86
|
-
def self.
|
134
|
+
def self.get_connector(value, user_agent: nil, socks_proxy: nil)
|
87
135
|
return value if value.respond_to?(:call)
|
88
136
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
logger: logger || DEFAULT_LOGGER,
|
93
|
-
), socks_proxy)
|
94
|
-
return builder.origin_connector if socks_proxy.nil? || builder.proxy_disabled?
|
137
|
+
connector = const_get(value.to_s.classify).new(user_agent: user_agent)
|
138
|
+
fail "Not defined connnector[#{value}]" if connector.nil?
|
139
|
+
return connector if socks_proxy.nil?
|
95
140
|
|
96
|
-
|
141
|
+
tor = TorConnector.new(connnector, socks_proxy)
|
142
|
+
tor.try_connection!
|
143
|
+
tor
|
97
144
|
end
|
98
145
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -6,16 +6,13 @@
|
|
6
6
|
class Spidy::Connector::Html
|
7
7
|
include Spidy::Connector::StaticAccessor
|
8
8
|
|
9
|
-
def initialize(
|
10
|
-
@wait_time = wait_time
|
11
|
-
@logger = logger
|
9
|
+
def initialize(user_agent:)
|
12
10
|
@agent = Mechanize.new
|
13
11
|
@user_agent = user_agent
|
14
12
|
@agent.user_agent = user_agent
|
15
13
|
end
|
16
14
|
|
17
15
|
attr_reader :agent
|
18
|
-
attr_reader :logger
|
19
16
|
|
20
17
|
def call(url, encoding: nil, retry_count: 5, &yielder)
|
21
18
|
fail 'url is not specified' if url.blank?
|
@@ -23,42 +20,27 @@ class Spidy::Connector::Html
|
|
23
20
|
agent.default_encoding = encoding
|
24
21
|
agent.force_default_encoding = true
|
25
22
|
end
|
26
|
-
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
27
|
-
get(url, retry_count, yielder)
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def get(url, retry_count, yielder)
|
33
23
|
connect(url, retry_count, yielder)
|
34
|
-
|
35
|
-
logger.call('retry.accessed': Time.current,
|
36
|
-
'retry.uri': url,
|
37
|
-
'retry.response_code': e.response_code,
|
38
|
-
'retry.rest_count': retry_count)
|
24
|
+
end
|
39
25
|
|
26
|
+
def refresh!
|
40
27
|
@agent = Mechanize.new
|
41
28
|
@agent.user_agent = @user_agent
|
42
|
-
|
43
|
-
retry_count -= 1
|
44
|
-
if retry_count.positive?
|
45
|
-
sleep e.wait_time
|
46
|
-
retry
|
47
|
-
end
|
48
|
-
raise e
|
49
29
|
end
|
50
30
|
|
31
|
+
private
|
32
|
+
|
51
33
|
def connect(url, retry_count, yielder)
|
52
34
|
result = nil
|
53
35
|
agent.get(url) do |page|
|
54
|
-
fail Spidy::Connector::Retry,
|
36
|
+
fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
|
55
37
|
|
56
38
|
result = yielder.call(page)
|
57
39
|
end
|
58
40
|
result
|
59
41
|
rescue Mechanize::ResponseCodeError => e
|
60
|
-
raise Spidy::Connector::Retry, error: e,
|
61
|
-
raise Spidy::Connector::Retry, error: e,
|
62
|
-
raise e
|
42
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
|
43
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
|
44
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
|
63
45
|
end
|
64
46
|
end
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -8,10 +8,8 @@ class Spidy::Connector::Json
|
|
8
8
|
|
9
9
|
attr_reader :logger
|
10
10
|
|
11
|
-
def initialize(
|
12
|
-
@wait_time = wait_time
|
11
|
+
def initialize(user_agent: nil)
|
13
12
|
@user_agent = user_agent
|
14
|
-
@logger = logger
|
15
13
|
end
|
16
14
|
|
17
15
|
def call(url, &block)
|
@@ -22,16 +20,6 @@ class Spidy::Connector::Json
|
|
22
20
|
def connect(url, retry_count: 5)
|
23
21
|
OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
24
22
|
rescue OpenURI::HTTPError => e
|
25
|
-
|
26
|
-
'retry.uri': url,
|
27
|
-
'retry.response_code': e.message,
|
28
|
-
'retry.rest_count': retry_count)
|
29
|
-
|
30
|
-
retry_count -= 1
|
31
|
-
if retry_count.positive?
|
32
|
-
sleep @wait_time
|
33
|
-
retry
|
34
|
-
end
|
35
|
-
raise e
|
23
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
|
36
24
|
end
|
37
25
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -6,15 +6,21 @@
|
|
6
6
|
class Spidy::Connector::Xml
|
7
7
|
include Spidy::Connector::StaticAccessor
|
8
8
|
|
9
|
-
def call(url)
|
9
|
+
def call(url, &block)
|
10
10
|
fail 'URL is undefined' if url.blank?
|
11
11
|
|
12
|
+
connect(url, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def connect(url, &block)
|
12
16
|
OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
|
13
|
-
|
17
|
+
block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
14
18
|
end
|
19
|
+
rescue OpenURI::HTTPError => e
|
20
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
|
15
21
|
end
|
16
22
|
|
17
|
-
def initialize(user_agent:
|
23
|
+
def initialize(user_agent:)
|
18
24
|
@user_agent = user_agent
|
19
25
|
end
|
20
26
|
end
|
data/lib/spidy/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -187,6 +187,9 @@ files:
|
|
187
187
|
- Rakefile
|
188
188
|
- bin/console
|
189
189
|
- bin/setup
|
190
|
+
- example/master_detail.rb
|
191
|
+
- example/proxy.rb
|
192
|
+
- example/retry.rb
|
190
193
|
- exe/spidy
|
191
194
|
- lib/spidy.rb
|
192
195
|
- lib/spidy/binder.rb
|