spidy 0.2.5 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -3
- data/example/master_detail.rb +73 -0
- data/example/proxy.rb +10 -0
- data/example/retry.rb +7 -0
- data/lib/spidy.rb +1 -0
- data/lib/spidy/connector.rb +111 -21
- data/lib/spidy/connector/direct.rb +1 -1
- data/lib/spidy/connector/html.rb +11 -27
- data/lib/spidy/connector/json.rb +8 -2
- data/lib/spidy/connector/xml.rb +11 -3
- data/lib/spidy/definition.rb +1 -1
- data/lib/spidy/version.rb +1 -1
- data/spidy.gemspec +1 -0
- metadata +19 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f87cda14101ec7c184d3e3134f25601cf3c0d2ef2d22b9766f5ce2b417734212
|
4
|
+
data.tar.gz: 803f7012304d475280949f742fb5da7af588dc9f2ab85e3991973b4c60a81a46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd725d2f1a697d4f912e5b37113de561bb002423db61a666cf06404a4bcb3c46765ef1b54dd024f6dd4760f73dc75e285e828e2672348d70e3cb8d71861f5c5f
|
7
|
+
data.tar.gz: ba4550f6871bad2c10f8ad1964e6fd30534f66e02d6f034f916e1afa5247a0f23249fa91842b60477f6f2cbbe2829d8f7edf279655b020eeeb4d242b26d81970
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.2.
|
4
|
+
spidy (0.2.9)
|
5
5
|
activesupport
|
6
6
|
mechanize
|
7
7
|
pry
|
8
|
+
socksify
|
9
|
+
tor
|
8
10
|
|
9
11
|
GEM
|
10
12
|
remote: https://rubygems.org/
|
@@ -16,7 +18,7 @@ GEM
|
|
16
18
|
tzinfo (~> 1.1)
|
17
19
|
zeitwerk (~> 2.2, >= 2.2.2)
|
18
20
|
coderay (1.1.2)
|
19
|
-
concurrent-ruby (1.1.
|
21
|
+
concurrent-ruby (1.1.7)
|
20
22
|
connection_pool (2.2.3)
|
21
23
|
diff-lcs (1.3)
|
22
24
|
domain_name (0.5.20190701)
|
@@ -40,7 +42,7 @@ GEM
|
|
40
42
|
mime-types-data (~> 3.2015)
|
41
43
|
mime-types-data (3.2020.0512)
|
42
44
|
mini_portile2 (2.4.0)
|
43
|
-
minitest (5.14.
|
45
|
+
minitest (5.14.2)
|
44
46
|
mixlib-shellout (2.4.4)
|
45
47
|
net-http-digest_auth (1.4.1)
|
46
48
|
net-http-persistent (4.0.0)
|
@@ -72,7 +74,9 @@ GEM
|
|
72
74
|
diff-lcs (>= 1.2.0, < 2.0)
|
73
75
|
rspec-support (~> 3.8.0)
|
74
76
|
rspec-support (3.8.2)
|
77
|
+
socksify (1.7.1)
|
75
78
|
thread_safe (0.3.6)
|
79
|
+
tor (0.1.4)
|
76
80
|
tzinfo (1.2.7)
|
77
81
|
thread_safe (~> 0.1)
|
78
82
|
unf (0.1.4)
|
@@ -0,0 +1,73 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
Spidy.define do
|
4
|
+
url_to_params = ->(url) {
|
5
|
+
uri = URI.parse(url)
|
6
|
+
params = URI.decode_www_form(uri.query).to_h if uri.query.present?
|
7
|
+
params if params.present?
|
8
|
+
}
|
9
|
+
|
10
|
+
master_page = proc { |url, &yielder|
|
11
|
+
params = url_to_params.call(url)
|
12
|
+
page = params&.dig('page')&.to_i || 0
|
13
|
+
|
14
|
+
limit_page = 3
|
15
|
+
per_page = 25
|
16
|
+
yielder.call(Nokogiri::HTML::Builder.new { |doc|
|
17
|
+
doc.html {
|
18
|
+
doc.body {
|
19
|
+
doc.span.bold {
|
20
|
+
doc.text "Hello world"
|
21
|
+
}
|
22
|
+
doc.main {
|
23
|
+
(page * per_page + 1).upto((page + 1) * per_page).each do |i|
|
24
|
+
doc.a("page #{i}", href: "http://localhost/?id=#{i}")
|
25
|
+
end
|
26
|
+
}
|
27
|
+
doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
|
28
|
+
}
|
29
|
+
}
|
30
|
+
}.doc)
|
31
|
+
}
|
32
|
+
|
33
|
+
detail_page = proc { |url, &yielder|
|
34
|
+
params = url_to_params.call(url)
|
35
|
+
id = params['id']
|
36
|
+
|
37
|
+
yielder.call(Nokogiri::HTML::Builder.new { |doc|
|
38
|
+
doc.html {
|
39
|
+
doc.body {
|
40
|
+
doc.span.bold {
|
41
|
+
doc.text "Hello world"
|
42
|
+
}
|
43
|
+
doc.h1("title_#{id}", id: 'title')
|
44
|
+
doc.main("body_#{id}", id: 'body')
|
45
|
+
doc.div.sub do
|
46
|
+
doc.span.name('testtest')
|
47
|
+
end
|
48
|
+
}
|
49
|
+
}
|
50
|
+
}.doc)
|
51
|
+
}
|
52
|
+
|
53
|
+
define(as: :html, connector: detail_page) do
|
54
|
+
let(:title, '#title')
|
55
|
+
let(:body, '#body')
|
56
|
+
end
|
57
|
+
|
58
|
+
define(:sub, as: :html, connector: :direct) do
|
59
|
+
let(:name, '.name')
|
60
|
+
end
|
61
|
+
|
62
|
+
spider(as: :html, connector: master_page) do |yielder, connector|
|
63
|
+
next_url = 'http://localhost'
|
64
|
+
while next_url.present?
|
65
|
+
connector.call(next_url) do |page|
|
66
|
+
page.search('main a').each do |a|
|
67
|
+
yielder.call(a.attr('href'))
|
68
|
+
end
|
69
|
+
next_url = page.at('a.next')&.attr('href')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
data/example/proxy.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
Spidy.define do
|
2
|
+
user_agent 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'
|
3
|
+
socks_proxy '127.0.0.1', 9050
|
4
|
+
|
5
|
+
spider(as: :json) do |yielder, connector|
|
6
|
+
connector.call('https://httpbin.org/ip') do |json|
|
7
|
+
yielder.call(json[:origin])
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
data/example/retry.rb
ADDED
data/lib/spidy.rb
CHANGED
data/lib/spidy/connector.rb
CHANGED
@@ -10,25 +10,12 @@ module Spidy::Connector
|
|
10
10
|
autoload :Json
|
11
11
|
autoload :Xml
|
12
12
|
|
13
|
-
|
14
|
-
# retry class
|
15
|
-
#
|
16
|
-
class Retry < StandardError
|
17
|
-
attr_reader :page
|
18
|
-
attr_reader :response_code
|
19
|
-
attr_reader :wait_time
|
20
|
-
|
21
|
-
def initialize(wait_time: 2, page: nil, error: nil)
|
22
|
-
@page = page
|
23
|
-
@wait_time = wait_time
|
24
|
-
@response_code = error.try(:response_code) || page.try(:response_code)
|
25
|
-
end
|
26
|
-
end
|
13
|
+
DEFAULT_WAIT_TIME = 5
|
27
14
|
|
28
15
|
#
|
29
16
|
# default user agent
|
30
17
|
#
|
31
|
-
|
18
|
+
DEFAULT_USER_AGENT = [
|
32
19
|
'Mozilla/5.0',
|
33
20
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
34
21
|
'AppleWebKit/537.36',
|
@@ -38,18 +25,121 @@ module Spidy::Connector
|
|
38
25
|
].join(' ')
|
39
26
|
|
40
27
|
#
|
41
|
-
#
|
28
|
+
# error output logger
|
42
29
|
#
|
43
|
-
|
44
|
-
return value if value.respond_to?(:call)
|
30
|
+
DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
|
45
31
|
|
46
|
-
|
47
|
-
|
32
|
+
#
|
33
|
+
# static method
|
34
|
+
#
|
35
|
+
module StaticAccessor
|
36
|
+
extend ActiveSupport::Concern
|
37
|
+
class_methods do
|
38
|
+
def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
|
39
|
+
new(wait_time: wait_time, user_agent: user_agent, logger: logger).call(url, &block)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
#
|
45
|
+
# retry class
|
46
|
+
#
|
47
|
+
class Retry < StandardError
|
48
|
+
attr_reader :object, :response_code, :error
|
49
|
+
|
50
|
+
def initialize(object: nil, error: nil, response_code: nil)
|
51
|
+
@object = object
|
52
|
+
@response_code = response_code
|
53
|
+
@error = error
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
#
|
58
|
+
# retry
|
59
|
+
#
|
60
|
+
class Retryable
|
61
|
+
attr_reader :origin_connector
|
62
|
+
|
63
|
+
def initialize(connector, logger:, wait_time:)
|
64
|
+
@origin_connector = connector
|
65
|
+
@logger = logger
|
66
|
+
@wait_time = wait_time
|
67
|
+
@retry_attempt_count = 5
|
68
|
+
end
|
69
|
+
|
70
|
+
def call(url, &block)
|
71
|
+
connect(url, &block)
|
72
|
+
end
|
73
|
+
|
74
|
+
def connect(url, retry_attempt_count: @retry_attempt_count, &block)
|
75
|
+
@logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
76
|
+
@origin_connector.call(url, &block)
|
77
|
+
rescue Spidy::Connector::Retry => e
|
78
|
+
@logger.call('retry.accessed': Time.current,
|
79
|
+
'retry.uri': url,
|
80
|
+
'retry.response_code': e.response_code,
|
81
|
+
'retry.attempt_count': retry_attempt_count)
|
82
|
+
|
83
|
+
retry_attempt_count -= 1
|
84
|
+
if retry_attempt_count.positive?
|
85
|
+
sleep @wait_time
|
86
|
+
@origin_connector.refresh! if @origin_connector.respond_to?(:refresh!)
|
87
|
+
retry
|
88
|
+
end
|
89
|
+
raise e.error
|
90
|
+
end
|
91
|
+
end
|
48
92
|
|
49
|
-
|
93
|
+
#
|
94
|
+
# tor proxy
|
95
|
+
#
|
96
|
+
class TorConnector
|
97
|
+
attr_reader :connector, :socks_proxy
|
98
|
+
|
99
|
+
def initialize(connector, socks_proxy)
|
100
|
+
@connector = connector
|
101
|
+
@socks_proxy = socks_proxy
|
102
|
+
end
|
103
|
+
|
104
|
+
def call(url, &block)
|
50
105
|
Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
|
51
106
|
connector.call(url, &block)
|
52
107
|
end
|
53
108
|
end
|
109
|
+
|
110
|
+
def try_connection?
|
111
|
+
try_connection!
|
112
|
+
true
|
113
|
+
rescue Errno::ECONNREFUSED
|
114
|
+
false
|
115
|
+
end
|
116
|
+
|
117
|
+
def try_connection!
|
118
|
+
Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
|
123
|
+
logger ||= DEFAULT_LOGGER
|
124
|
+
user_agent ||= DEFAULT_USER_AGENT
|
125
|
+
wait_time ||= DEFAULT_WAIT_TIME
|
126
|
+
|
127
|
+
connector = get_connector(value, user_agent: user_agent, socks_proxy: socks_proxy)
|
128
|
+
Retryable.new(connector, wait_time: wait_time, logger: logger)
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# get connection handller
|
133
|
+
#
|
134
|
+
def self.get_connector(value, user_agent: nil, socks_proxy: nil)
|
135
|
+
return value if value.respond_to?(:call)
|
136
|
+
|
137
|
+
connector = const_get(value.to_s.classify).new(user_agent: user_agent)
|
138
|
+
fail "Not defined connnector[#{value}]" if connector.nil?
|
139
|
+
return connector if socks_proxy.nil?
|
140
|
+
|
141
|
+
tor = TorConnector.new(connnector, socks_proxy)
|
142
|
+
tor.try_connection!
|
143
|
+
tor
|
54
144
|
end
|
55
145
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -4,16 +4,15 @@
|
|
4
4
|
# Mechanize wrapper
|
5
5
|
#
|
6
6
|
class Spidy::Connector::Html
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
include Spidy::Connector::StaticAccessor
|
8
|
+
|
9
|
+
def initialize(user_agent:)
|
10
10
|
@agent = Mechanize.new
|
11
11
|
@user_agent = user_agent
|
12
12
|
@agent.user_agent = user_agent
|
13
13
|
end
|
14
14
|
|
15
15
|
attr_reader :agent
|
16
|
-
attr_reader :logger
|
17
16
|
|
18
17
|
def call(url, encoding: nil, retry_count: 5, &yielder)
|
19
18
|
fail 'url is not specified' if url.blank?
|
@@ -21,42 +20,27 @@ class Spidy::Connector::Html
|
|
21
20
|
agent.default_encoding = encoding
|
22
21
|
agent.force_default_encoding = true
|
23
22
|
end
|
24
|
-
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
25
|
-
get(url, retry_count, yielder)
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def get(url, retry_count, yielder)
|
31
23
|
connect(url, retry_count, yielder)
|
32
|
-
|
33
|
-
logger.call('retry.accessed': Time.current,
|
34
|
-
'retry.uri': url,
|
35
|
-
'retry.response_code': e.response_code,
|
36
|
-
'retry.rest_count': retry_count)
|
24
|
+
end
|
37
25
|
|
26
|
+
def refresh!
|
38
27
|
@agent = Mechanize.new
|
39
28
|
@agent.user_agent = @user_agent
|
40
|
-
|
41
|
-
retry_count -= 1
|
42
|
-
if retry_count.positive?
|
43
|
-
sleep e.wait_time
|
44
|
-
retry
|
45
|
-
end
|
46
|
-
raise e
|
47
29
|
end
|
48
30
|
|
31
|
+
private
|
32
|
+
|
49
33
|
def connect(url, retry_count, yielder)
|
50
34
|
result = nil
|
51
35
|
agent.get(url) do |page|
|
52
|
-
fail Spidy::Connector::Retry,
|
36
|
+
fail Spidy::Connector::Retry, object: page, response_code: page.try(:response_code) if page.title == 'Sorry, unable to access page...'
|
53
37
|
|
54
38
|
result = yielder.call(page)
|
55
39
|
end
|
56
40
|
result
|
57
41
|
rescue Mechanize::ResponseCodeError => e
|
58
|
-
raise Spidy::Connector::Retry, error: e,
|
59
|
-
raise Spidy::Connector::Retry, error: e,
|
60
|
-
raise e
|
42
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '429'
|
43
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code) if e.response_code == '502'
|
44
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.try(:response_code)
|
61
45
|
end
|
62
46
|
end
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -4,7 +4,11 @@
|
|
4
4
|
# OpenURI to JSON.parse
|
5
5
|
#
|
6
6
|
class Spidy::Connector::Json
|
7
|
-
|
7
|
+
include Spidy::Connector::StaticAccessor
|
8
|
+
|
9
|
+
attr_reader :logger
|
10
|
+
|
11
|
+
def initialize(user_agent: nil)
|
8
12
|
@user_agent = user_agent
|
9
13
|
end
|
10
14
|
|
@@ -13,7 +17,9 @@ class Spidy::Connector::Json
|
|
13
17
|
connect(url, &block)
|
14
18
|
end
|
15
19
|
|
16
|
-
def connect(url)
|
20
|
+
def connect(url, retry_count: 5)
|
17
21
|
OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
22
|
+
rescue OpenURI::HTTPError => e
|
23
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
|
18
24
|
end
|
19
25
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -4,15 +4,23 @@
|
|
4
4
|
# xml
|
5
5
|
#
|
6
6
|
class Spidy::Connector::Xml
|
7
|
-
|
7
|
+
include Spidy::Connector::StaticAccessor
|
8
|
+
|
9
|
+
def call(url, &block)
|
8
10
|
fail 'URL is undefined' if url.blank?
|
9
11
|
|
12
|
+
connect(url, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def connect(url, &block)
|
10
16
|
OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
|
11
|
-
|
17
|
+
block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
12
18
|
end
|
19
|
+
rescue OpenURI::HTTPError => e
|
20
|
+
raise Spidy::Connector::Retry, error: e, response_code: e.io.status[0]
|
13
21
|
end
|
14
22
|
|
15
|
-
def initialize(
|
23
|
+
def initialize(user_agent:)
|
16
24
|
@user_agent = user_agent
|
17
25
|
end
|
18
26
|
end
|
data/lib/spidy/definition.rb
CHANGED
@@ -46,7 +46,7 @@ module Spidy::Definition
|
|
46
46
|
|
47
47
|
def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
|
48
48
|
@namespace ||= {}
|
49
|
-
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent)
|
49
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
|
50
50
|
binder = Spidy::Binder.get(self, binder || as)
|
51
51
|
@namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
|
52
52
|
end
|
data/lib/spidy/version.rb
CHANGED
data/spidy.gemspec
CHANGED
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
|
|
31
31
|
spec.add_development_dependency 'ffaker'
|
32
32
|
spec.add_development_dependency 'rspec-command'
|
33
33
|
|
34
|
+
spec.add_runtime_dependency 'tor'
|
34
35
|
spec.add_runtime_dependency 'activesupport'
|
35
36
|
spec.add_runtime_dependency 'mechanize'
|
36
37
|
spec.add_runtime_dependency 'socksify'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: tor
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: activesupport
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -173,6 +187,9 @@ files:
|
|
173
187
|
- Rakefile
|
174
188
|
- bin/console
|
175
189
|
- bin/setup
|
190
|
+
- example/master_detail.rb
|
191
|
+
- example/proxy.rb
|
192
|
+
- example/retry.rb
|
176
193
|
- exe/spidy
|
177
194
|
- lib/spidy.rb
|
178
195
|
- lib/spidy/binder.rb
|