spidy 0.2.4 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -3
- data/lib/spidy.rb +5 -0
- data/lib/spidy/binder.rb +0 -4
- data/lib/spidy/connector.rb +74 -3
- data/lib/spidy/connector/direct.rb +5 -2
- data/lib/spidy/connector/html.rb +44 -56
- data/lib/spidy/connector/json.rb +29 -3
- data/lib/spidy/connector/xml.rb +9 -3
- data/lib/spidy/console.rb +5 -0
- data/lib/spidy/definition.rb +14 -2
- data/lib/spidy/version.rb +1 -1
- data/spidy.gemspec +2 -0
- metadata +34 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 44d16509821e1779d821effb7c571d2a11cbb79166168943242628b2e53457ec
|
4
|
+
data.tar.gz: 5b52389a8042c9e69aaeead0d96b81553da2e5f5e51ad16696f87249e38067bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3475b05ea1235b388960416a03ab96adb64608ce4046d1812b4d80f038a91b99343882e45413ee6a2f74e29626869aca8a48d34f8e57c64b480c8893efcda123
|
7
|
+
data.tar.gz: 7e4fd1abf5898a7a7273d4eaf53021e4c1898d995e8e0d23abfcbf80c3e1b45b2f059c95e48707577bae72a4f158d7cba7f9be097e7e8a277d89e683386c32d6
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.2.
|
4
|
+
spidy (0.2.9)
|
5
5
|
activesupport
|
6
6
|
mechanize
|
7
7
|
pry
|
8
|
+
socksify
|
9
|
+
tor
|
8
10
|
|
9
11
|
GEM
|
10
12
|
remote: https://rubygems.org/
|
@@ -16,7 +18,7 @@ GEM
|
|
16
18
|
tzinfo (~> 1.1)
|
17
19
|
zeitwerk (~> 2.2, >= 2.2.2)
|
18
20
|
coderay (1.1.2)
|
19
|
-
concurrent-ruby (1.1.
|
21
|
+
concurrent-ruby (1.1.7)
|
20
22
|
connection_pool (2.2.3)
|
21
23
|
diff-lcs (1.3)
|
22
24
|
domain_name (0.5.20190701)
|
@@ -40,7 +42,7 @@ GEM
|
|
40
42
|
mime-types-data (~> 3.2015)
|
41
43
|
mime-types-data (3.2020.0512)
|
42
44
|
mini_portile2 (2.4.0)
|
43
|
-
minitest (5.14.
|
45
|
+
minitest (5.14.2)
|
44
46
|
mixlib-shellout (2.4.4)
|
45
47
|
net-http-digest_auth (1.4.1)
|
46
48
|
net-http-persistent (4.0.0)
|
@@ -72,7 +74,9 @@ GEM
|
|
72
74
|
diff-lcs (>= 1.2.0, < 2.0)
|
73
75
|
rspec-support (~> 3.8.0)
|
74
76
|
rspec-support (3.8.2)
|
77
|
+
socksify (1.7.1)
|
75
78
|
thread_safe (0.3.6)
|
79
|
+
tor (0.1.4)
|
76
80
|
tzinfo (1.2.7)
|
77
81
|
thread_safe (~> 0.1)
|
78
82
|
unf (0.1.4)
|
data/lib/spidy.rb
CHANGED
@@ -4,6 +4,8 @@ require 'spidy/version'
|
|
4
4
|
require 'active_support/all'
|
5
5
|
require 'mechanize'
|
6
6
|
require 'open-uri'
|
7
|
+
require 'socksify'
|
8
|
+
require 'tor'
|
7
9
|
|
8
10
|
#
|
9
11
|
# web spider dsl engine
|
@@ -36,6 +38,9 @@ module Spidy
|
|
36
38
|
spidy.instance_eval do
|
37
39
|
undef :spider
|
38
40
|
undef :define
|
41
|
+
undef :wait_time
|
42
|
+
undef :user_agent
|
43
|
+
undef :socks_proxy
|
39
44
|
end
|
40
45
|
spidy
|
41
46
|
end
|
data/lib/spidy/binder.rb
CHANGED
data/lib/spidy/connector.rb
CHANGED
@@ -10,6 +10,9 @@ module Spidy::Connector
|
|
10
10
|
autoload :Json
|
11
11
|
autoload :Xml
|
12
12
|
|
13
|
+
#
|
14
|
+
# default user agent
|
15
|
+
#
|
13
16
|
USER_AGENT = [
|
14
17
|
'Mozilla/5.0',
|
15
18
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
@@ -19,9 +22,77 @@ module Spidy::Connector
|
|
19
22
|
'Safari/537.36'
|
20
23
|
].join(' ')
|
21
24
|
|
22
|
-
|
23
|
-
|
25
|
+
#
|
26
|
+
# error output logger
|
27
|
+
#
|
28
|
+
DEFAULT_LOGGER = proc { |values| STDERR.puts(values.to_json) }
|
24
29
|
|
25
|
-
|
30
|
+
#
|
31
|
+
# static method
|
32
|
+
#
|
33
|
+
module StaticAccessor
|
34
|
+
extend ActiveSupport::Concern
|
35
|
+
class_methods do
|
36
|
+
def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
|
37
|
+
new(wait_time: wait_time, user_agent: user_agent, logger: logger).call(url, &block)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# retry class
|
44
|
+
#
|
45
|
+
class Retry < StandardError
|
46
|
+
attr_reader :page
|
47
|
+
attr_reader :response_code
|
48
|
+
attr_reader :wait_time
|
49
|
+
|
50
|
+
def initialize(wait_time: 2, page: nil, error: nil)
|
51
|
+
@page = page
|
52
|
+
@wait_time = wait_time
|
53
|
+
@response_code = error.try(:response_code) || page.try(:response_code)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class Builder
|
58
|
+
attr_reader :origin_connector, :proxy_connector
|
59
|
+
|
60
|
+
def initialize(connector, socks_proxy)
|
61
|
+
@socks_proxy = socks_proxy
|
62
|
+
@origin_connector = connector
|
63
|
+
@proxy_connector =
|
64
|
+
lambda do |url, &block|
|
65
|
+
Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
|
66
|
+
connector.call(url, &block)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def proxy_disabled?
|
72
|
+
!tor?
|
73
|
+
end
|
74
|
+
|
75
|
+
def tor?
|
76
|
+
Tor::Controller.new(host: @socks_proxy[:host], port: @socks_proxy[:port]).close
|
77
|
+
true
|
78
|
+
rescue Errno::ECONNREFUSED
|
79
|
+
false
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# get connection handller
|
85
|
+
#
|
86
|
+
def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil, logger: nil)
|
87
|
+
return value if value.respond_to?(:call)
|
88
|
+
|
89
|
+
builder = Builder.new(const_get(value.to_s.classify).new(
|
90
|
+
wait_time: wait_time || 5,
|
91
|
+
user_agent: user_agent || USER_AGENT,
|
92
|
+
logger: logger || DEFAULT_LOGGER,
|
93
|
+
), socks_proxy)
|
94
|
+
return builder.origin_connector if socks_proxy.nil? || builder.proxy_disabled?
|
95
|
+
|
96
|
+
builder.proxy_connector
|
26
97
|
end
|
27
98
|
end
|
@@ -3,8 +3,11 @@
|
|
3
3
|
#
|
4
4
|
# Direct resource ( not network resource )
|
5
5
|
#
|
6
|
-
|
7
|
-
def
|
6
|
+
class Spidy::Connector::Direct
|
7
|
+
def call(resource, &yielder)
|
8
8
|
yielder.call(resource)
|
9
9
|
end
|
10
|
+
|
11
|
+
def initialize(wait_time: nil, user_agent: nil, logger: nil)
|
12
|
+
end
|
10
13
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -3,74 +3,62 @@
|
|
3
3
|
#
|
4
4
|
# Mechanize wrapper
|
5
5
|
#
|
6
|
-
|
7
|
-
|
8
|
-
# retry class
|
9
|
-
#
|
10
|
-
class Retry < StandardError
|
11
|
-
attr_reader :page
|
12
|
-
attr_reader :response_code
|
13
|
-
attr_reader :wait_time
|
6
|
+
class Spidy::Connector::Html
|
7
|
+
include Spidy::Connector::StaticAccessor
|
14
8
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
9
|
+
def initialize(wait_time:, user_agent:, logger: nil)
|
10
|
+
@wait_time = wait_time
|
11
|
+
@logger = logger
|
12
|
+
@agent = Mechanize.new
|
13
|
+
@user_agent = user_agent
|
14
|
+
@agent.user_agent = user_agent
|
20
15
|
end
|
21
16
|
|
22
|
-
|
23
|
-
|
24
|
-
@agent.user_agent = Spidy::Connector::USER_AGENT
|
25
|
-
|
26
|
-
class << self
|
27
|
-
attr_reader :agent
|
28
|
-
attr_accessor :logger
|
17
|
+
attr_reader :agent
|
18
|
+
attr_reader :logger
|
29
19
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
37
|
-
get(url, retry_count, yielder)
|
20
|
+
def call(url, encoding: nil, retry_count: 5, &yielder)
|
21
|
+
fail 'url is not specified' if url.blank?
|
22
|
+
if encoding
|
23
|
+
agent.default_encoding = encoding
|
24
|
+
agent.force_default_encoding = true
|
38
25
|
end
|
26
|
+
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
27
|
+
get(url, retry_count, yielder)
|
28
|
+
end
|
39
29
|
|
40
|
-
|
30
|
+
private
|
41
31
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
32
|
+
def get(url, retry_count, yielder)
|
33
|
+
connect(url, retry_count, yielder)
|
34
|
+
rescue Spidy::Connector::Retry => e
|
35
|
+
logger.call('retry.accessed': Time.current,
|
36
|
+
'retry.uri': url,
|
37
|
+
'retry.response_code': e.response_code,
|
38
|
+
'retry.rest_count': retry_count)
|
49
39
|
|
50
|
-
|
51
|
-
|
40
|
+
@agent = Mechanize.new
|
41
|
+
@agent.user_agent = @user_agent
|
52
42
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
end
|
58
|
-
raise e
|
43
|
+
retry_count -= 1
|
44
|
+
if retry_count.positive?
|
45
|
+
sleep e.wait_time
|
46
|
+
retry
|
59
47
|
end
|
48
|
+
raise e
|
49
|
+
end
|
60
50
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
51
|
+
def connect(url, retry_count, yielder)
|
52
|
+
result = nil
|
53
|
+
agent.get(url) do |page|
|
54
|
+
fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
|
65
55
|
|
66
|
-
|
67
|
-
end
|
68
|
-
result
|
69
|
-
rescue Mechanize::ResponseCodeError => e
|
70
|
-
raise Retry, error: e if e.response_code == '429'
|
71
|
-
raise Retry, error: e if e.response_code == '502'
|
72
|
-
raise e
|
56
|
+
result = yielder.call(page)
|
73
57
|
end
|
74
|
-
|
58
|
+
result
|
59
|
+
rescue Mechanize::ResponseCodeError => e
|
60
|
+
raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
|
61
|
+
raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
|
62
|
+
raise e
|
75
63
|
end
|
76
64
|
end
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -3,9 +3,35 @@
|
|
3
3
|
#
|
4
4
|
# OpenURI to JSON.parse
|
5
5
|
#
|
6
|
-
|
7
|
-
|
6
|
+
class Spidy::Connector::Json
|
7
|
+
include Spidy::Connector::StaticAccessor
|
8
|
+
|
9
|
+
attr_reader :logger
|
10
|
+
|
11
|
+
def initialize(wait_time: nil, user_agent: nil, logger: nil)
|
12
|
+
@wait_time = wait_time
|
13
|
+
@user_agent = user_agent
|
14
|
+
@logger = logger
|
15
|
+
end
|
16
|
+
|
17
|
+
def call(url, &block)
|
8
18
|
fail 'url is not specified' if url.blank?
|
9
|
-
|
19
|
+
connect(url, &block)
|
20
|
+
end
|
21
|
+
|
22
|
+
def connect(url, retry_count: 5)
|
23
|
+
OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
24
|
+
rescue OpenURI::HTTPError => e
|
25
|
+
logger.call('retry.accessed': Time.current,
|
26
|
+
'retry.uri': url,
|
27
|
+
'retry.response_code': e.message,
|
28
|
+
'retry.rest_count': retry_count)
|
29
|
+
|
30
|
+
retry_count -= 1
|
31
|
+
if retry_count.positive?
|
32
|
+
sleep @wait_time
|
33
|
+
retry
|
34
|
+
end
|
35
|
+
raise e
|
10
36
|
end
|
11
37
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -3,12 +3,18 @@
|
|
3
3
|
#
|
4
4
|
# xml
|
5
5
|
#
|
6
|
-
|
7
|
-
|
6
|
+
class Spidy::Connector::Xml
|
7
|
+
include Spidy::Connector::StaticAccessor
|
8
|
+
|
9
|
+
def call(url)
|
8
10
|
fail 'URL is undefined' if url.blank?
|
9
11
|
|
10
|
-
OpenURI.open_uri(url, "User-Agent" =>
|
12
|
+
OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
|
11
13
|
yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
12
14
|
end
|
13
15
|
end
|
16
|
+
|
17
|
+
def initialize(user_agent: nil, logger: nil)
|
18
|
+
@user_agent = user_agent
|
19
|
+
end
|
14
20
|
end
|
data/lib/spidy/console.rb
CHANGED
data/lib/spidy/definition.rb
CHANGED
@@ -16,6 +16,18 @@ module Spidy::Definition
|
|
16
16
|
spidy.call(source, &yielder)
|
17
17
|
end
|
18
18
|
|
19
|
+
def user_agent(user_agent)
|
20
|
+
@user_agent = user_agent
|
21
|
+
end
|
22
|
+
|
23
|
+
def wait_time(wait_time)
|
24
|
+
@wait_time = wait_time
|
25
|
+
end
|
26
|
+
|
27
|
+
def socks_proxy(host, port)
|
28
|
+
@socks_proxy = { host: host, port: port }
|
29
|
+
end
|
30
|
+
|
19
31
|
def each(source = nil, name: :default, &yielder)
|
20
32
|
name = name.presence || :default
|
21
33
|
spidy = @namespace[:"#{name}_spider"]
|
@@ -26,7 +38,7 @@ module Spidy::Definition
|
|
26
38
|
|
27
39
|
def spider(name = :default, connector: nil, as: nil, &define_block)
|
28
40
|
@namespace ||= {}
|
29
|
-
connector = Spidy::Connector.get(connector || as)
|
41
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
|
30
42
|
@namespace[:"#{name}_spider"] = proc do |source, &yielder|
|
31
43
|
define_block.call(yielder, connector, source)
|
32
44
|
end
|
@@ -34,7 +46,7 @@ module Spidy::Definition
|
|
34
46
|
|
35
47
|
def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
|
36
48
|
@namespace ||= {}
|
37
|
-
connector = Spidy::Connector.get(connector || as)
|
49
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
|
38
50
|
binder = Spidy::Binder.get(self, binder || as)
|
39
51
|
@namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
|
40
52
|
end
|
data/lib/spidy/version.rb
CHANGED
data/spidy.gemspec
CHANGED
@@ -31,7 +31,9 @@ Gem::Specification.new do |spec|
|
|
31
31
|
spec.add_development_dependency 'ffaker'
|
32
32
|
spec.add_development_dependency 'rspec-command'
|
33
33
|
|
34
|
+
spec.add_runtime_dependency 'tor'
|
34
35
|
spec.add_runtime_dependency 'activesupport'
|
35
36
|
spec.add_runtime_dependency 'mechanize'
|
37
|
+
spec.add_runtime_dependency 'socksify'
|
36
38
|
spec.add_runtime_dependency 'pry'
|
37
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: tor
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: activesupport
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +136,20 @@ dependencies:
|
|
122
136
|
- - ">="
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: socksify
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :runtime
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
125
153
|
- !ruby/object:Gem::Dependency
|
126
154
|
name: pry
|
127
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,7 +164,7 @@ dependencies:
|
|
136
164
|
- - ">="
|
137
165
|
- !ruby/object:Gem::Version
|
138
166
|
version: '0'
|
139
|
-
description:
|
167
|
+
description:
|
140
168
|
email:
|
141
169
|
- aileron.cc@gmail.com
|
142
170
|
executables:
|
@@ -183,7 +211,7 @@ homepage: https://github.com/aileron-inc/spidy
|
|
183
211
|
licenses:
|
184
212
|
- MIT
|
185
213
|
metadata: {}
|
186
|
-
post_install_message:
|
214
|
+
post_install_message:
|
187
215
|
rdoc_options: []
|
188
216
|
require_paths:
|
189
217
|
- lib
|
@@ -199,7 +227,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
199
227
|
version: '0'
|
200
228
|
requirements: []
|
201
229
|
rubygems_version: 3.0.3
|
202
|
-
signing_key:
|
230
|
+
signing_key:
|
203
231
|
specification_version: 4
|
204
232
|
summary: web spider dsl
|
205
233
|
test_files: []
|