spidy 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/spidy.rb +4 -0
- data/lib/spidy/binder.rb +0 -4
- data/lib/spidy/connector.rb +31 -3
- data/lib/spidy/connector/direct.rb +5 -2
- data/lib/spidy/connector/html.rb +43 -57
- data/lib/spidy/connector/json.rb +11 -3
- data/lib/spidy/connector/xml.rb +7 -3
- data/lib/spidy/console.rb +5 -0
- data/lib/spidy/definition.rb +14 -2
- data/lib/spidy/version.rb +1 -1
- data/spidy.gemspec +1 -0
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f123b0f2bb611ea41bafe6a9cb5a7bb7564ba6be7044127e77fddf34dc3faed1
|
4
|
+
data.tar.gz: 87f0079575c62dac9a43d6900b88da45cce77a37f5b994376b0d31dd24b03fa5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6554befde64315d01d2db11ba22b479629ae602ca4acddee51c941bd091144849ad15f2b98c220d29bd97c70c02fc2726061db429db2140a50bbfe0fce850a2
|
7
|
+
data.tar.gz: 511b7dccf5f93e4cf8a652d3039ec852c556cbd49e238c676029d1b0e611dce669ac84140c8b9be3f861426e8f248f6452698bf7319f5493cb7ecccb3cd1428d
|
data/lib/spidy.rb
CHANGED
@@ -4,6 +4,7 @@ require 'spidy/version'
|
|
4
4
|
require 'active_support/all'
|
5
5
|
require 'mechanize'
|
6
6
|
require 'open-uri'
|
7
|
+
require 'socksify'
|
7
8
|
|
8
9
|
#
|
9
10
|
# web spider dsl engine
|
@@ -36,6 +37,9 @@ module Spidy
|
|
36
37
|
spidy.instance_eval do
|
37
38
|
undef :spider
|
38
39
|
undef :define
|
40
|
+
undef :wait_time
|
41
|
+
undef :user_agent
|
42
|
+
undef :socks_proxy
|
39
43
|
end
|
40
44
|
spidy
|
41
45
|
end
|
data/lib/spidy/binder.rb
CHANGED
data/lib/spidy/connector.rb
CHANGED
@@ -10,6 +10,24 @@ module Spidy::Connector
|
|
10
10
|
autoload :Json
|
11
11
|
autoload :Xml
|
12
12
|
|
13
|
+
#
|
14
|
+
# retry class
|
15
|
+
#
|
16
|
+
class Retry < StandardError
|
17
|
+
attr_reader :page
|
18
|
+
attr_reader :response_code
|
19
|
+
attr_reader :wait_time
|
20
|
+
|
21
|
+
def initialize(wait_time: 2, page: nil, error: nil)
|
22
|
+
@page = page
|
23
|
+
@wait_time = wait_time
|
24
|
+
@response_code = error.try(:response_code) || page.try(:response_code)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# default user agent
|
30
|
+
#
|
13
31
|
USER_AGENT = [
|
14
32
|
'Mozilla/5.0',
|
15
33
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
@@ -19,9 +37,19 @@ module Spidy::Connector
|
|
19
37
|
'Safari/537.36'
|
20
38
|
].join(' ')
|
21
39
|
|
22
|
-
|
23
|
-
|
40
|
+
#
|
41
|
+
# get connection handller
|
42
|
+
#
|
43
|
+
def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil)
|
44
|
+
return value if value.respond_to?(:call)
|
45
|
+
|
46
|
+
connector = const_get(value.to_s.classify).new(wait_time: wait_time || 5, user_agent: user_agent || USER_AGENT)
|
47
|
+
return connector if socks_proxy.nil?
|
24
48
|
|
25
|
-
|
49
|
+
lambda do |url, &block|
|
50
|
+
Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
|
51
|
+
connector.call(url, &block)
|
52
|
+
end
|
53
|
+
end
|
26
54
|
end
|
27
55
|
end
|
@@ -3,8 +3,11 @@
|
|
3
3
|
#
|
4
4
|
# Direct resource ( not network resource )
|
5
5
|
#
|
6
|
-
|
7
|
-
def
|
6
|
+
class Spidy::Connector::Direct
|
7
|
+
def call(resource, &yielder)
|
8
8
|
yielder.call(resource)
|
9
9
|
end
|
10
|
+
|
11
|
+
def initialize(wait_time: nil, user_agent: nil)
|
12
|
+
end
|
10
13
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -3,74 +3,60 @@
|
|
3
3
|
#
|
4
4
|
# Mechanize wrapper
|
5
5
|
#
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
attr_reader :wait_time
|
14
|
-
|
15
|
-
def initialize(wait_time: 2, page: nil, error: nil)
|
16
|
-
@page = page
|
17
|
-
@wait_time = wait_time
|
18
|
-
@response_code = error.try(:response_code) || page.try(:response_code)
|
19
|
-
end
|
6
|
+
class Spidy::Connector::Html
|
7
|
+
def initialize(wait_time:, user_agent:, logger: nil)
|
8
|
+
@wait_time = wait_time
|
9
|
+
@logger = logger || proc { |values| STDERR.puts(values.to_json) }
|
10
|
+
@agent = Mechanize.new
|
11
|
+
@user_agent = user_agent
|
12
|
+
@agent.user_agent = user_agent
|
20
13
|
end
|
21
14
|
|
22
|
-
|
23
|
-
|
24
|
-
@agent.user_agent = Spidy::Connector::USER_AGENT
|
25
|
-
|
26
|
-
class << self
|
27
|
-
attr_reader :agent
|
28
|
-
attr_accessor :logger
|
15
|
+
attr_reader :agent
|
16
|
+
attr_reader :logger
|
29
17
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
37
|
-
get(url, retry_count, yielder)
|
18
|
+
def call(url, encoding: nil, retry_count: 5, &yielder)
|
19
|
+
fail 'url is not specified' if url.blank?
|
20
|
+
if encoding
|
21
|
+
agent.default_encoding = encoding
|
22
|
+
agent.force_default_encoding = true
|
38
23
|
end
|
24
|
+
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
25
|
+
get(url, retry_count, yielder)
|
26
|
+
end
|
39
27
|
|
40
|
-
|
28
|
+
private
|
41
29
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
30
|
+
def get(url, retry_count, yielder)
|
31
|
+
connect(url, retry_count, yielder)
|
32
|
+
rescue Spidy::Connector::Retry => e
|
33
|
+
logger.call('retry.accessed': Time.current,
|
34
|
+
'retry.uri': url,
|
35
|
+
'retry.response_code': e.response_code,
|
36
|
+
'retry.rest_count': retry_count)
|
49
37
|
|
50
|
-
|
51
|
-
|
38
|
+
@agent = Mechanize.new
|
39
|
+
@agent.user_agent = @user_agent
|
52
40
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
end
|
58
|
-
raise e
|
41
|
+
retry_count -= 1
|
42
|
+
if retry_count.positive?
|
43
|
+
sleep e.wait_time
|
44
|
+
retry
|
59
45
|
end
|
46
|
+
raise e
|
47
|
+
end
|
60
48
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
49
|
+
def connect(url, retry_count, yielder)
|
50
|
+
result = nil
|
51
|
+
agent.get(url) do |page|
|
52
|
+
fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
|
65
53
|
|
66
|
-
|
67
|
-
end
|
68
|
-
result
|
69
|
-
rescue Mechanize::ResponseCodeError => e
|
70
|
-
raise Retry, error: e if e.response_code == '429'
|
71
|
-
raise Retry, error: e if e.response_code == '502'
|
72
|
-
raise e
|
54
|
+
result = yielder.call(page)
|
73
55
|
end
|
74
|
-
|
56
|
+
result
|
57
|
+
rescue Mechanize::ResponseCodeError => e
|
58
|
+
raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
|
59
|
+
raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
|
60
|
+
raise e
|
75
61
|
end
|
76
62
|
end
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -3,9 +3,17 @@
|
|
3
3
|
#
|
4
4
|
# OpenURI to JSON.parse
|
5
5
|
#
|
6
|
-
|
7
|
-
def
|
6
|
+
class Spidy::Connector::Json
|
7
|
+
def initialize(wait_time: nil, user_agent: nil)
|
8
|
+
@user_agent = user_agent
|
9
|
+
end
|
10
|
+
|
11
|
+
def call(url, &block)
|
8
12
|
fail 'url is not specified' if url.blank?
|
9
|
-
|
13
|
+
connect(url, &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
def connect(url)
|
17
|
+
OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
10
18
|
end
|
11
19
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -3,12 +3,16 @@
|
|
3
3
|
#
|
4
4
|
# xml
|
5
5
|
#
|
6
|
-
|
7
|
-
def
|
6
|
+
class Spidy::Connector::Xml
|
7
|
+
def call(url)
|
8
8
|
fail 'URL is undefined' if url.blank?
|
9
9
|
|
10
|
-
OpenURI.open_uri(url, "User-Agent" =>
|
10
|
+
OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
|
11
11
|
yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
12
12
|
end
|
13
13
|
end
|
14
|
+
|
15
|
+
def initialize(wait_time: nil, user_agent: nil)
|
16
|
+
@user_agent = user_agent
|
17
|
+
end
|
14
18
|
end
|
data/lib/spidy/console.rb
CHANGED
data/lib/spidy/definition.rb
CHANGED
@@ -16,6 +16,18 @@ module Spidy::Definition
|
|
16
16
|
spidy.call(source, &yielder)
|
17
17
|
end
|
18
18
|
|
19
|
+
def user_agent(user_agent)
|
20
|
+
@user_agent = user_agent
|
21
|
+
end
|
22
|
+
|
23
|
+
def wait_time(wait_time)
|
24
|
+
@wait_time = wait_time
|
25
|
+
end
|
26
|
+
|
27
|
+
def socks_proxy(host, port)
|
28
|
+
@socks_proxy = { host: host, port: port }
|
29
|
+
end
|
30
|
+
|
19
31
|
def each(source = nil, name: :default, &yielder)
|
20
32
|
name = name.presence || :default
|
21
33
|
spidy = @namespace[:"#{name}_spider"]
|
@@ -26,7 +38,7 @@ module Spidy::Definition
|
|
26
38
|
|
27
39
|
def spider(name = :default, connector: nil, as: nil, &define_block)
|
28
40
|
@namespace ||= {}
|
29
|
-
connector = Spidy::Connector.get(connector || as)
|
41
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
|
30
42
|
@namespace[:"#{name}_spider"] = proc do |source, &yielder|
|
31
43
|
define_block.call(yielder, connector, source)
|
32
44
|
end
|
@@ -34,7 +46,7 @@ module Spidy::Definition
|
|
34
46
|
|
35
47
|
def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
|
36
48
|
@namespace ||= {}
|
37
|
-
connector = Spidy::Connector.get(connector || as)
|
49
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent)
|
38
50
|
binder = Spidy::Binder.get(self, binder || as)
|
39
51
|
@namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
|
40
52
|
end
|
data/lib/spidy/version.rb
CHANGED
data/spidy.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: socksify
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: pry
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,7 +150,7 @@ dependencies:
|
|
136
150
|
- - ">="
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '0'
|
139
|
-
description:
|
153
|
+
description:
|
140
154
|
email:
|
141
155
|
- aileron.cc@gmail.com
|
142
156
|
executables:
|
@@ -183,7 +197,7 @@ homepage: https://github.com/aileron-inc/spidy
|
|
183
197
|
licenses:
|
184
198
|
- MIT
|
185
199
|
metadata: {}
|
186
|
-
post_install_message:
|
200
|
+
post_install_message:
|
187
201
|
rdoc_options: []
|
188
202
|
require_paths:
|
189
203
|
- lib
|
@@ -199,7 +213,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
199
213
|
version: '0'
|
200
214
|
requirements: []
|
201
215
|
rubygems_version: 3.0.3
|
202
|
-
signing_key:
|
216
|
+
signing_key:
|
203
217
|
specification_version: 4
|
204
218
|
summary: web spider dsl
|
205
219
|
test_files: []
|