spidy 0.2.1 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Gemfile.lock +18 -14
- data/README.md +1 -1
- data/lib/spidy.rb +10 -2
- data/lib/spidy/binder.rb +29 -2
- data/lib/spidy/binder/html.rb +12 -29
- data/lib/spidy/binder/json.rb +15 -32
- data/lib/spidy/binder/xml.rb +12 -29
- data/lib/spidy/connector.rb +31 -3
- data/lib/spidy/connector/direct.rb +5 -2
- data/lib/spidy/connector/html.rb +43 -56
- data/lib/spidy/connector/json.rb +11 -3
- data/lib/spidy/connector/xml.rb +7 -3
- data/lib/spidy/console.rb +5 -0
- data/lib/spidy/definition.rb +21 -5
- data/lib/spidy/version.rb +1 -1
- data/spidy.gemspec +2 -0
- metadata +35 -8
- data/lib/spidy/interface.rb +0 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c6c2e2e85979c5d5492564fec243c23c44ef1d762230cd1a59400fcc46bf42a3
|
4
|
+
data.tar.gz: d551534240f528923ccb862e17ae521aaccdc5596e45c09e1cee0e90f0c990c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3348c6243500f90f7157435bf36f1675744f46c0025dc4bab81fad4ccfb65c4e2ac0d029a837fb589c768b4868fbaf8ab7c258ad67aa4e1a9f33f0949701b2da
|
7
|
+
data.tar.gz: ef8d804eae97e2300747abb1b9e97f84e100b6b151e01b587fff95c98327ce21e0024456c6d8208ecd81ceaead8a0353d251f15a4ef4deff7e3b077e13c872cb
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.6.
|
1
|
+
2.6.6
|
data/Gemfile.lock
CHANGED
@@ -1,30 +1,32 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.
|
4
|
+
spidy (0.2.6)
|
5
5
|
activesupport
|
6
6
|
mechanize
|
7
7
|
pry
|
8
|
+
socksify
|
9
|
+
tor
|
8
10
|
|
9
11
|
GEM
|
10
12
|
remote: https://rubygems.org/
|
11
13
|
specs:
|
12
|
-
activesupport (6.0.2
|
14
|
+
activesupport (6.0.3.2)
|
13
15
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
14
16
|
i18n (>= 0.7, < 2)
|
15
17
|
minitest (~> 5.1)
|
16
18
|
tzinfo (~> 1.1)
|
17
|
-
zeitwerk (~> 2.2)
|
19
|
+
zeitwerk (~> 2.2, >= 2.2.2)
|
18
20
|
coderay (1.1.2)
|
19
|
-
concurrent-ruby (1.1.
|
20
|
-
connection_pool (2.2.
|
21
|
+
concurrent-ruby (1.1.7)
|
22
|
+
connection_pool (2.2.3)
|
21
23
|
diff-lcs (1.3)
|
22
24
|
domain_name (0.5.20190701)
|
23
25
|
unf (>= 0.0.5, < 1.0.0)
|
24
26
|
ffaker (2.10.0)
|
25
27
|
http-cookie (1.0.3)
|
26
28
|
domain_name (~> 0.5)
|
27
|
-
i18n (1.8.
|
29
|
+
i18n (1.8.5)
|
28
30
|
concurrent-ruby (~> 1.0)
|
29
31
|
mechanize (2.7.6)
|
30
32
|
domain_name (~> 0.5, >= 0.5.1)
|
@@ -38,14 +40,14 @@ GEM
|
|
38
40
|
method_source (0.9.2)
|
39
41
|
mime-types (3.3.1)
|
40
42
|
mime-types-data (~> 3.2015)
|
41
|
-
mime-types-data (3.
|
43
|
+
mime-types-data (3.2020.0512)
|
42
44
|
mini_portile2 (2.4.0)
|
43
|
-
minitest (5.14.
|
45
|
+
minitest (5.14.2)
|
44
46
|
mixlib-shellout (2.4.4)
|
45
47
|
net-http-digest_auth (1.4.1)
|
46
|
-
net-http-persistent (
|
48
|
+
net-http-persistent (4.0.0)
|
47
49
|
connection_pool (~> 2.2)
|
48
|
-
nokogiri (1.10.
|
50
|
+
nokogiri (1.10.10)
|
49
51
|
mini_portile2 (~> 2.4.0)
|
50
52
|
ntlm-http (0.1.1)
|
51
53
|
pry (0.12.2)
|
@@ -72,14 +74,16 @@ GEM
|
|
72
74
|
diff-lcs (>= 1.2.0, < 2.0)
|
73
75
|
rspec-support (~> 3.8.0)
|
74
76
|
rspec-support (3.8.2)
|
77
|
+
socksify (1.7.1)
|
75
78
|
thread_safe (0.3.6)
|
76
|
-
|
79
|
+
tor (0.1.4)
|
80
|
+
tzinfo (1.2.7)
|
77
81
|
thread_safe (~> 0.1)
|
78
82
|
unf (0.1.4)
|
79
83
|
unf_ext
|
80
|
-
unf_ext (0.0.7.
|
84
|
+
unf_ext (0.0.7.7)
|
81
85
|
webrobots (0.1.2)
|
82
|
-
zeitwerk (2.
|
86
|
+
zeitwerk (2.4.0)
|
83
87
|
|
84
88
|
PLATFORMS
|
85
89
|
ruby
|
@@ -94,4 +98,4 @@ DEPENDENCIES
|
|
94
98
|
spidy!
|
95
99
|
|
96
100
|
BUNDLED WITH
|
97
|
-
2.1.
|
101
|
+
2.1.4
|
data/README.md
CHANGED
data/lib/spidy.rb
CHANGED
@@ -4,13 +4,14 @@ require 'spidy/version'
|
|
4
4
|
require 'active_support/all'
|
5
5
|
require 'mechanize'
|
6
6
|
require 'open-uri'
|
7
|
+
require 'socksify'
|
8
|
+
require 'tor'
|
7
9
|
|
8
10
|
#
|
9
11
|
# web spider dsl engine
|
10
12
|
#
|
11
13
|
module Spidy
|
12
14
|
extend ActiveSupport::Autoload
|
13
|
-
autoload :Interface
|
14
15
|
autoload :Shell
|
15
16
|
autoload :CommandLine
|
16
17
|
autoload :Console
|
@@ -34,6 +35,13 @@ module Spidy
|
|
34
35
|
module_eval(&block)
|
35
36
|
end
|
36
37
|
end
|
37
|
-
|
38
|
+
spidy.instance_eval do
|
39
|
+
undef :spider
|
40
|
+
undef :define
|
41
|
+
undef :wait_time
|
42
|
+
undef :user_agent
|
43
|
+
undef :socks_proxy
|
44
|
+
end
|
45
|
+
spidy
|
38
46
|
end
|
39
47
|
end
|
data/lib/spidy/binder.rb
CHANGED
@@ -9,17 +9,44 @@ module Spidy::Binder
|
|
9
9
|
autoload :Html
|
10
10
|
autoload :Xml
|
11
11
|
|
12
|
+
class Error < StandardError
|
13
|
+
end
|
14
|
+
|
12
15
|
class Caller
|
13
16
|
def initialize(spidy, binder)
|
14
17
|
@spidy = spidy
|
15
18
|
@binder = binder
|
16
19
|
end
|
17
20
|
|
18
|
-
def call(source, url: nil, define: nil)
|
19
|
-
yield Class.new(@binder, &define).new(@spidy, source, url)
|
21
|
+
def call(source, url: nil, define: nil, define_name: nil)
|
22
|
+
yield Class.new(@binder, &define).new(define_name, @spidy, source, url)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Base
|
27
|
+
class << self
|
28
|
+
attr_reader :attribute_names
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_reader :resource, :url
|
32
|
+
|
33
|
+
def initialize(define_name, spidy, resource, url)
|
34
|
+
@define_name = define_name
|
35
|
+
@spidy = spidy
|
36
|
+
@resource = resource
|
37
|
+
@url = url
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_s
|
41
|
+
to_h.to_json
|
42
|
+
end
|
43
|
+
|
44
|
+
def to_h
|
45
|
+
self.class.attribute_names.map { |name| [name, send(name)] }.to_h
|
20
46
|
end
|
21
47
|
end
|
22
48
|
|
49
|
+
|
23
50
|
def self.get(spidy, value)
|
24
51
|
return Caller.new(spidy, const_get(value.to_s.classify)) if name.is_a?(String) || name.is_a?(Symbol)
|
25
52
|
|
data/lib/spidy/binder/html.rb
CHANGED
@@ -3,40 +3,23 @@
|
|
3
3
|
#
|
4
4
|
# Bind html and convert to object
|
5
5
|
#
|
6
|
-
class Spidy::Binder::Html
|
7
|
-
|
8
|
-
|
6
|
+
class Spidy::Binder::Html < Spidy::Binder::Base
|
7
|
+
def self.let(name, query = nil, &block)
|
8
|
+
@attribute_names ||= []
|
9
|
+
@attribute_names << name
|
9
10
|
|
10
|
-
|
11
|
-
@attribute_names ||= []
|
12
|
-
@attribute_names << name
|
13
|
-
define_method(name) do
|
14
|
-
return html.at(query)&.text if block.nil?
|
15
|
-
return instance_exec(&block) if query.blank?
|
11
|
+
return define_method(name) { html.at(query)&.text } if block.nil?
|
16
12
|
|
13
|
+
define_method(name) do
|
14
|
+
if query.present?
|
17
15
|
instance_exec(html.at(query), &block)
|
16
|
+
else
|
17
|
+
instance_exec(&block)
|
18
18
|
end
|
19
|
+
rescue StandardError => e
|
20
|
+
fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
22
|
-
|
23
|
-
alias_method :resource, :html
|
24
|
-
|
25
|
-
def initialize(spidy, html, url)
|
26
|
-
@spidy = spidy
|
27
|
-
@url = url
|
28
|
-
@html = html
|
29
|
-
end
|
30
|
-
|
31
|
-
def scraper(name, source)
|
32
|
-
lambda { |&block| @spidy.call(source, name: name, &block) }
|
33
|
-
end
|
34
|
-
|
35
|
-
def to_s
|
36
|
-
to_h.to_json
|
37
|
-
end
|
38
|
-
|
39
|
-
def to_h
|
40
|
-
self.class.attribute_names.map { |name| [name, send(name)] }.to_h
|
41
|
-
end
|
24
|
+
alias_method :html, :resource
|
42
25
|
end
|
data/lib/spidy/binder/json.rb
CHANGED
@@ -3,40 +3,23 @@
|
|
3
3
|
#
|
4
4
|
# Bind json and convert to object
|
5
5
|
#
|
6
|
-
class Spidy::Binder::Json
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
instance_exec(
|
6
|
+
class Spidy::Binder::Json < Spidy::Binder::Base
|
7
|
+
def self.let(name, *query, &block)
|
8
|
+
@attribute_names ||= []
|
9
|
+
@attribute_names << name
|
10
|
+
|
11
|
+
return define_method(name) { json.dig(*query) } if block.nil?
|
12
|
+
|
13
|
+
define_method(name) do
|
14
|
+
if query.present?
|
15
|
+
instance_exec(json.dig(*query), &block)
|
16
|
+
else
|
17
|
+
instance_exec(&block)
|
18
18
|
end
|
19
|
+
rescue StandardError => e
|
20
|
+
fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
22
|
-
|
23
|
-
alias_method :resource, :json
|
24
|
-
|
25
|
-
def initialize(spidy, json, url)
|
26
|
-
@spidy = spidy
|
27
|
-
@json = json
|
28
|
-
@url = url
|
29
|
-
end
|
30
|
-
|
31
|
-
def scraper(name, source)
|
32
|
-
lambda { |&block| @spidy.call(source, name: name, &block) }
|
33
|
-
end
|
34
|
-
|
35
|
-
def to_s
|
36
|
-
to_h.to_json
|
37
|
-
end
|
38
|
-
|
39
|
-
def to_h
|
40
|
-
self.class.attribute_names.map { |name| [name, send(name)] }.to_h
|
41
|
-
end
|
24
|
+
alias_method :json, :resource
|
42
25
|
end
|
data/lib/spidy/binder/xml.rb
CHANGED
@@ -3,40 +3,23 @@
|
|
3
3
|
#
|
4
4
|
# Bind xml and convert to object
|
5
5
|
#
|
6
|
-
class Spidy::Binder::Xml
|
7
|
-
|
8
|
-
|
6
|
+
class Spidy::Binder::Xml < Spidy::Binder::Base
|
7
|
+
def self.let(name, query = nil, &block)
|
8
|
+
@attribute_names ||= []
|
9
|
+
@attribute_names << name
|
9
10
|
|
10
|
-
|
11
|
-
@attribute_names ||= []
|
12
|
-
@attribute_names << name
|
13
|
-
define_method(name) do
|
14
|
-
return xml.at(query)&.text if block.nil?
|
15
|
-
return instance_exec(&block) if query.blank?
|
11
|
+
return define_method(name) { xml.at(query)&.text } if block.nil?
|
16
12
|
|
13
|
+
define_method(name) do
|
14
|
+
if query.present?
|
17
15
|
instance_exec(xml.at(query), &block)
|
16
|
+
else
|
17
|
+
instance_exec(&block)
|
18
18
|
end
|
19
|
+
rescue StandardError => e
|
20
|
+
fail Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
22
|
-
|
23
|
-
alias_method :resource, :xml
|
24
|
-
|
25
|
-
def initialize(spidy, xml, url)
|
26
|
-
@spidy = spidy
|
27
|
-
@xml = xml
|
28
|
-
@url = url
|
29
|
-
end
|
30
|
-
|
31
|
-
def scraper(name, source)
|
32
|
-
lambda { |&block| @spidy.call(source, name: name, &block) }
|
33
|
-
end
|
34
|
-
|
35
|
-
def to_s
|
36
|
-
to_h.to_json
|
37
|
-
end
|
38
|
-
|
39
|
-
def to_h
|
40
|
-
self.class.attribute_names.map { |name| [name, send(name)] }.to_h
|
41
|
-
end
|
24
|
+
alias_method :xml, :resource
|
42
25
|
end
|
data/lib/spidy/connector.rb
CHANGED
@@ -10,6 +10,24 @@ module Spidy::Connector
|
|
10
10
|
autoload :Json
|
11
11
|
autoload :Xml
|
12
12
|
|
13
|
+
#
|
14
|
+
# retry class
|
15
|
+
#
|
16
|
+
class Retry < StandardError
|
17
|
+
attr_reader :page
|
18
|
+
attr_reader :response_code
|
19
|
+
attr_reader :wait_time
|
20
|
+
|
21
|
+
def initialize(wait_time: 2, page: nil, error: nil)
|
22
|
+
@page = page
|
23
|
+
@wait_time = wait_time
|
24
|
+
@response_code = error.try(:response_code) || page.try(:response_code)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# default user agent
|
30
|
+
#
|
13
31
|
USER_AGENT = [
|
14
32
|
'Mozilla/5.0',
|
15
33
|
'(Macintosh; Intel Mac OS X 10_12_6)',
|
@@ -19,9 +37,19 @@ module Spidy::Connector
|
|
19
37
|
'Safari/537.36'
|
20
38
|
].join(' ')
|
21
39
|
|
22
|
-
|
23
|
-
|
40
|
+
#
|
41
|
+
# get connection handller
|
42
|
+
#
|
43
|
+
def self.get(value, wait_time: nil, user_agent: nil, socks_proxy: nil)
|
44
|
+
return value if value.respond_to?(:call)
|
45
|
+
|
46
|
+
connector = const_get(value.to_s.classify).new(wait_time: wait_time || 5, user_agent: user_agent || USER_AGENT)
|
47
|
+
return connector if socks_proxy.nil?
|
24
48
|
|
25
|
-
|
49
|
+
lambda do |url, &block|
|
50
|
+
Socksify::proxy(socks_proxy[:host], socks_proxy[:port]) do
|
51
|
+
connector.call(url, &block)
|
52
|
+
end
|
53
|
+
end
|
26
54
|
end
|
27
55
|
end
|
@@ -3,8 +3,11 @@
|
|
3
3
|
#
|
4
4
|
# Direct resource ( not network resource )
|
5
5
|
#
|
6
|
-
|
7
|
-
def
|
6
|
+
class Spidy::Connector::Direct
|
7
|
+
def call(resource, &yielder)
|
8
8
|
yielder.call(resource)
|
9
9
|
end
|
10
|
+
|
11
|
+
def initialize(wait_time: nil, user_agent: nil)
|
12
|
+
end
|
10
13
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -3,73 +3,60 @@
|
|
3
3
|
#
|
4
4
|
# Mechanize wrapper
|
5
5
|
#
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
attr_reader :wait_time
|
14
|
-
|
15
|
-
def initialize(wait_time: 2, page: nil, error: nil)
|
16
|
-
@page = page
|
17
|
-
@wait_time = wait_time
|
18
|
-
@response_code = error.try(:response_code) || page.try(:response_code)
|
19
|
-
end
|
6
|
+
class Spidy::Connector::Html
|
7
|
+
def initialize(wait_time:, user_agent:, logger: nil)
|
8
|
+
@wait_time = wait_time
|
9
|
+
@logger = logger || proc { |values| STDERR.puts(values.to_json) }
|
10
|
+
@agent = Mechanize.new
|
11
|
+
@user_agent = user_agent
|
12
|
+
@agent.user_agent = user_agent
|
20
13
|
end
|
21
14
|
|
22
|
-
|
23
|
-
|
24
|
-
@agent.user_agent = Spidy::Connector::USER_AGENT
|
25
|
-
|
26
|
-
class << self
|
27
|
-
attr_reader :agent
|
28
|
-
attr_accessor :logger
|
15
|
+
attr_reader :agent
|
16
|
+
attr_reader :logger
|
29
17
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
37
|
-
get(url, retry_count, yielder)
|
18
|
+
def call(url, encoding: nil, retry_count: 5, &yielder)
|
19
|
+
fail 'url is not specified' if url.blank?
|
20
|
+
if encoding
|
21
|
+
agent.default_encoding = encoding
|
22
|
+
agent.force_default_encoding = true
|
38
23
|
end
|
24
|
+
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
25
|
+
get(url, retry_count, yielder)
|
26
|
+
end
|
39
27
|
|
40
|
-
|
28
|
+
private
|
41
29
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
30
|
+
def get(url, retry_count, yielder)
|
31
|
+
connect(url, retry_count, yielder)
|
32
|
+
rescue Spidy::Connector::Retry => e
|
33
|
+
logger.call('retry.accessed': Time.current,
|
34
|
+
'retry.uri': url,
|
35
|
+
'retry.response_code': e.response_code,
|
36
|
+
'retry.rest_count': retry_count)
|
49
37
|
|
50
|
-
|
51
|
-
|
38
|
+
@agent = Mechanize.new
|
39
|
+
@agent.user_agent = @user_agent
|
52
40
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
end
|
58
|
-
raise e
|
41
|
+
retry_count -= 1
|
42
|
+
if retry_count.positive?
|
43
|
+
sleep e.wait_time
|
44
|
+
retry
|
59
45
|
end
|
46
|
+
raise e
|
47
|
+
end
|
60
48
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
49
|
+
def connect(url, retry_count, yielder)
|
50
|
+
result = nil
|
51
|
+
agent.get(url) do |page|
|
52
|
+
fail Spidy::Connector::Retry, page: page, wait_time: @wait_time if page.title == 'Sorry, unable to access page...'
|
65
53
|
|
66
|
-
|
67
|
-
end
|
68
|
-
result
|
69
|
-
rescue Mechanize::ResponseCodeError => e
|
70
|
-
raise Retry, error: e if e.response_code == '429'
|
71
|
-
raise e
|
54
|
+
result = yielder.call(page)
|
72
55
|
end
|
73
|
-
|
56
|
+
result
|
57
|
+
rescue Mechanize::ResponseCodeError => e
|
58
|
+
raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '429'
|
59
|
+
raise Spidy::Connector::Retry, error: e, wait_time: @wait_time if e.response_code == '502'
|
60
|
+
raise e
|
74
61
|
end
|
75
62
|
end
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -3,9 +3,17 @@
|
|
3
3
|
#
|
4
4
|
# OpenURI to JSON.parse
|
5
5
|
#
|
6
|
-
|
7
|
-
def
|
6
|
+
class Spidy::Connector::Json
|
7
|
+
def initialize(wait_time: nil, user_agent: nil)
|
8
|
+
@user_agent = user_agent
|
9
|
+
end
|
10
|
+
|
11
|
+
def call(url, &block)
|
8
12
|
fail 'url is not specified' if url.blank?
|
9
|
-
|
13
|
+
connect(url, &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
def connect(url)
|
17
|
+
OpenURI.open_uri(url, "User-Agent" => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
10
18
|
end
|
11
19
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -3,12 +3,16 @@
|
|
3
3
|
#
|
4
4
|
# xml
|
5
5
|
#
|
6
|
-
|
7
|
-
def
|
6
|
+
class Spidy::Connector::Xml
|
7
|
+
def call(url)
|
8
8
|
fail 'URL is undefined' if url.blank?
|
9
9
|
|
10
|
-
OpenURI.open_uri(url, "User-Agent" =>
|
10
|
+
OpenURI.open_uri(url, "User-Agent" => @user_agent) do |body|
|
11
11
|
yield Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
12
12
|
end
|
13
13
|
end
|
14
|
+
|
15
|
+
def initialize(wait_time: nil, user_agent: nil)
|
16
|
+
@user_agent = user_agent
|
17
|
+
end
|
14
18
|
end
|
data/lib/spidy/console.rb
CHANGED
data/lib/spidy/definition.rb
CHANGED
@@ -16,6 +16,22 @@ module Spidy::Definition
|
|
16
16
|
spidy.call(source, &yielder)
|
17
17
|
end
|
18
18
|
|
19
|
+
def user_agent(user_agent)
|
20
|
+
@user_agent = user_agent
|
21
|
+
end
|
22
|
+
|
23
|
+
def wait_time(wait_time)
|
24
|
+
@wait_time = wait_time
|
25
|
+
end
|
26
|
+
|
27
|
+
def socks_proxy(host, port)
|
28
|
+
@socks_proxy = { host: host, port: port }
|
29
|
+
end
|
30
|
+
|
31
|
+
def tor?
|
32
|
+
Tor.running?
|
33
|
+
end
|
34
|
+
|
19
35
|
def each(source = nil, name: :default, &yielder)
|
20
36
|
name = name.presence || :default
|
21
37
|
spidy = @namespace[:"#{name}_spider"]
|
@@ -26,7 +42,7 @@ module Spidy::Definition
|
|
26
42
|
|
27
43
|
def spider(name = :default, connector: nil, as: nil, &define_block)
|
28
44
|
@namespace ||= {}
|
29
|
-
connector = Spidy::Connector.get(connector || as)
|
45
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent, socks_proxy: @socks_proxy)
|
30
46
|
@namespace[:"#{name}_spider"] = proc do |source, &yielder|
|
31
47
|
define_block.call(yielder, connector, source)
|
32
48
|
end
|
@@ -34,18 +50,18 @@ module Spidy::Definition
|
|
34
50
|
|
35
51
|
def define(name = :default, connector: nil, binder: nil, as: nil, &define_block)
|
36
52
|
@namespace ||= {}
|
37
|
-
connector = Spidy::Connector.get(connector || as)
|
53
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent)
|
38
54
|
binder = Spidy::Binder.get(self, binder || as)
|
39
|
-
@namespace[:"#{name}_scraper"] = define_proc(connector, binder, define_block)
|
55
|
+
@namespace[:"#{name}_scraper"] = define_proc(name, connector, binder, define_block)
|
40
56
|
end
|
41
57
|
|
42
58
|
private
|
43
59
|
|
44
|
-
def define_proc(connector, binder, define_block)
|
60
|
+
def define_proc(name, connector, binder, define_block)
|
45
61
|
proc do |source, &yielder|
|
46
62
|
yielder = lambda { |result| break result } if yielder.nil?
|
47
63
|
connection_yielder = lambda do |page|
|
48
|
-
binder.call(page, url: source, define: define_block) { |object| yielder.call(object) }
|
64
|
+
binder.call(page, url: source, define: define_block, define_name: name) { |object| yielder.call(object) }
|
49
65
|
end
|
50
66
|
connector.call(source, &connection_yielder)
|
51
67
|
end
|
data/lib/spidy/version.rb
CHANGED
data/spidy.gemspec
CHANGED
@@ -31,7 +31,9 @@ Gem::Specification.new do |spec|
|
|
31
31
|
spec.add_development_dependency 'ffaker'
|
32
32
|
spec.add_development_dependency 'rspec-command'
|
33
33
|
|
34
|
+
spec.add_runtime_dependency 'tor'
|
34
35
|
spec.add_runtime_dependency 'activesupport'
|
35
36
|
spec.add_runtime_dependency 'mechanize'
|
37
|
+
spec.add_runtime_dependency 'socksify'
|
36
38
|
spec.add_runtime_dependency 'pry'
|
37
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: tor
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: activesupport
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,6 +136,20 @@ dependencies:
|
|
122
136
|
- - ">="
|
123
137
|
- !ruby/object:Gem::Version
|
124
138
|
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: socksify
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :runtime
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
125
153
|
- !ruby/object:Gem::Dependency
|
126
154
|
name: pry
|
127
155
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,7 +164,7 @@ dependencies:
|
|
136
164
|
- - ">="
|
137
165
|
- !ruby/object:Gem::Version
|
138
166
|
version: '0'
|
139
|
-
description:
|
167
|
+
description:
|
140
168
|
email:
|
141
169
|
- aileron.cc@gmail.com
|
142
170
|
executables:
|
@@ -174,7 +202,6 @@ files:
|
|
174
202
|
- lib/spidy/console.rb
|
175
203
|
- lib/spidy/definition.rb
|
176
204
|
- lib/spidy/definition_file.rb
|
177
|
-
- lib/spidy/interface.rb
|
178
205
|
- lib/spidy/shell.rb
|
179
206
|
- lib/spidy/spider.rb
|
180
207
|
- lib/spidy/version.rb
|
@@ -184,7 +211,7 @@ homepage: https://github.com/aileron-inc/spidy
|
|
184
211
|
licenses:
|
185
212
|
- MIT
|
186
213
|
metadata: {}
|
187
|
-
post_install_message:
|
214
|
+
post_install_message:
|
188
215
|
rdoc_options: []
|
189
216
|
require_paths:
|
190
217
|
- lib
|
@@ -199,8 +226,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
199
226
|
- !ruby/object:Gem::Version
|
200
227
|
version: '0'
|
201
228
|
requirements: []
|
202
|
-
rubygems_version: 3.
|
203
|
-
signing_key:
|
229
|
+
rubygems_version: 3.0.3
|
230
|
+
signing_key:
|
204
231
|
specification_version: 4
|
205
232
|
summary: web spider dsl
|
206
233
|
test_files: []
|