spidy 0.3.9 → 0.3.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +7 -3
- data/Gemfile.lock +4 -6
- data/example/master_detail.rb +23 -23
- data/example/proxy.rb +2 -0
- data/example/retry.rb +2 -0
- data/example/wikip.rb +2 -5
- data/exe/spidy +3 -3
- data/lib/spidy/binder/error.rb +4 -0
- data/lib/spidy/binder/html.rb +2 -1
- data/lib/spidy/binder/json.rb +2 -1
- data/lib/spidy/binder/xml.rb +2 -1
- data/lib/spidy/binder.rb +1 -0
- data/lib/spidy/command_line.rb +37 -43
- data/lib/spidy/connector/direct.rb +2 -3
- data/lib/spidy/connector/html.rb +9 -7
- data/lib/spidy/connector/json.rb +3 -3
- data/lib/spidy/connector/xml.rb +2 -2
- data/lib/spidy/connector.rb +18 -16
- data/lib/spidy/definition.rb +13 -5
- data/lib/spidy/definition_file.rb +2 -3
- data/lib/spidy/{define_object.rb → definition_object.rb} +7 -2
- data/lib/spidy/version.rb +1 -1
- data/lib/spidy.rb +1 -1
- data/spidy.gemspec +7 -4
- metadata +32 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 76cb60ea985d1a663f24b7b024198d222756376bd9dd979a032c46ba39b16548
|
4
|
+
data.tar.gz: ff2e7f056f7ad5afe06df90adf0bb2e438c696472cde50c8d5758b2f9801684e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a721848978135752ddcfe3da30a293317a4852b41dc99209019ae71960538fe448ec4ad54da661e0b99edef3fcb85a84b095b99ddbbba9b628fdd4ac1be2f23c
|
7
|
+
data.tar.gz: a156f47f317cd4f1f0a66a13ac5102073723f139c5b797c8dc56d7dbdd41e342cb1ad1a6814812563033c68f448c4d57775c0edf300456dd164b90211632737e
|
data/.rubocop.yml
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
inherit_from: .rubocop_todo.yml
|
2
2
|
AllCops:
|
3
|
+
TargetRubyVersion: 3.0.2
|
4
|
+
NewCops: enable
|
3
5
|
DisplayCopNames: true
|
4
|
-
TargetRubyVersion: 2.6
|
5
6
|
|
6
7
|
Style/ClassAndModuleChildren:
|
7
8
|
Enabled: false
|
@@ -9,7 +10,7 @@ Style/ClassAndModuleChildren:
|
|
9
10
|
Style/SignalException:
|
10
11
|
EnforcedStyle: semantic
|
11
12
|
|
12
|
-
Naming/
|
13
|
+
Naming/MethodParameterName:
|
13
14
|
AllowedNames:
|
14
15
|
- as
|
15
16
|
|
@@ -17,8 +18,11 @@ Metrics/AbcSize:
|
|
17
18
|
Max: 21
|
18
19
|
Exclude:
|
19
20
|
|
21
|
+
Metrics/MethodLength:
|
22
|
+
Max: 15
|
23
|
+
|
20
24
|
Metrics/LineLength:
|
21
|
-
Max:
|
25
|
+
Max: 130
|
22
26
|
|
23
27
|
Metrics/BlockLength:
|
24
28
|
Max: 120
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.3.
|
4
|
+
spidy (0.3.10)
|
5
5
|
activesupport
|
6
6
|
mechanize
|
7
7
|
pry
|
@@ -32,7 +32,7 @@ GEM
|
|
32
32
|
coderay (1.1.3)
|
33
33
|
concurrent-ruby (1.1.9)
|
34
34
|
connection_pool (2.2.5)
|
35
|
-
diff-lcs (1.
|
35
|
+
diff-lcs (1.5.0)
|
36
36
|
domain_name (0.5.20190701)
|
37
37
|
unf (>= 0.0.5, < 1.0.0)
|
38
38
|
ffaker (2.20.0)
|
@@ -57,7 +57,6 @@ GEM
|
|
57
57
|
mime-types-data (~> 3.2015)
|
58
58
|
mime-types-data (3.2021.1115)
|
59
59
|
mini_mime (1.1.2)
|
60
|
-
mini_portile2 (2.6.1)
|
61
60
|
minitest (5.15.0)
|
62
61
|
mixlib-shellout (2.4.4)
|
63
62
|
mustermann (1.1.1)
|
@@ -65,8 +64,7 @@ GEM
|
|
65
64
|
net-http-digest_auth (1.4.1)
|
66
65
|
net-http-persistent (4.0.1)
|
67
66
|
connection_pool (~> 2.2)
|
68
|
-
nokogiri (1.12.5)
|
69
|
-
mini_portile2 (~> 2.6.1)
|
67
|
+
nokogiri (1.12.5-arm64-darwin)
|
70
68
|
racc (~> 1.4)
|
71
69
|
pry (0.14.1)
|
72
70
|
coderay (~> 1.1)
|
@@ -121,7 +119,7 @@ GEM
|
|
121
119
|
nokogiri (~> 1.8)
|
122
120
|
|
123
121
|
PLATFORMS
|
124
|
-
|
122
|
+
arm64-darwin-20
|
125
123
|
|
126
124
|
DEPENDENCIES
|
127
125
|
bundler (~> 2.0)
|
data/example/master_detail.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
Spidy.define do
|
4
|
-
url_to_params =
|
4
|
+
url_to_params = lambda { |url|
|
5
5
|
uri = URI.parse(url)
|
6
6
|
params = URI.decode_www_form(uri.query).to_h if uri.query.present?
|
7
7
|
params if params.present?
|
@@ -13,41 +13,41 @@ Spidy.define do
|
|
13
13
|
|
14
14
|
limit_page = 3
|
15
15
|
per_page = 25
|
16
|
-
yielder.call(Nokogiri::HTML::Builder.new
|
17
|
-
doc.html
|
18
|
-
doc.body
|
19
|
-
doc.span.bold
|
20
|
-
doc.text
|
21
|
-
|
22
|
-
doc.main
|
23
|
-
(page * per_page + 1).upto((page + 1) * per_page).each do |i|
|
16
|
+
yielder.call(Nokogiri::HTML::Builder.new do |doc|
|
17
|
+
doc.html do
|
18
|
+
doc.body do
|
19
|
+
doc.span.bold do
|
20
|
+
doc.text 'Hello world'
|
21
|
+
end
|
22
|
+
doc.main do
|
23
|
+
((page * per_page) + 1).upto((page + 1) * per_page).each do |i|
|
24
24
|
doc.a("page #{i}", href: "http://localhost/?id=#{i}")
|
25
25
|
end
|
26
|
-
|
26
|
+
end
|
27
27
|
doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end.doc)
|
31
31
|
}
|
32
32
|
|
33
33
|
detail_page = proc { |url, &yielder|
|
34
34
|
params = url_to_params.call(url)
|
35
35
|
id = params['id']
|
36
36
|
|
37
|
-
yielder.call(Nokogiri::HTML::Builder.new
|
38
|
-
doc.html
|
39
|
-
doc.body
|
40
|
-
doc.span.bold
|
41
|
-
doc.text
|
42
|
-
|
37
|
+
yielder.call(Nokogiri::HTML::Builder.new do |doc|
|
38
|
+
doc.html do
|
39
|
+
doc.body do
|
40
|
+
doc.span.bold do
|
41
|
+
doc.text 'Hello world'
|
42
|
+
end
|
43
43
|
doc.h1("title_#{id}", id: 'title')
|
44
44
|
doc.main("body_#{id}", id: 'body')
|
45
45
|
doc.div.sub do
|
46
46
|
doc.span.name('testtest')
|
47
47
|
end
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end.doc)
|
51
51
|
}
|
52
52
|
|
53
53
|
define(as: :html, connector: detail_page) do
|
data/example/proxy.rb
CHANGED
data/example/retry.rb
CHANGED
data/example/wikip.rb
CHANGED
@@ -11,11 +11,8 @@ Spidy.define do
|
|
11
11
|
|
12
12
|
define(:infobox, as: :html, connector: :direct) do
|
13
13
|
let(:columns) do
|
14
|
-
html.search('tr').
|
15
|
-
{
|
16
|
-
name: tr.at('th')&.text,
|
17
|
-
value: tr.at('td')&.text
|
18
|
-
}
|
14
|
+
html.search('tr').map do |tr|
|
15
|
+
{ name: tr.at('th')&.text, value: tr.at('td')&.text }
|
19
16
|
end
|
20
17
|
end
|
21
18
|
end
|
data/exe/spidy
CHANGED
@@ -6,10 +6,10 @@ require 'pry'
|
|
6
6
|
|
7
7
|
if ARGV[1].blank?
|
8
8
|
case ARGV[0]
|
9
|
-
when 'version' then
|
9
|
+
when 'version' then $stdout.puts(Spidy::VERSION)
|
10
10
|
when 'console' then Spidy.shell.interactive
|
11
11
|
else
|
12
|
-
|
12
|
+
$stdout.puts 'usage: spidy [version console]'
|
13
13
|
end
|
14
14
|
else
|
15
15
|
case ARGV[0]
|
@@ -19,6 +19,6 @@ else
|
|
19
19
|
when 'each' then Spidy.shell(ARGV[1]).each(ARGV[2])
|
20
20
|
when 'eval' then Spidy.shell(ARGV[1]).eval_call(ARGV[2])
|
21
21
|
else
|
22
|
-
|
22
|
+
$stdout.puts 'usage: spidy [console function call each run] [file]'
|
23
23
|
end
|
24
24
|
end
|
data/lib/spidy/binder/html.rb
CHANGED
@@ -17,9 +17,10 @@ module Spidy::Binder::Html
|
|
17
17
|
instance_exec(&block)
|
18
18
|
end
|
19
19
|
rescue StandardError => e
|
20
|
-
|
20
|
+
raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
21
21
|
end
|
22
22
|
end
|
23
|
+
|
23
24
|
def self.extended(obj)
|
24
25
|
obj.alias_method :html, :resource
|
25
26
|
end
|
data/lib/spidy/binder/json.rb
CHANGED
@@ -17,9 +17,10 @@ module Spidy::Binder::Json
|
|
17
17
|
instance_exec(&block)
|
18
18
|
end
|
19
19
|
rescue StandardError => e
|
20
|
-
|
20
|
+
raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
21
21
|
end
|
22
22
|
end
|
23
|
+
|
23
24
|
def self.extended(obj)
|
24
25
|
obj.alias_method :json, :resource
|
25
26
|
end
|
data/lib/spidy/binder/xml.rb
CHANGED
@@ -17,9 +17,10 @@ module Spidy::Binder::Xml
|
|
17
17
|
instance_exec(&block)
|
18
18
|
end
|
19
19
|
rescue StandardError => e
|
20
|
-
|
20
|
+
raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
21
21
|
end
|
22
22
|
end
|
23
|
+
|
23
24
|
def self.extended(obj)
|
24
25
|
obj.alias_method :xml, :resource
|
25
26
|
end
|
data/lib/spidy/binder.rb
CHANGED
data/lib/spidy/command_line.rb
CHANGED
@@ -5,8 +5,10 @@
|
|
5
5
|
#
|
6
6
|
class Spidy::CommandLine
|
7
7
|
delegate :spidy, to: :@definition_file
|
8
|
-
class_attribute :output, default: (proc { |result|
|
9
|
-
class_attribute :error_handler, default: (proc { |e, url|
|
8
|
+
class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
|
9
|
+
class_attribute :error_handler, default: (proc { |e, url|
|
10
|
+
warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
|
11
|
+
})
|
10
12
|
|
11
13
|
def eval_call(script)
|
12
14
|
@definition_file.spidy.instance_eval(script)
|
@@ -14,40 +16,36 @@ class Spidy::CommandLine
|
|
14
16
|
|
15
17
|
def initialize(definition_file)
|
16
18
|
@definition_file = definition_file
|
17
|
-
|
19
|
+
fail 'unloaded spidy' if definition_file.spidy.nil?
|
18
20
|
end
|
19
21
|
|
20
22
|
def each_stdin_lines(name)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
error_handler.call(e, url)
|
26
|
-
end
|
23
|
+
$stdin.each_line do |url|
|
24
|
+
spidy.each(url.strip, name: name, &output)
|
25
|
+
rescue StandardError => e
|
26
|
+
error_handler.call(e, url)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
30
|
def call_stdin_lines(name)
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
error_handler.call(e, url)
|
36
|
-
end
|
31
|
+
$stdin.each_line do |url|
|
32
|
+
spidy.call(url.strip, name: name, &output)
|
33
|
+
rescue StandardError => e
|
34
|
+
error_handler.call(e, url)
|
37
35
|
end
|
38
36
|
end
|
39
37
|
|
40
38
|
def call(name)
|
41
|
-
return call_stdin_lines(name) if FileTest.pipe?(
|
42
|
-
spidy.call(name: name, &output) unless FileTest.pipe?(
|
43
|
-
rescue => e
|
39
|
+
return call_stdin_lines(name) if FileTest.pipe?($stdin)
|
40
|
+
spidy.call(name: name, &output) unless FileTest.pipe?($stdin)
|
41
|
+
rescue StandardError => e
|
44
42
|
error_handler.call(e, nil)
|
45
43
|
end
|
46
44
|
|
47
45
|
def each(name)
|
48
|
-
return each_stdin_lines(name) if FileTest.pipe?(
|
46
|
+
return each_stdin_lines(name) if FileTest.pipe?($stdin)
|
49
47
|
spidy.each(name: name, &output)
|
50
|
-
rescue => e
|
48
|
+
rescue StandardError => e
|
51
49
|
error_handler.call(e, nil)
|
52
50
|
end
|
53
51
|
|
@@ -63,36 +61,32 @@ class Spidy::CommandLine
|
|
63
61
|
end
|
64
62
|
|
65
63
|
def build(name)
|
66
|
-
|
67
|
-
|
64
|
+
File.write("#{name}.sh", build_shell_script(name))
|
65
|
+
File.write("#{name}.rb", build_ruby_script)
|
68
66
|
end
|
69
67
|
|
70
68
|
def build_shell(name)
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
SHELL
|
77
|
-
end
|
69
|
+
<<~SHELL
|
70
|
+
#!/bin/bash
|
71
|
+
eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
|
72
|
+
spider
|
73
|
+
SHELL
|
78
74
|
end
|
79
75
|
|
80
|
-
def build_ruby
|
81
|
-
|
82
|
-
|
83
|
-
# frozen_string_literal: true
|
76
|
+
def build_ruby
|
77
|
+
<<~RUBY
|
78
|
+
# frozen_string_literal: true
|
84
79
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
80
|
+
Spidy.define do
|
81
|
+
spider(as: :html) do |yielder, connector|
|
82
|
+
# connector.call(url) do |resource|
|
83
|
+
# yielder.call(url or resource)
|
84
|
+
# end
|
85
|
+
end
|
91
86
|
|
92
|
-
|
93
|
-
end
|
87
|
+
define(as: :html) do
|
94
88
|
end
|
95
|
-
|
96
|
-
|
89
|
+
end
|
90
|
+
RUBY
|
97
91
|
end
|
98
92
|
end
|
@@ -4,7 +4,7 @@
|
|
4
4
|
# Direct resource ( not network resource )
|
5
5
|
#
|
6
6
|
class Spidy::Connector::Direct
|
7
|
-
def call(resource
|
7
|
+
def call(resource)
|
8
8
|
if block_given?
|
9
9
|
yield resource
|
10
10
|
else
|
@@ -12,6 +12,5 @@ class Spidy::Connector::Direct
|
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
|
-
def initialize(user_agent:)
|
16
|
-
end
|
15
|
+
def initialize(user_agent:); end
|
17
16
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -14,13 +14,13 @@ class Spidy::Connector::Html
|
|
14
14
|
|
15
15
|
attr_reader :agent
|
16
16
|
|
17
|
-
def call(url, encoding: nil,
|
17
|
+
def call(url, encoding: nil, &yielder)
|
18
18
|
fail 'url is not specified' if url.blank?
|
19
19
|
if encoding
|
20
20
|
agent.default_encoding = encoding
|
21
21
|
agent.force_default_encoding = true
|
22
22
|
end
|
23
|
-
connect(url,
|
23
|
+
connect(url, yielder)
|
24
24
|
end
|
25
25
|
|
26
26
|
def refresh!
|
@@ -30,17 +30,19 @@ class Spidy::Connector::Html
|
|
30
30
|
|
31
31
|
private
|
32
32
|
|
33
|
-
def connect(url,
|
33
|
+
def connect(url, yielder)
|
34
34
|
result = nil
|
35
35
|
agent.get(url) do |page|
|
36
|
-
|
36
|
+
if page.title == 'Sorry, unable to access page...'
|
37
|
+
fail Spidy::Connector::Retry.new(object: page, response_code: page.try(:response_code))
|
38
|
+
end
|
37
39
|
|
38
40
|
result = yielder.call(page)
|
39
41
|
end
|
40
42
|
result
|
41
43
|
rescue Mechanize::ResponseCodeError => e
|
42
|
-
raise Spidy::Connector::Retry
|
43
|
-
raise Spidy::Connector::Retry
|
44
|
-
raise Spidy::Connector::Retry
|
44
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '429'
|
45
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '502'
|
46
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code))
|
45
47
|
end
|
46
48
|
end
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -17,9 +17,9 @@ class Spidy::Connector::Json
|
|
17
17
|
connect(url, &block)
|
18
18
|
end
|
19
19
|
|
20
|
-
def connect(url
|
21
|
-
OpenURI.open_uri(url,
|
20
|
+
def connect(url)
|
21
|
+
OpenURI.open_uri(url, 'User-Agent' => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
22
22
|
rescue OpenURI::HTTPError => e
|
23
|
-
raise Spidy::Connector::Retry
|
23
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
|
24
24
|
end
|
25
25
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -13,11 +13,11 @@ class Spidy::Connector::Xml
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def connect(url, &block)
|
16
|
-
OpenURI.open_uri(url,
|
16
|
+
OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
|
17
17
|
block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
18
18
|
end
|
19
19
|
rescue OpenURI::HTTPError => e
|
20
|
-
raise Spidy::Connector::Retry
|
20
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
|
21
21
|
end
|
22
22
|
|
23
23
|
def initialize(user_agent:)
|
data/lib/spidy/connector.rb
CHANGED
@@ -27,7 +27,7 @@ module Spidy::Connector
|
|
27
27
|
#
|
28
28
|
# error output logger
|
29
29
|
#
|
30
|
-
DEFAULT_LOGGER = proc { |values|
|
30
|
+
DEFAULT_LOGGER = proc { |values| warn(values.to_json) }
|
31
31
|
|
32
32
|
#
|
33
33
|
# static method
|
@@ -36,7 +36,9 @@ module Spidy::Connector
|
|
36
36
|
extend ActiveSupport::Concern
|
37
37
|
class_methods do
|
38
38
|
def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
|
39
|
-
::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
|
39
|
+
::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
|
40
|
+
url, &block
|
41
|
+
)
|
40
42
|
end
|
41
43
|
end
|
42
44
|
end
|
@@ -51,6 +53,7 @@ module Spidy::Connector
|
|
51
53
|
@object = object
|
52
54
|
@response_code = response_code
|
53
55
|
@error = error
|
56
|
+
super(error)
|
54
57
|
end
|
55
58
|
end
|
56
59
|
|
@@ -58,13 +61,13 @@ module Spidy::Connector
|
|
58
61
|
# retry
|
59
62
|
#
|
60
63
|
class RetryableCaller
|
61
|
-
attr_reader :origin_connector
|
64
|
+
attr_reader :origin_connector, :logger, :wait_time
|
62
65
|
|
63
|
-
def initialize(connector, logger:, wait_time:)
|
66
|
+
def initialize(connector, logger:, wait_time:, retry_attempt_count: 5)
|
64
67
|
@origin_connector = connector
|
65
68
|
@logger = logger
|
66
69
|
@wait_time = wait_time
|
67
|
-
@retry_attempt_count =
|
70
|
+
@retry_attempt_count = retry_attempt_count
|
68
71
|
end
|
69
72
|
|
70
73
|
def call(url, &block)
|
@@ -73,18 +76,18 @@ module Spidy::Connector
|
|
73
76
|
end
|
74
77
|
|
75
78
|
def connect(url, retry_attempt_count: @retry_attempt_count, &block)
|
76
|
-
|
77
|
-
|
79
|
+
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
80
|
+
origin_connector.call(url, &block)
|
78
81
|
rescue Spidy::Connector::Retry => e
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
82
|
+
logger.call('retry.accessed': Time.current,
|
83
|
+
'retry.uri': url,
|
84
|
+
'retry.response_code': e.response_code,
|
85
|
+
'retry.attempt_count': retry_attempt_count)
|
83
86
|
|
84
87
|
retry_attempt_count -= 1
|
85
88
|
if retry_attempt_count.positive?
|
86
|
-
sleep
|
87
|
-
|
89
|
+
sleep wait_time
|
90
|
+
origin_connector.refresh! if origin_connector.respond_to?(:refresh!)
|
88
91
|
retry
|
89
92
|
end
|
90
93
|
raise e.error
|
@@ -103,7 +106,7 @@ module Spidy::Connector
|
|
103
106
|
end
|
104
107
|
|
105
108
|
def call(url, &block)
|
106
|
-
Socksify
|
109
|
+
Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
|
107
110
|
connector.call(url, &block)
|
108
111
|
end
|
109
112
|
end
|
@@ -141,7 +144,6 @@ module Spidy::Connector
|
|
141
144
|
fail "Not defined connnector[#{value}]" if connector.nil?
|
142
145
|
return connector if socks_proxy.nil?
|
143
146
|
|
144
|
-
|
145
|
-
tor
|
147
|
+
TorConnector.new(connector, socks_proxy)
|
146
148
|
end
|
147
149
|
end
|
data/lib/spidy/definition.rb
CHANGED
@@ -33,26 +33,34 @@ module Spidy::Definition
|
|
33
33
|
spidy = @namespace[:"#{name}_spider"]
|
34
34
|
fail "undefined spidy [#{name}]" if spidy.nil?
|
35
35
|
|
36
|
-
|
36
|
+
if yielder
|
37
|
+
spidy.call(source, &yielder)
|
38
|
+
else
|
39
|
+
Enumerator.new do |enumerate_yielder|
|
40
|
+
spidy.call(source, &enumerate_yielder)
|
41
|
+
end
|
42
|
+
end
|
37
43
|
end
|
38
44
|
|
39
45
|
def spider(name = :default, connector: nil, as: nil, &define_block)
|
40
46
|
@namespace ||= {}
|
41
|
-
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
47
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
48
|
+
socks_proxy: @socks_proxy)
|
42
49
|
@namespace[:"#{name}_spider"] = proc do |source, &yielder|
|
43
50
|
define_block.call(yielder, connector, source)
|
44
51
|
end
|
45
52
|
end
|
46
53
|
|
47
54
|
def define(name = :default, connector: nil, as: nil, &define_block)
|
48
|
-
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
55
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
56
|
+
socks_proxy: @socks_proxy)
|
49
57
|
binder_base = Spidy::Binder.const_get(as.to_s.classify)
|
50
58
|
@namespace ||= {}
|
51
|
-
@namespace[:"#{name}_scraper"] = Class.new(Spidy::
|
59
|
+
@namespace[:"#{name}_scraper"] = Class.new(Spidy::DefinitionObject) do
|
52
60
|
extend binder_base
|
53
61
|
class_eval(&define_block)
|
54
62
|
define_singleton_method(:call) do |source, &yielder|
|
55
|
-
yielder =
|
63
|
+
yielder = ->(result) { break result } if yielder.nil?
|
56
64
|
connection_yielder = lambda do |page|
|
57
65
|
yielder.call(new(page, source))
|
58
66
|
end
|
@@ -4,8 +4,7 @@
|
|
4
4
|
# spidy interface binding
|
5
5
|
#
|
6
6
|
class Spidy::DefinitionFile
|
7
|
-
attr_reader :path
|
8
|
-
attr_reader :spidy
|
7
|
+
attr_reader :path, :spidy
|
9
8
|
|
10
9
|
def self.open(filepath)
|
11
10
|
object = new(filepath)
|
@@ -15,7 +14,7 @@ class Spidy::DefinitionFile
|
|
15
14
|
|
16
15
|
# rubocop:disable Security/Eval
|
17
16
|
def eval_definition
|
18
|
-
@spidy = eval(File.
|
17
|
+
@spidy = eval(File.read(path)) if path
|
19
18
|
end
|
20
19
|
# rubocop:enable Security/Eval
|
21
20
|
|
@@ -1,4 +1,9 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# An object that represents the scraper defined by define block.
|
5
|
+
#
|
6
|
+
class Spidy::DefinitionObject
|
2
7
|
class << self
|
3
8
|
attr_reader :attribute_names
|
4
9
|
end
|
@@ -14,6 +19,6 @@ class Spidy::DefineObject
|
|
14
19
|
end
|
15
20
|
|
16
21
|
def to_h
|
17
|
-
self.class.attribute_names.
|
22
|
+
self.class.attribute_names.to_h { |name| [name, send(name)] }
|
18
23
|
end
|
19
24
|
end
|
data/lib/spidy/version.rb
CHANGED
data/lib/spidy.rb
CHANGED
data/spidy.gemspec
CHANGED
@@ -25,17 +25,20 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.require_paths = ['lib']
|
26
26
|
|
27
27
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
28
|
+
spec.add_development_dependency 'capybara_discoball'
|
29
|
+
spec.add_development_dependency 'ffaker'
|
28
30
|
spec.add_development_dependency 'pry'
|
29
31
|
spec.add_development_dependency 'rake', '~> 13.0'
|
30
32
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
31
|
-
spec.add_development_dependency 'ffaker'
|
32
33
|
spec.add_development_dependency 'rspec-command'
|
33
|
-
spec.add_development_dependency 'capybara_discoball'
|
34
34
|
spec.add_development_dependency 'sinatra'
|
35
35
|
|
36
|
-
spec.add_runtime_dependency 'tor'
|
37
36
|
spec.add_runtime_dependency 'activesupport'
|
38
37
|
spec.add_runtime_dependency 'mechanize'
|
39
|
-
spec.add_runtime_dependency 'socksify'
|
40
38
|
spec.add_runtime_dependency 'pry'
|
39
|
+
spec.add_runtime_dependency 'socksify'
|
40
|
+
spec.add_runtime_dependency 'tor'
|
41
|
+
spec.metadata = {
|
42
|
+
'rubygems_mfa_required' => 'true'
|
43
|
+
}
|
41
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-02-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -25,7 +25,7 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: capybara_discoball
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
@@ -39,63 +39,63 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: ffaker
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: pry
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
75
|
+
version: '13.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
82
|
+
version: '13.0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name: rspec
|
84
|
+
name: rspec
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - "
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
89
|
+
version: '3.0'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - "
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
96
|
+
version: '3.0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rspec-command
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - ">="
|
@@ -123,7 +123,7 @@ dependencies:
|
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
126
|
+
name: activesupport
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
129
|
- - ">="
|
@@ -137,7 +137,7 @@ dependencies:
|
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0'
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
140
|
+
name: mechanize
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
142
142
|
requirements:
|
143
143
|
- - ">="
|
@@ -151,7 +151,7 @@ dependencies:
|
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: '0'
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
|
-
name:
|
154
|
+
name: pry
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
157
|
- - ">="
|
@@ -179,7 +179,7 @@ dependencies:
|
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: '0'
|
181
181
|
- !ruby/object:Gem::Dependency
|
182
|
-
name:
|
182
|
+
name: tor
|
183
183
|
requirement: !ruby/object:Gem::Requirement
|
184
184
|
requirements:
|
185
185
|
- - ">="
|
@@ -222,6 +222,7 @@ files:
|
|
222
222
|
- exe/spidy
|
223
223
|
- lib/spidy.rb
|
224
224
|
- lib/spidy/binder.rb
|
225
|
+
- lib/spidy/binder/error.rb
|
225
226
|
- lib/spidy/binder/html.rb
|
226
227
|
- lib/spidy/binder/json.rb
|
227
228
|
- lib/spidy/binder/xml.rb
|
@@ -232,9 +233,9 @@ files:
|
|
232
233
|
- lib/spidy/connector/json.rb
|
233
234
|
- lib/spidy/connector/xml.rb
|
234
235
|
- lib/spidy/console.rb
|
235
|
-
- lib/spidy/define_object.rb
|
236
236
|
- lib/spidy/definition.rb
|
237
237
|
- lib/spidy/definition_file.rb
|
238
|
+
- lib/spidy/definition_object.rb
|
238
239
|
- lib/spidy/shell.rb
|
239
240
|
- lib/spidy/spider.rb
|
240
241
|
- lib/spidy/version.rb
|
@@ -243,7 +244,8 @@ files:
|
|
243
244
|
homepage: https://github.com/aileron-inc/spidy
|
244
245
|
licenses:
|
245
246
|
- MIT
|
246
|
-
metadata:
|
247
|
+
metadata:
|
248
|
+
rubygems_mfa_required: 'true'
|
247
249
|
post_install_message:
|
248
250
|
rdoc_options: []
|
249
251
|
require_paths:
|