spidy 0.3.9 → 0.3.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +7 -3
- data/Gemfile.lock +4 -6
- data/example/master_detail.rb +23 -23
- data/example/proxy.rb +2 -0
- data/example/retry.rb +2 -0
- data/example/wikip.rb +2 -5
- data/exe/spidy +3 -3
- data/lib/spidy/binder/error.rb +4 -0
- data/lib/spidy/binder/html.rb +2 -1
- data/lib/spidy/binder/json.rb +2 -1
- data/lib/spidy/binder/xml.rb +2 -1
- data/lib/spidy/binder.rb +1 -0
- data/lib/spidy/command_line.rb +37 -43
- data/lib/spidy/connector/direct.rb +2 -3
- data/lib/spidy/connector/html.rb +9 -7
- data/lib/spidy/connector/json.rb +3 -3
- data/lib/spidy/connector/xml.rb +2 -2
- data/lib/spidy/connector.rb +18 -16
- data/lib/spidy/definition.rb +13 -5
- data/lib/spidy/definition_file.rb +2 -3
- data/lib/spidy/{define_object.rb → definition_object.rb} +7 -2
- data/lib/spidy/version.rb +1 -1
- data/lib/spidy.rb +1 -1
- data/spidy.gemspec +7 -4
- metadata +32 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 76cb60ea985d1a663f24b7b024198d222756376bd9dd979a032c46ba39b16548
|
4
|
+
data.tar.gz: ff2e7f056f7ad5afe06df90adf0bb2e438c696472cde50c8d5758b2f9801684e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a721848978135752ddcfe3da30a293317a4852b41dc99209019ae71960538fe448ec4ad54da661e0b99edef3fcb85a84b095b99ddbbba9b628fdd4ac1be2f23c
|
7
|
+
data.tar.gz: a156f47f317cd4f1f0a66a13ac5102073723f139c5b797c8dc56d7dbdd41e342cb1ad1a6814812563033c68f448c4d57775c0edf300456dd164b90211632737e
|
data/.rubocop.yml
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
inherit_from: .rubocop_todo.yml
|
2
2
|
AllCops:
|
3
|
+
TargetRubyVersion: 3.0.2
|
4
|
+
NewCops: enable
|
3
5
|
DisplayCopNames: true
|
4
|
-
TargetRubyVersion: 2.6
|
5
6
|
|
6
7
|
Style/ClassAndModuleChildren:
|
7
8
|
Enabled: false
|
@@ -9,7 +10,7 @@ Style/ClassAndModuleChildren:
|
|
9
10
|
Style/SignalException:
|
10
11
|
EnforcedStyle: semantic
|
11
12
|
|
12
|
-
Naming/
|
13
|
+
Naming/MethodParameterName:
|
13
14
|
AllowedNames:
|
14
15
|
- as
|
15
16
|
|
@@ -17,8 +18,11 @@ Metrics/AbcSize:
|
|
17
18
|
Max: 21
|
18
19
|
Exclude:
|
19
20
|
|
21
|
+
Metrics/MethodLength:
|
22
|
+
Max: 15
|
23
|
+
|
20
24
|
Metrics/LineLength:
|
21
|
-
Max:
|
25
|
+
Max: 130
|
22
26
|
|
23
27
|
Metrics/BlockLength:
|
24
28
|
Max: 120
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
spidy (0.3.
|
4
|
+
spidy (0.3.10)
|
5
5
|
activesupport
|
6
6
|
mechanize
|
7
7
|
pry
|
@@ -32,7 +32,7 @@ GEM
|
|
32
32
|
coderay (1.1.3)
|
33
33
|
concurrent-ruby (1.1.9)
|
34
34
|
connection_pool (2.2.5)
|
35
|
-
diff-lcs (1.
|
35
|
+
diff-lcs (1.5.0)
|
36
36
|
domain_name (0.5.20190701)
|
37
37
|
unf (>= 0.0.5, < 1.0.0)
|
38
38
|
ffaker (2.20.0)
|
@@ -57,7 +57,6 @@ GEM
|
|
57
57
|
mime-types-data (~> 3.2015)
|
58
58
|
mime-types-data (3.2021.1115)
|
59
59
|
mini_mime (1.1.2)
|
60
|
-
mini_portile2 (2.6.1)
|
61
60
|
minitest (5.15.0)
|
62
61
|
mixlib-shellout (2.4.4)
|
63
62
|
mustermann (1.1.1)
|
@@ -65,8 +64,7 @@ GEM
|
|
65
64
|
net-http-digest_auth (1.4.1)
|
66
65
|
net-http-persistent (4.0.1)
|
67
66
|
connection_pool (~> 2.2)
|
68
|
-
nokogiri (1.12.5)
|
69
|
-
mini_portile2 (~> 2.6.1)
|
67
|
+
nokogiri (1.12.5-arm64-darwin)
|
70
68
|
racc (~> 1.4)
|
71
69
|
pry (0.14.1)
|
72
70
|
coderay (~> 1.1)
|
@@ -121,7 +119,7 @@ GEM
|
|
121
119
|
nokogiri (~> 1.8)
|
122
120
|
|
123
121
|
PLATFORMS
|
124
|
-
|
122
|
+
arm64-darwin-20
|
125
123
|
|
126
124
|
DEPENDENCIES
|
127
125
|
bundler (~> 2.0)
|
data/example/master_detail.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
Spidy.define do
|
4
|
-
url_to_params =
|
4
|
+
url_to_params = lambda { |url|
|
5
5
|
uri = URI.parse(url)
|
6
6
|
params = URI.decode_www_form(uri.query).to_h if uri.query.present?
|
7
7
|
params if params.present?
|
@@ -13,41 +13,41 @@ Spidy.define do
|
|
13
13
|
|
14
14
|
limit_page = 3
|
15
15
|
per_page = 25
|
16
|
-
yielder.call(Nokogiri::HTML::Builder.new
|
17
|
-
doc.html
|
18
|
-
doc.body
|
19
|
-
doc.span.bold
|
20
|
-
doc.text
|
21
|
-
|
22
|
-
doc.main
|
23
|
-
(page * per_page + 1).upto((page + 1) * per_page).each do |i|
|
16
|
+
yielder.call(Nokogiri::HTML::Builder.new do |doc|
|
17
|
+
doc.html do
|
18
|
+
doc.body do
|
19
|
+
doc.span.bold do
|
20
|
+
doc.text 'Hello world'
|
21
|
+
end
|
22
|
+
doc.main do
|
23
|
+
((page * per_page) + 1).upto((page + 1) * per_page).each do |i|
|
24
24
|
doc.a("page #{i}", href: "http://localhost/?id=#{i}")
|
25
25
|
end
|
26
|
-
|
26
|
+
end
|
27
27
|
doc.a('NEXT', href: "http://localhost/?page=#{page + 1}", class: 'next') if page < limit_page
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end.doc)
|
31
31
|
}
|
32
32
|
|
33
33
|
detail_page = proc { |url, &yielder|
|
34
34
|
params = url_to_params.call(url)
|
35
35
|
id = params['id']
|
36
36
|
|
37
|
-
yielder.call(Nokogiri::HTML::Builder.new
|
38
|
-
doc.html
|
39
|
-
doc.body
|
40
|
-
doc.span.bold
|
41
|
-
doc.text
|
42
|
-
|
37
|
+
yielder.call(Nokogiri::HTML::Builder.new do |doc|
|
38
|
+
doc.html do
|
39
|
+
doc.body do
|
40
|
+
doc.span.bold do
|
41
|
+
doc.text 'Hello world'
|
42
|
+
end
|
43
43
|
doc.h1("title_#{id}", id: 'title')
|
44
44
|
doc.main("body_#{id}", id: 'body')
|
45
45
|
doc.div.sub do
|
46
46
|
doc.span.name('testtest')
|
47
47
|
end
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end.doc)
|
51
51
|
}
|
52
52
|
|
53
53
|
define(as: :html, connector: detail_page) do
|
data/example/proxy.rb
CHANGED
data/example/retry.rb
CHANGED
data/example/wikip.rb
CHANGED
@@ -11,11 +11,8 @@ Spidy.define do
|
|
11
11
|
|
12
12
|
define(:infobox, as: :html, connector: :direct) do
|
13
13
|
let(:columns) do
|
14
|
-
html.search('tr').
|
15
|
-
{
|
16
|
-
name: tr.at('th')&.text,
|
17
|
-
value: tr.at('td')&.text
|
18
|
-
}
|
14
|
+
html.search('tr').map do |tr|
|
15
|
+
{ name: tr.at('th')&.text, value: tr.at('td')&.text }
|
19
16
|
end
|
20
17
|
end
|
21
18
|
end
|
data/exe/spidy
CHANGED
@@ -6,10 +6,10 @@ require 'pry'
|
|
6
6
|
|
7
7
|
if ARGV[1].blank?
|
8
8
|
case ARGV[0]
|
9
|
-
when 'version' then
|
9
|
+
when 'version' then $stdout.puts(Spidy::VERSION)
|
10
10
|
when 'console' then Spidy.shell.interactive
|
11
11
|
else
|
12
|
-
|
12
|
+
$stdout.puts 'usage: spidy [version console]'
|
13
13
|
end
|
14
14
|
else
|
15
15
|
case ARGV[0]
|
@@ -19,6 +19,6 @@ else
|
|
19
19
|
when 'each' then Spidy.shell(ARGV[1]).each(ARGV[2])
|
20
20
|
when 'eval' then Spidy.shell(ARGV[1]).eval_call(ARGV[2])
|
21
21
|
else
|
22
|
-
|
22
|
+
$stdout.puts 'usage: spidy [console function call each run] [file]'
|
23
23
|
end
|
24
24
|
end
|
data/lib/spidy/binder/html.rb
CHANGED
@@ -17,9 +17,10 @@ module Spidy::Binder::Html
|
|
17
17
|
instance_exec(&block)
|
18
18
|
end
|
19
19
|
rescue StandardError => e
|
20
|
-
|
20
|
+
raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
21
21
|
end
|
22
22
|
end
|
23
|
+
|
23
24
|
def self.extended(obj)
|
24
25
|
obj.alias_method :html, :resource
|
25
26
|
end
|
data/lib/spidy/binder/json.rb
CHANGED
@@ -17,9 +17,10 @@ module Spidy::Binder::Json
|
|
17
17
|
instance_exec(&block)
|
18
18
|
end
|
19
19
|
rescue StandardError => e
|
20
|
-
|
20
|
+
raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
21
21
|
end
|
22
22
|
end
|
23
|
+
|
23
24
|
def self.extended(obj)
|
24
25
|
obj.alias_method :json, :resource
|
25
26
|
end
|
data/lib/spidy/binder/xml.rb
CHANGED
@@ -17,9 +17,10 @@ module Spidy::Binder::Xml
|
|
17
17
|
instance_exec(&block)
|
18
18
|
end
|
19
19
|
rescue StandardError => e
|
20
|
-
|
20
|
+
raise Spidy::Binder::Error, "spidy(#{@define_name})##{name} => #{e.message}"
|
21
21
|
end
|
22
22
|
end
|
23
|
+
|
23
24
|
def self.extended(obj)
|
24
25
|
obj.alias_method :xml, :resource
|
25
26
|
end
|
data/lib/spidy/binder.rb
CHANGED
data/lib/spidy/command_line.rb
CHANGED
@@ -5,8 +5,10 @@
|
|
5
5
|
#
|
6
6
|
class Spidy::CommandLine
|
7
7
|
delegate :spidy, to: :@definition_file
|
8
|
-
class_attribute :output, default: (proc { |result|
|
9
|
-
class_attribute :error_handler, default: (proc { |e, url|
|
8
|
+
class_attribute :output, default: (proc { |result| $stdout.puts(result.to_s) })
|
9
|
+
class_attribute :error_handler, default: (proc { |e, url|
|
10
|
+
warn({ url: url, message: e.message, backtrace: e.backtrace }.to_json)
|
11
|
+
})
|
10
12
|
|
11
13
|
def eval_call(script)
|
12
14
|
@definition_file.spidy.instance_eval(script)
|
@@ -14,40 +16,36 @@ class Spidy::CommandLine
|
|
14
16
|
|
15
17
|
def initialize(definition_file)
|
16
18
|
@definition_file = definition_file
|
17
|
-
|
19
|
+
fail 'unloaded spidy' if definition_file.spidy.nil?
|
18
20
|
end
|
19
21
|
|
20
22
|
def each_stdin_lines(name)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
error_handler.call(e, url)
|
26
|
-
end
|
23
|
+
$stdin.each_line do |url|
|
24
|
+
spidy.each(url.strip, name: name, &output)
|
25
|
+
rescue StandardError => e
|
26
|
+
error_handler.call(e, url)
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
30
|
def call_stdin_lines(name)
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
error_handler.call(e, url)
|
36
|
-
end
|
31
|
+
$stdin.each_line do |url|
|
32
|
+
spidy.call(url.strip, name: name, &output)
|
33
|
+
rescue StandardError => e
|
34
|
+
error_handler.call(e, url)
|
37
35
|
end
|
38
36
|
end
|
39
37
|
|
40
38
|
def call(name)
|
41
|
-
return call_stdin_lines(name) if FileTest.pipe?(
|
42
|
-
spidy.call(name: name, &output) unless FileTest.pipe?(
|
43
|
-
rescue => e
|
39
|
+
return call_stdin_lines(name) if FileTest.pipe?($stdin)
|
40
|
+
spidy.call(name: name, &output) unless FileTest.pipe?($stdin)
|
41
|
+
rescue StandardError => e
|
44
42
|
error_handler.call(e, nil)
|
45
43
|
end
|
46
44
|
|
47
45
|
def each(name)
|
48
|
-
return each_stdin_lines(name) if FileTest.pipe?(
|
46
|
+
return each_stdin_lines(name) if FileTest.pipe?($stdin)
|
49
47
|
spidy.each(name: name, &output)
|
50
|
-
rescue => e
|
48
|
+
rescue StandardError => e
|
51
49
|
error_handler.call(e, nil)
|
52
50
|
end
|
53
51
|
|
@@ -63,36 +61,32 @@ class Spidy::CommandLine
|
|
63
61
|
end
|
64
62
|
|
65
63
|
def build(name)
|
66
|
-
|
67
|
-
|
64
|
+
File.write("#{name}.sh", build_shell_script(name))
|
65
|
+
File.write("#{name}.rb", build_ruby_script)
|
68
66
|
end
|
69
67
|
|
70
68
|
def build_shell(name)
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
SHELL
|
77
|
-
end
|
69
|
+
<<~SHELL
|
70
|
+
#!/bin/bash
|
71
|
+
eval "$(spidy $(dirname "${0}")/#{name}.rb shell)"
|
72
|
+
spider
|
73
|
+
SHELL
|
78
74
|
end
|
79
75
|
|
80
|
-
def build_ruby
|
81
|
-
|
82
|
-
|
83
|
-
# frozen_string_literal: true
|
76
|
+
def build_ruby
|
77
|
+
<<~RUBY
|
78
|
+
# frozen_string_literal: true
|
84
79
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
80
|
+
Spidy.define do
|
81
|
+
spider(as: :html) do |yielder, connector|
|
82
|
+
# connector.call(url) do |resource|
|
83
|
+
# yielder.call(url or resource)
|
84
|
+
# end
|
85
|
+
end
|
91
86
|
|
92
|
-
|
93
|
-
end
|
87
|
+
define(as: :html) do
|
94
88
|
end
|
95
|
-
|
96
|
-
|
89
|
+
end
|
90
|
+
RUBY
|
97
91
|
end
|
98
92
|
end
|
@@ -4,7 +4,7 @@
|
|
4
4
|
# Direct resource ( not network resource )
|
5
5
|
#
|
6
6
|
class Spidy::Connector::Direct
|
7
|
-
def call(resource
|
7
|
+
def call(resource)
|
8
8
|
if block_given?
|
9
9
|
yield resource
|
10
10
|
else
|
@@ -12,6 +12,5 @@ class Spidy::Connector::Direct
|
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
|
-
def initialize(user_agent:)
|
16
|
-
end
|
15
|
+
def initialize(user_agent:); end
|
17
16
|
end
|
data/lib/spidy/connector/html.rb
CHANGED
@@ -14,13 +14,13 @@ class Spidy::Connector::Html
|
|
14
14
|
|
15
15
|
attr_reader :agent
|
16
16
|
|
17
|
-
def call(url, encoding: nil,
|
17
|
+
def call(url, encoding: nil, &yielder)
|
18
18
|
fail 'url is not specified' if url.blank?
|
19
19
|
if encoding
|
20
20
|
agent.default_encoding = encoding
|
21
21
|
agent.force_default_encoding = true
|
22
22
|
end
|
23
|
-
connect(url,
|
23
|
+
connect(url, yielder)
|
24
24
|
end
|
25
25
|
|
26
26
|
def refresh!
|
@@ -30,17 +30,19 @@ class Spidy::Connector::Html
|
|
30
30
|
|
31
31
|
private
|
32
32
|
|
33
|
-
def connect(url,
|
33
|
+
def connect(url, yielder)
|
34
34
|
result = nil
|
35
35
|
agent.get(url) do |page|
|
36
|
-
|
36
|
+
if page.title == 'Sorry, unable to access page...'
|
37
|
+
fail Spidy::Connector::Retry.new(object: page, response_code: page.try(:response_code))
|
38
|
+
end
|
37
39
|
|
38
40
|
result = yielder.call(page)
|
39
41
|
end
|
40
42
|
result
|
41
43
|
rescue Mechanize::ResponseCodeError => e
|
42
|
-
raise Spidy::Connector::Retry
|
43
|
-
raise Spidy::Connector::Retry
|
44
|
-
raise Spidy::Connector::Retry
|
44
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '429'
|
45
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code)) if e.response_code == '502'
|
46
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.try(:response_code))
|
45
47
|
end
|
46
48
|
end
|
data/lib/spidy/connector/json.rb
CHANGED
@@ -17,9 +17,9 @@ class Spidy::Connector::Json
|
|
17
17
|
connect(url, &block)
|
18
18
|
end
|
19
19
|
|
20
|
-
def connect(url
|
21
|
-
OpenURI.open_uri(url,
|
20
|
+
def connect(url)
|
21
|
+
OpenURI.open_uri(url, 'User-Agent' => @user_agent) { |body| yield JSON.parse(body.read, symbolize_names: true) }
|
22
22
|
rescue OpenURI::HTTPError => e
|
23
|
-
raise Spidy::Connector::Retry
|
23
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
|
24
24
|
end
|
25
25
|
end
|
data/lib/spidy/connector/xml.rb
CHANGED
@@ -13,11 +13,11 @@ class Spidy::Connector::Xml
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def connect(url, &block)
|
16
|
-
OpenURI.open_uri(url,
|
16
|
+
OpenURI.open_uri(url, 'User-Agent' => @user_agent) do |body|
|
17
17
|
block.call Nokogiri::XML(body.read.gsub(/[\x00-\x09\x0B\x0C\x0E-\x1F\x7F]/, ''), url)
|
18
18
|
end
|
19
19
|
rescue OpenURI::HTTPError => e
|
20
|
-
raise Spidy::Connector::Retry
|
20
|
+
raise Spidy::Connector::Retry.new(error: e, response_code: e.io.status[0])
|
21
21
|
end
|
22
22
|
|
23
23
|
def initialize(user_agent:)
|
data/lib/spidy/connector.rb
CHANGED
@@ -27,7 +27,7 @@ module Spidy::Connector
|
|
27
27
|
#
|
28
28
|
# error output logger
|
29
29
|
#
|
30
|
-
DEFAULT_LOGGER = proc { |values|
|
30
|
+
DEFAULT_LOGGER = proc { |values| warn(values.to_json) }
|
31
31
|
|
32
32
|
#
|
33
33
|
# static method
|
@@ -36,7 +36,9 @@ module Spidy::Connector
|
|
36
36
|
extend ActiveSupport::Concern
|
37
37
|
class_methods do
|
38
38
|
def call(url, wait_time: 5, logger: Spidy::Connector::DEFAULT_LOGGER, user_agent: Spidy::Connector::USER_AGENT, &block)
|
39
|
-
::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
|
39
|
+
::Spidy::Connector::RetryableCaller.new(new(user_agent: user_agent), wait_time: wait_time, logger: logger).call(
|
40
|
+
url, &block
|
41
|
+
)
|
40
42
|
end
|
41
43
|
end
|
42
44
|
end
|
@@ -51,6 +53,7 @@ module Spidy::Connector
|
|
51
53
|
@object = object
|
52
54
|
@response_code = response_code
|
53
55
|
@error = error
|
56
|
+
super(error)
|
54
57
|
end
|
55
58
|
end
|
56
59
|
|
@@ -58,13 +61,13 @@ module Spidy::Connector
|
|
58
61
|
# retry
|
59
62
|
#
|
60
63
|
class RetryableCaller
|
61
|
-
attr_reader :origin_connector
|
64
|
+
attr_reader :origin_connector, :logger, :wait_time
|
62
65
|
|
63
|
-
def initialize(connector, logger:, wait_time:)
|
66
|
+
def initialize(connector, logger:, wait_time:, retry_attempt_count: 5)
|
64
67
|
@origin_connector = connector
|
65
68
|
@logger = logger
|
66
69
|
@wait_time = wait_time
|
67
|
-
@retry_attempt_count =
|
70
|
+
@retry_attempt_count = retry_attempt_count
|
68
71
|
end
|
69
72
|
|
70
73
|
def call(url, &block)
|
@@ -73,18 +76,18 @@ module Spidy::Connector
|
|
73
76
|
end
|
74
77
|
|
75
78
|
def connect(url, retry_attempt_count: @retry_attempt_count, &block)
|
76
|
-
|
77
|
-
|
79
|
+
logger.call('connnector.get': url, 'connnector.accessed': Time.current)
|
80
|
+
origin_connector.call(url, &block)
|
78
81
|
rescue Spidy::Connector::Retry => e
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
82
|
+
logger.call('retry.accessed': Time.current,
|
83
|
+
'retry.uri': url,
|
84
|
+
'retry.response_code': e.response_code,
|
85
|
+
'retry.attempt_count': retry_attempt_count)
|
83
86
|
|
84
87
|
retry_attempt_count -= 1
|
85
88
|
if retry_attempt_count.positive?
|
86
|
-
sleep
|
87
|
-
|
89
|
+
sleep wait_time
|
90
|
+
origin_connector.refresh! if origin_connector.respond_to?(:refresh!)
|
88
91
|
retry
|
89
92
|
end
|
90
93
|
raise e.error
|
@@ -103,7 +106,7 @@ module Spidy::Connector
|
|
103
106
|
end
|
104
107
|
|
105
108
|
def call(url, &block)
|
106
|
-
Socksify
|
109
|
+
Socksify.proxy(socks_proxy[:host], socks_proxy[:port]) do
|
107
110
|
connector.call(url, &block)
|
108
111
|
end
|
109
112
|
end
|
@@ -141,7 +144,6 @@ module Spidy::Connector
|
|
141
144
|
fail "Not defined connnector[#{value}]" if connector.nil?
|
142
145
|
return connector if socks_proxy.nil?
|
143
146
|
|
144
|
-
|
145
|
-
tor
|
147
|
+
TorConnector.new(connector, socks_proxy)
|
146
148
|
end
|
147
149
|
end
|
data/lib/spidy/definition.rb
CHANGED
@@ -33,26 +33,34 @@ module Spidy::Definition
|
|
33
33
|
spidy = @namespace[:"#{name}_spider"]
|
34
34
|
fail "undefined spidy [#{name}]" if spidy.nil?
|
35
35
|
|
36
|
-
|
36
|
+
if yielder
|
37
|
+
spidy.call(source, &yielder)
|
38
|
+
else
|
39
|
+
Enumerator.new do |enumerate_yielder|
|
40
|
+
spidy.call(source, &enumerate_yielder)
|
41
|
+
end
|
42
|
+
end
|
37
43
|
end
|
38
44
|
|
39
45
|
def spider(name = :default, connector: nil, as: nil, &define_block)
|
40
46
|
@namespace ||= {}
|
41
|
-
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
47
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
48
|
+
socks_proxy: @socks_proxy)
|
42
49
|
@namespace[:"#{name}_spider"] = proc do |source, &yielder|
|
43
50
|
define_block.call(yielder, connector, source)
|
44
51
|
end
|
45
52
|
end
|
46
53
|
|
47
54
|
def define(name = :default, connector: nil, as: nil, &define_block)
|
48
|
-
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
55
|
+
connector = Spidy::Connector.get(connector || as, wait_time: @wait_time, user_agent: @user_agent,
|
56
|
+
socks_proxy: @socks_proxy)
|
49
57
|
binder_base = Spidy::Binder.const_get(as.to_s.classify)
|
50
58
|
@namespace ||= {}
|
51
|
-
@namespace[:"#{name}_scraper"] = Class.new(Spidy::
|
59
|
+
@namespace[:"#{name}_scraper"] = Class.new(Spidy::DefinitionObject) do
|
52
60
|
extend binder_base
|
53
61
|
class_eval(&define_block)
|
54
62
|
define_singleton_method(:call) do |source, &yielder|
|
55
|
-
yielder =
|
63
|
+
yielder = ->(result) { break result } if yielder.nil?
|
56
64
|
connection_yielder = lambda do |page|
|
57
65
|
yielder.call(new(page, source))
|
58
66
|
end
|
@@ -4,8 +4,7 @@
|
|
4
4
|
# spidy interface binding
|
5
5
|
#
|
6
6
|
class Spidy::DefinitionFile
|
7
|
-
attr_reader :path
|
8
|
-
attr_reader :spidy
|
7
|
+
attr_reader :path, :spidy
|
9
8
|
|
10
9
|
def self.open(filepath)
|
11
10
|
object = new(filepath)
|
@@ -15,7 +14,7 @@ class Spidy::DefinitionFile
|
|
15
14
|
|
16
15
|
# rubocop:disable Security/Eval
|
17
16
|
def eval_definition
|
18
|
-
@spidy = eval(File.
|
17
|
+
@spidy = eval(File.read(path)) if path
|
19
18
|
end
|
20
19
|
# rubocop:enable Security/Eval
|
21
20
|
|
@@ -1,4 +1,9 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#
|
4
|
+
# An object that represents the scraper defined by define block.
|
5
|
+
#
|
6
|
+
class Spidy::DefinitionObject
|
2
7
|
class << self
|
3
8
|
attr_reader :attribute_names
|
4
9
|
end
|
@@ -14,6 +19,6 @@ class Spidy::DefineObject
|
|
14
19
|
end
|
15
20
|
|
16
21
|
def to_h
|
17
|
-
self.class.attribute_names.
|
22
|
+
self.class.attribute_names.to_h { |name| [name, send(name)] }
|
18
23
|
end
|
19
24
|
end
|
data/lib/spidy/version.rb
CHANGED
data/lib/spidy.rb
CHANGED
data/spidy.gemspec
CHANGED
@@ -25,17 +25,20 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.require_paths = ['lib']
|
26
26
|
|
27
27
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
28
|
+
spec.add_development_dependency 'capybara_discoball'
|
29
|
+
spec.add_development_dependency 'ffaker'
|
28
30
|
spec.add_development_dependency 'pry'
|
29
31
|
spec.add_development_dependency 'rake', '~> 13.0'
|
30
32
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
31
|
-
spec.add_development_dependency 'ffaker'
|
32
33
|
spec.add_development_dependency 'rspec-command'
|
33
|
-
spec.add_development_dependency 'capybara_discoball'
|
34
34
|
spec.add_development_dependency 'sinatra'
|
35
35
|
|
36
|
-
spec.add_runtime_dependency 'tor'
|
37
36
|
spec.add_runtime_dependency 'activesupport'
|
38
37
|
spec.add_runtime_dependency 'mechanize'
|
39
|
-
spec.add_runtime_dependency 'socksify'
|
40
38
|
spec.add_runtime_dependency 'pry'
|
39
|
+
spec.add_runtime_dependency 'socksify'
|
40
|
+
spec.add_runtime_dependency 'tor'
|
41
|
+
spec.metadata = {
|
42
|
+
'rubygems_mfa_required' => 'true'
|
43
|
+
}
|
41
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aileron
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-02-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -25,7 +25,7 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: capybara_discoball
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
@@ -39,63 +39,63 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: ffaker
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: pry
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
75
|
+
version: '13.0'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
82
|
+
version: '13.0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name: rspec
|
84
|
+
name: rspec
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - "
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
89
|
+
version: '3.0'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - "
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
96
|
+
version: '3.0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rspec-command
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - ">="
|
@@ -123,7 +123,7 @@ dependencies:
|
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
126
|
+
name: activesupport
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
129
|
- - ">="
|
@@ -137,7 +137,7 @@ dependencies:
|
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0'
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
140
|
+
name: mechanize
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
142
142
|
requirements:
|
143
143
|
- - ">="
|
@@ -151,7 +151,7 @@ dependencies:
|
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: '0'
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
|
-
name:
|
154
|
+
name: pry
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
157
|
- - ">="
|
@@ -179,7 +179,7 @@ dependencies:
|
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: '0'
|
181
181
|
- !ruby/object:Gem::Dependency
|
182
|
-
name:
|
182
|
+
name: tor
|
183
183
|
requirement: !ruby/object:Gem::Requirement
|
184
184
|
requirements:
|
185
185
|
- - ">="
|
@@ -222,6 +222,7 @@ files:
|
|
222
222
|
- exe/spidy
|
223
223
|
- lib/spidy.rb
|
224
224
|
- lib/spidy/binder.rb
|
225
|
+
- lib/spidy/binder/error.rb
|
225
226
|
- lib/spidy/binder/html.rb
|
226
227
|
- lib/spidy/binder/json.rb
|
227
228
|
- lib/spidy/binder/xml.rb
|
@@ -232,9 +233,9 @@ files:
|
|
232
233
|
- lib/spidy/connector/json.rb
|
233
234
|
- lib/spidy/connector/xml.rb
|
234
235
|
- lib/spidy/console.rb
|
235
|
-
- lib/spidy/define_object.rb
|
236
236
|
- lib/spidy/definition.rb
|
237
237
|
- lib/spidy/definition_file.rb
|
238
|
+
- lib/spidy/definition_object.rb
|
238
239
|
- lib/spidy/shell.rb
|
239
240
|
- lib/spidy/spider.rb
|
240
241
|
- lib/spidy/version.rb
|
@@ -243,7 +244,8 @@ files:
|
|
243
244
|
homepage: https://github.com/aileron-inc/spidy
|
244
245
|
licenses:
|
245
246
|
- MIT
|
246
|
-
metadata:
|
247
|
+
metadata:
|
248
|
+
rubygems_mfa_required: 'true'
|
247
249
|
post_install_message:
|
248
250
|
rdoc_options: []
|
249
251
|
require_paths:
|