spidr 0.6.1 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +19 -1
- data/Gemfile +7 -4
- data/LICENSE.txt +1 -1
- data/README.md +136 -79
- data/Rakefile +1 -0
- data/gemspec.yml +7 -0
- data/lib/spidr/agent/actions.rb +3 -1
- data/lib/spidr/agent/events.rb +3 -1
- data/lib/spidr/agent/filters.rb +57 -56
- data/lib/spidr/agent/robots.rb +2 -0
- data/lib/spidr/agent/sanitizers.rb +7 -8
- data/lib/spidr/agent.rb +232 -108
- data/lib/spidr/auth_credential.rb +2 -0
- data/lib/spidr/auth_store.rb +9 -7
- data/lib/spidr/cookie_jar.rb +7 -5
- data/lib/spidr/extensions/uri.rb +3 -1
- data/lib/spidr/extensions.rb +3 -1
- data/lib/spidr/page/content_types.rb +53 -0
- data/lib/spidr/page/cookies.rb +2 -0
- data/lib/spidr/page/html.rb +21 -20
- data/lib/spidr/page/status_codes.rb +15 -11
- data/lib/spidr/page.rb +3 -1
- data/lib/spidr/proxy.rb +8 -14
- data/lib/spidr/rules.rb +7 -8
- data/lib/spidr/session_cache.rb +26 -22
- data/lib/spidr/settings/proxy.rb +22 -6
- data/lib/spidr/settings/timeouts.rb +2 -0
- data/lib/spidr/settings/user_agent.rb +2 -0
- data/lib/spidr/settings.rb +5 -3
- data/lib/spidr/spidr.rb +22 -11
- data/lib/spidr/version.rb +3 -1
- data/lib/spidr.rb +5 -3
- data/spec/agent_spec.rb +356 -7
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- data/spidr.gemspec +1 -4
- metadata +8 -7
- data/.travis.yml +0 -16
data/lib/spidr/page/html.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../extensions/uri'
|
4
|
+
|
1
5
|
require 'nokogiri'
|
2
|
-
require 'spidr/extensions/uri'
|
3
6
|
|
4
7
|
module Spidr
|
5
8
|
class Page
|
@@ -105,7 +108,9 @@ module Spidr
|
|
105
108
|
def each_redirect(&block)
|
106
109
|
return enum_for(__method__) unless block
|
107
110
|
|
108
|
-
|
111
|
+
locations = @response.get_fields('Location')
|
112
|
+
|
113
|
+
unless (locations.nil? || locations.empty?)
|
109
114
|
# Location headers override any meta-refresh redirects in the HTML
|
110
115
|
locations.each(&block)
|
111
116
|
else
|
@@ -175,34 +180,30 @@ module Spidr
|
|
175
180
|
#
|
176
181
|
# @since 0.3.0
|
177
182
|
#
|
178
|
-
def each_link
|
183
|
+
def each_link(&block)
|
179
184
|
return enum_for(__method__) unless block_given?
|
180
185
|
|
181
|
-
|
182
|
-
yield url unless (url.nil? || url.empty?)
|
183
|
-
}
|
184
|
-
|
185
|
-
each_redirect(&filter) if is_redirect?
|
186
|
+
each_redirect(&block) if is_redirect?
|
186
187
|
|
187
188
|
if (html? && doc)
|
188
|
-
doc.search('//a[@href]').each do |a|
|
189
|
-
|
189
|
+
doc.search('//a[@href[string()]]').each do |a|
|
190
|
+
yield a.get_attribute('href')
|
190
191
|
end
|
191
192
|
|
192
|
-
doc.search('//frame[@src]').each do |iframe|
|
193
|
-
|
193
|
+
doc.search('//frame[@src[string()]]').each do |iframe|
|
194
|
+
yield iframe.get_attribute('src')
|
194
195
|
end
|
195
196
|
|
196
|
-
doc.search('//iframe[@src]').each do |iframe|
|
197
|
-
|
197
|
+
doc.search('//iframe[@src[string()]]').each do |iframe|
|
198
|
+
yield iframe.get_attribute('src')
|
198
199
|
end
|
199
200
|
|
200
|
-
doc.search('//link[@href]').each do |link|
|
201
|
-
|
201
|
+
doc.search('//link[@href[string()]]').each do |link|
|
202
|
+
yield link.get_attribute('href')
|
202
203
|
end
|
203
204
|
|
204
|
-
doc.search('//script[@src]').each do |script|
|
205
|
-
|
205
|
+
doc.search('//script[@src[string()]]').each do |script|
|
206
|
+
yield script.get_attribute('src')
|
206
207
|
end
|
207
208
|
end
|
208
209
|
end
|
@@ -211,7 +212,7 @@ module Spidr
|
|
211
212
|
# The links from within the page.
|
212
213
|
#
|
213
214
|
# @return [Array<String>]
|
214
|
-
# All links within the HTML page, frame
|
215
|
+
# All links within the HTML page, `frame`/`iframe` source URLs and any
|
215
216
|
# links in the `Location` header.
|
216
217
|
#
|
217
218
|
def links
|
@@ -267,7 +268,7 @@ module Spidr
|
|
267
268
|
link = link.to_s
|
268
269
|
new_url = begin
|
269
270
|
url.merge(link)
|
270
|
-
rescue
|
271
|
+
rescue URI::Error
|
271
272
|
return
|
272
273
|
end
|
273
274
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Spidr
|
2
4
|
class Page
|
3
5
|
#
|
@@ -22,16 +24,6 @@ module Spidr
|
|
22
24
|
|
23
25
|
alias ok? is_ok?
|
24
26
|
|
25
|
-
#
|
26
|
-
# Determines if the response code is `308`.
|
27
|
-
#
|
28
|
-
# @return [Boolean]
|
29
|
-
# Specifies whether the response code is `308`.
|
30
|
-
#
|
31
|
-
def timedout?
|
32
|
-
code == 308
|
33
|
-
end
|
34
|
-
|
35
27
|
#
|
36
28
|
# Determines if the response code is `400`.
|
37
29
|
#
|
@@ -78,6 +70,18 @@ module Spidr
|
|
78
70
|
|
79
71
|
alias missing? is_missing?
|
80
72
|
|
73
|
+
#
|
74
|
+
# Determines if the response code is `408`.
|
75
|
+
#
|
76
|
+
# @return [Boolean]
|
77
|
+
# Specifies whether the response code is `408`.
|
78
|
+
#
|
79
|
+
def is_timedout?
|
80
|
+
code == 408
|
81
|
+
end
|
82
|
+
|
83
|
+
alias timedout? is_timedout?
|
84
|
+
|
81
85
|
#
|
82
86
|
# Determines if the response code is `500`.
|
83
87
|
#
|
@@ -90,7 +94,7 @@ module Spidr
|
|
90
94
|
|
91
95
|
#
|
92
96
|
# Determines if the response code is `300`, `301`, `302`, `303`
|
93
|
-
# or `307`. Also checks for "soft" redirects added at the page
|
97
|
+
# or `307`. Also checks for "soft" redirects added at the page
|
94
98
|
# level by a meta refresh tag.
|
95
99
|
#
|
96
100
|
# @return [Boolean]
|
data/lib/spidr/page.rb
CHANGED
data/lib/spidr/proxy.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Spidr
|
2
4
|
#
|
3
5
|
# @since 0.6.0
|
@@ -10,28 +12,20 @@ module Spidr
|
|
10
12
|
#
|
11
13
|
# Initializes the proxy.
|
12
14
|
#
|
13
|
-
# @param [
|
14
|
-
# Attributes for the proxy.
|
15
|
-
#
|
16
|
-
# @option attributes [String] :host
|
15
|
+
# @param [String] host
|
17
16
|
# The host the proxy is running on.
|
18
17
|
#
|
19
|
-
# @
|
18
|
+
# @param [Integer] port
|
20
19
|
# The port the proxy is running on.
|
21
20
|
#
|
22
|
-
# @
|
21
|
+
# @param [String] user
|
23
22
|
# The user to authenticate as with the proxy.
|
24
23
|
#
|
25
|
-
# @
|
24
|
+
# @param [String] password
|
26
25
|
# The password to authenticate with.
|
27
26
|
#
|
28
|
-
def initialize(
|
29
|
-
super(
|
30
|
-
attributes[:host],
|
31
|
-
attributes.fetch(:port,DEFAULT_PORT),
|
32
|
-
attributes[:user],
|
33
|
-
attributes[:password]
|
34
|
-
)
|
27
|
+
def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
|
28
|
+
super(host,port,user,password)
|
35
29
|
end
|
36
30
|
|
37
31
|
#
|
data/lib/spidr/rules.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Spidr
|
2
4
|
#
|
3
5
|
# The {Rules} class represents collections of acceptance and rejection
|
@@ -14,21 +16,18 @@ module Spidr
|
|
14
16
|
#
|
15
17
|
# Creates a new Rules object.
|
16
18
|
#
|
17
|
-
# @param [
|
18
|
-
# Additional options.
|
19
|
-
#
|
20
|
-
# @option options [Array<String, Regexp, Proc>] :accept
|
19
|
+
# @param [Array<String, Regexp, Proc>, nil] accept
|
21
20
|
# The patterns to accept data with.
|
22
21
|
#
|
23
|
-
# @
|
22
|
+
# @param [Array<String, Regexp, Proc>, nil] reject
|
24
23
|
# The patterns to reject data with.
|
25
24
|
#
|
26
|
-
def initialize(
|
25
|
+
def initialize(accept: nil, reject: nil)
|
27
26
|
@accept = []
|
28
27
|
@reject = []
|
29
28
|
|
30
|
-
@accept +=
|
31
|
-
@reject +=
|
29
|
+
@accept += accept if accept
|
30
|
+
@reject += reject if reject
|
32
31
|
end
|
33
32
|
|
34
33
|
#
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'settings/proxy'
|
4
|
+
require_relative 'settings/timeouts'
|
5
|
+
require_relative 'spidr'
|
4
6
|
|
5
7
|
require 'net/http'
|
6
8
|
require 'openssl'
|
@@ -17,37 +19,39 @@ module Spidr
|
|
17
19
|
#
|
18
20
|
# Creates a new session cache.
|
19
21
|
#
|
20
|
-
# @param [Hash]
|
21
|
-
# Configuration options.
|
22
|
-
#
|
23
|
-
# @option [Hash] :proxy (Spidr.proxy)
|
22
|
+
# @param [Hash] proxy
|
24
23
|
# Proxy options.
|
25
24
|
#
|
26
|
-
# @
|
27
|
-
# Optional open timeout.
|
25
|
+
# @param [Integer] open_timeout
|
26
|
+
# Optional open connection timeout.
|
28
27
|
#
|
29
|
-
# @
|
30
|
-
# Optional
|
28
|
+
# @param [Integer] ssl_timeout
|
29
|
+
# Optional SSL connection timeout.
|
31
30
|
#
|
32
|
-
# @
|
31
|
+
# @param [Integer] read_timeout
|
33
32
|
# Optional read timeout.
|
34
33
|
#
|
35
|
-
# @
|
34
|
+
# @param [Integer] continue_timeout
|
36
35
|
# Optional `Continue` timeout.
|
37
36
|
#
|
38
|
-
# @
|
37
|
+
# @param [Integer] keep_alive_timeout
|
39
38
|
# Optional `Keep-Alive` timeout.
|
40
39
|
#
|
41
40
|
# @since 0.6.0
|
42
41
|
#
|
43
|
-
def initialize(
|
44
|
-
|
42
|
+
def initialize(proxy: Spidr.proxy,
|
43
|
+
open_timeout: Spidr.open_timeout,
|
44
|
+
ssl_timeout: Spidr.ssl_timeout,
|
45
|
+
read_timeout: Spidr.read_timeout,
|
46
|
+
continue_timeout: Spidr.continue_timeout,
|
47
|
+
keep_alive_timeout: Spidr.keep_alive_timeout)
|
48
|
+
self.proxy = proxy
|
45
49
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
50
|
+
self.open_timeout = open_timeout
|
51
|
+
self.ssl_timeout = ssl_timeout
|
52
|
+
self.read_timeout = read_timeout
|
53
|
+
self.continue_timeout = continue_timeout
|
54
|
+
self.keep_alive_timeout = keep_alive_timeout
|
51
55
|
|
52
56
|
@sessions = {}
|
53
57
|
end
|
@@ -133,7 +137,7 @@ module Spidr
|
|
133
137
|
key = key_for(url)
|
134
138
|
|
135
139
|
if (sess = @sessions[key])
|
136
|
-
begin
|
140
|
+
begin
|
137
141
|
sess.finish
|
138
142
|
rescue IOError
|
139
143
|
end
|
data/lib/spidr/settings/proxy.rb
CHANGED
@@ -1,4 +1,8 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../proxy'
|
4
|
+
|
5
|
+
require 'uri/http'
|
2
6
|
|
3
7
|
module Spidr
|
4
8
|
module Settings
|
@@ -21,7 +25,7 @@ module Spidr
|
|
21
25
|
#
|
22
26
|
# Sets the proxy information used by Agent objects.
|
23
27
|
#
|
24
|
-
# @param [Spidr::Proxy, Hash, nil] new_proxy
|
28
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
25
29
|
# The new proxy information.
|
26
30
|
#
|
27
31
|
# @option new_proxy [String] :host
|
@@ -41,11 +45,23 @@ module Spidr
|
|
41
45
|
#
|
42
46
|
def proxy=(new_proxy)
|
43
47
|
@proxy = case new_proxy
|
44
|
-
when Spidr::Proxy
|
45
|
-
|
46
|
-
when
|
48
|
+
when Spidr::Proxy
|
49
|
+
new_proxy
|
50
|
+
when Hash
|
51
|
+
Spidr::Proxy.new(**new_proxy)
|
52
|
+
when String, URI::HTTP
|
53
|
+
proxy_uri = URI(new_proxy)
|
54
|
+
|
55
|
+
Spidr::Proxy.new(
|
56
|
+
host: proxy_uri.host,
|
57
|
+
port: proxy_uri.port,
|
58
|
+
user: proxy_uri.user,
|
59
|
+
password: proxy_uri.password
|
60
|
+
)
|
61
|
+
when nil
|
62
|
+
Spidr::Proxy.new
|
47
63
|
else
|
48
|
-
raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
|
64
|
+
raise(TypeError,"#{self.class}#{__method__} only accepts Spidr::Proxy, URI::HTTP, Hash, or nil")
|
49
65
|
end
|
50
66
|
end
|
51
67
|
|
data/lib/spidr/settings.rb
CHANGED
data/lib/spidr/spidr.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'settings/proxy'
|
4
|
+
require_relative 'settings/timeouts'
|
5
|
+
require_relative 'settings/user_agent'
|
6
|
+
require_relative 'agent'
|
5
7
|
|
6
8
|
module Spidr
|
7
9
|
extend Settings::Proxy
|
@@ -36,25 +38,34 @@ module Spidr
|
|
36
38
|
#
|
37
39
|
# @see Agent.start_at
|
38
40
|
#
|
39
|
-
def self.start_at(url
|
40
|
-
Agent.start_at(url
|
41
|
+
def self.start_at(url,**kwargs,&block)
|
42
|
+
Agent.start_at(url,**kwargs,&block)
|
41
43
|
end
|
42
44
|
|
43
45
|
#
|
44
46
|
# @see Agent.host
|
45
47
|
#
|
46
|
-
def self.host(name
|
47
|
-
Agent.host(name
|
48
|
+
def self.host(name,**kwargs,&block)
|
49
|
+
Agent.host(name,**kwargs,&block)
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# @see Agent.domain
|
54
|
+
#
|
55
|
+
# @since 0.7.0
|
56
|
+
#
|
57
|
+
def self.domain(name,**kwargs,&block)
|
58
|
+
Agent.domain(name,**kwargs,&block)
|
48
59
|
end
|
49
60
|
|
50
61
|
#
|
51
62
|
# @see Agent.site
|
52
63
|
#
|
53
|
-
def self.site(url
|
54
|
-
Agent.site(url
|
64
|
+
def self.site(url,**kwargs,&block)
|
65
|
+
Agent.site(url,**kwargs,&block)
|
55
66
|
end
|
56
67
|
|
57
|
-
#
|
68
|
+
#
|
58
69
|
# @abstract
|
59
70
|
#
|
60
71
|
def self.robots
|
data/lib/spidr/version.rb
CHANGED
data/lib/spidr.rb
CHANGED