spidr 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +17 -0
- data/Gemfile +8 -5
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +8 -1
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +55 -56
- data/lib/spidr/agent/sanitizers.rb +6 -9
- data/lib/spidr/agent.rb +230 -120
- data/lib/spidr/auth_store.rb +10 -6
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +17 -19
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +23 -21
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +16 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +357 -10
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +19 -19
- data/.travis.yml +0 -14
data/lib/spidr/page/html.rb
CHANGED
@@ -105,7 +105,9 @@ module Spidr
|
|
105
105
|
def each_redirect(&block)
|
106
106
|
return enum_for(__method__) unless block
|
107
107
|
|
108
|
-
|
108
|
+
locations = @response.get_fields('Location')
|
109
|
+
|
110
|
+
unless (locations.nil? || locations.empty?)
|
109
111
|
# Location headers override any meta-refresh redirects in the HTML
|
110
112
|
locations.each(&block)
|
111
113
|
else
|
@@ -175,34 +177,30 @@ module Spidr
|
|
175
177
|
#
|
176
178
|
# @since 0.3.0
|
177
179
|
#
|
178
|
-
def each_link
|
180
|
+
def each_link(&block)
|
179
181
|
return enum_for(__method__) unless block_given?
|
180
182
|
|
181
|
-
|
182
|
-
yield url unless (url.nil? || url.empty?)
|
183
|
-
}
|
184
|
-
|
185
|
-
each_redirect(&filter) if is_redirect?
|
183
|
+
each_redirect(&block) if is_redirect?
|
186
184
|
|
187
185
|
if (html? && doc)
|
188
|
-
doc.search('//a[@href]').each do |a|
|
189
|
-
|
186
|
+
doc.search('//a[@href[string()]]').each do |a|
|
187
|
+
yield a.get_attribute('href')
|
190
188
|
end
|
191
189
|
|
192
|
-
doc.search('//frame[@src]').each do |iframe|
|
193
|
-
|
190
|
+
doc.search('//frame[@src[string()]]').each do |iframe|
|
191
|
+
yield iframe.get_attribute('src')
|
194
192
|
end
|
195
193
|
|
196
|
-
doc.search('//iframe[@src]').each do |iframe|
|
197
|
-
|
194
|
+
doc.search('//iframe[@src[string()]]').each do |iframe|
|
195
|
+
yield iframe.get_attribute('src')
|
198
196
|
end
|
199
197
|
|
200
|
-
doc.search('//link[@href]').each do |link|
|
201
|
-
|
198
|
+
doc.search('//link[@href[string()]]').each do |link|
|
199
|
+
yield link.get_attribute('href')
|
202
200
|
end
|
203
201
|
|
204
|
-
doc.search('//script[@src]').each do |script|
|
205
|
-
|
202
|
+
doc.search('//script[@src[string()]]').each do |script|
|
203
|
+
yield script.get_attribute('src')
|
206
204
|
end
|
207
205
|
end
|
208
206
|
end
|
@@ -211,7 +209,7 @@ module Spidr
|
|
211
209
|
# The links from within the page.
|
212
210
|
#
|
213
211
|
# @return [Array<String>]
|
214
|
-
# All links within the HTML page, frame
|
212
|
+
# All links within the HTML page, `frame`/`iframe` source URLs and any
|
215
213
|
# links in the `Location` header.
|
216
214
|
#
|
217
215
|
def links
|
@@ -271,7 +269,7 @@ module Spidr
|
|
271
269
|
return
|
272
270
|
end
|
273
271
|
|
274
|
-
if (path = new_url.path)
|
272
|
+
if (!new_url.opaque) && (path = new_url.path)
|
275
273
|
# ensure that paths begin with a leading '/' for URI::FTP
|
276
274
|
if (new_url.scheme == 'ftp' && !path.start_with?('/'))
|
277
275
|
path.insert(0,'/')
|
@@ -22,16 +22,6 @@ module Spidr
|
|
22
22
|
|
23
23
|
alias ok? is_ok?
|
24
24
|
|
25
|
-
#
|
26
|
-
# Determines if the response code is `308`.
|
27
|
-
#
|
28
|
-
# @return [Boolean]
|
29
|
-
# Specifies whether the response code is `308`.
|
30
|
-
#
|
31
|
-
def timedout?
|
32
|
-
code == 308
|
33
|
-
end
|
34
|
-
|
35
25
|
#
|
36
26
|
# Determines if the response code is `400`.
|
37
27
|
#
|
@@ -78,6 +68,18 @@ module Spidr
|
|
78
68
|
|
79
69
|
alias missing? is_missing?
|
80
70
|
|
71
|
+
#
|
72
|
+
# Determines if the response code is `408`.
|
73
|
+
#
|
74
|
+
# @return [Boolean]
|
75
|
+
# Specifies whether the response code is `408`.
|
76
|
+
#
|
77
|
+
def is_timedout?
|
78
|
+
code == 408
|
79
|
+
end
|
80
|
+
|
81
|
+
alias timedout? is_timedout?
|
82
|
+
|
81
83
|
#
|
82
84
|
# Determines if the response code is `500`.
|
83
85
|
#
|
data/lib/spidr/proxy.rb
CHANGED
@@ -10,28 +10,20 @@ module Spidr
|
|
10
10
|
#
|
11
11
|
# Initializes the proxy.
|
12
12
|
#
|
13
|
-
# @param [
|
14
|
-
# Attributes for the proxy.
|
15
|
-
#
|
16
|
-
# @option attributes [String] :host
|
13
|
+
# @param [String] host
|
17
14
|
# The host the proxy is running on.
|
18
15
|
#
|
19
|
-
# @
|
16
|
+
# @param [Integer] port
|
20
17
|
# The port the proxy is running on.
|
21
18
|
#
|
22
|
-
# @
|
19
|
+
# @param [String] user
|
23
20
|
# The user to authenticate as with the proxy.
|
24
21
|
#
|
25
|
-
# @
|
22
|
+
# @param [String] password
|
26
23
|
# The password to authenticate with.
|
27
24
|
#
|
28
|
-
def initialize(
|
29
|
-
super(
|
30
|
-
attributes[:host],
|
31
|
-
attributes.fetch(:port,DEFAULT_PORT),
|
32
|
-
attributes[:user],
|
33
|
-
attributes[:password]
|
34
|
-
)
|
25
|
+
def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
|
26
|
+
super(host,port,user,password)
|
35
27
|
end
|
36
28
|
|
37
29
|
#
|
data/lib/spidr/rules.rb
CHANGED
@@ -14,21 +14,18 @@ module Spidr
|
|
14
14
|
#
|
15
15
|
# Creates a new Rules object.
|
16
16
|
#
|
17
|
-
# @param [
|
18
|
-
# Additional options.
|
19
|
-
#
|
20
|
-
# @option options [Array<String, Regexp, Proc>] :accept
|
17
|
+
# @param [Array<String, Regexp, Proc>, nil] accept
|
21
18
|
# The patterns to accept data with.
|
22
19
|
#
|
23
|
-
# @
|
20
|
+
# @param [Array<String, Regexp, Proc>, nil] reject
|
24
21
|
# The patterns to reject data with.
|
25
22
|
#
|
26
|
-
def initialize(
|
23
|
+
def initialize(accept: nil, reject: nil)
|
27
24
|
@accept = []
|
28
25
|
@reject = []
|
29
26
|
|
30
|
-
@accept +=
|
31
|
-
@reject +=
|
27
|
+
@accept += accept if accept
|
28
|
+
@reject += reject if reject
|
32
29
|
end
|
33
30
|
|
34
31
|
#
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -17,37 +17,39 @@ module Spidr
|
|
17
17
|
#
|
18
18
|
# Creates a new session cache.
|
19
19
|
#
|
20
|
-
# @param [Hash]
|
21
|
-
# Configuration options.
|
22
|
-
#
|
23
|
-
# @option [Hash] :proxy (Spidr.proxy)
|
20
|
+
# @param [Hash] proxy
|
24
21
|
# Proxy options.
|
25
22
|
#
|
26
|
-
# @
|
27
|
-
# Optional open timeout.
|
23
|
+
# @param [Integer] open_timeout
|
24
|
+
# Optional open connection timeout.
|
28
25
|
#
|
29
|
-
# @
|
30
|
-
# Optional
|
26
|
+
# @param [Integer] ssl_timeout
|
27
|
+
# Optional SSL connection timeout.
|
31
28
|
#
|
32
|
-
# @
|
29
|
+
# @param [Integer] read_timeout
|
33
30
|
# Optional read timeout.
|
34
31
|
#
|
35
|
-
# @
|
32
|
+
# @param [Integer] continue_timeout
|
36
33
|
# Optional `Continue` timeout.
|
37
34
|
#
|
38
|
-
# @
|
35
|
+
# @param [Integer] keep_alive_timeout
|
39
36
|
# Optional `Keep-Alive` timeout.
|
40
37
|
#
|
41
38
|
# @since 0.6.0
|
42
39
|
#
|
43
|
-
def initialize(
|
44
|
-
|
40
|
+
def initialize(proxy: Spidr.proxy,
|
41
|
+
open_timeout: Spidr.open_timeout,
|
42
|
+
ssl_timeout: Spidr.ssl_timeout,
|
43
|
+
read_timeout: Spidr.read_timeout,
|
44
|
+
continue_timeout: Spidr.continue_timeout,
|
45
|
+
keep_alive_timeout: Spidr.keep_alive_timeout)
|
46
|
+
self.proxy = proxy
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
self.open_timeout = open_timeout
|
49
|
+
self.ssl_timeout = ssl_timeout
|
50
|
+
self.read_timeout = read_timeout
|
51
|
+
self.continue_timeout = continue_timeout
|
52
|
+
self.keep_alive_timeout = keep_alive_timeout
|
51
53
|
|
52
54
|
@sessions = {}
|
53
55
|
end
|
@@ -65,7 +67,7 @@ module Spidr
|
|
65
67
|
#
|
66
68
|
def active?(url)
|
67
69
|
# normalize the url
|
68
|
-
url = URI(url
|
70
|
+
url = URI(url)
|
69
71
|
|
70
72
|
# session key
|
71
73
|
key = key_for(url)
|
@@ -84,7 +86,7 @@ module Spidr
|
|
84
86
|
#
|
85
87
|
def [](url)
|
86
88
|
# normalize the url
|
87
|
-
url = URI(url
|
89
|
+
url = URI(url)
|
88
90
|
|
89
91
|
# session key
|
90
92
|
key = key_for(url)
|
@@ -127,7 +129,7 @@ module Spidr
|
|
127
129
|
#
|
128
130
|
def kill!(url)
|
129
131
|
# normalize the url
|
130
|
-
url = URI(url
|
132
|
+
url = URI(url)
|
131
133
|
|
132
134
|
# session key
|
133
135
|
key = key_for(url)
|
data/lib/spidr/settings/proxy.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'spidr/proxy'
|
2
2
|
|
3
|
+
require 'uri/http'
|
4
|
+
|
3
5
|
module Spidr
|
4
6
|
module Settings
|
5
7
|
#
|
@@ -21,7 +23,7 @@ module Spidr
|
|
21
23
|
#
|
22
24
|
# Sets the proxy information used by Agent objects.
|
23
25
|
#
|
24
|
-
# @param [Spidr::Proxy, Hash, nil] new_proxy
|
26
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
25
27
|
# The new proxy information.
|
26
28
|
#
|
27
29
|
# @option new_proxy [String] :host
|
@@ -41,11 +43,23 @@ module Spidr
|
|
41
43
|
#
|
42
44
|
def proxy=(new_proxy)
|
43
45
|
@proxy = case new_proxy
|
44
|
-
when Spidr::Proxy
|
45
|
-
|
46
|
-
when
|
46
|
+
when Spidr::Proxy
|
47
|
+
new_proxy
|
48
|
+
when Hash
|
49
|
+
Spidr::Proxy.new(**new_proxy)
|
50
|
+
when String, URI::HTTP
|
51
|
+
proxy_uri = URI(new_proxy)
|
52
|
+
|
53
|
+
Spidr::Proxy.new(
|
54
|
+
host: proxy_uri.host,
|
55
|
+
port: proxy_uri.port,
|
56
|
+
user: proxy_uri.user,
|
57
|
+
password: proxy_uri.password
|
58
|
+
)
|
59
|
+
when nil
|
60
|
+
Spidr::Proxy.new
|
47
61
|
else
|
48
|
-
raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
|
62
|
+
raise(TypeError,"#{self.class}#{__method__} only accepts Spidr::Proxy, URI::HTTP, Hash, or nil")
|
49
63
|
end
|
50
64
|
end
|
51
65
|
|
data/lib/spidr/spidr.rb
CHANGED
@@ -16,6 +16,7 @@ module Spidr
|
|
16
16
|
# @since 0.5.0
|
17
17
|
#
|
18
18
|
def self.robots?
|
19
|
+
@robots ||= false
|
19
20
|
@robots
|
20
21
|
end
|
21
22
|
|
@@ -35,22 +36,31 @@ module Spidr
|
|
35
36
|
#
|
36
37
|
# @see Agent.start_at
|
37
38
|
#
|
38
|
-
def self.start_at(url
|
39
|
-
Agent.start_at(url
|
39
|
+
def self.start_at(url,**kwargs,&block)
|
40
|
+
Agent.start_at(url,**kwargs,&block)
|
40
41
|
end
|
41
42
|
|
42
43
|
#
|
43
44
|
# @see Agent.host
|
44
45
|
#
|
45
|
-
def self.host(name
|
46
|
-
Agent.host(name
|
46
|
+
def self.host(name,**kwargs,&block)
|
47
|
+
Agent.host(name,**kwargs,&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# @see Agent.domain
|
52
|
+
#
|
53
|
+
# @since 0.7.0
|
54
|
+
#
|
55
|
+
def self.domain(name,options={},&block)
|
56
|
+
Agent.domain(name,options,&block)
|
47
57
|
end
|
48
58
|
|
49
59
|
#
|
50
60
|
# @see Agent.site
|
51
61
|
#
|
52
|
-
def self.site(url
|
53
|
-
Agent.site(url
|
62
|
+
def self.site(url,**kwargs,&block)
|
63
|
+
Agent.site(url,**kwargs,&block)
|
54
64
|
end
|
55
65
|
|
56
66
|
#
|
data/lib/spidr/version.rb
CHANGED