spidr 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.editorconfig +11 -0
- data/.github/workflows/ruby.yml +26 -0
- data/.gitignore +4 -5
- data/ChangeLog.md +17 -0
- data/Gemfile +8 -5
- data/LICENSE.txt +1 -1
- data/README.md +137 -78
- data/Rakefile +1 -0
- data/gemspec.yml +8 -1
- data/lib/spidr/agent/actions.rb +1 -1
- data/lib/spidr/agent/events.rb +1 -1
- data/lib/spidr/agent/filters.rb +55 -56
- data/lib/spidr/agent/sanitizers.rb +6 -9
- data/lib/spidr/agent.rb +230 -120
- data/lib/spidr/auth_store.rb +10 -6
- data/lib/spidr/page/content_types.rb +51 -0
- data/lib/spidr/page/html.rb +17 -19
- data/lib/spidr/page/status_codes.rb +12 -10
- data/lib/spidr/proxy.rb +6 -14
- data/lib/spidr/rules.rb +5 -8
- data/lib/spidr/session_cache.rb +23 -21
- data/lib/spidr/settings/proxy.rb +19 -5
- data/lib/spidr/spidr.rb +16 -6
- data/lib/spidr/version.rb +1 -1
- data/spec/agent_spec.rb +357 -10
- data/spec/example_page.rb +2 -0
- data/spec/page/content_types_spec.rb +22 -0
- data/spec/page/html_spec.rb +255 -51
- data/spec/page/status_codes_spec.rb +4 -4
- data/spec/proxy_spec.rb +2 -2
- data/spec/settings/proxy_examples.rb +31 -11
- data/spec/spec_helper.rb +3 -0
- metadata +19 -19
- data/.travis.yml +0 -14
data/lib/spidr/page/html.rb
CHANGED
@@ -105,7 +105,9 @@ module Spidr
|
|
105
105
|
def each_redirect(&block)
|
106
106
|
return enum_for(__method__) unless block
|
107
107
|
|
108
|
-
|
108
|
+
locations = @response.get_fields('Location')
|
109
|
+
|
110
|
+
unless (locations.nil? || locations.empty?)
|
109
111
|
# Location headers override any meta-refresh redirects in the HTML
|
110
112
|
locations.each(&block)
|
111
113
|
else
|
@@ -175,34 +177,30 @@ module Spidr
|
|
175
177
|
#
|
176
178
|
# @since 0.3.0
|
177
179
|
#
|
178
|
-
def each_link
|
180
|
+
def each_link(&block)
|
179
181
|
return enum_for(__method__) unless block_given?
|
180
182
|
|
181
|
-
|
182
|
-
yield url unless (url.nil? || url.empty?)
|
183
|
-
}
|
184
|
-
|
185
|
-
each_redirect(&filter) if is_redirect?
|
183
|
+
each_redirect(&block) if is_redirect?
|
186
184
|
|
187
185
|
if (html? && doc)
|
188
|
-
doc.search('//a[@href]').each do |a|
|
189
|
-
|
186
|
+
doc.search('//a[@href[string()]]').each do |a|
|
187
|
+
yield a.get_attribute('href')
|
190
188
|
end
|
191
189
|
|
192
|
-
doc.search('//frame[@src]').each do |iframe|
|
193
|
-
|
190
|
+
doc.search('//frame[@src[string()]]').each do |iframe|
|
191
|
+
yield iframe.get_attribute('src')
|
194
192
|
end
|
195
193
|
|
196
|
-
doc.search('//iframe[@src]').each do |iframe|
|
197
|
-
|
194
|
+
doc.search('//iframe[@src[string()]]').each do |iframe|
|
195
|
+
yield iframe.get_attribute('src')
|
198
196
|
end
|
199
197
|
|
200
|
-
doc.search('//link[@href]').each do |link|
|
201
|
-
|
198
|
+
doc.search('//link[@href[string()]]').each do |link|
|
199
|
+
yield link.get_attribute('href')
|
202
200
|
end
|
203
201
|
|
204
|
-
doc.search('//script[@src]').each do |script|
|
205
|
-
|
202
|
+
doc.search('//script[@src[string()]]').each do |script|
|
203
|
+
yield script.get_attribute('src')
|
206
204
|
end
|
207
205
|
end
|
208
206
|
end
|
@@ -211,7 +209,7 @@ module Spidr
|
|
211
209
|
# The links from within the page.
|
212
210
|
#
|
213
211
|
# @return [Array<String>]
|
214
|
-
# All links within the HTML page, frame
|
212
|
+
# All links within the HTML page, `frame`/`iframe` source URLs and any
|
215
213
|
# links in the `Location` header.
|
216
214
|
#
|
217
215
|
def links
|
@@ -271,7 +269,7 @@ module Spidr
|
|
271
269
|
return
|
272
270
|
end
|
273
271
|
|
274
|
-
if (path = new_url.path)
|
272
|
+
if (!new_url.opaque) && (path = new_url.path)
|
275
273
|
# ensure that paths begin with a leading '/' for URI::FTP
|
276
274
|
if (new_url.scheme == 'ftp' && !path.start_with?('/'))
|
277
275
|
path.insert(0,'/')
|
@@ -22,16 +22,6 @@ module Spidr
|
|
22
22
|
|
23
23
|
alias ok? is_ok?
|
24
24
|
|
25
|
-
#
|
26
|
-
# Determines if the response code is `308`.
|
27
|
-
#
|
28
|
-
# @return [Boolean]
|
29
|
-
# Specifies whether the response code is `308`.
|
30
|
-
#
|
31
|
-
def timedout?
|
32
|
-
code == 308
|
33
|
-
end
|
34
|
-
|
35
25
|
#
|
36
26
|
# Determines if the response code is `400`.
|
37
27
|
#
|
@@ -78,6 +68,18 @@ module Spidr
|
|
78
68
|
|
79
69
|
alias missing? is_missing?
|
80
70
|
|
71
|
+
#
|
72
|
+
# Determines if the response code is `408`.
|
73
|
+
#
|
74
|
+
# @return [Boolean]
|
75
|
+
# Specifies whether the response code is `408`.
|
76
|
+
#
|
77
|
+
def is_timedout?
|
78
|
+
code == 408
|
79
|
+
end
|
80
|
+
|
81
|
+
alias timedout? is_timedout?
|
82
|
+
|
81
83
|
#
|
82
84
|
# Determines if the response code is `500`.
|
83
85
|
#
|
data/lib/spidr/proxy.rb
CHANGED
@@ -10,28 +10,20 @@ module Spidr
|
|
10
10
|
#
|
11
11
|
# Initializes the proxy.
|
12
12
|
#
|
13
|
-
# @param [
|
14
|
-
# Attributes for the proxy.
|
15
|
-
#
|
16
|
-
# @option attributes [String] :host
|
13
|
+
# @param [String] host
|
17
14
|
# The host the proxy is running on.
|
18
15
|
#
|
19
|
-
# @
|
16
|
+
# @param [Integer] port
|
20
17
|
# The port the proxy is running on.
|
21
18
|
#
|
22
|
-
# @
|
19
|
+
# @param [String] user
|
23
20
|
# The user to authenticate as with the proxy.
|
24
21
|
#
|
25
|
-
# @
|
22
|
+
# @param [String] password
|
26
23
|
# The password to authenticate with.
|
27
24
|
#
|
28
|
-
def initialize(
|
29
|
-
super(
|
30
|
-
attributes[:host],
|
31
|
-
attributes.fetch(:port,DEFAULT_PORT),
|
32
|
-
attributes[:user],
|
33
|
-
attributes[:password]
|
34
|
-
)
|
25
|
+
def initialize(host: nil, port: DEFAULT_PORT, user: nil, password: nil)
|
26
|
+
super(host,port,user,password)
|
35
27
|
end
|
36
28
|
|
37
29
|
#
|
data/lib/spidr/rules.rb
CHANGED
@@ -14,21 +14,18 @@ module Spidr
|
|
14
14
|
#
|
15
15
|
# Creates a new Rules object.
|
16
16
|
#
|
17
|
-
# @param [
|
18
|
-
# Additional options.
|
19
|
-
#
|
20
|
-
# @option options [Array<String, Regexp, Proc>] :accept
|
17
|
+
# @param [Array<String, Regexp, Proc>, nil] accept
|
21
18
|
# The patterns to accept data with.
|
22
19
|
#
|
23
|
-
# @
|
20
|
+
# @param [Array<String, Regexp, Proc>, nil] reject
|
24
21
|
# The patterns to reject data with.
|
25
22
|
#
|
26
|
-
def initialize(
|
23
|
+
def initialize(accept: nil, reject: nil)
|
27
24
|
@accept = []
|
28
25
|
@reject = []
|
29
26
|
|
30
|
-
@accept +=
|
31
|
-
@reject +=
|
27
|
+
@accept += accept if accept
|
28
|
+
@reject += reject if reject
|
32
29
|
end
|
33
30
|
|
34
31
|
#
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -17,37 +17,39 @@ module Spidr
|
|
17
17
|
#
|
18
18
|
# Creates a new session cache.
|
19
19
|
#
|
20
|
-
# @param [Hash]
|
21
|
-
# Configuration options.
|
22
|
-
#
|
23
|
-
# @option [Hash] :proxy (Spidr.proxy)
|
20
|
+
# @param [Hash] proxy
|
24
21
|
# Proxy options.
|
25
22
|
#
|
26
|
-
# @
|
27
|
-
# Optional open timeout.
|
23
|
+
# @param [Integer] open_timeout
|
24
|
+
# Optional open connection timeout.
|
28
25
|
#
|
29
|
-
# @
|
30
|
-
# Optional
|
26
|
+
# @param [Integer] ssl_timeout
|
27
|
+
# Optional SSL connection timeout.
|
31
28
|
#
|
32
|
-
# @
|
29
|
+
# @param [Integer] read_timeout
|
33
30
|
# Optional read timeout.
|
34
31
|
#
|
35
|
-
# @
|
32
|
+
# @param [Integer] continue_timeout
|
36
33
|
# Optional `Continue` timeout.
|
37
34
|
#
|
38
|
-
# @
|
35
|
+
# @param [Integer] keep_alive_timeout
|
39
36
|
# Optional `Keep-Alive` timeout.
|
40
37
|
#
|
41
38
|
# @since 0.6.0
|
42
39
|
#
|
43
|
-
def initialize(
|
44
|
-
|
40
|
+
def initialize(proxy: Spidr.proxy,
|
41
|
+
open_timeout: Spidr.open_timeout,
|
42
|
+
ssl_timeout: Spidr.ssl_timeout,
|
43
|
+
read_timeout: Spidr.read_timeout,
|
44
|
+
continue_timeout: Spidr.continue_timeout,
|
45
|
+
keep_alive_timeout: Spidr.keep_alive_timeout)
|
46
|
+
self.proxy = proxy
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
48
|
+
self.open_timeout = open_timeout
|
49
|
+
self.ssl_timeout = ssl_timeout
|
50
|
+
self.read_timeout = read_timeout
|
51
|
+
self.continue_timeout = continue_timeout
|
52
|
+
self.keep_alive_timeout = keep_alive_timeout
|
51
53
|
|
52
54
|
@sessions = {}
|
53
55
|
end
|
@@ -65,7 +67,7 @@ module Spidr
|
|
65
67
|
#
|
66
68
|
def active?(url)
|
67
69
|
# normalize the url
|
68
|
-
url = URI(url
|
70
|
+
url = URI(url)
|
69
71
|
|
70
72
|
# session key
|
71
73
|
key = key_for(url)
|
@@ -84,7 +86,7 @@ module Spidr
|
|
84
86
|
#
|
85
87
|
def [](url)
|
86
88
|
# normalize the url
|
87
|
-
url = URI(url
|
89
|
+
url = URI(url)
|
88
90
|
|
89
91
|
# session key
|
90
92
|
key = key_for(url)
|
@@ -127,7 +129,7 @@ module Spidr
|
|
127
129
|
#
|
128
130
|
def kill!(url)
|
129
131
|
# normalize the url
|
130
|
-
url = URI(url
|
132
|
+
url = URI(url)
|
131
133
|
|
132
134
|
# session key
|
133
135
|
key = key_for(url)
|
data/lib/spidr/settings/proxy.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
require 'spidr/proxy'
|
2
2
|
|
3
|
+
require 'uri/http'
|
4
|
+
|
3
5
|
module Spidr
|
4
6
|
module Settings
|
5
7
|
#
|
@@ -21,7 +23,7 @@ module Spidr
|
|
21
23
|
#
|
22
24
|
# Sets the proxy information used by Agent objects.
|
23
25
|
#
|
24
|
-
# @param [Spidr::Proxy, Hash, nil] new_proxy
|
26
|
+
# @param [Spidr::Proxy, Hash, URI::HTTP, String, nil] new_proxy
|
25
27
|
# The new proxy information.
|
26
28
|
#
|
27
29
|
# @option new_proxy [String] :host
|
@@ -41,11 +43,23 @@ module Spidr
|
|
41
43
|
#
|
42
44
|
def proxy=(new_proxy)
|
43
45
|
@proxy = case new_proxy
|
44
|
-
when Spidr::Proxy
|
45
|
-
|
46
|
-
when
|
46
|
+
when Spidr::Proxy
|
47
|
+
new_proxy
|
48
|
+
when Hash
|
49
|
+
Spidr::Proxy.new(**new_proxy)
|
50
|
+
when String, URI::HTTP
|
51
|
+
proxy_uri = URI(new_proxy)
|
52
|
+
|
53
|
+
Spidr::Proxy.new(
|
54
|
+
host: proxy_uri.host,
|
55
|
+
port: proxy_uri.port,
|
56
|
+
user: proxy_uri.user,
|
57
|
+
password: proxy_uri.password
|
58
|
+
)
|
59
|
+
when nil
|
60
|
+
Spidr::Proxy.new
|
47
61
|
else
|
48
|
-
raise(TypeError,"#{self.class}#{__method__} only accepts Proxy, Hash or nil")
|
62
|
+
raise(TypeError,"#{self.class}#{__method__} only accepts Spidr::Proxy, URI::HTTP, Hash, or nil")
|
49
63
|
end
|
50
64
|
end
|
51
65
|
|
data/lib/spidr/spidr.rb
CHANGED
@@ -16,6 +16,7 @@ module Spidr
|
|
16
16
|
# @since 0.5.0
|
17
17
|
#
|
18
18
|
def self.robots?
|
19
|
+
@robots ||= false
|
19
20
|
@robots
|
20
21
|
end
|
21
22
|
|
@@ -35,22 +36,31 @@ module Spidr
|
|
35
36
|
#
|
36
37
|
# @see Agent.start_at
|
37
38
|
#
|
38
|
-
def self.start_at(url
|
39
|
-
Agent.start_at(url
|
39
|
+
def self.start_at(url,**kwargs,&block)
|
40
|
+
Agent.start_at(url,**kwargs,&block)
|
40
41
|
end
|
41
42
|
|
42
43
|
#
|
43
44
|
# @see Agent.host
|
44
45
|
#
|
45
|
-
def self.host(name
|
46
|
-
Agent.host(name
|
46
|
+
def self.host(name,**kwargs,&block)
|
47
|
+
Agent.host(name,**kwargs,&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# @see Agent.domain
|
52
|
+
#
|
53
|
+
# @since 0.7.0
|
54
|
+
#
|
55
|
+
def self.domain(name,options={},&block)
|
56
|
+
Agent.domain(name,options,&block)
|
47
57
|
end
|
48
58
|
|
49
59
|
#
|
50
60
|
# @see Agent.site
|
51
61
|
#
|
52
|
-
def self.site(url
|
53
|
-
Agent.site(url
|
62
|
+
def self.site(url,**kwargs,&block)
|
63
|
+
Agent.site(url,**kwargs,&block)
|
54
64
|
end
|
55
65
|
|
56
66
|
#
|
data/lib/spidr/version.rb
CHANGED