spidr 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
data/lib/spidr/agent/filters.rb
CHANGED
@@ -400,7 +400,7 @@ module Spidr
|
|
400
400
|
@schemes = []
|
401
401
|
|
402
402
|
if options[:schemes]
|
403
|
-
|
403
|
+
self.schemes = options[:schemes]
|
404
404
|
else
|
405
405
|
@schemes << 'http'
|
406
406
|
|
@@ -439,14 +439,6 @@ module Spidr
|
|
439
439
|
if options[:host]
|
440
440
|
visit_hosts_like(options[:host])
|
441
441
|
end
|
442
|
-
|
443
|
-
if options[:queue]
|
444
|
-
self.queue = options[:queue]
|
445
|
-
end
|
446
|
-
|
447
|
-
if options[:history]
|
448
|
-
self.history = options[:history]
|
449
|
-
end
|
450
442
|
end
|
451
443
|
|
452
444
|
#
|
@@ -0,0 +1,36 @@
|
|
1
|
+
begin
|
2
|
+
require 'robots'
|
3
|
+
rescue LoadError
|
4
|
+
end
|
5
|
+
|
6
|
+
module Spidr
|
7
|
+
class Agent
|
8
|
+
#
|
9
|
+
# Initializes the robots filter.
|
10
|
+
#
|
11
|
+
def initialize_robots
|
12
|
+
unless Object.const_defined?(:Robots)
|
13
|
+
raise(ArgumentError,":robots option given but unable to require 'robots' gem")
|
14
|
+
end
|
15
|
+
|
16
|
+
@robots = Robots.new(@user_agent)
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Determines whether a URL is allowed by the robot policy.
|
21
|
+
#
|
22
|
+
# @param [URI::HTTP, String] url
|
23
|
+
# The URL to check.
|
24
|
+
#
|
25
|
+
# @return [Boolean]
|
26
|
+
# Specifies whether a URL is allowed by the robot policy.
|
27
|
+
#
|
28
|
+
def robot_allowed?(url)
|
29
|
+
if @robots
|
30
|
+
@robots.allowed?(url)
|
31
|
+
else
|
32
|
+
true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/spidr/page.rb
CHANGED
@@ -1,7 +1,3 @@
|
|
1
|
-
require 'spidr/page/headers'
|
2
|
-
require 'spidr/page/body'
|
3
|
-
require 'spidr/page/links'
|
4
|
-
|
5
1
|
module Spidr
|
6
2
|
#
|
7
3
|
# Represents a requested page from a website.
|
@@ -34,42 +30,89 @@ module Spidr
|
|
34
30
|
end
|
35
31
|
|
36
32
|
#
|
37
|
-
# The
|
33
|
+
# The body of the response.
|
34
|
+
#
|
35
|
+
# @return [String]
|
36
|
+
# The body of the response.
|
38
37
|
#
|
39
|
-
|
40
|
-
|
38
|
+
def body
|
39
|
+
(response.body || '')
|
40
|
+
end
|
41
|
+
|
42
|
+
alias to_s body
|
43
|
+
|
41
44
|
#
|
42
|
-
#
|
43
|
-
# Deprecated in 0.3.0 and will be removed in 0.4.0.
|
44
|
-
# Use {#meta_redirects} instead.
|
45
|
+
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
|
45
46
|
#
|
46
|
-
|
47
|
-
|
48
|
-
|
47
|
+
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
48
|
+
# The document that represents HTML or XML pages.
|
49
|
+
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
50
|
+
# the page could not be parsed properly.
|
51
|
+
#
|
52
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
53
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
54
|
+
#
|
55
|
+
def doc
|
56
|
+
unless body.empty?
|
57
|
+
doc_class = if html?
|
58
|
+
Nokogiri::HTML::Document
|
59
|
+
elsif rss? || atom? || xml? || xsl?
|
60
|
+
Nokogiri::XML::Document
|
61
|
+
end
|
49
62
|
|
50
|
-
|
63
|
+
if doc_class
|
64
|
+
begin
|
65
|
+
@doc ||= doc_class.parse(body, @url.to_s, content_charset)
|
66
|
+
rescue
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
51
70
|
end
|
52
71
|
|
53
72
|
#
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
73
|
+
# Searches the document for XPath or CSS Path paths.
|
74
|
+
#
|
75
|
+
# @param [Array<String>] paths
|
76
|
+
# CSS or XPath expressions to search the document with.
|
57
77
|
#
|
58
|
-
# @return [
|
59
|
-
#
|
78
|
+
# @return [Array]
|
79
|
+
# The matched nodes from the document.
|
80
|
+
# Returns an empty Array if no nodes were matched, or if the page
|
81
|
+
# is not an HTML or XML document.
|
60
82
|
#
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
83
|
+
# @example
|
84
|
+
# page.search('//a[@href]')
|
85
|
+
#
|
86
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
|
87
|
+
#
|
88
|
+
def search(*paths)
|
89
|
+
if doc
|
90
|
+
doc.search(*paths)
|
67
91
|
else
|
68
|
-
|
92
|
+
[]
|
69
93
|
end
|
70
94
|
end
|
71
95
|
|
72
|
-
|
96
|
+
#
|
97
|
+
# Searches for the first occurrence an XPath or CSS Path expression.
|
98
|
+
#
|
99
|
+
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
100
|
+
# The first matched node. Returns `nil` if no nodes could be matched,
|
101
|
+
# or if the page is not a HTML or XML document.
|
102
|
+
#
|
103
|
+
# @example
|
104
|
+
# page.at('//title')
|
105
|
+
#
|
106
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
|
107
|
+
#
|
108
|
+
def at(*arguments)
|
109
|
+
if doc
|
110
|
+
doc.at(*arguments)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
alias / search
|
115
|
+
alias % at
|
73
116
|
|
74
117
|
protected
|
75
118
|
|
@@ -90,7 +133,7 @@ module Spidr
|
|
90
133
|
#
|
91
134
|
def method_missing(name,*arguments,&block)
|
92
135
|
if (arguments.empty? && block.nil?)
|
93
|
-
header_name = name.to_s.
|
136
|
+
header_name = name.to_s.tr('_','-')
|
94
137
|
|
95
138
|
if @response.key?(header_name)
|
96
139
|
return @response[header_name]
|
@@ -102,3 +145,8 @@ module Spidr
|
|
102
145
|
|
103
146
|
end
|
104
147
|
end
|
148
|
+
|
149
|
+
require 'spidr/page/status_codes'
|
150
|
+
require 'spidr/page/content_types'
|
151
|
+
require 'spidr/page/cookies'
|
152
|
+
require 'spidr/page/html'
|
@@ -1,98 +1,5 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module Spidr
|
4
2
|
class Page
|
5
|
-
# Reserved names used within Cookie strings
|
6
|
-
RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
|
7
|
-
|
8
|
-
#
|
9
|
-
# The response code from the page.
|
10
|
-
#
|
11
|
-
# @return [Integer]
|
12
|
-
# Response code from the page.
|
13
|
-
#
|
14
|
-
def code
|
15
|
-
response.code.to_i
|
16
|
-
end
|
17
|
-
|
18
|
-
#
|
19
|
-
# Determines if the response code is `200`.
|
20
|
-
#
|
21
|
-
# @return [Boolean]
|
22
|
-
# Specifies whether the response code is `200`.
|
23
|
-
#
|
24
|
-
def is_ok?
|
25
|
-
code == 200
|
26
|
-
end
|
27
|
-
|
28
|
-
alias ok? is_ok?
|
29
|
-
|
30
|
-
#
|
31
|
-
# Determines if the response code is `308`.
|
32
|
-
#
|
33
|
-
# @return [Boolean]
|
34
|
-
# Specifies whether the response code is `308`.
|
35
|
-
#
|
36
|
-
def timedout?
|
37
|
-
code == 308
|
38
|
-
end
|
39
|
-
|
40
|
-
#
|
41
|
-
# Determines if the response code is `400`.
|
42
|
-
#
|
43
|
-
# @return [Boolean]
|
44
|
-
# Specifies whether the response code is `400`.
|
45
|
-
#
|
46
|
-
def bad_request?
|
47
|
-
code == 400
|
48
|
-
end
|
49
|
-
|
50
|
-
#
|
51
|
-
# Determines if the response code is `401`.
|
52
|
-
#
|
53
|
-
# @return [Boolean]
|
54
|
-
# Specifies whether the response code is `401`.
|
55
|
-
#
|
56
|
-
def is_unauthorized?
|
57
|
-
code == 401
|
58
|
-
end
|
59
|
-
|
60
|
-
alias unauthorized? is_unauthorized?
|
61
|
-
|
62
|
-
#
|
63
|
-
# Determines if the response code is `403`.
|
64
|
-
#
|
65
|
-
# @return [Boolean]
|
66
|
-
# Specifies whether the response code is `403`.
|
67
|
-
#
|
68
|
-
def is_forbidden?
|
69
|
-
code == 403
|
70
|
-
end
|
71
|
-
|
72
|
-
alias forbidden? is_forbidden?
|
73
|
-
|
74
|
-
#
|
75
|
-
# Determines if the response code is `404`.
|
76
|
-
#
|
77
|
-
# @return [Boolean]
|
78
|
-
# Specifies whether the response code is `404`.
|
79
|
-
#
|
80
|
-
def is_missing?
|
81
|
-
code == 404
|
82
|
-
end
|
83
|
-
|
84
|
-
alias missing? is_missing?
|
85
|
-
|
86
|
-
#
|
87
|
-
# Determines if the response code is `500`.
|
88
|
-
#
|
89
|
-
# @return [Boolean]
|
90
|
-
# Specifies whether the response code is `500`.
|
91
|
-
#
|
92
|
-
def had_internal_server_error?
|
93
|
-
code == 500
|
94
|
-
end
|
95
|
-
|
96
3
|
#
|
97
4
|
# The Content-Type of the page.
|
98
5
|
#
|
@@ -100,7 +7,7 @@ module Spidr
|
|
100
7
|
# The Content-Type of the page.
|
101
8
|
#
|
102
9
|
def content_type
|
103
|
-
|
10
|
+
@response['Content-Type'] || ''
|
104
11
|
end
|
105
12
|
|
106
13
|
#
|
@@ -112,7 +19,7 @@ module Spidr
|
|
112
19
|
# @since 0.2.2
|
113
20
|
#
|
114
21
|
def content_types
|
115
|
-
(
|
22
|
+
@response.get_fields('content-type') || []
|
116
23
|
end
|
117
24
|
|
118
25
|
#
|
@@ -314,57 +221,5 @@ module Spidr
|
|
314
221
|
def zip?
|
315
222
|
is_content_type?('application/zip')
|
316
223
|
end
|
317
|
-
|
318
|
-
#
|
319
|
-
# The raw Cookie String sent along with the page.
|
320
|
-
#
|
321
|
-
# @return [String]
|
322
|
-
# The raw Cookie from the response.
|
323
|
-
#
|
324
|
-
# @since 0.2.7
|
325
|
-
#
|
326
|
-
def cookie
|
327
|
-
(response['Set-Cookie'] || '')
|
328
|
-
end
|
329
|
-
|
330
|
-
alias raw_cookie cookie
|
331
|
-
|
332
|
-
#
|
333
|
-
# The Cookie values sent along with the page.
|
334
|
-
#
|
335
|
-
# @return [Array<String>]
|
336
|
-
# The Cookies from the response.
|
337
|
-
#
|
338
|
-
# @since 0.2.2
|
339
|
-
#
|
340
|
-
def cookies
|
341
|
-
(headers['set-cookie'] || [])
|
342
|
-
end
|
343
|
-
|
344
|
-
#
|
345
|
-
# The Cookie key -> value pairs returned with the response.
|
346
|
-
#
|
347
|
-
# @return [Hash{String => String}]
|
348
|
-
# The cookie keys and values.
|
349
|
-
#
|
350
|
-
# @since 0.2.2
|
351
|
-
#
|
352
|
-
def cookie_params
|
353
|
-
params = {}
|
354
|
-
|
355
|
-
cookies.each do |value|
|
356
|
-
value.split(';').each do |param|
|
357
|
-
param.strip!
|
358
|
-
|
359
|
-
name, value = param.split('=',2)
|
360
|
-
|
361
|
-
unless RESERVED_COOKIE_NAMES.include?(name)
|
362
|
-
params[name] = (value || '')
|
363
|
-
end
|
364
|
-
end
|
365
|
-
end
|
366
|
-
|
367
|
-
return params
|
368
|
-
end
|
369
224
|
end
|
370
225
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
class Page
|
5
|
+
# Reserved names used within Cookie strings
|
6
|
+
RESERVED_COOKIE_NAMES = /^(?:Path|Expires|Domain|Secure|HTTPOnly)$/i
|
7
|
+
|
8
|
+
#
|
9
|
+
# The raw Cookie String sent along with the page.
|
10
|
+
#
|
11
|
+
# @return [String]
|
12
|
+
# The raw Cookie from the response.
|
13
|
+
#
|
14
|
+
# @since 0.2.7
|
15
|
+
#
|
16
|
+
def cookie
|
17
|
+
@response['Set-Cookie'] || ''
|
18
|
+
end
|
19
|
+
|
20
|
+
alias raw_cookie cookie
|
21
|
+
|
22
|
+
#
|
23
|
+
# The Cookie values sent along with the page.
|
24
|
+
#
|
25
|
+
# @return [Array<String>]
|
26
|
+
# The Cookies from the response.
|
27
|
+
#
|
28
|
+
# @since 0.2.2
|
29
|
+
#
|
30
|
+
def cookies
|
31
|
+
(@response.get_fields('Set-Cookie') || [])
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# The Cookie key -> value pairs returned with the response.
|
36
|
+
#
|
37
|
+
# @return [Hash{String => String}]
|
38
|
+
# The cookie keys and values.
|
39
|
+
#
|
40
|
+
# @since 0.2.2
|
41
|
+
#
|
42
|
+
def cookie_params
|
43
|
+
params = {}
|
44
|
+
|
45
|
+
cookies.each do |value|
|
46
|
+
value.split(';').each do |param|
|
47
|
+
param.strip!
|
48
|
+
|
49
|
+
name, value = param.split('=',2)
|
50
|
+
|
51
|
+
unless name =~ RESERVED_COOKIE_NAMES
|
52
|
+
params[name] = (value || '')
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
return params
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -1,10 +1,22 @@
|
|
1
|
+
require 'nokogiri'
|
1
2
|
require 'spidr/extensions/uri'
|
2
|
-
require 'uri'
|
3
3
|
|
4
4
|
module Spidr
|
5
5
|
class Page
|
6
6
|
include Enumerable
|
7
7
|
|
8
|
+
#
|
9
|
+
# The title of the HTML page.
|
10
|
+
#
|
11
|
+
# @return [String]
|
12
|
+
# The inner-text of the title element of the page.
|
13
|
+
#
|
14
|
+
def title
|
15
|
+
if (node = at('//title'))
|
16
|
+
node.inner_text
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
8
20
|
#
|
9
21
|
# Enumerates over the meta-redirect links in the page.
|
10
22
|
#
|
@@ -21,7 +33,7 @@ module Spidr
|
|
21
33
|
# @since 0.3.0
|
22
34
|
#
|
23
35
|
def each_meta_redirect
|
24
|
-
return enum_for(
|
36
|
+
return enum_for(__method__) unless block_given?
|
25
37
|
|
26
38
|
if (html? && doc)
|
27
39
|
search('//meta[@http-equiv and @content]').each do |node|
|
@@ -44,7 +56,7 @@ module Spidr
|
|
44
56
|
# Specifies whether the page includes page-level redirects.
|
45
57
|
#
|
46
58
|
def meta_redirect?
|
47
|
-
!
|
59
|
+
!each_meta_redirect.first.nil?
|
48
60
|
end
|
49
61
|
|
50
62
|
#
|
@@ -59,6 +71,23 @@ module Spidr
|
|
59
71
|
each_meta_redirect.to_a
|
60
72
|
end
|
61
73
|
|
74
|
+
#
|
75
|
+
# The meta-redirect links of the page.
|
76
|
+
#
|
77
|
+
# @return [Array<String>]
|
78
|
+
# All meta-redirect links in the page.
|
79
|
+
#
|
80
|
+
# @deprecated
|
81
|
+
# Deprecated in 0.3.0 and will be removed in 0.4.0.
|
82
|
+
# Use {#meta_redirects} instead.
|
83
|
+
#
|
84
|
+
def meta_redirect
|
85
|
+
warn 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
|
86
|
+
warn 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
|
87
|
+
|
88
|
+
meta_redirects
|
89
|
+
end
|
90
|
+
|
62
91
|
#
|
63
92
|
# Enumerates over every HTTP or meta-redirect link in the page.
|
64
93
|
#
|
@@ -74,18 +103,14 @@ module Spidr
|
|
74
103
|
# @since 0.3.0
|
75
104
|
#
|
76
105
|
def each_redirect(&block)
|
77
|
-
return enum_for(
|
106
|
+
return enum_for(__method__) unless block
|
78
107
|
|
79
|
-
|
80
|
-
|
81
|
-
|
108
|
+
if (locations = @response.get_fields('Location'))
|
109
|
+
# Location headers override any meta-refresh redirects in the HTML
|
110
|
+
locations.each(&block)
|
111
|
+
else
|
82
112
|
# check page-level meta redirects if there isn't a location header
|
83
113
|
each_meta_redirect(&block)
|
84
|
-
elsif location.kind_of?(Array)
|
85
|
-
location.each(&block)
|
86
|
-
else
|
87
|
-
# usually the location header contains a single String
|
88
|
-
yield location
|
89
114
|
end
|
90
115
|
end
|
91
116
|
|
@@ -115,7 +140,7 @@ module Spidr
|
|
115
140
|
# @since 0.5.0
|
116
141
|
#
|
117
142
|
def each_mailto
|
118
|
-
return enum_for(
|
143
|
+
return enum_for(__method__) unless block_given?
|
119
144
|
|
120
145
|
if (html? && doc)
|
121
146
|
doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
|
@@ -151,7 +176,7 @@ module Spidr
|
|
151
176
|
# @since 0.3.0
|
152
177
|
#
|
153
178
|
def each_link
|
154
|
-
return enum_for(
|
179
|
+
return enum_for(__method__) unless block_given?
|
155
180
|
|
156
181
|
filter = lambda { |url|
|
157
182
|
yield url unless (url.nil? || url.empty?)
|
@@ -208,7 +233,7 @@ module Spidr
|
|
208
233
|
# @since 0.3.0
|
209
234
|
#
|
210
235
|
def each_url
|
211
|
-
return enum_for(
|
236
|
+
return enum_for(__method__) unless block_given?
|
212
237
|
|
213
238
|
each_link do |link|
|
214
239
|
if (url = to_absolute(link))
|
@@ -239,15 +264,14 @@ module Spidr
|
|
239
264
|
# The normalized URI.
|
240
265
|
#
|
241
266
|
def to_absolute(link)
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
if new_url.path
|
249
|
-
path = new_url.path
|
267
|
+
link = link.to_s
|
268
|
+
new_url = begin
|
269
|
+
url.merge(link)
|
270
|
+
rescue Exception
|
271
|
+
return
|
272
|
+
end
|
250
273
|
|
274
|
+
if (path = new_url.path)
|
251
275
|
# ensure that paths begin with a leading '/' for URI::FTP
|
252
276
|
if (new_url.scheme == 'ftp' && !path.start_with?('/'))
|
253
277
|
path.insert(0,'/')
|