spidr 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +14 -0
- data/ChangeLog.md +20 -2
- data/Gemfile +2 -2
- data/README.md +4 -2
- data/Rakefile +1 -0
- data/gemspec.yml +1 -1
- data/lib/spidr/agent.rb +145 -85
- data/lib/spidr/agent/filters.rb +1 -9
- data/lib/spidr/agent/robots.rb +36 -0
- data/lib/spidr/page.rb +76 -28
- data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
- data/lib/spidr/page/cookies.rb +60 -0
- data/lib/spidr/page/{links.rb → html.rb} +47 -23
- data/lib/spidr/page/status_codes.rb +112 -0
- data/lib/spidr/proxy.rb +56 -0
- data/lib/spidr/session_cache.rb +60 -24
- data/lib/spidr/settings.rb +3 -0
- data/lib/spidr/settings/proxy.rb +61 -0
- data/lib/spidr/settings/timeouts.rb +33 -0
- data/lib/spidr/settings/user_agent.rb +14 -0
- data/lib/spidr/spidr.rb +15 -79
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +158 -32
- data/spec/agent/filters_spec.rb +46 -29
- data/spec/agent/sanitizers_spec.rb +25 -31
- data/spec/agent_spec.rb +772 -50
- data/spec/example_app.rb +27 -0
- data/spec/example_page.rb +33 -0
- data/spec/page/content_types_spec.rb +150 -0
- data/spec/page/cookies_spec.rb +58 -0
- data/spec/page/html_spec.rb +524 -0
- data/spec/page/status_codes_spec.rb +87 -0
- data/spec/page_spec.rb +114 -78
- data/spec/proxy_spec.rb +45 -0
- data/spec/session_cache.rb +103 -2
- data/spec/settings/proxy_examples.rb +82 -0
- data/spec/settings/timeouts_examples.rb +93 -0
- data/spec/settings/user_agent_examples.rb +25 -0
- data/spec/spidr_spec.rb +6 -29
- data/spidr.gemspec +38 -109
- metadata +35 -31
- data/lib/spidr/page/body.rb +0 -98
- data/spec/helpers/history.rb +0 -34
- data/spec/helpers/page.rb +0 -8
- data/spec/helpers/wsoc.rb +0 -83
- data/spec/page_examples.rb +0 -21
data/lib/spidr/agent/filters.rb
CHANGED
@@ -400,7 +400,7 @@ module Spidr
|
|
400
400
|
@schemes = []
|
401
401
|
|
402
402
|
if options[:schemes]
|
403
|
-
|
403
|
+
self.schemes = options[:schemes]
|
404
404
|
else
|
405
405
|
@schemes << 'http'
|
406
406
|
|
@@ -439,14 +439,6 @@ module Spidr
|
|
439
439
|
if options[:host]
|
440
440
|
visit_hosts_like(options[:host])
|
441
441
|
end
|
442
|
-
|
443
|
-
if options[:queue]
|
444
|
-
self.queue = options[:queue]
|
445
|
-
end
|
446
|
-
|
447
|
-
if options[:history]
|
448
|
-
self.history = options[:history]
|
449
|
-
end
|
450
442
|
end
|
451
443
|
|
452
444
|
#
|
@@ -0,0 +1,36 @@
|
|
1
|
+
begin
|
2
|
+
require 'robots'
|
3
|
+
rescue LoadError
|
4
|
+
end
|
5
|
+
|
6
|
+
module Spidr
|
7
|
+
class Agent
|
8
|
+
#
|
9
|
+
# Initializes the robots filter.
|
10
|
+
#
|
11
|
+
def initialize_robots
|
12
|
+
unless Object.const_defined?(:Robots)
|
13
|
+
raise(ArgumentError,":robots option given but unable to require 'robots' gem")
|
14
|
+
end
|
15
|
+
|
16
|
+
@robots = Robots.new(@user_agent)
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Determines whether a URL is allowed by the robot policy.
|
21
|
+
#
|
22
|
+
# @param [URI::HTTP, String] url
|
23
|
+
# The URL to check.
|
24
|
+
#
|
25
|
+
# @return [Boolean]
|
26
|
+
# Specifies whether a URL is allowed by the robot policy.
|
27
|
+
#
|
28
|
+
def robot_allowed?(url)
|
29
|
+
if @robots
|
30
|
+
@robots.allowed?(url)
|
31
|
+
else
|
32
|
+
true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/spidr/page.rb
CHANGED
@@ -1,7 +1,3 @@
|
|
1
|
-
require 'spidr/page/headers'
|
2
|
-
require 'spidr/page/body'
|
3
|
-
require 'spidr/page/links'
|
4
|
-
|
5
1
|
module Spidr
|
6
2
|
#
|
7
3
|
# Represents a requested page from a website.
|
@@ -34,42 +30,89 @@ module Spidr
|
|
34
30
|
end
|
35
31
|
|
36
32
|
#
|
37
|
-
# The
|
33
|
+
# The body of the response.
|
34
|
+
#
|
35
|
+
# @return [String]
|
36
|
+
# The body of the response.
|
38
37
|
#
|
39
|
-
|
40
|
-
|
38
|
+
def body
|
39
|
+
(response.body || '')
|
40
|
+
end
|
41
|
+
|
42
|
+
alias to_s body
|
43
|
+
|
41
44
|
#
|
42
|
-
#
|
43
|
-
# Deprecated in 0.3.0 and will be removed in 0.4.0.
|
44
|
-
# Use {#meta_redirects} instead.
|
45
|
+
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
|
45
46
|
#
|
46
|
-
|
47
|
-
|
48
|
-
|
47
|
+
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
|
48
|
+
# The document that represents HTML or XML pages.
|
49
|
+
# Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
|
50
|
+
# the page could not be parsed properly.
|
51
|
+
#
|
52
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
|
53
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
|
54
|
+
#
|
55
|
+
def doc
|
56
|
+
unless body.empty?
|
57
|
+
doc_class = if html?
|
58
|
+
Nokogiri::HTML::Document
|
59
|
+
elsif rss? || atom? || xml? || xsl?
|
60
|
+
Nokogiri::XML::Document
|
61
|
+
end
|
49
62
|
|
50
|
-
|
63
|
+
if doc_class
|
64
|
+
begin
|
65
|
+
@doc ||= doc_class.parse(body, @url.to_s, content_charset)
|
66
|
+
rescue
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
51
70
|
end
|
52
71
|
|
53
72
|
#
|
54
|
-
#
|
55
|
-
#
|
56
|
-
#
|
73
|
+
# Searches the document for XPath or CSS Path paths.
|
74
|
+
#
|
75
|
+
# @param [Array<String>] paths
|
76
|
+
# CSS or XPath expressions to search the document with.
|
57
77
|
#
|
58
|
-
# @return [
|
59
|
-
#
|
78
|
+
# @return [Array]
|
79
|
+
# The matched nodes from the document.
|
80
|
+
# Returns an empty Array if no nodes were matched, or if the page
|
81
|
+
# is not an HTML or XML document.
|
60
82
|
#
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
83
|
+
# @example
|
84
|
+
# page.search('//a[@href]')
|
85
|
+
#
|
86
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
|
87
|
+
#
|
88
|
+
def search(*paths)
|
89
|
+
if doc
|
90
|
+
doc.search(*paths)
|
67
91
|
else
|
68
|
-
|
92
|
+
[]
|
69
93
|
end
|
70
94
|
end
|
71
95
|
|
72
|
-
|
96
|
+
#
|
97
|
+
# Searches for the first occurrence an XPath or CSS Path expression.
|
98
|
+
#
|
99
|
+
# @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
|
100
|
+
# The first matched node. Returns `nil` if no nodes could be matched,
|
101
|
+
# or if the page is not a HTML or XML document.
|
102
|
+
#
|
103
|
+
# @example
|
104
|
+
# page.at('//title')
|
105
|
+
#
|
106
|
+
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
|
107
|
+
#
|
108
|
+
def at(*arguments)
|
109
|
+
if doc
|
110
|
+
doc.at(*arguments)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
alias / search
|
115
|
+
alias % at
|
73
116
|
|
74
117
|
protected
|
75
118
|
|
@@ -90,7 +133,7 @@ module Spidr
|
|
90
133
|
#
|
91
134
|
def method_missing(name,*arguments,&block)
|
92
135
|
if (arguments.empty? && block.nil?)
|
93
|
-
header_name = name.to_s.
|
136
|
+
header_name = name.to_s.tr('_','-')
|
94
137
|
|
95
138
|
if @response.key?(header_name)
|
96
139
|
return @response[header_name]
|
@@ -102,3 +145,8 @@ module Spidr
|
|
102
145
|
|
103
146
|
end
|
104
147
|
end
|
148
|
+
|
149
|
+
require 'spidr/page/status_codes'
|
150
|
+
require 'spidr/page/content_types'
|
151
|
+
require 'spidr/page/cookies'
|
152
|
+
require 'spidr/page/html'
|
@@ -1,98 +1,5 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module Spidr
|
4
2
|
class Page
|
5
|
-
# Reserved names used within Cookie strings
|
6
|
-
RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
|
7
|
-
|
8
|
-
#
|
9
|
-
# The response code from the page.
|
10
|
-
#
|
11
|
-
# @return [Integer]
|
12
|
-
# Response code from the page.
|
13
|
-
#
|
14
|
-
def code
|
15
|
-
response.code.to_i
|
16
|
-
end
|
17
|
-
|
18
|
-
#
|
19
|
-
# Determines if the response code is `200`.
|
20
|
-
#
|
21
|
-
# @return [Boolean]
|
22
|
-
# Specifies whether the response code is `200`.
|
23
|
-
#
|
24
|
-
def is_ok?
|
25
|
-
code == 200
|
26
|
-
end
|
27
|
-
|
28
|
-
alias ok? is_ok?
|
29
|
-
|
30
|
-
#
|
31
|
-
# Determines if the response code is `308`.
|
32
|
-
#
|
33
|
-
# @return [Boolean]
|
34
|
-
# Specifies whether the response code is `308`.
|
35
|
-
#
|
36
|
-
def timedout?
|
37
|
-
code == 308
|
38
|
-
end
|
39
|
-
|
40
|
-
#
|
41
|
-
# Determines if the response code is `400`.
|
42
|
-
#
|
43
|
-
# @return [Boolean]
|
44
|
-
# Specifies whether the response code is `400`.
|
45
|
-
#
|
46
|
-
def bad_request?
|
47
|
-
code == 400
|
48
|
-
end
|
49
|
-
|
50
|
-
#
|
51
|
-
# Determines if the response code is `401`.
|
52
|
-
#
|
53
|
-
# @return [Boolean]
|
54
|
-
# Specifies whether the response code is `401`.
|
55
|
-
#
|
56
|
-
def is_unauthorized?
|
57
|
-
code == 401
|
58
|
-
end
|
59
|
-
|
60
|
-
alias unauthorized? is_unauthorized?
|
61
|
-
|
62
|
-
#
|
63
|
-
# Determines if the response code is `403`.
|
64
|
-
#
|
65
|
-
# @return [Boolean]
|
66
|
-
# Specifies whether the response code is `403`.
|
67
|
-
#
|
68
|
-
def is_forbidden?
|
69
|
-
code == 403
|
70
|
-
end
|
71
|
-
|
72
|
-
alias forbidden? is_forbidden?
|
73
|
-
|
74
|
-
#
|
75
|
-
# Determines if the response code is `404`.
|
76
|
-
#
|
77
|
-
# @return [Boolean]
|
78
|
-
# Specifies whether the response code is `404`.
|
79
|
-
#
|
80
|
-
def is_missing?
|
81
|
-
code == 404
|
82
|
-
end
|
83
|
-
|
84
|
-
alias missing? is_missing?
|
85
|
-
|
86
|
-
#
|
87
|
-
# Determines if the response code is `500`.
|
88
|
-
#
|
89
|
-
# @return [Boolean]
|
90
|
-
# Specifies whether the response code is `500`.
|
91
|
-
#
|
92
|
-
def had_internal_server_error?
|
93
|
-
code == 500
|
94
|
-
end
|
95
|
-
|
96
3
|
#
|
97
4
|
# The Content-Type of the page.
|
98
5
|
#
|
@@ -100,7 +7,7 @@ module Spidr
|
|
100
7
|
# The Content-Type of the page.
|
101
8
|
#
|
102
9
|
def content_type
|
103
|
-
|
10
|
+
@response['Content-Type'] || ''
|
104
11
|
end
|
105
12
|
|
106
13
|
#
|
@@ -112,7 +19,7 @@ module Spidr
|
|
112
19
|
# @since 0.2.2
|
113
20
|
#
|
114
21
|
def content_types
|
115
|
-
(
|
22
|
+
@response.get_fields('content-type') || []
|
116
23
|
end
|
117
24
|
|
118
25
|
#
|
@@ -314,57 +221,5 @@ module Spidr
|
|
314
221
|
def zip?
|
315
222
|
is_content_type?('application/zip')
|
316
223
|
end
|
317
|
-
|
318
|
-
#
|
319
|
-
# The raw Cookie String sent along with the page.
|
320
|
-
#
|
321
|
-
# @return [String]
|
322
|
-
# The raw Cookie from the response.
|
323
|
-
#
|
324
|
-
# @since 0.2.7
|
325
|
-
#
|
326
|
-
def cookie
|
327
|
-
(response['Set-Cookie'] || '')
|
328
|
-
end
|
329
|
-
|
330
|
-
alias raw_cookie cookie
|
331
|
-
|
332
|
-
#
|
333
|
-
# The Cookie values sent along with the page.
|
334
|
-
#
|
335
|
-
# @return [Array<String>]
|
336
|
-
# The Cookies from the response.
|
337
|
-
#
|
338
|
-
# @since 0.2.2
|
339
|
-
#
|
340
|
-
def cookies
|
341
|
-
(headers['set-cookie'] || [])
|
342
|
-
end
|
343
|
-
|
344
|
-
#
|
345
|
-
# The Cookie key -> value pairs returned with the response.
|
346
|
-
#
|
347
|
-
# @return [Hash{String => String}]
|
348
|
-
# The cookie keys and values.
|
349
|
-
#
|
350
|
-
# @since 0.2.2
|
351
|
-
#
|
352
|
-
def cookie_params
|
353
|
-
params = {}
|
354
|
-
|
355
|
-
cookies.each do |value|
|
356
|
-
value.split(';').each do |param|
|
357
|
-
param.strip!
|
358
|
-
|
359
|
-
name, value = param.split('=',2)
|
360
|
-
|
361
|
-
unless RESERVED_COOKIE_NAMES.include?(name)
|
362
|
-
params[name] = (value || '')
|
363
|
-
end
|
364
|
-
end
|
365
|
-
end
|
366
|
-
|
367
|
-
return params
|
368
|
-
end
|
369
224
|
end
|
370
225
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
class Page
|
5
|
+
# Reserved names used within Cookie strings
|
6
|
+
RESERVED_COOKIE_NAMES = /^(?:Path|Expires|Domain|Secure|HTTPOnly)$/i
|
7
|
+
|
8
|
+
#
|
9
|
+
# The raw Cookie String sent along with the page.
|
10
|
+
#
|
11
|
+
# @return [String]
|
12
|
+
# The raw Cookie from the response.
|
13
|
+
#
|
14
|
+
# @since 0.2.7
|
15
|
+
#
|
16
|
+
def cookie
|
17
|
+
@response['Set-Cookie'] || ''
|
18
|
+
end
|
19
|
+
|
20
|
+
alias raw_cookie cookie
|
21
|
+
|
22
|
+
#
|
23
|
+
# The Cookie values sent along with the page.
|
24
|
+
#
|
25
|
+
# @return [Array<String>]
|
26
|
+
# The Cookies from the response.
|
27
|
+
#
|
28
|
+
# @since 0.2.2
|
29
|
+
#
|
30
|
+
def cookies
|
31
|
+
(@response.get_fields('Set-Cookie') || [])
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# The Cookie key -> value pairs returned with the response.
|
36
|
+
#
|
37
|
+
# @return [Hash{String => String}]
|
38
|
+
# The cookie keys and values.
|
39
|
+
#
|
40
|
+
# @since 0.2.2
|
41
|
+
#
|
42
|
+
def cookie_params
|
43
|
+
params = {}
|
44
|
+
|
45
|
+
cookies.each do |value|
|
46
|
+
value.split(';').each do |param|
|
47
|
+
param.strip!
|
48
|
+
|
49
|
+
name, value = param.split('=',2)
|
50
|
+
|
51
|
+
unless name =~ RESERVED_COOKIE_NAMES
|
52
|
+
params[name] = (value || '')
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
return params
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -1,10 +1,22 @@
|
|
1
|
+
require 'nokogiri'
|
1
2
|
require 'spidr/extensions/uri'
|
2
|
-
require 'uri'
|
3
3
|
|
4
4
|
module Spidr
|
5
5
|
class Page
|
6
6
|
include Enumerable
|
7
7
|
|
8
|
+
#
|
9
|
+
# The title of the HTML page.
|
10
|
+
#
|
11
|
+
# @return [String]
|
12
|
+
# The inner-text of the title element of the page.
|
13
|
+
#
|
14
|
+
def title
|
15
|
+
if (node = at('//title'))
|
16
|
+
node.inner_text
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
8
20
|
#
|
9
21
|
# Enumerates over the meta-redirect links in the page.
|
10
22
|
#
|
@@ -21,7 +33,7 @@ module Spidr
|
|
21
33
|
# @since 0.3.0
|
22
34
|
#
|
23
35
|
def each_meta_redirect
|
24
|
-
return enum_for(
|
36
|
+
return enum_for(__method__) unless block_given?
|
25
37
|
|
26
38
|
if (html? && doc)
|
27
39
|
search('//meta[@http-equiv and @content]').each do |node|
|
@@ -44,7 +56,7 @@ module Spidr
|
|
44
56
|
# Specifies whether the page includes page-level redirects.
|
45
57
|
#
|
46
58
|
def meta_redirect?
|
47
|
-
!
|
59
|
+
!each_meta_redirect.first.nil?
|
48
60
|
end
|
49
61
|
|
50
62
|
#
|
@@ -59,6 +71,23 @@ module Spidr
|
|
59
71
|
each_meta_redirect.to_a
|
60
72
|
end
|
61
73
|
|
74
|
+
#
|
75
|
+
# The meta-redirect links of the page.
|
76
|
+
#
|
77
|
+
# @return [Array<String>]
|
78
|
+
# All meta-redirect links in the page.
|
79
|
+
#
|
80
|
+
# @deprecated
|
81
|
+
# Deprecated in 0.3.0 and will be removed in 0.4.0.
|
82
|
+
# Use {#meta_redirects} instead.
|
83
|
+
#
|
84
|
+
def meta_redirect
|
85
|
+
warn 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
|
86
|
+
warn 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
|
87
|
+
|
88
|
+
meta_redirects
|
89
|
+
end
|
90
|
+
|
62
91
|
#
|
63
92
|
# Enumerates over every HTTP or meta-redirect link in the page.
|
64
93
|
#
|
@@ -74,18 +103,14 @@ module Spidr
|
|
74
103
|
# @since 0.3.0
|
75
104
|
#
|
76
105
|
def each_redirect(&block)
|
77
|
-
return enum_for(
|
106
|
+
return enum_for(__method__) unless block
|
78
107
|
|
79
|
-
|
80
|
-
|
81
|
-
|
108
|
+
if (locations = @response.get_fields('Location'))
|
109
|
+
# Location headers override any meta-refresh redirects in the HTML
|
110
|
+
locations.each(&block)
|
111
|
+
else
|
82
112
|
# check page-level meta redirects if there isn't a location header
|
83
113
|
each_meta_redirect(&block)
|
84
|
-
elsif location.kind_of?(Array)
|
85
|
-
location.each(&block)
|
86
|
-
else
|
87
|
-
# usually the location header contains a single String
|
88
|
-
yield location
|
89
114
|
end
|
90
115
|
end
|
91
116
|
|
@@ -115,7 +140,7 @@ module Spidr
|
|
115
140
|
# @since 0.5.0
|
116
141
|
#
|
117
142
|
def each_mailto
|
118
|
-
return enum_for(
|
143
|
+
return enum_for(__method__) unless block_given?
|
119
144
|
|
120
145
|
if (html? && doc)
|
121
146
|
doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
|
@@ -151,7 +176,7 @@ module Spidr
|
|
151
176
|
# @since 0.3.0
|
152
177
|
#
|
153
178
|
def each_link
|
154
|
-
return enum_for(
|
179
|
+
return enum_for(__method__) unless block_given?
|
155
180
|
|
156
181
|
filter = lambda { |url|
|
157
182
|
yield url unless (url.nil? || url.empty?)
|
@@ -208,7 +233,7 @@ module Spidr
|
|
208
233
|
# @since 0.3.0
|
209
234
|
#
|
210
235
|
def each_url
|
211
|
-
return enum_for(
|
236
|
+
return enum_for(__method__) unless block_given?
|
212
237
|
|
213
238
|
each_link do |link|
|
214
239
|
if (url = to_absolute(link))
|
@@ -239,15 +264,14 @@ module Spidr
|
|
239
264
|
# The normalized URI.
|
240
265
|
#
|
241
266
|
def to_absolute(link)
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
if new_url.path
|
249
|
-
path = new_url.path
|
267
|
+
link = link.to_s
|
268
|
+
new_url = begin
|
269
|
+
url.merge(link)
|
270
|
+
rescue Exception
|
271
|
+
return
|
272
|
+
end
|
250
273
|
|
274
|
+
if (path = new_url.path)
|
251
275
|
# ensure that paths begin with a leading '/' for URI::FTP
|
252
276
|
if (new_url.scheme == 'ftp' && !path.start_with?('/'))
|
253
277
|
path.insert(0,'/')
|