spidr 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ChangeLog.md +69 -54
- data/Gemfile +9 -5
- data/LICENSE.txt +1 -1
- data/README.md +34 -26
- data/Rakefile +4 -15
- data/gemspec.yml +3 -2
- data/lib/spidr/agent.rb +101 -44
- data/lib/spidr/{actions → agent}/actions.rb +32 -12
- data/lib/spidr/{events.rb → agent/events.rb} +4 -8
- data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
- data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
- data/lib/spidr/auth_store.rb +2 -2
- data/lib/spidr/cookie_jar.rb +2 -2
- data/lib/spidr/extensions/uri.rb +28 -16
- data/lib/spidr/page.rb +7 -11
- data/lib/spidr/{body.rb → page/body.rb} +1 -1
- data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
- data/lib/spidr/{links.rb → page/links.rb} +43 -7
- data/lib/spidr/session_cache.rb +2 -2
- data/lib/spidr/spidr.rb +32 -5
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +60 -0
- data/spec/agent/filters_spec.rb +62 -0
- data/spec/agent/sanitizers_spec.rb +62 -0
- data/spec/agent_spec.rb +13 -13
- data/spec/auth_store_spec.rb +17 -17
- data/spec/cookie_jar_spec.rb +26 -26
- data/spec/extensions/uri_spec.rb +19 -9
- data/spec/helpers/history.rb +5 -5
- data/spec/helpers/wsoc.rb +2 -2
- data/spec/page_examples.rb +4 -4
- data/spec/page_spec.rb +28 -25
- data/spec/rules_spec.rb +14 -14
- data/spec/session_cache.rb +7 -7
- data/spec/spidr_spec.rb +10 -10
- metadata +37 -51
- data/lib/spidr/actions.rb +0 -2
- data/lib/spidr/actions/exceptions.rb +0 -4
- data/lib/spidr/actions/exceptions/action.rb +0 -9
- data/lib/spidr/actions/exceptions/paused.rb +0 -11
- data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
- data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
- data/spec/actions_spec.rb +0 -59
- data/spec/filters_spec.rb +0 -61
- data/spec/sanitizers_spec.rb +0 -61
@@ -1,13 +1,33 @@
|
|
1
|
-
require 'spidr/actions/exceptions/paused'
|
2
|
-
require 'spidr/actions/exceptions/skip_link'
|
3
|
-
require 'spidr/actions/exceptions/skip_page'
|
4
|
-
|
5
1
|
module Spidr
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
2
|
+
class Agent
|
3
|
+
module Actions
|
4
|
+
#
|
5
|
+
# The base {Actions} exception class.
|
6
|
+
#
|
7
|
+
class Action < RuntimeError
|
8
|
+
end
|
9
|
+
|
10
|
+
#
|
11
|
+
# An {Actions} exception class used to pause a running {Agent}.
|
12
|
+
#
|
13
|
+
class Paused < Action
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# An {Actions} exception class which causes a running {Agent} to
|
18
|
+
# skip a link.
|
19
|
+
#
|
20
|
+
class SkipLink < Action
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# An {Actions} exception class which causes a running {Agent} to
|
25
|
+
# skip a {Page}, and all links within that page.
|
26
|
+
#
|
27
|
+
class SkipPage < Action
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
11
31
|
#
|
12
32
|
# Continue spidering.
|
13
33
|
#
|
@@ -40,7 +60,7 @@ module Spidr
|
|
40
60
|
#
|
41
61
|
def pause!
|
42
62
|
@paused = true
|
43
|
-
raise(Paused)
|
63
|
+
raise(Actions::Paused)
|
44
64
|
end
|
45
65
|
|
46
66
|
#
|
@@ -61,7 +81,7 @@ module Spidr
|
|
61
81
|
# and not enqueued or visited.
|
62
82
|
#
|
63
83
|
def skip_link!
|
64
|
-
raise(SkipLink)
|
84
|
+
raise(Actions::SkipLink)
|
65
85
|
end
|
66
86
|
|
67
87
|
#
|
@@ -71,7 +91,7 @@ module Spidr
|
|
71
91
|
# Indicates to the agent, that the current page should be skipped.
|
72
92
|
#
|
73
93
|
def skip_page!
|
74
|
-
raise(SkipPage)
|
94
|
+
raise(Actions::SkipPage)
|
75
95
|
end
|
76
96
|
|
77
97
|
protected
|
@@ -1,10 +1,5 @@
|
|
1
1
|
module Spidr
|
2
|
-
|
3
|
-
# The {Events} module adds methods to {Agent} for registering
|
4
|
-
# callbacks which will receive URLs, links, headers and pages, when
|
5
|
-
# they are visited.
|
6
|
-
#
|
7
|
-
module Events
|
2
|
+
class Agent
|
8
3
|
#
|
9
4
|
# Pass each URL from each page visited to the given block.
|
10
5
|
#
|
@@ -526,12 +521,13 @@ module Spidr
|
|
526
521
|
protected
|
527
522
|
|
528
523
|
def initialize_events(options={})
|
529
|
-
@every_url_blocks
|
524
|
+
@every_url_blocks = []
|
530
525
|
@every_failed_url_blocks = []
|
531
|
-
@every_url_like_blocks
|
526
|
+
@every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
532
527
|
|
533
528
|
@every_page_blocks = []
|
534
529
|
@every_link_blocks = []
|
535
530
|
end
|
531
|
+
|
536
532
|
end
|
537
533
|
end
|
@@ -1,11 +1,8 @@
|
|
1
1
|
require 'spidr/rules'
|
2
2
|
|
3
3
|
module Spidr
|
4
|
-
|
5
|
-
|
6
|
-
# URLs the agent will visit.
|
7
|
-
#
|
8
|
-
module Filters
|
4
|
+
class Agent
|
5
|
+
|
9
6
|
# List of acceptable URL schemes to follow
|
10
7
|
attr_reader :schemes
|
11
8
|
|
@@ -419,24 +416,24 @@ module Spidr
|
|
419
416
|
end
|
420
417
|
|
421
418
|
@host_rules = Rules.new(
|
422
|
-
:
|
423
|
-
:
|
419
|
+
accept: options[:hosts],
|
420
|
+
reject: options[:ignore_hosts]
|
424
421
|
)
|
425
422
|
@port_rules = Rules.new(
|
426
|
-
:
|
427
|
-
:
|
423
|
+
accept: options[:ports],
|
424
|
+
reject: options[:ignore_ports]
|
428
425
|
)
|
429
426
|
@link_rules = Rules.new(
|
430
|
-
:
|
431
|
-
:
|
427
|
+
accept: options[:links],
|
428
|
+
reject: options[:ignore_links]
|
432
429
|
)
|
433
430
|
@url_rules = Rules.new(
|
434
|
-
:
|
435
|
-
:
|
431
|
+
accept: options[:urls],
|
432
|
+
reject: options[:ignore_urls]
|
436
433
|
)
|
437
434
|
@ext_rules = Rules.new(
|
438
|
-
:
|
439
|
-
:
|
435
|
+
accept: options[:exts],
|
436
|
+
reject: options[:ignore_exts]
|
440
437
|
)
|
441
438
|
|
442
439
|
if options[:host]
|
@@ -511,7 +508,7 @@ module Spidr
|
|
511
508
|
#
|
512
509
|
# Determines if a given URL should be visited.
|
513
510
|
#
|
514
|
-
# @param [URI::HTTP, URI::HTTPS]
|
511
|
+
# @param [URI::HTTP, URI::HTTPS] link
|
515
512
|
# The URL.
|
516
513
|
#
|
517
514
|
# @return [Boolean]
|
@@ -535,5 +532,6 @@ module Spidr
|
|
535
532
|
def visit_ext?(path)
|
536
533
|
@ext_rules.accept?(File.extname(path)[1..-1])
|
537
534
|
end
|
535
|
+
|
538
536
|
end
|
539
537
|
end
|
@@ -1,11 +1,8 @@
|
|
1
1
|
require 'uri'
|
2
2
|
|
3
3
|
module Spidr
|
4
|
-
|
5
|
-
|
6
|
-
# sanitation of incoming links.
|
7
|
-
#
|
8
|
-
module Sanitizers
|
4
|
+
class Agent
|
5
|
+
|
9
6
|
# Specifies whether the Agent will strip URI fragments
|
10
7
|
attr_accessor :strip_fragments
|
11
8
|
|
@@ -27,7 +24,7 @@ module Spidr
|
|
27
24
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
28
25
|
|
29
26
|
url.fragment = nil if @strip_fragments
|
30
|
-
url.query
|
27
|
+
url.query = nil if @strip_query
|
31
28
|
|
32
29
|
return url
|
33
30
|
end
|
@@ -50,7 +47,8 @@ module Spidr
|
|
50
47
|
#
|
51
48
|
def initialize_sanitizers(options={})
|
52
49
|
@strip_fragments = options.fetch(:strip_fragments,true)
|
53
|
-
@strip_query
|
50
|
+
@strip_query = options.fetch(:strip_query,false)
|
54
51
|
end
|
52
|
+
|
55
53
|
end
|
56
54
|
end
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -57,10 +57,10 @@ module Spidr
|
|
57
57
|
#
|
58
58
|
# Add an auth credential to the store for supplied base URL.
|
59
59
|
#
|
60
|
-
# @param [URI]
|
60
|
+
# @param [URI] url
|
61
61
|
# A URL pattern to associate with a set of auth credentials.
|
62
62
|
#
|
63
|
-
# @param [AuthCredential]
|
63
|
+
# @param [AuthCredential] auth
|
64
64
|
# The auth credential for this URL pattern.
|
65
65
|
#
|
66
66
|
# @return [AuthCredential]
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -18,7 +18,7 @@ module Spidr
|
|
18
18
|
def initialize
|
19
19
|
@params = {}
|
20
20
|
|
21
|
-
@dirty
|
21
|
+
@dirty = Set[]
|
22
22
|
@cookies = {}
|
23
23
|
end
|
24
24
|
|
@@ -147,7 +147,7 @@ module Spidr
|
|
147
147
|
#
|
148
148
|
def cookies_for_host(host)
|
149
149
|
host_cookies = (@params[host] || {})
|
150
|
-
sub_domains
|
150
|
+
sub_domains = host.split('.')
|
151
151
|
|
152
152
|
while sub_domains.length > 2
|
153
153
|
sub_domains.shift
|
data/lib/spidr/extensions/uri.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'uri'
|
2
|
+
require 'strscan'
|
2
3
|
|
3
4
|
module URI
|
4
5
|
#
|
@@ -26,27 +27,38 @@ module URI
|
|
26
27
|
# URI.expand_path('/test/../path')
|
27
28
|
# # => "/path"
|
28
29
|
#
|
29
|
-
def
|
30
|
-
|
30
|
+
def self.expand_path(path)
|
31
|
+
if path.start_with?('/')
|
32
|
+
leading_slash, path = path[0,1], path[1..-1]
|
33
|
+
else
|
34
|
+
leading_slash = ''
|
35
|
+
end
|
31
36
|
|
32
|
-
|
33
|
-
|
37
|
+
if path.end_with?('/')
|
38
|
+
trailing_slash, path = path[-1,1], path[0..-2]
|
39
|
+
else
|
40
|
+
trailing_slash = ''
|
41
|
+
end
|
34
42
|
|
35
|
-
|
43
|
+
scanner = StringScanner.new(path)
|
44
|
+
stack = []
|
36
45
|
|
37
|
-
|
38
|
-
if dir
|
39
|
-
|
40
|
-
|
41
|
-
|
46
|
+
until scanner.eos?
|
47
|
+
if (dir = scanner.scan(/^[^\/]+/))
|
48
|
+
case dir
|
49
|
+
when '..' then stack.pop
|
50
|
+
when '.' then false
|
51
|
+
else stack.push(dir)
|
52
|
+
end
|
53
|
+
else
|
54
|
+
scanner.skip(/\/+/)
|
42
55
|
end
|
43
56
|
end
|
44
57
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
return full_path
|
58
|
+
unless stack.empty?
|
59
|
+
"#{leading_slash}#{stack.join('/')}#{trailing_slash}"
|
60
|
+
else
|
61
|
+
'/'
|
62
|
+
end
|
51
63
|
end
|
52
64
|
end
|
data/lib/spidr/page.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require 'spidr/headers'
|
2
|
-
require 'spidr/body'
|
3
|
-
require 'spidr/links'
|
1
|
+
require 'spidr/page/headers'
|
2
|
+
require 'spidr/page/body'
|
3
|
+
require 'spidr/page/links'
|
4
4
|
|
5
5
|
module Spidr
|
6
6
|
#
|
@@ -8,10 +8,6 @@ module Spidr
|
|
8
8
|
#
|
9
9
|
class Page
|
10
10
|
|
11
|
-
include Headers
|
12
|
-
include Body
|
13
|
-
include Links
|
14
|
-
|
15
11
|
# URL of the page
|
16
12
|
attr_reader :url
|
17
13
|
|
@@ -27,14 +23,14 @@ module Spidr
|
|
27
23
|
# @param [URI::HTTP] url
|
28
24
|
# The URL of the page.
|
29
25
|
#
|
30
|
-
# @param [Net::
|
26
|
+
# @param [Net::HTTPResponse] response
|
31
27
|
# The response from the request for the page.
|
32
28
|
#
|
33
29
|
def initialize(url,response)
|
34
|
-
@url
|
30
|
+
@url = url
|
35
31
|
@response = response
|
36
|
-
@headers
|
37
|
-
@doc
|
32
|
+
@headers = response.to_hash
|
33
|
+
@doc = nil
|
38
34
|
end
|
39
35
|
|
40
36
|
#
|
@@ -2,7 +2,7 @@ require 'spidr/extensions/uri'
|
|
2
2
|
require 'uri'
|
3
3
|
|
4
4
|
module Spidr
|
5
|
-
|
5
|
+
class Page
|
6
6
|
include Enumerable
|
7
7
|
|
8
8
|
#
|
@@ -100,6 +100,42 @@ module Spidr
|
|
100
100
|
each_redirect.to_a
|
101
101
|
end
|
102
102
|
|
103
|
+
#
|
104
|
+
# Enumerates over every `mailto:` link in the page.
|
105
|
+
#
|
106
|
+
# @yield [link]
|
107
|
+
# The given block will be passed every `mailto:` link from the page.
|
108
|
+
#
|
109
|
+
# @yieldparam [String] link
|
110
|
+
# A `mailto:` link from the page.
|
111
|
+
#
|
112
|
+
# @return [Enumerator]
|
113
|
+
# If no block is given, an enumerator object will be returned.
|
114
|
+
#
|
115
|
+
# @since 0.5.0
|
116
|
+
#
|
117
|
+
def each_mailto
|
118
|
+
return enum_for(:each_mailto) unless block_given?
|
119
|
+
|
120
|
+
if (html? && doc)
|
121
|
+
doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
|
122
|
+
yield a.get_attribute('href')[7..-1]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# `mailto:` links in the page.
|
129
|
+
#
|
130
|
+
# @return [Array<String>]
|
131
|
+
# The `mailto:` links found within the page.
|
132
|
+
#
|
133
|
+
# @since 0.5.0
|
134
|
+
#
|
135
|
+
def mailtos
|
136
|
+
each_mailto.to_a
|
137
|
+
end
|
138
|
+
|
103
139
|
#
|
104
140
|
# Enumerates over every link in the page.
|
105
141
|
#
|
@@ -124,23 +160,23 @@ module Spidr
|
|
124
160
|
each_redirect(&filter) if is_redirect?
|
125
161
|
|
126
162
|
if (html? && doc)
|
127
|
-
doc.search('a[@href]').each do |a|
|
163
|
+
doc.search('//a[@href]').each do |a|
|
128
164
|
filter.call(a.get_attribute('href'))
|
129
165
|
end
|
130
166
|
|
131
|
-
doc.search('frame[@src]').each do |iframe|
|
167
|
+
doc.search('//frame[@src]').each do |iframe|
|
132
168
|
filter.call(iframe.get_attribute('src'))
|
133
169
|
end
|
134
170
|
|
135
|
-
doc.search('iframe[@src]').each do |iframe|
|
171
|
+
doc.search('//iframe[@src]').each do |iframe|
|
136
172
|
filter.call(iframe.get_attribute('src'))
|
137
173
|
end
|
138
174
|
|
139
|
-
doc.search('link[@href]').each do |link|
|
175
|
+
doc.search('//link[@href]').each do |link|
|
140
176
|
filter.call(link.get_attribute('href'))
|
141
177
|
end
|
142
178
|
|
143
|
-
doc.search('script[@src]').each do |script|
|
179
|
+
doc.search('//script[@src]').each do |script|
|
144
180
|
filter.call(script.get_attribute('src'))
|
145
181
|
end
|
146
182
|
end
|
@@ -213,7 +249,7 @@ module Spidr
|
|
213
249
|
path = new_url.path
|
214
250
|
|
215
251
|
# ensure that paths begin with a leading '/' for URI::FTP
|
216
|
-
if (new_url.scheme == 'ftp' && path
|
252
|
+
if (new_url.scheme == 'ftp' && !path.start_with?('/'))
|
217
253
|
path.insert(0,'/')
|
218
254
|
end
|
219
255
|
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -32,7 +32,7 @@ module Spidr
|
|
32
32
|
# @since 0.2.2
|
33
33
|
#
|
34
34
|
def initialize(proxy=Spidr.proxy)
|
35
|
-
@proxy
|
35
|
+
@proxy = proxy
|
36
36
|
@sessions = {}
|
37
37
|
end
|
38
38
|
|
@@ -82,7 +82,7 @@ module Spidr
|
|
82
82
|
).new(url.host,url.port)
|
83
83
|
|
84
84
|
if url.scheme == 'https'
|
85
|
-
session.use_ssl
|
85
|
+
session.use_ssl = true
|
86
86
|
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
87
87
|
session.start
|
88
88
|
end
|