spidr 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ChangeLog.md +69 -54
- data/Gemfile +9 -5
- data/LICENSE.txt +1 -1
- data/README.md +34 -26
- data/Rakefile +4 -15
- data/gemspec.yml +3 -2
- data/lib/spidr/agent.rb +101 -44
- data/lib/spidr/{actions → agent}/actions.rb +32 -12
- data/lib/spidr/{events.rb → agent/events.rb} +4 -8
- data/lib/spidr/{filters.rb → agent/filters.rb} +14 -16
- data/lib/spidr/{sanitizers.rb → agent/sanitizers.rb} +5 -7
- data/lib/spidr/auth_store.rb +2 -2
- data/lib/spidr/cookie_jar.rb +2 -2
- data/lib/spidr/extensions/uri.rb +28 -16
- data/lib/spidr/page.rb +7 -11
- data/lib/spidr/{body.rb → page/body.rb} +1 -1
- data/lib/spidr/{headers.rb → page/headers.rb} +1 -1
- data/lib/spidr/{links.rb → page/links.rb} +43 -7
- data/lib/spidr/session_cache.rb +2 -2
- data/lib/spidr/spidr.rb +32 -5
- data/lib/spidr/version.rb +1 -1
- data/spec/agent/actions_spec.rb +60 -0
- data/spec/agent/filters_spec.rb +62 -0
- data/spec/agent/sanitizers_spec.rb +62 -0
- data/spec/agent_spec.rb +13 -13
- data/spec/auth_store_spec.rb +17 -17
- data/spec/cookie_jar_spec.rb +26 -26
- data/spec/extensions/uri_spec.rb +19 -9
- data/spec/helpers/history.rb +5 -5
- data/spec/helpers/wsoc.rb +2 -2
- data/spec/page_examples.rb +4 -4
- data/spec/page_spec.rb +28 -25
- data/spec/rules_spec.rb +14 -14
- data/spec/session_cache.rb +7 -7
- data/spec/spidr_spec.rb +10 -10
- metadata +37 -51
- data/lib/spidr/actions.rb +0 -2
- data/lib/spidr/actions/exceptions.rb +0 -4
- data/lib/spidr/actions/exceptions/action.rb +0 -9
- data/lib/spidr/actions/exceptions/paused.rb +0 -11
- data/lib/spidr/actions/exceptions/skip_link.rb +0 -12
- data/lib/spidr/actions/exceptions/skip_page.rb +0 -12
- data/spec/actions_spec.rb +0 -59
- data/spec/filters_spec.rb +0 -61
- data/spec/sanitizers_spec.rb +0 -61
@@ -1,13 +1,33 @@
|
|
1
|
-
require 'spidr/actions/exceptions/paused'
|
2
|
-
require 'spidr/actions/exceptions/skip_link'
|
3
|
-
require 'spidr/actions/exceptions/skip_page'
|
4
|
-
|
5
1
|
module Spidr
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
2
|
+
class Agent
|
3
|
+
module Actions
|
4
|
+
#
|
5
|
+
# The base {Actions} exception class.
|
6
|
+
#
|
7
|
+
class Action < RuntimeError
|
8
|
+
end
|
9
|
+
|
10
|
+
#
|
11
|
+
# An {Actions} exception class used to pause a running {Agent}.
|
12
|
+
#
|
13
|
+
class Paused < Action
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# An {Actions} exception class which causes a running {Agent} to
|
18
|
+
# skip a link.
|
19
|
+
#
|
20
|
+
class SkipLink < Action
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# An {Actions} exception class which causes a running {Agent} to
|
25
|
+
# skip a {Page}, and all links within that page.
|
26
|
+
#
|
27
|
+
class SkipPage < Action
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
11
31
|
#
|
12
32
|
# Continue spidering.
|
13
33
|
#
|
@@ -40,7 +60,7 @@ module Spidr
|
|
40
60
|
#
|
41
61
|
def pause!
|
42
62
|
@paused = true
|
43
|
-
raise(Paused)
|
63
|
+
raise(Actions::Paused)
|
44
64
|
end
|
45
65
|
|
46
66
|
#
|
@@ -61,7 +81,7 @@ module Spidr
|
|
61
81
|
# and not enqueued or visited.
|
62
82
|
#
|
63
83
|
def skip_link!
|
64
|
-
raise(SkipLink)
|
84
|
+
raise(Actions::SkipLink)
|
65
85
|
end
|
66
86
|
|
67
87
|
#
|
@@ -71,7 +91,7 @@ module Spidr
|
|
71
91
|
# Indicates to the agent, that the current page should be skipped.
|
72
92
|
#
|
73
93
|
def skip_page!
|
74
|
-
raise(SkipPage)
|
94
|
+
raise(Actions::SkipPage)
|
75
95
|
end
|
76
96
|
|
77
97
|
protected
|
@@ -1,10 +1,5 @@
|
|
1
1
|
module Spidr
|
2
|
-
|
3
|
-
# The {Events} module adds methods to {Agent} for registering
|
4
|
-
# callbacks which will receive URLs, links, headers and pages, when
|
5
|
-
# they are visited.
|
6
|
-
#
|
7
|
-
module Events
|
2
|
+
class Agent
|
8
3
|
#
|
9
4
|
# Pass each URL from each page visited to the given block.
|
10
5
|
#
|
@@ -526,12 +521,13 @@ module Spidr
|
|
526
521
|
protected
|
527
522
|
|
528
523
|
def initialize_events(options={})
|
529
|
-
@every_url_blocks
|
524
|
+
@every_url_blocks = []
|
530
525
|
@every_failed_url_blocks = []
|
531
|
-
@every_url_like_blocks
|
526
|
+
@every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
532
527
|
|
533
528
|
@every_page_blocks = []
|
534
529
|
@every_link_blocks = []
|
535
530
|
end
|
531
|
+
|
536
532
|
end
|
537
533
|
end
|
@@ -1,11 +1,8 @@
|
|
1
1
|
require 'spidr/rules'
|
2
2
|
|
3
3
|
module Spidr
|
4
|
-
|
5
|
-
|
6
|
-
# URLs the agent will visit.
|
7
|
-
#
|
8
|
-
module Filters
|
4
|
+
class Agent
|
5
|
+
|
9
6
|
# List of acceptable URL schemes to follow
|
10
7
|
attr_reader :schemes
|
11
8
|
|
@@ -419,24 +416,24 @@ module Spidr
|
|
419
416
|
end
|
420
417
|
|
421
418
|
@host_rules = Rules.new(
|
422
|
-
:
|
423
|
-
:
|
419
|
+
accept: options[:hosts],
|
420
|
+
reject: options[:ignore_hosts]
|
424
421
|
)
|
425
422
|
@port_rules = Rules.new(
|
426
|
-
:
|
427
|
-
:
|
423
|
+
accept: options[:ports],
|
424
|
+
reject: options[:ignore_ports]
|
428
425
|
)
|
429
426
|
@link_rules = Rules.new(
|
430
|
-
:
|
431
|
-
:
|
427
|
+
accept: options[:links],
|
428
|
+
reject: options[:ignore_links]
|
432
429
|
)
|
433
430
|
@url_rules = Rules.new(
|
434
|
-
:
|
435
|
-
:
|
431
|
+
accept: options[:urls],
|
432
|
+
reject: options[:ignore_urls]
|
436
433
|
)
|
437
434
|
@ext_rules = Rules.new(
|
438
|
-
:
|
439
|
-
:
|
435
|
+
accept: options[:exts],
|
436
|
+
reject: options[:ignore_exts]
|
440
437
|
)
|
441
438
|
|
442
439
|
if options[:host]
|
@@ -511,7 +508,7 @@ module Spidr
|
|
511
508
|
#
|
512
509
|
# Determines if a given URL should be visited.
|
513
510
|
#
|
514
|
-
# @param [URI::HTTP, URI::HTTPS]
|
511
|
+
# @param [URI::HTTP, URI::HTTPS] link
|
515
512
|
# The URL.
|
516
513
|
#
|
517
514
|
# @return [Boolean]
|
@@ -535,5 +532,6 @@ module Spidr
|
|
535
532
|
def visit_ext?(path)
|
536
533
|
@ext_rules.accept?(File.extname(path)[1..-1])
|
537
534
|
end
|
535
|
+
|
538
536
|
end
|
539
537
|
end
|
@@ -1,11 +1,8 @@
|
|
1
1
|
require 'uri'
|
2
2
|
|
3
3
|
module Spidr
|
4
|
-
|
5
|
-
|
6
|
-
# sanitation of incoming links.
|
7
|
-
#
|
8
|
-
module Sanitizers
|
4
|
+
class Agent
|
5
|
+
|
9
6
|
# Specifies whether the Agent will strip URI fragments
|
10
7
|
attr_accessor :strip_fragments
|
11
8
|
|
@@ -27,7 +24,7 @@ module Spidr
|
|
27
24
|
url = URI(url.to_s) unless url.kind_of?(URI)
|
28
25
|
|
29
26
|
url.fragment = nil if @strip_fragments
|
30
|
-
url.query
|
27
|
+
url.query = nil if @strip_query
|
31
28
|
|
32
29
|
return url
|
33
30
|
end
|
@@ -50,7 +47,8 @@ module Spidr
|
|
50
47
|
#
|
51
48
|
def initialize_sanitizers(options={})
|
52
49
|
@strip_fragments = options.fetch(:strip_fragments,true)
|
53
|
-
@strip_query
|
50
|
+
@strip_query = options.fetch(:strip_query,false)
|
54
51
|
end
|
52
|
+
|
55
53
|
end
|
56
54
|
end
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -57,10 +57,10 @@ module Spidr
|
|
57
57
|
#
|
58
58
|
# Add an auth credential to the store for supplied base URL.
|
59
59
|
#
|
60
|
-
# @param [URI]
|
60
|
+
# @param [URI] url
|
61
61
|
# A URL pattern to associate with a set of auth credentials.
|
62
62
|
#
|
63
|
-
# @param [AuthCredential]
|
63
|
+
# @param [AuthCredential] auth
|
64
64
|
# The auth credential for this URL pattern.
|
65
65
|
#
|
66
66
|
# @return [AuthCredential]
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -18,7 +18,7 @@ module Spidr
|
|
18
18
|
def initialize
|
19
19
|
@params = {}
|
20
20
|
|
21
|
-
@dirty
|
21
|
+
@dirty = Set[]
|
22
22
|
@cookies = {}
|
23
23
|
end
|
24
24
|
|
@@ -147,7 +147,7 @@ module Spidr
|
|
147
147
|
#
|
148
148
|
def cookies_for_host(host)
|
149
149
|
host_cookies = (@params[host] || {})
|
150
|
-
sub_domains
|
150
|
+
sub_domains = host.split('.')
|
151
151
|
|
152
152
|
while sub_domains.length > 2
|
153
153
|
sub_domains.shift
|
data/lib/spidr/extensions/uri.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'uri'
|
2
|
+
require 'strscan'
|
2
3
|
|
3
4
|
module URI
|
4
5
|
#
|
@@ -26,27 +27,38 @@ module URI
|
|
26
27
|
# URI.expand_path('/test/../path')
|
27
28
|
# # => "/path"
|
28
29
|
#
|
29
|
-
def
|
30
|
-
|
30
|
+
def self.expand_path(path)
|
31
|
+
if path.start_with?('/')
|
32
|
+
leading_slash, path = path[0,1], path[1..-1]
|
33
|
+
else
|
34
|
+
leading_slash = ''
|
35
|
+
end
|
31
36
|
|
32
|
-
|
33
|
-
|
37
|
+
if path.end_with?('/')
|
38
|
+
trailing_slash, path = path[-1,1], path[0..-2]
|
39
|
+
else
|
40
|
+
trailing_slash = ''
|
41
|
+
end
|
34
42
|
|
35
|
-
|
43
|
+
scanner = StringScanner.new(path)
|
44
|
+
stack = []
|
36
45
|
|
37
|
-
|
38
|
-
if dir
|
39
|
-
|
40
|
-
|
41
|
-
|
46
|
+
until scanner.eos?
|
47
|
+
if (dir = scanner.scan(/^[^\/]+/))
|
48
|
+
case dir
|
49
|
+
when '..' then stack.pop
|
50
|
+
when '.' then false
|
51
|
+
else stack.push(dir)
|
52
|
+
end
|
53
|
+
else
|
54
|
+
scanner.skip(/\/+/)
|
42
55
|
end
|
43
56
|
end
|
44
57
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
return full_path
|
58
|
+
unless stack.empty?
|
59
|
+
"#{leading_slash}#{stack.join('/')}#{trailing_slash}"
|
60
|
+
else
|
61
|
+
'/'
|
62
|
+
end
|
51
63
|
end
|
52
64
|
end
|
data/lib/spidr/page.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require 'spidr/headers'
|
2
|
-
require 'spidr/body'
|
3
|
-
require 'spidr/links'
|
1
|
+
require 'spidr/page/headers'
|
2
|
+
require 'spidr/page/body'
|
3
|
+
require 'spidr/page/links'
|
4
4
|
|
5
5
|
module Spidr
|
6
6
|
#
|
@@ -8,10 +8,6 @@ module Spidr
|
|
8
8
|
#
|
9
9
|
class Page
|
10
10
|
|
11
|
-
include Headers
|
12
|
-
include Body
|
13
|
-
include Links
|
14
|
-
|
15
11
|
# URL of the page
|
16
12
|
attr_reader :url
|
17
13
|
|
@@ -27,14 +23,14 @@ module Spidr
|
|
27
23
|
# @param [URI::HTTP] url
|
28
24
|
# The URL of the page.
|
29
25
|
#
|
30
|
-
# @param [Net::
|
26
|
+
# @param [Net::HTTPResponse] response
|
31
27
|
# The response from the request for the page.
|
32
28
|
#
|
33
29
|
def initialize(url,response)
|
34
|
-
@url
|
30
|
+
@url = url
|
35
31
|
@response = response
|
36
|
-
@headers
|
37
|
-
@doc
|
32
|
+
@headers = response.to_hash
|
33
|
+
@doc = nil
|
38
34
|
end
|
39
35
|
|
40
36
|
#
|
@@ -2,7 +2,7 @@ require 'spidr/extensions/uri'
|
|
2
2
|
require 'uri'
|
3
3
|
|
4
4
|
module Spidr
|
5
|
-
|
5
|
+
class Page
|
6
6
|
include Enumerable
|
7
7
|
|
8
8
|
#
|
@@ -100,6 +100,42 @@ module Spidr
|
|
100
100
|
each_redirect.to_a
|
101
101
|
end
|
102
102
|
|
103
|
+
#
|
104
|
+
# Enumerates over every `mailto:` link in the page.
|
105
|
+
#
|
106
|
+
# @yield [link]
|
107
|
+
# The given block will be passed every `mailto:` link from the page.
|
108
|
+
#
|
109
|
+
# @yieldparam [String] link
|
110
|
+
# A `mailto:` link from the page.
|
111
|
+
#
|
112
|
+
# @return [Enumerator]
|
113
|
+
# If no block is given, an enumerator object will be returned.
|
114
|
+
#
|
115
|
+
# @since 0.5.0
|
116
|
+
#
|
117
|
+
def each_mailto
|
118
|
+
return enum_for(:each_mailto) unless block_given?
|
119
|
+
|
120
|
+
if (html? && doc)
|
121
|
+
doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
|
122
|
+
yield a.get_attribute('href')[7..-1]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# `mailto:` links in the page.
|
129
|
+
#
|
130
|
+
# @return [Array<String>]
|
131
|
+
# The `mailto:` links found within the page.
|
132
|
+
#
|
133
|
+
# @since 0.5.0
|
134
|
+
#
|
135
|
+
def mailtos
|
136
|
+
each_mailto.to_a
|
137
|
+
end
|
138
|
+
|
103
139
|
#
|
104
140
|
# Enumerates over every link in the page.
|
105
141
|
#
|
@@ -124,23 +160,23 @@ module Spidr
|
|
124
160
|
each_redirect(&filter) if is_redirect?
|
125
161
|
|
126
162
|
if (html? && doc)
|
127
|
-
doc.search('a[@href]').each do |a|
|
163
|
+
doc.search('//a[@href]').each do |a|
|
128
164
|
filter.call(a.get_attribute('href'))
|
129
165
|
end
|
130
166
|
|
131
|
-
doc.search('frame[@src]').each do |iframe|
|
167
|
+
doc.search('//frame[@src]').each do |iframe|
|
132
168
|
filter.call(iframe.get_attribute('src'))
|
133
169
|
end
|
134
170
|
|
135
|
-
doc.search('iframe[@src]').each do |iframe|
|
171
|
+
doc.search('//iframe[@src]').each do |iframe|
|
136
172
|
filter.call(iframe.get_attribute('src'))
|
137
173
|
end
|
138
174
|
|
139
|
-
doc.search('link[@href]').each do |link|
|
175
|
+
doc.search('//link[@href]').each do |link|
|
140
176
|
filter.call(link.get_attribute('href'))
|
141
177
|
end
|
142
178
|
|
143
|
-
doc.search('script[@src]').each do |script|
|
179
|
+
doc.search('//script[@src]').each do |script|
|
144
180
|
filter.call(script.get_attribute('src'))
|
145
181
|
end
|
146
182
|
end
|
@@ -213,7 +249,7 @@ module Spidr
|
|
213
249
|
path = new_url.path
|
214
250
|
|
215
251
|
# ensure that paths begin with a leading '/' for URI::FTP
|
216
|
-
if (new_url.scheme == 'ftp' && path
|
252
|
+
if (new_url.scheme == 'ftp' && !path.start_with?('/'))
|
217
253
|
path.insert(0,'/')
|
218
254
|
end
|
219
255
|
|
data/lib/spidr/session_cache.rb
CHANGED
@@ -32,7 +32,7 @@ module Spidr
|
|
32
32
|
# @since 0.2.2
|
33
33
|
#
|
34
34
|
def initialize(proxy=Spidr.proxy)
|
35
|
-
@proxy
|
35
|
+
@proxy = proxy
|
36
36
|
@sessions = {}
|
37
37
|
end
|
38
38
|
|
@@ -82,7 +82,7 @@ module Spidr
|
|
82
82
|
).new(url.host,url.port)
|
83
83
|
|
84
84
|
if url.scheme == 'https'
|
85
|
-
session.use_ssl
|
85
|
+
session.use_ssl = true
|
86
86
|
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
87
87
|
session.start
|
88
88
|
end
|