spidr 0.1.9 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +43 -0
- data/Manifest.txt +19 -0
- data/README.txt +100 -11
- data/Rakefile +15 -5
- data/lib/spidr/actions.rb +2 -0
- data/lib/spidr/actions/actions.rb +79 -0
- data/lib/spidr/actions/exceptions.rb +4 -0
- data/lib/spidr/actions/exceptions/action.rb +6 -0
- data/lib/spidr/actions/exceptions/paused.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
- data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
- data/lib/spidr/agent.rb +385 -444
- data/lib/spidr/events.rb +87 -0
- data/lib/spidr/extensions.rb +1 -0
- data/lib/spidr/extensions/uri.rb +45 -0
- data/lib/spidr/filters.rb +438 -0
- data/lib/spidr/page.rb +211 -70
- data/lib/spidr/rules.rb +40 -18
- data/lib/spidr/spidr.rb +57 -7
- data/lib/spidr/version.rb +2 -1
- data/spec/actions_spec.rb +61 -0
- data/spec/agent_spec.rb +24 -31
- data/spec/extensions/uri_spec.rb +39 -0
- data/spec/filters_spec.rb +53 -0
- data/spec/helpers/page.rb +8 -0
- data/spec/page_examples.rb +17 -0
- data/spec/page_spec.rb +81 -0
- data/spec/rules_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/spidr_spec.rb +30 -0
- data/static/course/specs.json +1 -1
- data/tasks/course.rb +8 -1
- data/tasks/spec.rb +1 -0
- data/tasks/yard.rb +12 -0
- metadata +45 -6
- metadata.gz.sig +0 -0
data/lib/spidr/events.rb
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
module Spidr
|
2
|
+
module Events
|
3
|
+
def initialize(options={})
|
4
|
+
super(options)
|
5
|
+
|
6
|
+
@every_url_blocks = []
|
7
|
+
@every_failed_url_blocks = []
|
8
|
+
@urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
9
|
+
|
10
|
+
@every_page_blocks = []
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Pass each URL from each page visited to the given block.
|
15
|
+
#
|
16
|
+
# @yield [url]
|
17
|
+
# The block will be passed every URL from every page visited.
|
18
|
+
#
|
19
|
+
# @yieldparam [URI::HTTP] url
|
20
|
+
# Each URL from each page visited.
|
21
|
+
#
|
22
|
+
def every_url(&block)
|
23
|
+
@every_url_blocks << block
|
24
|
+
return self
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Pass each URL that could not be requested to the given block.
|
29
|
+
#
|
30
|
+
# @yield [url]
|
31
|
+
# The block will be passed every URL that could not be requested.
|
32
|
+
#
|
33
|
+
# @yieldparam [URI::HTTP] url
|
34
|
+
# A failed URL.
|
35
|
+
#
|
36
|
+
def every_failed_url(&block)
|
37
|
+
@every_failed_url_blocks << block
|
38
|
+
return self
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Pass every URL that the agent visits, and matches a given pattern,
|
43
|
+
# to a given block.
|
44
|
+
#
|
45
|
+
# @param [Regexp, String] pattern
|
46
|
+
# The pattern to match URLs with.
|
47
|
+
#
|
48
|
+
# @yield [url]
|
49
|
+
# The block will be passed every URL that matches the given pattern.
|
50
|
+
#
|
51
|
+
# @yieldparam [URI::HTTP] url
|
52
|
+
# A matching URL.
|
53
|
+
#
|
54
|
+
def urls_like(pattern,&block)
|
55
|
+
@urls_like_blocks[pattern] << block
|
56
|
+
return self
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# Pass every page that the agent visits to a given block.
|
61
|
+
#
|
62
|
+
# @yield [page]
|
63
|
+
# The block will be passed every page visited.
|
64
|
+
#
|
65
|
+
# @yieldparam [Page] page
|
66
|
+
# A visited page.
|
67
|
+
#
|
68
|
+
def every_page(&block)
|
69
|
+
@every_page_blocks << block
|
70
|
+
return self
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
# Pass the headers from every response the agent receives to a given
|
75
|
+
# block.
|
76
|
+
#
|
77
|
+
# @yield [headers]
|
78
|
+
# The block will be passed the headers of every response.
|
79
|
+
#
|
80
|
+
# @yieldparam [Hash] headers
|
81
|
+
# The headers from a response.
|
82
|
+
#
|
83
|
+
def all_headers(&block)
|
84
|
+
every_page { |page| block.call(page.headers) }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'spidr/extensions/uri'
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module URI
|
4
|
+
#
|
5
|
+
# Expands a URI decoded path, into a proper absolute path.
|
6
|
+
#
|
7
|
+
# @param [String] path
|
8
|
+
# The path from a URI.
|
9
|
+
#
|
10
|
+
# @return [String]
|
11
|
+
# The expanded path.
|
12
|
+
#
|
13
|
+
# @example
|
14
|
+
# URI.expand_path('./path')
|
15
|
+
# # => "path"
|
16
|
+
#
|
17
|
+
# @example
|
18
|
+
# URI.expand_path('test/../path')
|
19
|
+
# # => "path"
|
20
|
+
#
|
21
|
+
# @example
|
22
|
+
# URI.exand_path('/test/path/')
|
23
|
+
# # => "/test/path/"
|
24
|
+
#
|
25
|
+
# @example
|
26
|
+
# URI.expand_path('/test/../path')
|
27
|
+
# # => "/path"
|
28
|
+
#
|
29
|
+
def URI.expand_path(path)
|
30
|
+
dirs = path.gsub(/[\/]{2,}/,'/').scan(/[^\/]*\/|[^\/]+$/)
|
31
|
+
new_dirs = []
|
32
|
+
|
33
|
+
dirs.each do |dir|
|
34
|
+
if (dir == '..' || dir == '../')
|
35
|
+
unless new_dirs == ['/']
|
36
|
+
new_dirs.pop
|
37
|
+
end
|
38
|
+
elsif (dir != '.' && dir != './')
|
39
|
+
new_dirs.push(dir)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
return new_dirs.join
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,438 @@
|
|
1
|
+
require 'spidr/rules'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
module Filters
|
5
|
+
def self.included(base)
|
6
|
+
base.module_eval do
|
7
|
+
# List of acceptable URL schemes to follow
|
8
|
+
attr_reader :schemes
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
#
|
13
|
+
# Initializes filtering rules.
|
14
|
+
#
|
15
|
+
# @param [Hash] options
|
16
|
+
# Additional options.
|
17
|
+
#
|
18
|
+
# @option options [Array] :schemes (['http', 'https'])
|
19
|
+
# The list of acceptable URI schemes to visit.
|
20
|
+
# The +https+ scheme will be ignored if +net/https+ cannot be loaded.
|
21
|
+
#
|
22
|
+
# @option options [String] :host
|
23
|
+
# The host-name to visit.
|
24
|
+
#
|
25
|
+
# @option options [Array<String, Regexp, Proc>] :hosts
|
26
|
+
# The patterns which match the host-names to visit.
|
27
|
+
#
|
28
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_hosts
|
29
|
+
# The patterns which match the host-names to not visit.
|
30
|
+
#
|
31
|
+
# @option options [Array<Integer, Regexp, Proc>] :ports
|
32
|
+
# The patterns which match the ports to visit.
|
33
|
+
#
|
34
|
+
# @option options [Array<Integer, Regexp, Proc>] :ignore_ports
|
35
|
+
# The patterns which match the ports to not visit.
|
36
|
+
#
|
37
|
+
# @option options [Array<String, Regexp, Proc>] :links
|
38
|
+
# The patterns which match the links to visit.
|
39
|
+
#
|
40
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_links
|
41
|
+
# The patterns which match the links to not visit.
|
42
|
+
#
|
43
|
+
# @option options [Array<String, Regexp, Proc>] :exts
|
44
|
+
# The patterns which match the URI path extensions to visit.
|
45
|
+
#
|
46
|
+
# @option options [Array<String, Regexp, Proc>] :ignore_exts
|
47
|
+
# The patterns which match the URI path extensions to not visit.
|
48
|
+
#
|
49
|
+
def initialize(options={})
|
50
|
+
@schemes = []
|
51
|
+
|
52
|
+
if options[:schemes]
|
53
|
+
@schemes += options[:schemes]
|
54
|
+
else
|
55
|
+
@schemes << 'http'
|
56
|
+
|
57
|
+
begin
|
58
|
+
require 'net/https'
|
59
|
+
|
60
|
+
@schemes << 'https'
|
61
|
+
rescue Gem::LoadError => e
|
62
|
+
raise(e)
|
63
|
+
rescue ::LoadError
|
64
|
+
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
@host_rules = Rules.new(
|
69
|
+
:accept => options[:hosts],
|
70
|
+
:reject => options[:ignore_hosts]
|
71
|
+
)
|
72
|
+
@port_rules = Rules.new(
|
73
|
+
:accept => options[:ports],
|
74
|
+
:reject => options[:ignore_ports]
|
75
|
+
)
|
76
|
+
@link_rules = Rules.new(
|
77
|
+
:accept => options[:links],
|
78
|
+
:reject => options[:ignore_links]
|
79
|
+
)
|
80
|
+
@ext_rules = Rules.new(
|
81
|
+
:accept => options[:exts],
|
82
|
+
:reject => options[:ignore_exts]
|
83
|
+
)
|
84
|
+
|
85
|
+
if options[:host]
|
86
|
+
visit_hosts_like(options[:host])
|
87
|
+
end
|
88
|
+
|
89
|
+
if options[:queue]
|
90
|
+
self.queue = options[:queue]
|
91
|
+
end
|
92
|
+
|
93
|
+
if options[:history]
|
94
|
+
self.history = options[:history]
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
#
|
99
|
+
# Sets the list of acceptable URL schemes to visit.
|
100
|
+
#
|
101
|
+
# @param [Array] new_schemes
|
102
|
+
# The new schemes to visit.
|
103
|
+
#
|
104
|
+
# @example
|
105
|
+
# agent.schemes = ['http']
|
106
|
+
#
|
107
|
+
def schemes=(new_schemes)
|
108
|
+
@schemes = new_schemes.map { |scheme| scheme.to_s }
|
109
|
+
end
|
110
|
+
|
111
|
+
#
|
112
|
+
# Specifies the patterns that match host-names to visit.
|
113
|
+
#
|
114
|
+
# @return [Array<String, Regexp, Proc>]
|
115
|
+
# The host-name patterns to visit.
|
116
|
+
#
|
117
|
+
def visit_hosts
|
118
|
+
@host_rules.accept
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# Adds a given pattern to the visit_hosts.
|
123
|
+
#
|
124
|
+
# @param [String, Regexp] pattern
|
125
|
+
# The pattern to match host-names with.
|
126
|
+
#
|
127
|
+
# @yield [host]
|
128
|
+
# If a block is given, it will be used to filter host-names.
|
129
|
+
#
|
130
|
+
# @yieldparam [String] host
|
131
|
+
# A host-name to accept or reject.
|
132
|
+
#
|
133
|
+
def visit_hosts_like(pattern=nil,&block)
|
134
|
+
if pattern
|
135
|
+
visit_hosts << pattern
|
136
|
+
elsif block
|
137
|
+
visit_hosts << block
|
138
|
+
end
|
139
|
+
|
140
|
+
return self
|
141
|
+
end
|
142
|
+
|
143
|
+
#
|
144
|
+
# Specifies the patterns that match host-names to not visit.
|
145
|
+
#
|
146
|
+
# @return [Array<String, Regexp, Proc>]
|
147
|
+
# The host-name patterns to not visit.
|
148
|
+
#
|
149
|
+
def ignore_hosts
|
150
|
+
@host_rules.reject
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# Adds a given pattern to the ignore_hosts.
|
155
|
+
#
|
156
|
+
# @param [String, Regexp] pattern
|
157
|
+
# The pattern to match host-names with.
|
158
|
+
#
|
159
|
+
# @yield [host]
|
160
|
+
# If a block is given, it will be used to filter host-names.
|
161
|
+
#
|
162
|
+
# @yieldparam [String] host
|
163
|
+
# A host-name to reject or accept.
|
164
|
+
#
|
165
|
+
def ignore_hosts_like(pattern=nil,&block)
|
166
|
+
if pattern
|
167
|
+
ignore_hosts << pattern
|
168
|
+
elsif block
|
169
|
+
ignore_hosts << block
|
170
|
+
end
|
171
|
+
|
172
|
+
return self
|
173
|
+
end
|
174
|
+
|
175
|
+
#
|
176
|
+
# Specifies the patterns that match the ports to visit.
|
177
|
+
#
|
178
|
+
# @return [Array<Integer, Regexp, Proc>]
|
179
|
+
# The port patterns to visit.
|
180
|
+
#
|
181
|
+
def visit_ports
|
182
|
+
@port_rules.accept
|
183
|
+
end
|
184
|
+
|
185
|
+
#
|
186
|
+
# Adds a given pattern to the visit_ports.
|
187
|
+
#
|
188
|
+
# @param [Integer, Regexp] pattern
|
189
|
+
# The pattern to match ports with.
|
190
|
+
#
|
191
|
+
# @yield [port]
|
192
|
+
# If a block is given, it will be used to filter ports.
|
193
|
+
#
|
194
|
+
# @yieldparam [Integer] port
|
195
|
+
# A port to accept or reject.
|
196
|
+
#
|
197
|
+
def visit_ports_like(pattern=nil,&block)
|
198
|
+
if pattern
|
199
|
+
visit_ports << pattern
|
200
|
+
elsif block
|
201
|
+
visit_ports << block
|
202
|
+
end
|
203
|
+
|
204
|
+
return self
|
205
|
+
end
|
206
|
+
|
207
|
+
#
|
208
|
+
# Specifies the patterns that match ports to not visit.
|
209
|
+
#
|
210
|
+
# @return [Array<Integer, Regexp, Proc>]
|
211
|
+
# The port patterns to not visit.
|
212
|
+
#
|
213
|
+
def ignore_ports
|
214
|
+
@port_rules.reject
|
215
|
+
end
|
216
|
+
|
217
|
+
#
|
218
|
+
# Adds a given pattern to the ignore_ports.
|
219
|
+
#
|
220
|
+
# @param [Integer, Regexp] pattern
|
221
|
+
# The pattern to match ports with.
|
222
|
+
#
|
223
|
+
# @yield [port]
|
224
|
+
# If a block is given, it will be used to filter ports.
|
225
|
+
#
|
226
|
+
# @yieldparam [Integer] port
|
227
|
+
# A port to reject or accept.
|
228
|
+
#
|
229
|
+
def ignore_ports_like(pattern=nil,&block)
|
230
|
+
if pattern
|
231
|
+
ignore_ports << pattern
|
232
|
+
elsif block
|
233
|
+
ignore_ports << block
|
234
|
+
end
|
235
|
+
|
236
|
+
return self
|
237
|
+
end
|
238
|
+
|
239
|
+
#
|
240
|
+
# Specifies the patterns that match the links to visit.
|
241
|
+
#
|
242
|
+
# @return [Array<String, Regexp, Proc>]
|
243
|
+
# The link patterns to visit.
|
244
|
+
#
|
245
|
+
def visit_links
|
246
|
+
@link_rules.accept
|
247
|
+
end
|
248
|
+
|
249
|
+
#
|
250
|
+
# Adds a given pattern to the visit_links.
|
251
|
+
#
|
252
|
+
# @param [String, Regexp] pattern
|
253
|
+
# The pattern to match links with.
|
254
|
+
#
|
255
|
+
# @yield [link]
|
256
|
+
# If a block is given, it will be used to filter links.
|
257
|
+
#
|
258
|
+
# @yieldparam [String] link
|
259
|
+
# A link to accept or reject.
|
260
|
+
#
|
261
|
+
def visit_links_like(pattern=nil,&block)
|
262
|
+
if pattern
|
263
|
+
visit_links << pattern
|
264
|
+
elsif block
|
265
|
+
visit_links << block
|
266
|
+
end
|
267
|
+
|
268
|
+
return self
|
269
|
+
end
|
270
|
+
|
271
|
+
#
|
272
|
+
# Specifies the patterns that match links to not visit.
|
273
|
+
#
|
274
|
+
# @return [Array<String, Regexp, Proc>]
|
275
|
+
# The link patterns to not visit.
|
276
|
+
#
|
277
|
+
def ignore_links
|
278
|
+
@link_rules.reject
|
279
|
+
end
|
280
|
+
|
281
|
+
#
|
282
|
+
# Adds a given pattern to the ignore_links.
|
283
|
+
#
|
284
|
+
# @param [String, Regexp] pattern
|
285
|
+
# The pattern to match links with.
|
286
|
+
#
|
287
|
+
# @yield [link]
|
288
|
+
# If a block is given, it will be used to filter links.
|
289
|
+
#
|
290
|
+
# @yieldparam [String] link
|
291
|
+
# A link to reject or accept.
|
292
|
+
#
|
293
|
+
def ignore_links_like(pattern=nil,&block)
|
294
|
+
if pattern
|
295
|
+
ignore_links << pattern
|
296
|
+
elsif block
|
297
|
+
ignore_links << block
|
298
|
+
end
|
299
|
+
|
300
|
+
return self
|
301
|
+
end
|
302
|
+
|
303
|
+
#
|
304
|
+
# Specifies the patterns that match the URI path extensions to visit.
|
305
|
+
#
|
306
|
+
# @return [Array<String, Regexp, Proc>]
|
307
|
+
# The URI path extensions patterns to visit.
|
308
|
+
#
|
309
|
+
def visit_exts
|
310
|
+
@ext_rules.accept
|
311
|
+
end
|
312
|
+
|
313
|
+
#
|
314
|
+
# Adds a given pattern to the visit_exts.
|
315
|
+
#
|
316
|
+
# @param [String, Regexp] pattern
|
317
|
+
# The pattern to match URI path extensions with.
|
318
|
+
#
|
319
|
+
# @yield [ext]
|
320
|
+
# If a block is given, it will be used to filter URI path extensions.
|
321
|
+
#
|
322
|
+
# @yieldparam [String] ext
|
323
|
+
# A URI path extension to accept or reject.
|
324
|
+
#
|
325
|
+
def visit_exts_like(pattern=nil,&block)
|
326
|
+
if pattern
|
327
|
+
visit_exts << pattern
|
328
|
+
elsif block
|
329
|
+
visit_exts << block
|
330
|
+
end
|
331
|
+
|
332
|
+
return self
|
333
|
+
end
|
334
|
+
|
335
|
+
#
|
336
|
+
# Specifies the patterns that match URI path extensions to not visit.
|
337
|
+
#
|
338
|
+
# @return [Array<String, Regexp, Proc>]
|
339
|
+
# The URI path extension patterns to not visit.
|
340
|
+
#
|
341
|
+
def ignore_exts
|
342
|
+
@ext_rules.reject
|
343
|
+
end
|
344
|
+
|
345
|
+
#
|
346
|
+
# Adds a given pattern to the ignore_exts.
|
347
|
+
#
|
348
|
+
# @param [String, Regexp] pattern
|
349
|
+
# The pattern to match URI path extensions with.
|
350
|
+
#
|
351
|
+
# @yield [ext]
|
352
|
+
# If a block is given, it will be used to filter URI path extensions.
|
353
|
+
#
|
354
|
+
# @yieldparam [String] ext
|
355
|
+
# A URI path extension to reject or accept.
|
356
|
+
#
|
357
|
+
def ignore_exts_like(pattern=nil,&block)
|
358
|
+
if pattern
|
359
|
+
ignore_exts << pattern
|
360
|
+
elsif block
|
361
|
+
ignore_exts << block
|
362
|
+
end
|
363
|
+
|
364
|
+
return self
|
365
|
+
end
|
366
|
+
|
367
|
+
protected
|
368
|
+
|
369
|
+
#
|
370
|
+
# Determines if a given URI scheme should be visited.
|
371
|
+
#
|
372
|
+
# @param [String] scheme
|
373
|
+
# The URI scheme.
|
374
|
+
#
|
375
|
+
# @return [Boolean]
|
376
|
+
# Specifies whether the given scheme should be visited.
|
377
|
+
#
|
378
|
+
def visit_scheme?(scheme)
|
379
|
+
if scheme
|
380
|
+
return @schemes.include?(scheme)
|
381
|
+
else
|
382
|
+
return true
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
#
|
387
|
+
# Determines if a given host-name should be visited.
|
388
|
+
#
|
389
|
+
# @param [String] host
|
390
|
+
# The host-name.
|
391
|
+
#
|
392
|
+
# @return [Boolean]
|
393
|
+
# Specifies whether the given host-name should be visited.
|
394
|
+
#
|
395
|
+
def visit_host?(host)
|
396
|
+
@host_rules.accept?(host)
|
397
|
+
end
|
398
|
+
|
399
|
+
#
|
400
|
+
# Determines if a given port should be visited.
|
401
|
+
#
|
402
|
+
# @param [Integer] port
|
403
|
+
# The port number.
|
404
|
+
#
|
405
|
+
# @return [Boolean]
|
406
|
+
# Specifies whether the given port should be visited.
|
407
|
+
#
|
408
|
+
def visit_port?(port)
|
409
|
+
@port_rules.accept?(port)
|
410
|
+
end
|
411
|
+
|
412
|
+
#
|
413
|
+
# Determines if a given link should be visited.
|
414
|
+
#
|
415
|
+
# @param [String] link
|
416
|
+
# The link.
|
417
|
+
#
|
418
|
+
# @return [Boolean]
|
419
|
+
# Specifies whether the given link should be visited.
|
420
|
+
#
|
421
|
+
def visit_link?(link)
|
422
|
+
@link_rules.accept?(link)
|
423
|
+
end
|
424
|
+
|
425
|
+
#
|
426
|
+
# Determines if a given URI path extension should be visited.
|
427
|
+
#
|
428
|
+
# @param [String] path
|
429
|
+
# The path that contains the extension.
|
430
|
+
#
|
431
|
+
# @return [Boolean]
|
432
|
+
# Specifies whether the given URI path extension should be visited.
|
433
|
+
#
|
434
|
+
def visit_ext?(path)
|
435
|
+
@ext_rules.accept?(File.extname(path)[1..-1])
|
436
|
+
end
|
437
|
+
end
|
438
|
+
end
|