spidr 0.1.9 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,87 @@
1
+ module Spidr
2
+ module Events
3
+ def initialize(options={})
4
+ super(options)
5
+
6
+ @every_url_blocks = []
7
+ @every_failed_url_blocks = []
8
+ @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
9
+
10
+ @every_page_blocks = []
11
+ end
12
+
13
+ #
14
+ # Pass each URL from each page visited to the given block.
15
+ #
16
+ # @yield [url]
17
+ # The block will be passed every URL from every page visited.
18
+ #
19
+ # @yieldparam [URI::HTTP] url
20
+ # Each URL from each page visited.
21
+ #
22
+ def every_url(&block)
23
+ @every_url_blocks << block
24
+ return self
25
+ end
26
+
27
+ #
28
+ # Pass each URL that could not be requested to the given block.
29
+ #
30
+ # @yield [url]
31
+ # The block will be passed every URL that could not be requested.
32
+ #
33
+ # @yieldparam [URI::HTTP] url
34
+ # A failed URL.
35
+ #
36
+ def every_failed_url(&block)
37
+ @every_failed_url_blocks << block
38
+ return self
39
+ end
40
+
41
+ #
42
+ # Pass every URL that the agent visits, and matches a given pattern,
43
+ # to a given block.
44
+ #
45
+ # @param [Regexp, String] pattern
46
+ # The pattern to match URLs with.
47
+ #
48
+ # @yield [url]
49
+ # The block will be passed every URL that matches the given pattern.
50
+ #
51
+ # @yieldparam [URI::HTTP] url
52
+ # A matching URL.
53
+ #
54
+ def urls_like(pattern,&block)
55
+ @urls_like_blocks[pattern] << block
56
+ return self
57
+ end
58
+
59
+ #
60
+ # Pass every page that the agent visits to a given block.
61
+ #
62
+ # @yield [page]
63
+ # The block will be passed every page visited.
64
+ #
65
+ # @yieldparam [Page] page
66
+ # A visited page.
67
+ #
68
+ def every_page(&block)
69
+ @every_page_blocks << block
70
+ return self
71
+ end
72
+
73
+ #
74
+ # Pass the headers from every response the agent receives to a given
75
+ # block.
76
+ #
77
+ # @yield [headers]
78
+ # The block will be passed the headers of every response.
79
+ #
80
+ # @yieldparam [Hash] headers
81
+ # The headers from a response.
82
+ #
83
+ def all_headers(&block)
84
+ every_page { |page| block.call(page.headers) }
85
+ end
86
+ end
87
+ end
@@ -0,0 +1 @@
1
+ require 'spidr/extensions/uri'
@@ -0,0 +1,45 @@
1
+ require 'uri'
2
+
3
+ module URI
4
+ #
5
+ # Expands a URI decoded path, into a proper absolute path.
6
+ #
7
+ # @param [String] path
8
+ # The path from a URI.
9
+ #
10
+ # @return [String]
11
+ # The expanded path.
12
+ #
13
+ # @example
14
+ # URI.expand_path('./path')
15
+ # # => "path"
16
+ #
17
+ # @example
18
+ # URI.expand_path('test/../path')
19
+ # # => "path"
20
+ #
21
+ # @example
22
+ # URI.exand_path('/test/path/')
23
+ # # => "/test/path/"
24
+ #
25
+ # @example
26
+ # URI.expand_path('/test/../path')
27
+ # # => "/path"
28
+ #
29
+ def URI.expand_path(path)
30
+ dirs = path.gsub(/[\/]{2,}/,'/').scan(/[^\/]*\/|[^\/]+$/)
31
+ new_dirs = []
32
+
33
+ dirs.each do |dir|
34
+ if (dir == '..' || dir == '../')
35
+ unless new_dirs == ['/']
36
+ new_dirs.pop
37
+ end
38
+ elsif (dir != '.' && dir != './')
39
+ new_dirs.push(dir)
40
+ end
41
+ end
42
+
43
+ return new_dirs.join
44
+ end
45
+ end
@@ -0,0 +1,438 @@
1
+ require 'spidr/rules'
2
+
3
+ module Spidr
4
+ module Filters
5
+ def self.included(base)
6
+ base.module_eval do
7
+ # List of acceptable URL schemes to follow
8
+ attr_reader :schemes
9
+ end
10
+ end
11
+
12
+ #
13
+ # Initializes filtering rules.
14
+ #
15
+ # @param [Hash] options
16
+ # Additional options.
17
+ #
18
+ # @option options [Array] :schemes (['http', 'https'])
19
+ # The list of acceptable URI schemes to visit.
20
+ # The +https+ scheme will be ignored if +net/https+ cannot be loaded.
21
+ #
22
+ # @option options [String] :host
23
+ # The host-name to visit.
24
+ #
25
+ # @option options [Array<String, Regexp, Proc>] :hosts
26
+ # The patterns which match the host-names to visit.
27
+ #
28
+ # @option options [Array<String, Regexp, Proc>] :ignore_hosts
29
+ # The patterns which match the host-names to not visit.
30
+ #
31
+ # @option options [Array<Integer, Regexp, Proc>] :ports
32
+ # The patterns which match the ports to visit.
33
+ #
34
+ # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
35
+ # The patterns which match the ports to not visit.
36
+ #
37
+ # @option options [Array<String, Regexp, Proc>] :links
38
+ # The patterns which match the links to visit.
39
+ #
40
+ # @option options [Array<String, Regexp, Proc>] :ignore_links
41
+ # The patterns which match the links to not visit.
42
+ #
43
+ # @option options [Array<String, Regexp, Proc>] :exts
44
+ # The patterns which match the URI path extensions to visit.
45
+ #
46
+ # @option options [Array<String, Regexp, Proc>] :ignore_exts
47
+ # The patterns which match the URI path extensions to not visit.
48
+ #
49
+ def initialize(options={})
50
+ @schemes = []
51
+
52
+ if options[:schemes]
53
+ @schemes += options[:schemes]
54
+ else
55
+ @schemes << 'http'
56
+
57
+ begin
58
+ require 'net/https'
59
+
60
+ @schemes << 'https'
61
+ rescue Gem::LoadError => e
62
+ raise(e)
63
+ rescue ::LoadError
64
+ STDERR.puts "Warning: cannot load 'net/https', https support disabled"
65
+ end
66
+ end
67
+
68
+ @host_rules = Rules.new(
69
+ :accept => options[:hosts],
70
+ :reject => options[:ignore_hosts]
71
+ )
72
+ @port_rules = Rules.new(
73
+ :accept => options[:ports],
74
+ :reject => options[:ignore_ports]
75
+ )
76
+ @link_rules = Rules.new(
77
+ :accept => options[:links],
78
+ :reject => options[:ignore_links]
79
+ )
80
+ @ext_rules = Rules.new(
81
+ :accept => options[:exts],
82
+ :reject => options[:ignore_exts]
83
+ )
84
+
85
+ if options[:host]
86
+ visit_hosts_like(options[:host])
87
+ end
88
+
89
+ if options[:queue]
90
+ self.queue = options[:queue]
91
+ end
92
+
93
+ if options[:history]
94
+ self.history = options[:history]
95
+ end
96
+ end
97
+
98
+ #
99
+ # Sets the list of acceptable URL schemes to visit.
100
+ #
101
+ # @param [Array] new_schemes
102
+ # The new schemes to visit.
103
+ #
104
+ # @example
105
+ # agent.schemes = ['http']
106
+ #
107
+ def schemes=(new_schemes)
108
+ @schemes = new_schemes.map { |scheme| scheme.to_s }
109
+ end
110
+
111
+ #
112
+ # Specifies the patterns that match host-names to visit.
113
+ #
114
+ # @return [Array<String, Regexp, Proc>]
115
+ # The host-name patterns to visit.
116
+ #
117
+ def visit_hosts
118
+ @host_rules.accept
119
+ end
120
+
121
+ #
122
+ # Adds a given pattern to the visit_hosts.
123
+ #
124
+ # @param [String, Regexp] pattern
125
+ # The pattern to match host-names with.
126
+ #
127
+ # @yield [host]
128
+ # If a block is given, it will be used to filter host-names.
129
+ #
130
+ # @yieldparam [String] host
131
+ # A host-name to accept or reject.
132
+ #
133
+ def visit_hosts_like(pattern=nil,&block)
134
+ if pattern
135
+ visit_hosts << pattern
136
+ elsif block
137
+ visit_hosts << block
138
+ end
139
+
140
+ return self
141
+ end
142
+
143
+ #
144
+ # Specifies the patterns that match host-names to not visit.
145
+ #
146
+ # @return [Array<String, Regexp, Proc>]
147
+ # The host-name patterns to not visit.
148
+ #
149
+ def ignore_hosts
150
+ @host_rules.reject
151
+ end
152
+
153
+ #
154
+ # Adds a given pattern to the ignore_hosts.
155
+ #
156
+ # @param [String, Regexp] pattern
157
+ # The pattern to match host-names with.
158
+ #
159
+ # @yield [host]
160
+ # If a block is given, it will be used to filter host-names.
161
+ #
162
+ # @yieldparam [String] host
163
+ # A host-name to reject or accept.
164
+ #
165
+ def ignore_hosts_like(pattern=nil,&block)
166
+ if pattern
167
+ ignore_hosts << pattern
168
+ elsif block
169
+ ignore_hosts << block
170
+ end
171
+
172
+ return self
173
+ end
174
+
175
+ #
176
+ # Specifies the patterns that match the ports to visit.
177
+ #
178
+ # @return [Array<Integer, Regexp, Proc>]
179
+ # The port patterns to visit.
180
+ #
181
+ def visit_ports
182
+ @port_rules.accept
183
+ end
184
+
185
+ #
186
+ # Adds a given pattern to the visit_ports.
187
+ #
188
+ # @param [Integer, Regexp] pattern
189
+ # The pattern to match ports with.
190
+ #
191
+ # @yield [port]
192
+ # If a block is given, it will be used to filter ports.
193
+ #
194
+ # @yieldparam [Integer] port
195
+ # A port to accept or reject.
196
+ #
197
+ def visit_ports_like(pattern=nil,&block)
198
+ if pattern
199
+ visit_ports << pattern
200
+ elsif block
201
+ visit_ports << block
202
+ end
203
+
204
+ return self
205
+ end
206
+
207
+ #
208
+ # Specifies the patterns that match ports to not visit.
209
+ #
210
+ # @return [Array<Integer, Regexp, Proc>]
211
+ # The port patterns to not visit.
212
+ #
213
+ def ignore_ports
214
+ @port_rules.reject
215
+ end
216
+
217
+ #
218
+ # Adds a given pattern to the ignore_ports.
219
+ #
220
+ # @param [Integer, Regexp] pattern
221
+ # The pattern to match ports with.
222
+ #
223
+ # @yield [port]
224
+ # If a block is given, it will be used to filter ports.
225
+ #
226
+ # @yieldparam [Integer] port
227
+ # A port to reject or accept.
228
+ #
229
+ def ignore_ports_like(pattern=nil,&block)
230
+ if pattern
231
+ ignore_ports << pattern
232
+ elsif block
233
+ ignore_ports << block
234
+ end
235
+
236
+ return self
237
+ end
238
+
239
+ #
240
+ # Specifies the patterns that match the links to visit.
241
+ #
242
+ # @return [Array<String, Regexp, Proc>]
243
+ # The link patterns to visit.
244
+ #
245
+ def visit_links
246
+ @link_rules.accept
247
+ end
248
+
249
+ #
250
+ # Adds a given pattern to the visit_links.
251
+ #
252
+ # @param [String, Regexp] pattern
253
+ # The pattern to match links with.
254
+ #
255
+ # @yield [link]
256
+ # If a block is given, it will be used to filter links.
257
+ #
258
+ # @yieldparam [String] link
259
+ # A link to accept or reject.
260
+ #
261
+ def visit_links_like(pattern=nil,&block)
262
+ if pattern
263
+ visit_links << pattern
264
+ elsif block
265
+ visit_links << block
266
+ end
267
+
268
+ return self
269
+ end
270
+
271
+ #
272
+ # Specifies the patterns that match links to not visit.
273
+ #
274
+ # @return [Array<String, Regexp, Proc>]
275
+ # The link patterns to not visit.
276
+ #
277
+ def ignore_links
278
+ @link_rules.reject
279
+ end
280
+
281
+ #
282
+ # Adds a given pattern to the ignore_links.
283
+ #
284
+ # @param [String, Regexp] pattern
285
+ # The pattern to match links with.
286
+ #
287
+ # @yield [link]
288
+ # If a block is given, it will be used to filter links.
289
+ #
290
+ # @yieldparam [String] link
291
+ # A link to reject or accept.
292
+ #
293
+ def ignore_links_like(pattern=nil,&block)
294
+ if pattern
295
+ ignore_links << pattern
296
+ elsif block
297
+ ignore_links << block
298
+ end
299
+
300
+ return self
301
+ end
302
+
303
+ #
304
+ # Specifies the patterns that match the URI path extensions to visit.
305
+ #
306
+ # @return [Array<String, Regexp, Proc>]
307
+ # The URI path extensions patterns to visit.
308
+ #
309
+ def visit_exts
310
+ @ext_rules.accept
311
+ end
312
+
313
+ #
314
+ # Adds a given pattern to the visit_exts.
315
+ #
316
+ # @param [String, Regexp] pattern
317
+ # The pattern to match URI path extensions with.
318
+ #
319
+ # @yield [ext]
320
+ # If a block is given, it will be used to filter URI path extensions.
321
+ #
322
+ # @yieldparam [String] ext
323
+ # A URI path extension to accept or reject.
324
+ #
325
+ def visit_exts_like(pattern=nil,&block)
326
+ if pattern
327
+ visit_exts << pattern
328
+ elsif block
329
+ visit_exts << block
330
+ end
331
+
332
+ return self
333
+ end
334
+
335
+ #
336
+ # Specifies the patterns that match URI path extensions to not visit.
337
+ #
338
+ # @return [Array<String, Regexp, Proc>]
339
+ # The URI path extension patterns to not visit.
340
+ #
341
+ def ignore_exts
342
+ @ext_rules.reject
343
+ end
344
+
345
+ #
346
+ # Adds a given pattern to the ignore_exts.
347
+ #
348
+ # @param [String, Regexp] pattern
349
+ # The pattern to match URI path extensions with.
350
+ #
351
+ # @yield [ext]
352
+ # If a block is given, it will be used to filter URI path extensions.
353
+ #
354
+ # @yieldparam [String] ext
355
+ # A URI path extension to reject or accept.
356
+ #
357
+ def ignore_exts_like(pattern=nil,&block)
358
+ if pattern
359
+ ignore_exts << pattern
360
+ elsif block
361
+ ignore_exts << block
362
+ end
363
+
364
+ return self
365
+ end
366
+
367
+ protected
368
+
369
+ #
370
+ # Determines if a given URI scheme should be visited.
371
+ #
372
+ # @param [String] scheme
373
+ # The URI scheme.
374
+ #
375
+ # @return [Boolean]
376
+ # Specifies whether the given scheme should be visited.
377
+ #
378
+ def visit_scheme?(scheme)
379
+ if scheme
380
+ return @schemes.include?(scheme)
381
+ else
382
+ return true
383
+ end
384
+ end
385
+
386
+ #
387
+ # Determines if a given host-name should be visited.
388
+ #
389
+ # @param [String] host
390
+ # The host-name.
391
+ #
392
+ # @return [Boolean]
393
+ # Specifies whether the given host-name should be visited.
394
+ #
395
+ def visit_host?(host)
396
+ @host_rules.accept?(host)
397
+ end
398
+
399
+ #
400
+ # Determines if a given port should be visited.
401
+ #
402
+ # @param [Integer] port
403
+ # The port number.
404
+ #
405
+ # @return [Boolean]
406
+ # Specifies whether the given port should be visited.
407
+ #
408
+ def visit_port?(port)
409
+ @port_rules.accept?(port)
410
+ end
411
+
412
+ #
413
+ # Determines if a given link should be visited.
414
+ #
415
+ # @param [String] link
416
+ # The link.
417
+ #
418
+ # @return [Boolean]
419
+ # Specifies whether the given link should be visited.
420
+ #
421
+ def visit_link?(link)
422
+ @link_rules.accept?(link)
423
+ end
424
+
425
+ #
426
+ # Determines if a given URI path extension should be visited.
427
+ #
428
+ # @param [String] path
429
+ # The path that contains the extension.
430
+ #
431
+ # @return [Boolean]
432
+ # Specifies whether the given URI path extension should be visited.
433
+ #
434
+ def visit_ext?(path)
435
+ @ext_rules.accept?(File.extname(path)[1..-1])
436
+ end
437
+ end
438
+ end