spidr 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ module Spidr
2
+ module Events
3
+ def initialize(options={})
4
+ super(options)
5
+
6
+ @every_url_blocks = []
7
+ @every_failed_url_blocks = []
8
+ @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
9
+
10
+ @every_page_blocks = []
11
+ end
12
+
13
+ #
14
+ # Pass each URL from each page visited to the given block.
15
+ #
16
+ # @yield [url]
17
+ # The block will be passed every URL from every page visited.
18
+ #
19
+ # @yieldparam [URI::HTTP] url
20
+ # Each URL from each page visited.
21
+ #
22
+ def every_url(&block)
23
+ @every_url_blocks << block
24
+ return self
25
+ end
26
+
27
+ #
28
+ # Pass each URL that could not be requested to the given block.
29
+ #
30
+ # @yield [url]
31
+ # The block will be passed every URL that could not be requested.
32
+ #
33
+ # @yieldparam [URI::HTTP] url
34
+ # A failed URL.
35
+ #
36
+ def every_failed_url(&block)
37
+ @every_failed_url_blocks << block
38
+ return self
39
+ end
40
+
41
+ #
42
+ # Pass every URL that the agent visits, and matches a given pattern,
43
+ # to a given block.
44
+ #
45
+ # @param [Regexp, String] pattern
46
+ # The pattern to match URLs with.
47
+ #
48
+ # @yield [url]
49
+ # The block will be passed every URL that matches the given pattern.
50
+ #
51
+ # @yieldparam [URI::HTTP] url
52
+ # A matching URL.
53
+ #
54
+ def urls_like(pattern,&block)
55
+ @urls_like_blocks[pattern] << block
56
+ return self
57
+ end
58
+
59
+ #
60
+ # Pass every page that the agent visits to a given block.
61
+ #
62
+ # @yield [page]
63
+ # The block will be passed every page visited.
64
+ #
65
+ # @yieldparam [Page] page
66
+ # A visited page.
67
+ #
68
+ def every_page(&block)
69
+ @every_page_blocks << block
70
+ return self
71
+ end
72
+
73
+ #
74
+ # Pass the headers from every response the agent receives to a given
75
+ # block.
76
+ #
77
+ # @yield [headers]
78
+ # The block will be passed the headers of every response.
79
+ #
80
+ # @yieldparam [Hash] headers
81
+ # The headers from a response.
82
+ #
83
+ def all_headers(&block)
84
+ every_page { |page| block.call(page.headers) }
85
+ end
86
+ end
87
+ end
@@ -0,0 +1 @@
1
+ require 'spidr/extensions/uri'
@@ -0,0 +1,45 @@
1
+ require 'uri'
2
+
3
+ module URI
4
+ #
5
+ # Expands a URI decoded path, into a proper absolute path.
6
+ #
7
+ # @param [String] path
8
+ # The path from a URI.
9
+ #
10
+ # @return [String]
11
+ # The expanded path.
12
+ #
13
+ # @example
14
+ # URI.expand_path('./path')
15
+ # # => "path"
16
+ #
17
+ # @example
18
+ # URI.expand_path('test/../path')
19
+ # # => "path"
20
+ #
21
+ # @example
22
+ # URI.exand_path('/test/path/')
23
+ # # => "/test/path/"
24
+ #
25
+ # @example
26
+ # URI.expand_path('/test/../path')
27
+ # # => "/path"
28
+ #
29
+ def URI.expand_path(path)
30
+ dirs = path.gsub(/[\/]{2,}/,'/').scan(/[^\/]*\/|[^\/]+$/)
31
+ new_dirs = []
32
+
33
+ dirs.each do |dir|
34
+ if (dir == '..' || dir == '../')
35
+ unless new_dirs == ['/']
36
+ new_dirs.pop
37
+ end
38
+ elsif (dir != '.' && dir != './')
39
+ new_dirs.push(dir)
40
+ end
41
+ end
42
+
43
+ return new_dirs.join
44
+ end
45
+ end
@@ -0,0 +1,438 @@
1
+ require 'spidr/rules'
2
+
3
+ module Spidr
4
+ module Filters
5
+ def self.included(base)
6
+ base.module_eval do
7
+ # List of acceptable URL schemes to follow
8
+ attr_reader :schemes
9
+ end
10
+ end
11
+
12
+ #
13
+ # Initializes filtering rules.
14
+ #
15
+ # @param [Hash] options
16
+ # Additional options.
17
+ #
18
+ # @option options [Array] :schemes (['http', 'https'])
19
+ # The list of acceptable URI schemes to visit.
20
+ # The +https+ scheme will be ignored if +net/https+ cannot be loaded.
21
+ #
22
+ # @option options [String] :host
23
+ # The host-name to visit.
24
+ #
25
+ # @option options [Array<String, Regexp, Proc>] :hosts
26
+ # The patterns which match the host-names to visit.
27
+ #
28
+ # @option options [Array<String, Regexp, Proc>] :ignore_hosts
29
+ # The patterns which match the host-names to not visit.
30
+ #
31
+ # @option options [Array<Integer, Regexp, Proc>] :ports
32
+ # The patterns which match the ports to visit.
33
+ #
34
+ # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
35
+ # The patterns which match the ports to not visit.
36
+ #
37
+ # @option options [Array<String, Regexp, Proc>] :links
38
+ # The patterns which match the links to visit.
39
+ #
40
+ # @option options [Array<String, Regexp, Proc>] :ignore_links
41
+ # The patterns which match the links to not visit.
42
+ #
43
+ # @option options [Array<String, Regexp, Proc>] :exts
44
+ # The patterns which match the URI path extensions to visit.
45
+ #
46
+ # @option options [Array<String, Regexp, Proc>] :ignore_exts
47
+ # The patterns which match the URI path extensions to not visit.
48
+ #
49
+ def initialize(options={})
50
+ @schemes = []
51
+
52
+ if options[:schemes]
53
+ @schemes += options[:schemes]
54
+ else
55
+ @schemes << 'http'
56
+
57
+ begin
58
+ require 'net/https'
59
+
60
+ @schemes << 'https'
61
+ rescue Gem::LoadError => e
62
+ raise(e)
63
+ rescue ::LoadError
64
+ STDERR.puts "Warning: cannot load 'net/https', https support disabled"
65
+ end
66
+ end
67
+
68
+ @host_rules = Rules.new(
69
+ :accept => options[:hosts],
70
+ :reject => options[:ignore_hosts]
71
+ )
72
+ @port_rules = Rules.new(
73
+ :accept => options[:ports],
74
+ :reject => options[:ignore_ports]
75
+ )
76
+ @link_rules = Rules.new(
77
+ :accept => options[:links],
78
+ :reject => options[:ignore_links]
79
+ )
80
+ @ext_rules = Rules.new(
81
+ :accept => options[:exts],
82
+ :reject => options[:ignore_exts]
83
+ )
84
+
85
+ if options[:host]
86
+ visit_hosts_like(options[:host])
87
+ end
88
+
89
+ if options[:queue]
90
+ self.queue = options[:queue]
91
+ end
92
+
93
+ if options[:history]
94
+ self.history = options[:history]
95
+ end
96
+ end
97
+
98
+ #
99
+ # Sets the list of acceptable URL schemes to visit.
100
+ #
101
+ # @param [Array] new_schemes
102
+ # The new schemes to visit.
103
+ #
104
+ # @example
105
+ # agent.schemes = ['http']
106
+ #
107
+ def schemes=(new_schemes)
108
+ @schemes = new_schemes.map { |scheme| scheme.to_s }
109
+ end
110
+
111
+ #
112
+ # Specifies the patterns that match host-names to visit.
113
+ #
114
+ # @return [Array<String, Regexp, Proc>]
115
+ # The host-name patterns to visit.
116
+ #
117
+ def visit_hosts
118
+ @host_rules.accept
119
+ end
120
+
121
+ #
122
+ # Adds a given pattern to the visit_hosts.
123
+ #
124
+ # @param [String, Regexp] pattern
125
+ # The pattern to match host-names with.
126
+ #
127
+ # @yield [host]
128
+ # If a block is given, it will be used to filter host-names.
129
+ #
130
+ # @yieldparam [String] host
131
+ # A host-name to accept or reject.
132
+ #
133
+ def visit_hosts_like(pattern=nil,&block)
134
+ if pattern
135
+ visit_hosts << pattern
136
+ elsif block
137
+ visit_hosts << block
138
+ end
139
+
140
+ return self
141
+ end
142
+
143
+ #
144
+ # Specifies the patterns that match host-names to not visit.
145
+ #
146
+ # @return [Array<String, Regexp, Proc>]
147
+ # The host-name patterns to not visit.
148
+ #
149
+ def ignore_hosts
150
+ @host_rules.reject
151
+ end
152
+
153
+ #
154
+ # Adds a given pattern to the ignore_hosts.
155
+ #
156
+ # @param [String, Regexp] pattern
157
+ # The pattern to match host-names with.
158
+ #
159
+ # @yield [host]
160
+ # If a block is given, it will be used to filter host-names.
161
+ #
162
+ # @yieldparam [String] host
163
+ # A host-name to reject or accept.
164
+ #
165
+ def ignore_hosts_like(pattern=nil,&block)
166
+ if pattern
167
+ ignore_hosts << pattern
168
+ elsif block
169
+ ignore_hosts << block
170
+ end
171
+
172
+ return self
173
+ end
174
+
175
+ #
176
+ # Specifies the patterns that match the ports to visit.
177
+ #
178
+ # @return [Array<Integer, Regexp, Proc>]
179
+ # The port patterns to visit.
180
+ #
181
+ def visit_ports
182
+ @port_rules.accept
183
+ end
184
+
185
+ #
186
+ # Adds a given pattern to the visit_ports.
187
+ #
188
+ # @param [Integer, Regexp] pattern
189
+ # The pattern to match ports with.
190
+ #
191
+ # @yield [port]
192
+ # If a block is given, it will be used to filter ports.
193
+ #
194
+ # @yieldparam [Integer] port
195
+ # A port to accept or reject.
196
+ #
197
+ def visit_ports_like(pattern=nil,&block)
198
+ if pattern
199
+ visit_ports << pattern
200
+ elsif block
201
+ visit_ports << block
202
+ end
203
+
204
+ return self
205
+ end
206
+
207
+ #
208
+ # Specifies the patterns that match ports to not visit.
209
+ #
210
+ # @return [Array<Integer, Regexp, Proc>]
211
+ # The port patterns to not visit.
212
+ #
213
+ def ignore_ports
214
+ @port_rules.reject
215
+ end
216
+
217
+ #
218
+ # Adds a given pattern to the ignore_ports.
219
+ #
220
+ # @param [Integer, Regexp] pattern
221
+ # The pattern to match ports with.
222
+ #
223
+ # @yield [port]
224
+ # If a block is given, it will be used to filter ports.
225
+ #
226
+ # @yieldparam [Integer] port
227
+ # A port to reject or accept.
228
+ #
229
+ def ignore_ports_like(pattern=nil,&block)
230
+ if pattern
231
+ ignore_ports << pattern
232
+ elsif block
233
+ ignore_ports << block
234
+ end
235
+
236
+ return self
237
+ end
238
+
239
+ #
240
+ # Specifies the patterns that match the links to visit.
241
+ #
242
+ # @return [Array<String, Regexp, Proc>]
243
+ # The link patterns to visit.
244
+ #
245
+ def visit_links
246
+ @link_rules.accept
247
+ end
248
+
249
+ #
250
+ # Adds a given pattern to the visit_links.
251
+ #
252
+ # @param [String, Regexp] pattern
253
+ # The pattern to match links with.
254
+ #
255
+ # @yield [link]
256
+ # If a block is given, it will be used to filter links.
257
+ #
258
+ # @yieldparam [String] link
259
+ # A link to accept or reject.
260
+ #
261
+ def visit_links_like(pattern=nil,&block)
262
+ if pattern
263
+ visit_links << pattern
264
+ elsif block
265
+ visit_links << block
266
+ end
267
+
268
+ return self
269
+ end
270
+
271
+ #
272
+ # Specifies the patterns that match links to not visit.
273
+ #
274
+ # @return [Array<String, Regexp, Proc>]
275
+ # The link patterns to not visit.
276
+ #
277
+ def ignore_links
278
+ @link_rules.reject
279
+ end
280
+
281
+ #
282
+ # Adds a given pattern to the ignore_links.
283
+ #
284
+ # @param [String, Regexp] pattern
285
+ # The pattern to match links with.
286
+ #
287
+ # @yield [link]
288
+ # If a block is given, it will be used to filter links.
289
+ #
290
+ # @yieldparam [String] link
291
+ # A link to reject or accept.
292
+ #
293
+ def ignore_links_like(pattern=nil,&block)
294
+ if pattern
295
+ ignore_links << pattern
296
+ elsif block
297
+ ignore_links << block
298
+ end
299
+
300
+ return self
301
+ end
302
+
303
+ #
304
+ # Specifies the patterns that match the URI path extensions to visit.
305
+ #
306
+ # @return [Array<String, Regexp, Proc>]
307
+ # The URI path extensions patterns to visit.
308
+ #
309
+ def visit_exts
310
+ @ext_rules.accept
311
+ end
312
+
313
+ #
314
+ # Adds a given pattern to the visit_exts.
315
+ #
316
+ # @param [String, Regexp] pattern
317
+ # The pattern to match URI path extensions with.
318
+ #
319
+ # @yield [ext]
320
+ # If a block is given, it will be used to filter URI path extensions.
321
+ #
322
+ # @yieldparam [String] ext
323
+ # A URI path extension to accept or reject.
324
+ #
325
+ def visit_exts_like(pattern=nil,&block)
326
+ if pattern
327
+ visit_exts << pattern
328
+ elsif block
329
+ visit_exts << block
330
+ end
331
+
332
+ return self
333
+ end
334
+
335
+ #
336
+ # Specifies the patterns that match URI path extensions to not visit.
337
+ #
338
+ # @return [Array<String, Regexp, Proc>]
339
+ # The URI path extension patterns to not visit.
340
+ #
341
+ def ignore_exts
342
+ @ext_rules.reject
343
+ end
344
+
345
+ #
346
+ # Adds a given pattern to the ignore_exts.
347
+ #
348
+ # @param [String, Regexp] pattern
349
+ # The pattern to match URI path extensions with.
350
+ #
351
+ # @yield [ext]
352
+ # If a block is given, it will be used to filter URI path extensions.
353
+ #
354
+ # @yieldparam [String] ext
355
+ # A URI path extension to reject or accept.
356
+ #
357
+ def ignore_exts_like(pattern=nil,&block)
358
+ if pattern
359
+ ignore_exts << pattern
360
+ elsif block
361
+ ignore_exts << block
362
+ end
363
+
364
+ return self
365
+ end
366
+
367
+ protected
368
+
369
+ #
370
+ # Determines if a given URI scheme should be visited.
371
+ #
372
+ # @param [String] scheme
373
+ # The URI scheme.
374
+ #
375
+ # @return [Boolean]
376
+ # Specifies whether the given scheme should be visited.
377
+ #
378
+ def visit_scheme?(scheme)
379
+ if scheme
380
+ return @schemes.include?(scheme)
381
+ else
382
+ return true
383
+ end
384
+ end
385
+
386
+ #
387
+ # Determines if a given host-name should be visited.
388
+ #
389
+ # @param [String] host
390
+ # The host-name.
391
+ #
392
+ # @return [Boolean]
393
+ # Specifies whether the given host-name should be visited.
394
+ #
395
+ def visit_host?(host)
396
+ @host_rules.accept?(host)
397
+ end
398
+
399
+ #
400
+ # Determines if a given port should be visited.
401
+ #
402
+ # @param [Integer] port
403
+ # The port number.
404
+ #
405
+ # @return [Boolean]
406
+ # Specifies whether the given port should be visited.
407
+ #
408
+ def visit_port?(port)
409
+ @port_rules.accept?(port)
410
+ end
411
+
412
+ #
413
+ # Determines if a given link should be visited.
414
+ #
415
+ # @param [String] link
416
+ # The link.
417
+ #
418
+ # @return [Boolean]
419
+ # Specifies whether the given link should be visited.
420
+ #
421
+ def visit_link?(link)
422
+ @link_rules.accept?(link)
423
+ end
424
+
425
+ #
426
+ # Determines if a given URI path extension should be visited.
427
+ #
428
+ # @param [String] path
429
+ # The path that contains the extension.
430
+ #
431
+ # @return [Boolean]
432
+ # Specifies whether the given URI path extension should be visited.
433
+ #
434
+ def visit_ext?(path)
435
+ @ext_rules.accept?(File.extname(path)[1..-1])
436
+ end
437
+ end
438
+ end