spidr_epg 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,539 @@
1
+ require 'spidrs/rules'
2
+
3
+ module Spidr
4
+ #
5
+ # The {Filters} module adds methods to {Agent} for controlling which
6
+ # URLs the agent will visit.
7
+ #
8
+ module Filters
9
+ # List of acceptable URL schemes to follow
10
+ attr_reader :schemes
11
+
12
+ #
13
+ # Sets the list of acceptable URL schemes to visit.
14
+ #
15
+ # @param [Array] new_schemes
16
+ # The new schemes to visit.
17
+ #
18
+ # @example
19
+ # agent.schemes = ['http']
20
+ #
21
+ def schemes=(new_schemes)
22
+ @schemes = new_schemes.map { |scheme| scheme.to_s }
23
+ end
24
+
25
+ #
26
+ # Specifies the patterns that match host-names to visit.
27
+ #
28
+ # @return [Array<String, Regexp, Proc>]
29
+ # The host-name patterns to visit.
30
+ #
31
+ def visit_hosts
32
+ @host_rules.accept
33
+ end
34
+
35
+ #
36
+ # Adds a given pattern to the {#visit_hosts}.
37
+ #
38
+ # @param [String, Regexp] pattern
39
+ # The pattern to match host-names with.
40
+ #
41
+ # @yield [host]
42
+ # If a block is given, it will be used to filter host-names.
43
+ #
44
+ # @yieldparam [String] host
45
+ # A host-name to accept or reject.
46
+ #
47
+ def visit_hosts_like(pattern=nil,&block)
48
+ if pattern
49
+ visit_hosts << pattern
50
+ elsif block
51
+ visit_hosts << block
52
+ end
53
+
54
+ return self
55
+ end
56
+
57
+ #
58
+ # Specifies the patterns that match host-names to not visit.
59
+ #
60
+ # @return [Array<String, Regexp, Proc>]
61
+ # The host-name patterns to not visit.
62
+ #
63
+ def ignore_hosts
64
+ @host_rules.reject
65
+ end
66
+
67
+ #
68
+ # Adds a given pattern to the {#ignore_hosts}.
69
+ #
70
+ # @param [String, Regexp] pattern
71
+ # The pattern to match host-names with.
72
+ #
73
+ # @yield [host]
74
+ # If a block is given, it will be used to filter host-names.
75
+ #
76
+ # @yieldparam [String] host
77
+ # A host-name to reject or accept.
78
+ #
79
+ def ignore_hosts_like(pattern=nil,&block)
80
+ if pattern
81
+ ignore_hosts << pattern
82
+ elsif block
83
+ ignore_hosts << block
84
+ end
85
+
86
+ return self
87
+ end
88
+
89
+ #
90
+ # Specifies the patterns that match the ports to visit.
91
+ #
92
+ # @return [Array<Integer, Regexp, Proc>]
93
+ # The port patterns to visit.
94
+ #
95
+ def visit_ports
96
+ @port_rules.accept
97
+ end
98
+
99
+ #
100
+ # Adds a given pattern to the {#visit_ports}.
101
+ #
102
+ # @param [Integer, Regexp] pattern
103
+ # The pattern to match ports with.
104
+ #
105
+ # @yield [port]
106
+ # If a block is given, it will be used to filter ports.
107
+ #
108
+ # @yieldparam [Integer] port
109
+ # A port to accept or reject.
110
+ #
111
+ def visit_ports_like(pattern=nil,&block)
112
+ if pattern
113
+ visit_ports << pattern
114
+ elsif block
115
+ visit_ports << block
116
+ end
117
+
118
+ return self
119
+ end
120
+
121
+ #
122
+ # Specifies the patterns that match ports to not visit.
123
+ #
124
+ # @return [Array<Integer, Regexp, Proc>]
125
+ # The port patterns to not visit.
126
+ #
127
+ def ignore_ports
128
+ @port_rules.reject
129
+ end
130
+
131
+ #
132
+ # Adds a given pattern to the {#ignore_ports}.
133
+ #
134
+ # @param [Integer, Regexp] pattern
135
+ # The pattern to match ports with.
136
+ #
137
+ # @yield [port]
138
+ # If a block is given, it will be used to filter ports.
139
+ #
140
+ # @yieldparam [Integer] port
141
+ # A port to reject or accept.
142
+ #
143
+ def ignore_ports_like(pattern=nil,&block)
144
+ if pattern
145
+ ignore_ports << pattern
146
+ elsif block
147
+ ignore_ports << block
148
+ end
149
+
150
+ return self
151
+ end
152
+
153
+ #
154
+ # Specifies the patterns that match the links to visit.
155
+ #
156
+ # @return [Array<String, Regexp, Proc>]
157
+ # The link patterns to visit.
158
+ #
159
+ # @since 0.2.4
160
+ #
161
+ def visit_links
162
+ @link_rules.accept
163
+ end
164
+
165
+ #
166
+ # Adds a given pattern to the {#visit_links}
167
+ #
168
+ # @param [String, Regexp] pattern
169
+ # The pattern to match link with.
170
+ #
171
+ # @yield [link]
172
+ # If a block is given, it will be used to filter links.
173
+ #
174
+ # @yieldparam [String] link
175
+ # A link to accept or reject.
176
+ #
177
+ # @since 0.2.4
178
+ #
179
+ def visit_links_like(pattern=nil,&block)
180
+ if pattern
181
+ visit_links << pattern
182
+ elsif block
183
+ visit_links << block
184
+ end
185
+
186
+ return self
187
+ end
188
+
189
+ #
190
+ # Specifies the patterns that match links to not visit.
191
+ #
192
+ # @return [Array<String, Regexp, Proc>]
193
+ # The link patterns to not visit.
194
+ #
195
+ def ignore_links
196
+ @link_rules.reject
197
+ end
198
+
199
+ #
200
+ # Adds a given pattern to the {#ignore_links}.
201
+ #
202
+ # @param [String, Regexp] pattern
203
+ # The pattern to match links with.
204
+ #
205
+ # @yield [link]
206
+ # If a block is given, it will be used to filter links.
207
+ #
208
+ # @yieldparam [String] link
209
+ # A link to reject or accept.
210
+ #
211
+ def ignore_links_like(pattern=nil,&block)
212
+ if pattern
213
+ ignore_links << pattern
214
+ elsif block
215
+ ignore_links << block
216
+ end
217
+
218
+ return self
219
+ end
220
+
221
+ #
222
+ # Specifies the patterns that match the URLs to visit.
223
+ #
224
+ # @return [Array<String, Regexp, Proc>]
225
+ # The link patterns to visit.
226
+ #
227
+ # @since 0.2.4
228
+ #
229
+ def visit_urls
230
+ @url_rules.accept
231
+ end
232
+
233
+ #
234
+ # Adds a given pattern to the {#visit_urls}
235
+ #
236
+ # @param [String, Regexp] pattern
237
+ # The pattern to match URLs with.
238
+ #
239
+ # @yield [url]
240
+ # If a block is given, it will be used to filter URLs.
241
+ #
242
+ # @yieldparam [URI::HTTP, URI::HTTPS] url
243
+ # A URL to accept or reject.
244
+ #
245
+ # @since 0.2.4
246
+ #
247
+ def visit_urls_like(pattern=nil,&block)
248
+ if pattern
249
+ visit_urls << pattern
250
+ elsif block
251
+ visit_urls << block
252
+ end
253
+
254
+ return self
255
+ end
256
+
257
+ #
258
+ # Specifies the patterns that match URLs to not visit.
259
+ #
260
+ # @return [Array<String, Regexp, Proc>]
261
+ # The URL patterns to not visit.
262
+ #
263
+ # @since 0.2.4
264
+ #
265
+ def ignore_urls
266
+ @url_rules.reject
267
+ end
268
+
269
+ #
270
+ # Adds a given pattern to the {#ignore_urls}.
271
+ #
272
+ # @param [String, Regexp] pattern
273
+ # The pattern to match URLs with.
274
+ #
275
+ # @yield [url]
276
+ # If a block is given, it will be used to filter URLs.
277
+ #
278
+ # @yieldparam [URI::HTTP, URI::HTTPS] url
279
+ # A URL to reject or accept.
280
+ #
281
+ # @since 0.2.4
282
+ #
283
+ def ignore_urls_like(pattern=nil,&block)
284
+ if pattern
285
+ ignore_urls << pattern
286
+ elsif block
287
+ ignore_urls << block
288
+ end
289
+
290
+ return self
291
+ end
292
+
293
+ #
294
+ # Specifies the patterns that match the URI path extensions to visit.
295
+ #
296
+ # @return [Array<String, Regexp, Proc>]
297
+ # The URI path extensions patterns to visit.
298
+ #
299
+ def visit_exts
300
+ @ext_rules.accept
301
+ end
302
+
303
+ #
304
+ # Adds a given pattern to the {#visit_exts}.
305
+ #
306
+ # @param [String, Regexp] pattern
307
+ # The pattern to match URI path extensions with.
308
+ #
309
+ # @yield [ext]
310
+ # If a block is given, it will be used to filter URI path extensions.
311
+ #
312
+ # @yieldparam [String] ext
313
+ # A URI path extension to accept or reject.
314
+ #
315
+ def visit_exts_like(pattern=nil,&block)
316
+ if pattern
317
+ visit_exts << pattern
318
+ elsif block
319
+ visit_exts << block
320
+ end
321
+
322
+ return self
323
+ end
324
+
325
+ #
326
+ # Specifies the patterns that match URI path extensions to not visit.
327
+ #
328
+ # @return [Array<String, Regexp, Proc>]
329
+ # The URI path extension patterns to not visit.
330
+ #
331
+ def ignore_exts
332
+ @ext_rules.reject
333
+ end
334
+
335
+ #
336
+ # Adds a given pattern to the {#ignore_exts}.
337
+ #
338
+ # @param [String, Regexp] pattern
339
+ # The pattern to match URI path extensions with.
340
+ #
341
+ # @yield [ext]
342
+ # If a block is given, it will be used to filter URI path extensions.
343
+ #
344
+ # @yieldparam [String] ext
345
+ # A URI path extension to reject or accept.
346
+ #
347
+ def ignore_exts_like(pattern=nil,&block)
348
+ if pattern
349
+ ignore_exts << pattern
350
+ elsif block
351
+ ignore_exts << block
352
+ end
353
+
354
+ return self
355
+ end
356
+
357
+ protected
358
+
359
+ #
360
+ # Initializes filtering rules.
361
+ #
362
+ # @param [Hash] options
363
+ # Additional options.
364
+ #
365
+ # @option options [Array] :schemes (['http', 'https'])
366
+ # The list of acceptable URI schemes to visit.
367
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
368
+ #
369
+ # @option options [String] :host
370
+ # The host-name to visit.
371
+ #
372
+ # @option options [Array<String, Regexp, Proc>] :hosts
373
+ # The patterns which match the host-names to visit.
374
+ #
375
+ # @option options [Array<String, Regexp, Proc>] :ignore_hosts
376
+ # The patterns which match the host-names to not visit.
377
+ #
378
+ # @option options [Array<Integer, Regexp, Proc>] :ports
379
+ # The patterns which match the ports to visit.
380
+ #
381
+ # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
382
+ # The patterns which match the ports to not visit.
383
+ #
384
+ # @option options [Array<String, Regexp, Proc>] :links
385
+ # The patterns which match the links to visit.
386
+ #
387
+ # @option options [Array<String, Regexp, Proc>] :ignore_links
388
+ # The patterns which match the links to not visit.
389
+ #
390
+ # @option options [Array<String, Regexp, Proc>] :urls
391
+ # The patterns which match the URLs to visit.
392
+ #
393
+ # @option options [Array<String, Regexp, Proc>] :ignore_urls
394
+ # The patterns which match the URLs to not visit.
395
+ #
396
+ # @option options [Array<String, Regexp, Proc>] :exts
397
+ # The patterns which match the URI path extensions to visit.
398
+ #
399
+ # @option options [Array<String, Regexp, Proc>] :ignore_exts
400
+ # The patterns which match the URI path extensions to not visit.
401
+ #
402
+ def initialize_filters(options={})
403
+ @schemes = []
404
+
405
+ if options[:schemes]
406
+ @schemes += options[:schemes]
407
+ else
408
+ @schemes << 'http'
409
+
410
+ begin
411
+ require 'net/https'
412
+
413
+ @schemes << 'https'
414
+ rescue Gem::LoadError => e
415
+ raise(e)
416
+ rescue ::LoadError
417
+ warn "Warning: cannot load 'net/https', https support disabled"
418
+ end
419
+ end
420
+
421
+ @host_rules = Rules.new(
422
+ :accept => options[:hosts],
423
+ :reject => options[:ignore_hosts]
424
+ )
425
+ @port_rules = Rules.new(
426
+ :accept => options[:ports],
427
+ :reject => options[:ignore_ports]
428
+ )
429
+ @link_rules = Rules.new(
430
+ :accept => options[:links],
431
+ :reject => options[:ignore_links]
432
+ )
433
+ @url_rules = Rules.new(
434
+ :accept => options[:urls],
435
+ :reject => options[:ignore_urls]
436
+ )
437
+ @ext_rules = Rules.new(
438
+ :accept => options[:exts],
439
+ :reject => options[:ignore_exts]
440
+ )
441
+
442
+ if options[:host]
443
+ visit_hosts_like(options[:host])
444
+ end
445
+
446
+ if options[:queue]
447
+ self.queue = options[:queue]
448
+ end
449
+
450
+ if options[:history]
451
+ self.history = options[:history]
452
+ end
453
+ end
454
+
455
+ #
456
+ # Determines if a given URI scheme should be visited.
457
+ #
458
+ # @param [String] scheme
459
+ # The URI scheme.
460
+ #
461
+ # @return [Boolean]
462
+ # Specifies whether the given scheme should be visited.
463
+ #
464
+ def visit_scheme?(scheme)
465
+ if scheme
466
+ return @schemes.include?(scheme)
467
+ else
468
+ return true
469
+ end
470
+ end
471
+
472
+ #
473
+ # Determines if a given host-name should be visited.
474
+ #
475
+ # @param [String] host
476
+ # The host-name.
477
+ #
478
+ # @return [Boolean]
479
+ # Specifies whether the given host-name should be visited.
480
+ #
481
+ def visit_host?(host)
482
+ @host_rules.accept?(host)
483
+ end
484
+
485
+ #
486
+ # Determines if a given port should be visited.
487
+ #
488
+ # @param [Integer] port
489
+ # The port number.
490
+ #
491
+ # @return [Boolean]
492
+ # Specifies whether the given port should be visited.
493
+ #
494
+ def visit_port?(port)
495
+ @port_rules.accept?(port)
496
+ end
497
+
498
+ #
499
+ # Determines if a given link should be visited.
500
+ #
501
+ # @param [String] link
502
+ # The link.
503
+ #
504
+ # @return [Boolean]
505
+ # Specifies whether the given link should be visited.
506
+ #
507
+ def visit_link?(link)
508
+ @link_rules.accept?(link)
509
+ end
510
+
511
+ #
512
+ # Determines if a given URL should be visited.
513
+ #
514
+ # @param [URI::HTTP, URI::HTTPS] url
515
+ # The URL.
516
+ #
517
+ # @return [Boolean]
518
+ # Specifies whether the given URL should be visited.
519
+ #
520
+ # @since 0.2.4
521
+ #
522
+ def visit_url?(link)
523
+ @url_rules.accept?(link)
524
+ end
525
+
526
+ #
527
+ # Determines if a given URI path extension should be visited.
528
+ #
529
+ # @param [String] path
530
+ # The path that contains the extension.
531
+ #
532
+ # @return [Boolean]
533
+ # Specifies whether the given URI path extension should be visited.
534
+ #
535
+ def visit_ext?(path)
536
+ @ext_rules.accept?(File.extname(path)[1..-1])
537
+ end
538
+ end
539
+ end