spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,539 @@
1
+ require 'spidrs/rules'
2
+
3
+ module Spidr
4
+ #
5
+ # The {Filters} module adds methods to {Agent} for controlling which
6
+ # URLs the agent will visit.
7
+ #
8
+ module Filters
9
+ # List of acceptable URL schemes to follow
10
+ attr_reader :schemes
11
+
12
+ #
13
+ # Sets the list of acceptable URL schemes to visit.
14
+ #
15
+ # @param [Array] new_schemes
16
+ # The new schemes to visit.
17
+ #
18
+ # @example
19
+ # agent.schemes = ['http']
20
+ #
21
+ def schemes=(new_schemes)
22
+ @schemes = new_schemes.map { |scheme| scheme.to_s }
23
+ end
24
+
25
+ #
26
+ # Specifies the patterns that match host-names to visit.
27
+ #
28
+ # @return [Array<String, Regexp, Proc>]
29
+ # The host-name patterns to visit.
30
+ #
31
+ def visit_hosts
32
+ @host_rules.accept
33
+ end
34
+
35
+ #
36
+ # Adds a given pattern to the {#visit_hosts}.
37
+ #
38
+ # @param [String, Regexp] pattern
39
+ # The pattern to match host-names with.
40
+ #
41
+ # @yield [host]
42
+ # If a block is given, it will be used to filter host-names.
43
+ #
44
+ # @yieldparam [String] host
45
+ # A host-name to accept or reject.
46
+ #
47
+ def visit_hosts_like(pattern=nil,&block)
48
+ if pattern
49
+ visit_hosts << pattern
50
+ elsif block
51
+ visit_hosts << block
52
+ end
53
+
54
+ return self
55
+ end
56
+
57
+ #
58
+ # Specifies the patterns that match host-names to not visit.
59
+ #
60
+ # @return [Array<String, Regexp, Proc>]
61
+ # The host-name patterns to not visit.
62
+ #
63
+ def ignore_hosts
64
+ @host_rules.reject
65
+ end
66
+
67
+ #
68
+ # Adds a given pattern to the {#ignore_hosts}.
69
+ #
70
+ # @param [String, Regexp] pattern
71
+ # The pattern to match host-names with.
72
+ #
73
+ # @yield [host]
74
+ # If a block is given, it will be used to filter host-names.
75
+ #
76
+ # @yieldparam [String] host
77
+ # A host-name to reject or accept.
78
+ #
79
+ def ignore_hosts_like(pattern=nil,&block)
80
+ if pattern
81
+ ignore_hosts << pattern
82
+ elsif block
83
+ ignore_hosts << block
84
+ end
85
+
86
+ return self
87
+ end
88
+
89
+ #
90
+ # Specifies the patterns that match the ports to visit.
91
+ #
92
+ # @return [Array<Integer, Regexp, Proc>]
93
+ # The port patterns to visit.
94
+ #
95
+ def visit_ports
96
+ @port_rules.accept
97
+ end
98
+
99
+ #
100
+ # Adds a given pattern to the {#visit_ports}.
101
+ #
102
+ # @param [Integer, Regexp] pattern
103
+ # The pattern to match ports with.
104
+ #
105
+ # @yield [port]
106
+ # If a block is given, it will be used to filter ports.
107
+ #
108
+ # @yieldparam [Integer] port
109
+ # A port to accept or reject.
110
+ #
111
+ def visit_ports_like(pattern=nil,&block)
112
+ if pattern
113
+ visit_ports << pattern
114
+ elsif block
115
+ visit_ports << block
116
+ end
117
+
118
+ return self
119
+ end
120
+
121
+ #
122
+ # Specifies the patterns that match ports to not visit.
123
+ #
124
+ # @return [Array<Integer, Regexp, Proc>]
125
+ # The port patterns to not visit.
126
+ #
127
+ def ignore_ports
128
+ @port_rules.reject
129
+ end
130
+
131
+ #
132
+ # Adds a given pattern to the {#ignore_ports}.
133
+ #
134
+ # @param [Integer, Regexp] pattern
135
+ # The pattern to match ports with.
136
+ #
137
+ # @yield [port]
138
+ # If a block is given, it will be used to filter ports.
139
+ #
140
+ # @yieldparam [Integer] port
141
+ # A port to reject or accept.
142
+ #
143
+ def ignore_ports_like(pattern=nil,&block)
144
+ if pattern
145
+ ignore_ports << pattern
146
+ elsif block
147
+ ignore_ports << block
148
+ end
149
+
150
+ return self
151
+ end
152
+
153
+ #
154
+ # Specifies the patterns that match the links to visit.
155
+ #
156
+ # @return [Array<String, Regexp, Proc>]
157
+ # The link patterns to visit.
158
+ #
159
+ # @since 0.2.4
160
+ #
161
+ def visit_links
162
+ @link_rules.accept
163
+ end
164
+
165
+ #
166
+ # Adds a given pattern to the {#visit_links}
167
+ #
168
+ # @param [String, Regexp] pattern
169
+ # The pattern to match link with.
170
+ #
171
+ # @yield [link]
172
+ # If a block is given, it will be used to filter links.
173
+ #
174
+ # @yieldparam [String] link
175
+ # A link to accept or reject.
176
+ #
177
+ # @since 0.2.4
178
+ #
179
+ def visit_links_like(pattern=nil,&block)
180
+ if pattern
181
+ visit_links << pattern
182
+ elsif block
183
+ visit_links << block
184
+ end
185
+
186
+ return self
187
+ end
188
+
189
+ #
190
+ # Specifies the patterns that match links to not visit.
191
+ #
192
+ # @return [Array<String, Regexp, Proc>]
193
+ # The link patterns to not visit.
194
+ #
195
+ def ignore_links
196
+ @link_rules.reject
197
+ end
198
+
199
+ #
200
+ # Adds a given pattern to the {#ignore_links}.
201
+ #
202
+ # @param [String, Regexp] pattern
203
+ # The pattern to match links with.
204
+ #
205
+ # @yield [link]
206
+ # If a block is given, it will be used to filter links.
207
+ #
208
+ # @yieldparam [String] link
209
+ # A link to reject or accept.
210
+ #
211
+ def ignore_links_like(pattern=nil,&block)
212
+ if pattern
213
+ ignore_links << pattern
214
+ elsif block
215
+ ignore_links << block
216
+ end
217
+
218
+ return self
219
+ end
220
+
221
+ #
222
+ # Specifies the patterns that match the URLs to visit.
223
+ #
224
+ # @return [Array<String, Regexp, Proc>]
225
+ # The link patterns to visit.
226
+ #
227
+ # @since 0.2.4
228
+ #
229
+ def visit_urls
230
+ @url_rules.accept
231
+ end
232
+
233
+ #
234
+ # Adds a given pattern to the {#visit_urls}
235
+ #
236
+ # @param [String, Regexp] pattern
237
+ # The pattern to match URLs with.
238
+ #
239
+ # @yield [url]
240
+ # If a block is given, it will be used to filter URLs.
241
+ #
242
+ # @yieldparam [URI::HTTP, URI::HTTPS] url
243
+ # A URL to accept or reject.
244
+ #
245
+ # @since 0.2.4
246
+ #
247
+ def visit_urls_like(pattern=nil,&block)
248
+ if pattern
249
+ visit_urls << pattern
250
+ elsif block
251
+ visit_urls << block
252
+ end
253
+
254
+ return self
255
+ end
256
+
257
+ #
258
+ # Specifies the patterns that match URLs to not visit.
259
+ #
260
+ # @return [Array<String, Regexp, Proc>]
261
+ # The URL patterns to not visit.
262
+ #
263
+ # @since 0.2.4
264
+ #
265
+ def ignore_urls
266
+ @url_rules.reject
267
+ end
268
+
269
+ #
270
+ # Adds a given pattern to the {#ignore_urls}.
271
+ #
272
+ # @param [String, Regexp] pattern
273
+ # The pattern to match URLs with.
274
+ #
275
+ # @yield [url]
276
+ # If a block is given, it will be used to filter URLs.
277
+ #
278
+ # @yieldparam [URI::HTTP, URI::HTTPS] url
279
+ # A URL to reject or accept.
280
+ #
281
+ # @since 0.2.4
282
+ #
283
+ def ignore_urls_like(pattern=nil,&block)
284
+ if pattern
285
+ ignore_urls << pattern
286
+ elsif block
287
+ ignore_urls << block
288
+ end
289
+
290
+ return self
291
+ end
292
+
293
+ #
294
+ # Specifies the patterns that match the URI path extensions to visit.
295
+ #
296
+ # @return [Array<String, Regexp, Proc>]
297
+ # The URI path extensions patterns to visit.
298
+ #
299
+ def visit_exts
300
+ @ext_rules.accept
301
+ end
302
+
303
+ #
304
+ # Adds a given pattern to the {#visit_exts}.
305
+ #
306
+ # @param [String, Regexp] pattern
307
+ # The pattern to match URI path extensions with.
308
+ #
309
+ # @yield [ext]
310
+ # If a block is given, it will be used to filter URI path extensions.
311
+ #
312
+ # @yieldparam [String] ext
313
+ # A URI path extension to accept or reject.
314
+ #
315
+ def visit_exts_like(pattern=nil,&block)
316
+ if pattern
317
+ visit_exts << pattern
318
+ elsif block
319
+ visit_exts << block
320
+ end
321
+
322
+ return self
323
+ end
324
+
325
+ #
326
+ # Specifies the patterns that match URI path extensions to not visit.
327
+ #
328
+ # @return [Array<String, Regexp, Proc>]
329
+ # The URI path extension patterns to not visit.
330
+ #
331
+ def ignore_exts
332
+ @ext_rules.reject
333
+ end
334
+
335
+ #
336
+ # Adds a given pattern to the {#ignore_exts}.
337
+ #
338
+ # @param [String, Regexp] pattern
339
+ # The pattern to match URI path extensions with.
340
+ #
341
+ # @yield [ext]
342
+ # If a block is given, it will be used to filter URI path extensions.
343
+ #
344
+ # @yieldparam [String] ext
345
+ # A URI path extension to reject or accept.
346
+ #
347
+ def ignore_exts_like(pattern=nil,&block)
348
+ if pattern
349
+ ignore_exts << pattern
350
+ elsif block
351
+ ignore_exts << block
352
+ end
353
+
354
+ return self
355
+ end
356
+
357
+ protected
358
+
359
+ #
360
+ # Initializes filtering rules.
361
+ #
362
+ # @param [Hash] options
363
+ # Additional options.
364
+ #
365
+ # @option options [Array] :schemes (['http', 'https'])
366
+ # The list of acceptable URI schemes to visit.
367
+ # The `https` scheme will be ignored if `net/https` cannot be loaded.
368
+ #
369
+ # @option options [String] :host
370
+ # The host-name to visit.
371
+ #
372
+ # @option options [Array<String, Regexp, Proc>] :hosts
373
+ # The patterns which match the host-names to visit.
374
+ #
375
+ # @option options [Array<String, Regexp, Proc>] :ignore_hosts
376
+ # The patterns which match the host-names to not visit.
377
+ #
378
+ # @option options [Array<Integer, Regexp, Proc>] :ports
379
+ # The patterns which match the ports to visit.
380
+ #
381
+ # @option options [Array<Integer, Regexp, Proc>] :ignore_ports
382
+ # The patterns which match the ports to not visit.
383
+ #
384
+ # @option options [Array<String, Regexp, Proc>] :links
385
+ # The patterns which match the links to visit.
386
+ #
387
+ # @option options [Array<String, Regexp, Proc>] :ignore_links
388
+ # The patterns which match the links to not visit.
389
+ #
390
+ # @option options [Array<String, Regexp, Proc>] :urls
391
+ # The patterns which match the URLs to visit.
392
+ #
393
+ # @option options [Array<String, Regexp, Proc>] :ignore_urls
394
+ # The patterns which match the URLs to not visit.
395
+ #
396
+ # @option options [Array<String, Regexp, Proc>] :exts
397
+ # The patterns which match the URI path extensions to visit.
398
+ #
399
+ # @option options [Array<String, Regexp, Proc>] :ignore_exts
400
+ # The patterns which match the URI path extensions to not visit.
401
+ #
402
+ def initialize_filters(options={})
403
+ @schemes = []
404
+
405
+ if options[:schemes]
406
+ @schemes += options[:schemes]
407
+ else
408
+ @schemes << 'http'
409
+
410
+ begin
411
+ require 'net/https'
412
+
413
+ @schemes << 'https'
414
+ rescue Gem::LoadError => e
415
+ raise(e)
416
+ rescue ::LoadError
417
+ warn "Warning: cannot load 'net/https', https support disabled"
418
+ end
419
+ end
420
+
421
+ @host_rules = Rules.new(
422
+ :accept => options[:hosts],
423
+ :reject => options[:ignore_hosts]
424
+ )
425
+ @port_rules = Rules.new(
426
+ :accept => options[:ports],
427
+ :reject => options[:ignore_ports]
428
+ )
429
+ @link_rules = Rules.new(
430
+ :accept => options[:links],
431
+ :reject => options[:ignore_links]
432
+ )
433
+ @url_rules = Rules.new(
434
+ :accept => options[:urls],
435
+ :reject => options[:ignore_urls]
436
+ )
437
+ @ext_rules = Rules.new(
438
+ :accept => options[:exts],
439
+ :reject => options[:ignore_exts]
440
+ )
441
+
442
+ if options[:host]
443
+ visit_hosts_like(options[:host])
444
+ end
445
+
446
+ if options[:queue]
447
+ self.queue = options[:queue]
448
+ end
449
+
450
+ if options[:history]
451
+ self.history = options[:history]
452
+ end
453
+ end
454
+
455
+ #
456
+ # Determines if a given URI scheme should be visited.
457
+ #
458
+ # @param [String] scheme
459
+ # The URI scheme.
460
+ #
461
+ # @return [Boolean]
462
+ # Specifies whether the given scheme should be visited.
463
+ #
464
+ def visit_scheme?(scheme)
465
+ if scheme
466
+ return @schemes.include?(scheme)
467
+ else
468
+ return true
469
+ end
470
+ end
471
+
472
+ #
473
+ # Determines if a given host-name should be visited.
474
+ #
475
+ # @param [String] host
476
+ # The host-name.
477
+ #
478
+ # @return [Boolean]
479
+ # Specifies whether the given host-name should be visited.
480
+ #
481
+ def visit_host?(host)
482
+ @host_rules.accept?(host)
483
+ end
484
+
485
+ #
486
+ # Determines if a given port should be visited.
487
+ #
488
+ # @param [Integer] port
489
+ # The port number.
490
+ #
491
+ # @return [Boolean]
492
+ # Specifies whether the given port should be visited.
493
+ #
494
+ def visit_port?(port)
495
+ @port_rules.accept?(port)
496
+ end
497
+
498
+ #
499
+ # Determines if a given link should be visited.
500
+ #
501
+ # @param [String] link
502
+ # The link.
503
+ #
504
+ # @return [Boolean]
505
+ # Specifies whether the given link should be visited.
506
+ #
507
+ def visit_link?(link)
508
+ @link_rules.accept?(link)
509
+ end
510
+
511
+ #
512
+ # Determines if a given URL should be visited.
513
+ #
514
+ # @param [URI::HTTP, URI::HTTPS] url
515
+ # The URL.
516
+ #
517
+ # @return [Boolean]
518
+ # Specifies whether the given URL should be visited.
519
+ #
520
+ # @since 0.2.4
521
+ #
522
+ def visit_url?(link)
523
+ @url_rules.accept?(link)
524
+ end
525
+
526
+ #
527
+ # Determines if a given URI path extension should be visited.
528
+ #
529
+ # @param [String] path
530
+ # The path that contains the extension.
531
+ #
532
+ # @return [Boolean]
533
+ # Specifies whether the given URI path extension should be visited.
534
+ #
535
+ def visit_ext?(path)
536
+ @ext_rules.accept?(File.extname(path)[1..-1])
537
+ end
538
+ end
539
+ end