spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,537 @@
1
+ module Spidr
2
+ #
3
+ # The {Events} module adds methods to {Agent} for registering
4
+ # callbacks which will receive URLs, links, headers and pages, when
5
+ # they are visited.
6
+ #
7
+ module Events
8
+ #
9
+ # Pass each URL from each page visited to the given block.
10
+ #
11
+ # @yield [url]
12
+ # The block will be passed every URL from every page visited.
13
+ #
14
+ # @yieldparam [URI::HTTP] url
15
+ # Each URL from each page visited.
16
+ #
17
+ def every_url(&block)
18
+ @every_url_blocks << block
19
+ return self
20
+ end
21
+
22
+ #
23
+ # Pass each URL that could not be requested to the given block.
24
+ #
25
+ # @yield [url]
26
+ # The block will be passed every URL that could not be requested.
27
+ #
28
+ # @yieldparam [URI::HTTP] url
29
+ # A failed URL.
30
+ #
31
+ def every_failed_url(&block)
32
+ @every_failed_url_blocks << block
33
+ return self
34
+ end
35
+
36
+ #
37
+ # Pass every URL that the agent visits, and matches a given pattern,
38
+ # to a given block.
39
+ #
40
+ # @param [Regexp, String] pattern
41
+ # The pattern to match URLs with.
42
+ #
43
+ # @yield [url]
44
+ # The block will be passed every URL that matches the given pattern.
45
+ #
46
+ # @yieldparam [URI::HTTP] url
47
+ # A matching URL.
48
+ #
49
+ # @since 0.3.2
50
+ #
51
+ def every_url_like(pattern,&block)
52
+ @every_url_like_blocks[pattern] << block
53
+ return self
54
+ end
55
+
56
+ #
57
+ # @see #every_url_like
58
+ #
59
+ def urls_like(pattern,&block)
60
+ every_url_like(pattern,&block)
61
+ end
62
+
63
+ #
64
+ # Pass the headers from every response the agent receives to a given
65
+ # block.
66
+ #
67
+ # @yield [headers]
68
+ # The block will be passed the headers of every response.
69
+ #
70
+ # @yieldparam [Hash] headers
71
+ # The headers from a response.
72
+ #
73
+ def all_headers
74
+ every_page { |page| yield page.headers }
75
+ end
76
+
77
+ #
78
+ # Pass every page that the agent visits to a given block.
79
+ #
80
+ # @yield [page]
81
+ # The block will be passed every page visited.
82
+ #
83
+ # @yieldparam [Page] page
84
+ # A visited page.
85
+ #
86
+ def every_page(&block)
87
+ @every_page_blocks << block
88
+ return self
89
+ end
90
+
91
+ #
92
+ # Pass every OK page that the agent visits to a given block.
93
+ #
94
+ # @yield [page]
95
+ # The block will be passed every OK page visited.
96
+ #
97
+ # @yieldparam [Page] page
98
+ # A visited page.
99
+ #
100
+ def every_ok_page
101
+ every_page do |page|
102
+ yield page if (block_given? && page.ok?)
103
+ end
104
+ end
105
+
106
+ #
107
+ # Pass every Redirect page that the agent visits to a given block.
108
+ #
109
+ # @yield [page]
110
+ # The block will be passed every Redirect page visited.
111
+ #
112
+ # @yieldparam [Page] page
113
+ # A visited page.
114
+ #
115
+ def every_redirect_page
116
+ every_page do |page|
117
+ yield page if (block_given? && page.redirect?)
118
+ end
119
+ end
120
+
121
+ #
122
+ # Pass every Timeout page that the agent visits to a given block.
123
+ #
124
+ # @yield [page]
125
+ # The block will be passed every Timeout page visited.
126
+ #
127
+ # @yieldparam [Page] page
128
+ # A visited page.
129
+ #
130
+ def every_timedout_page
131
+ every_page do |page|
132
+ yield page if (block_given? && page.timedout?)
133
+ end
134
+ end
135
+
136
+ #
137
+ # Pass every Bad Request page that the agent visits to a given block.
138
+ #
139
+ # @yield [page]
140
+ # The block will be passed every Bad Request page visited.
141
+ #
142
+ # @yieldparam [Page] page
143
+ # A visited page.
144
+ #
145
+ def every_bad_request_page
146
+ every_page do |page|
147
+ yield page if (block_given? && page.bad_request?)
148
+ end
149
+ end
150
+
151
+ #
152
+ # Pass every Unauthorized page that the agent visits to a given block.
153
+ #
154
+ # @yield [page]
155
+ # The block will be passed every Unauthorized page visited.
156
+ #
157
+ # @yieldparam [Page] page
158
+ # A visited page.
159
+ #
160
+ def every_unauthorized_page
161
+ every_page do |page|
162
+ yield page if (block_given? && page.unauthorized?)
163
+ end
164
+ end
165
+
166
+ #
167
+ # Pass every Forbidden page that the agent visits to a given block.
168
+ #
169
+ # @yield [page]
170
+ # The block will be passed every Forbidden page visited.
171
+ #
172
+ # @yieldparam [Page] page
173
+ # A visited page.
174
+ #
175
+ def every_forbidden_page
176
+ every_page do |page|
177
+ yield page if (block_given? && page.forbidden?)
178
+ end
179
+ end
180
+
181
+ #
182
+ # Pass every Missing page that the agent visits to a given block.
183
+ #
184
+ # @yield [page]
185
+ # The block will be passed every Missing page visited.
186
+ #
187
+ # @yieldparam [Page] page
188
+ # A visited page.
189
+ #
190
+ def every_missing_page
191
+ every_page do |page|
192
+ yield page if (block_given? && page.missing?)
193
+ end
194
+ end
195
+
196
+ #
197
+ # Pass every Internal Server Error page that the agent visits to a
198
+ # given block.
199
+ #
200
+ # @yield [page]
201
+ # The block will be passed every Internal Server Error page visited.
202
+ #
203
+ # @yieldparam [Page] page
204
+ # A visited page.
205
+ #
206
+ def every_internal_server_error_page
207
+ every_page do |page|
208
+ yield page if (block_given? && page.had_internal_server_error?)
209
+ end
210
+ end
211
+
212
+ #
213
+ # Pass every Plain Text page that the agent visits to a given block.
214
+ #
215
+ # @yield [page]
216
+ # The block will be passed every Plain Text page visited.
217
+ #
218
+ # @yieldparam [Page] page
219
+ # A visited page.
220
+ #
221
+ def every_txt_page
222
+ every_page do |page|
223
+ yield page if (block_given? && page.txt?)
224
+ end
225
+ end
226
+
227
+ #
228
+ # Pass every HTML page that the agent visits to a given block.
229
+ #
230
+ # @yield [page]
231
+ # The block will be passed every HTML page visited.
232
+ #
233
+ # @yieldparam [Page] page
234
+ # A visited page.
235
+ #
236
+ def every_html_page
237
+ every_page do |page|
238
+ yield page if (block_given? && page.html?)
239
+ end
240
+ end
241
+
242
+ #
243
+ # Pass every XML page that the agent visits to a given block.
244
+ #
245
+ # @yield [page]
246
+ # The block will be passed every XML page visited.
247
+ #
248
+ # @yieldparam [Page] page
249
+ # A visited page.
250
+ #
251
+ def every_xml_page
252
+ every_page do |page|
253
+ yield page if (block_given? && page.xml?)
254
+ end
255
+ end
256
+
257
+ #
258
+ # Pass every XML Stylesheet (XSL) page that the agent visits to a
259
+ # given block.
260
+ #
261
+ # @yield [page]
262
+ # The block will be passed every XML Stylesheet (XSL) page visited.
263
+ #
264
+ # @yieldparam [Page] page
265
+ # A visited page.
266
+ #
267
+ def every_xsl_page
268
+ every_page do |page|
269
+ yield page if (block_given? && page.xsl?)
270
+ end
271
+ end
272
+
273
+ #
274
+ # Pass every HTML or XML document that the agent parses to a given
275
+ # block.
276
+ #
277
+ # @yield [doc]
278
+ # The block will be passed every HTML or XML document parsed.
279
+ #
280
+ # @yieldparam [Nokogiri::HTML::Document, Nokogiri::XML::Document] doc
281
+ # A parsed HTML or XML document.
282
+ #
283
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
284
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
285
+ #
286
+ def every_doc
287
+ every_page do |page|
288
+ if block_given?
289
+ if (doc = page.doc)
290
+ yield doc
291
+ end
292
+ end
293
+ end
294
+ end
295
+
296
+ #
297
+ # Pass every HTML document that the agent parses to a given block.
298
+ #
299
+ # @yield [doc]
300
+ # The block will be passed every HTML document parsed.
301
+ #
302
+ # @yieldparam [Nokogiri::HTML::Document] doc
303
+ # A parsed HTML document.
304
+ #
305
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
306
+ #
307
+ def every_html_doc
308
+ every_page do |page|
309
+ if (block_given? && page.html?)
310
+ if (doc = page.doc)
311
+ yield doc
312
+ end
313
+ end
314
+ end
315
+ end
316
+
317
+ #
318
+ # Pass every XML document that the agent parses to a given block.
319
+ #
320
+ # @yield [doc]
321
+ # The block will be passed every XML document parsed.
322
+ #
323
+ # @yieldparam [Nokogiri::XML::Document] doc
324
+ # A parsed XML document.
325
+ #
326
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
327
+ #
328
+ def every_xml_doc
329
+ every_page do |page|
330
+ if (block_given? && page.xml?)
331
+ if (doc = page.doc)
332
+ yield doc
333
+ end
334
+ end
335
+ end
336
+ end
337
+
338
+ #
339
+ # Pass every XML Stylesheet (XSL) that the agent parses to a given
340
+ # block.
341
+ #
342
+ # @yield [doc]
343
+ # The block will be passed every XSL Stylesheet (XSL) parsed.
344
+ #
345
+ # @yieldparam [Nokogiri::XML::Document] doc
346
+ # A parsed XML document.
347
+ #
348
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
349
+ #
350
+ def every_xsl_doc
351
+ every_page do |page|
352
+ if (block_given? && page.xsl?)
353
+ if (doc = page.doc)
354
+ yield doc
355
+ end
356
+ end
357
+ end
358
+ end
359
+
360
+ #
361
+ # Pass every RSS document that the agent parses to a given block.
362
+ #
363
+ # @yield [doc]
364
+ # The block will be passed every RSS document parsed.
365
+ #
366
+ # @yieldparam [Nokogiri::XML::Document] doc
367
+ # A parsed XML document.
368
+ #
369
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
370
+ #
371
+ def every_rss_doc
372
+ every_page do |page|
373
+ if (block_given? && page.rss?)
374
+ if (doc = page.doc)
375
+ yield doc
376
+ end
377
+ end
378
+ end
379
+ end
380
+
381
+ #
382
+ # Pass every Atom document that the agent parses to a given block.
383
+ #
384
+ # @yield [doc]
385
+ # The block will be passed every Atom document parsed.
386
+ #
387
+ # @yieldparam [Nokogiri::XML::Document] doc
388
+ # A parsed XML document.
389
+ #
390
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
391
+ #
392
+ def every_atom_doc
393
+ every_page do |page|
394
+ if (block_given? && page.atom?)
395
+ if (doc = page.doc)
396
+ yield doc
397
+ end
398
+ end
399
+ end
400
+ end
401
+
402
+ #
403
+ # Pass every JavaScript page that the agent visits to a given block.
404
+ #
405
+ # @yield [page]
406
+ # The block will be passed every JavaScript page visited.
407
+ #
408
+ # @yieldparam [Page] page
409
+ # A visited page.
410
+ #
411
+ def every_javascript_page
412
+ every_page do |page|
413
+ yield page if (block_given? && page.javascript?)
414
+ end
415
+ end
416
+
417
+ #
418
+ # Pass every CSS page that the agent visits to a given block.
419
+ #
420
+ # @yield [page]
421
+ # The block will be passed every CSS page visited.
422
+ #
423
+ # @yieldparam [Page] page
424
+ # A visited page.
425
+ #
426
+ def every_css_page
427
+ every_page do |page|
428
+ yield page if (block_given? && page.css?)
429
+ end
430
+ end
431
+
432
+ #
433
+ # Pass every RSS feed that the agent visits to a given block.
434
+ #
435
+ # @yield [feed]
436
+ # The block will be passed every RSS feed visited.
437
+ #
438
+ # @yieldparam [Page] feed
439
+ # A visited page.
440
+ #
441
+ def every_rss_page
442
+ every_page do |page|
443
+ yield page if (block_given? && page.rss?)
444
+ end
445
+ end
446
+
447
+ #
448
+ # Pass every Atom feed that the agent visits to a given block.
449
+ #
450
+ # @yield [feed]
451
+ # The block will be passed every Atom feed visited.
452
+ #
453
+ # @yieldparam [Page] feed
454
+ # A visited page.
455
+ #
456
+ def every_atom_page
457
+ every_page do |page|
458
+ yield page if (block_given? && page.atom?)
459
+ end
460
+ end
461
+
462
+ #
463
+ # Pass every MS Word page that the agent visits to a given block.
464
+ #
465
+ # @yield [page]
466
+ # The block will be passed every MS Word page visited.
467
+ #
468
+ # @yieldparam [Page] page
469
+ # A visited page.
470
+ #
471
+ def every_ms_word_page
472
+ every_page do |page|
473
+ yield page if (block_given? && page.ms_word?)
474
+ end
475
+ end
476
+
477
+ #
478
+ # Pass every PDF page that the agent visits to a given block.
479
+ #
480
+ # @yield [page]
481
+ # The block will be passed every PDF page visited.
482
+ #
483
+ # @yieldparam [Page] page
484
+ # A visited page.
485
+ #
486
+ def every_pdf_page
487
+ every_page do |page|
488
+ yield page if (block_given? && page.pdf?)
489
+ end
490
+ end
491
+
492
+ #
493
+ # Pass every ZIP page that the agent visits to a given block.
494
+ #
495
+ # @yield [page]
496
+ # The block will be passed every ZIP page visited.
497
+ #
498
+ # @yieldparam [Page] page
499
+ # A visited page.
500
+ #
501
+ def every_zip_page
502
+ every_page do |page|
503
+ yield page if (block_given? && page.zip?)
504
+ end
505
+ end
506
+
507
+ #
508
+ # Passes every origin and destination URI of each link to a given
509
+ # block.
510
+ #
511
+ # @yield [origin,dest]
512
+ # The block will be passed every origin and destination URI of
513
+ # each link.
514
+ #
515
+ # @yieldparam [URI::HTTP] origin
516
+ # The URI that a link originated from.
517
+ #
518
+ # @yieldparam [URI::HTTP] dest
519
+ # The destination URI of a link.
520
+ #
521
+ def every_link(&block)
522
+ @every_link_blocks << block
523
+ return self
524
+ end
525
+
526
+ protected
527
+
528
+ def initialize_events(options={})
529
+ @every_url_blocks = []
530
+ @every_failed_url_blocks = []
531
+ @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
532
+
533
+ @every_page_blocks = []
534
+ @every_link_blocks = []
535
+ end
536
+ end
537
+ end
@@ -0,0 +1,52 @@
1
+ require 'uri'
2
+
3
+ module URI
4
+ #
5
+ # Expands a URI decoded path, into a proper absolute path.
6
+ #
7
+ # @param [String] path
8
+ # The path from a URI.
9
+ #
10
+ # @return [String]
11
+ # The expanded path.
12
+ #
13
+ # @example
14
+ # URI.expand_path('./path')
15
+ # # => "path"
16
+ #
17
+ # @example
18
+ # URI.expand_path('test/../path')
19
+ # # => "path"
20
+ #
21
+ # @example
22
+ # URI.exand_path('/test/path/')
23
+ # # => "/test/path/"
24
+ #
25
+ # @example
26
+ # URI.expand_path('/test/../path')
27
+ # # => "/path"
28
+ #
29
+ def URI.expand_path(path)
30
+ dirs = path.split(/\/+/)
31
+
32
+ # append any tailing '/' chars, lost due to String#split
33
+ dirs << '' if path[-1,1] == '/'
34
+
35
+ new_dirs = []
36
+
37
+ dirs.each do |dir|
38
+ if dir == '..'
39
+ new_dirs.pop
40
+ elsif dir != '.'
41
+ new_dirs.push(dir)
42
+ end
43
+ end
44
+
45
+ full_path = new_dirs.join('/')
46
+
47
+ # default empty paths to '/'
48
+ full_path = '/' if full_path.empty?
49
+
50
+ return full_path
51
+ end
52
+ end
@@ -0,0 +1 @@
1
+ require 'spidrs/extensions/uri'