spidr_epg 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,537 @@
1
+ module Spidr
2
+ #
3
+ # The {Events} module adds methods to {Agent} for registering
4
+ # callbacks which will receive URLs, links, headers and pages, when
5
+ # they are visited.
6
+ #
7
+ module Events
8
+ #
9
+ # Pass each URL from each page visited to the given block.
10
+ #
11
+ # @yield [url]
12
+ # The block will be passed every URL from every page visited.
13
+ #
14
+ # @yieldparam [URI::HTTP] url
15
+ # Each URL from each page visited.
16
+ #
17
+ def every_url(&block)
18
+ @every_url_blocks << block
19
+ return self
20
+ end
21
+
22
+ #
23
+ # Pass each URL that could not be requested to the given block.
24
+ #
25
+ # @yield [url]
26
+ # The block will be passed every URL that could not be requested.
27
+ #
28
+ # @yieldparam [URI::HTTP] url
29
+ # A failed URL.
30
+ #
31
+ def every_failed_url(&block)
32
+ @every_failed_url_blocks << block
33
+ return self
34
+ end
35
+
36
+ #
37
+ # Pass every URL that the agent visits, and matches a given pattern,
38
+ # to a given block.
39
+ #
40
+ # @param [Regexp, String] pattern
41
+ # The pattern to match URLs with.
42
+ #
43
+ # @yield [url]
44
+ # The block will be passed every URL that matches the given pattern.
45
+ #
46
+ # @yieldparam [URI::HTTP] url
47
+ # A matching URL.
48
+ #
49
+ # @since 0.3.2
50
+ #
51
+ def every_url_like(pattern,&block)
52
+ @every_url_like_blocks[pattern] << block
53
+ return self
54
+ end
55
+
56
+ #
57
+ # @see #every_url_like
58
+ #
59
+ def urls_like(pattern,&block)
60
+ every_url_like(pattern,&block)
61
+ end
62
+
63
+ #
64
+ # Pass the headers from every response the agent receives to a given
65
+ # block.
66
+ #
67
+ # @yield [headers]
68
+ # The block will be passed the headers of every response.
69
+ #
70
+ # @yieldparam [Hash] headers
71
+ # The headers from a response.
72
+ #
73
+ def all_headers
74
+ every_page { |page| yield page.headers }
75
+ end
76
+
77
+ #
78
+ # Pass every page that the agent visits to a given block.
79
+ #
80
+ # @yield [page]
81
+ # The block will be passed every page visited.
82
+ #
83
+ # @yieldparam [Page] page
84
+ # A visited page.
85
+ #
86
+ def every_page(&block)
87
+ @every_page_blocks << block
88
+ return self
89
+ end
90
+
91
+ #
92
+ # Pass every OK page that the agent visits to a given block.
93
+ #
94
+ # @yield [page]
95
+ # The block will be passed every OK page visited.
96
+ #
97
+ # @yieldparam [Page] page
98
+ # A visited page.
99
+ #
100
+ def every_ok_page
101
+ every_page do |page|
102
+ yield page if (block_given? && page.ok?)
103
+ end
104
+ end
105
+
106
+ #
107
+ # Pass every Redirect page that the agent visits to a given block.
108
+ #
109
+ # @yield [page]
110
+ # The block will be passed every Redirect page visited.
111
+ #
112
+ # @yieldparam [Page] page
113
+ # A visited page.
114
+ #
115
+ def every_redirect_page
116
+ every_page do |page|
117
+ yield page if (block_given? && page.redirect?)
118
+ end
119
+ end
120
+
121
+ #
122
+ # Pass every Timeout page that the agent visits to a given block.
123
+ #
124
+ # @yield [page]
125
+ # The block will be passed every Timeout page visited.
126
+ #
127
+ # @yieldparam [Page] page
128
+ # A visited page.
129
+ #
130
+ def every_timedout_page
131
+ every_page do |page|
132
+ yield page if (block_given? && page.timedout?)
133
+ end
134
+ end
135
+
136
+ #
137
+ # Pass every Bad Request page that the agent visits to a given block.
138
+ #
139
+ # @yield [page]
140
+ # The block will be passed every Bad Request page visited.
141
+ #
142
+ # @yieldparam [Page] page
143
+ # A visited page.
144
+ #
145
+ def every_bad_request_page
146
+ every_page do |page|
147
+ yield page if (block_given? && page.bad_request?)
148
+ end
149
+ end
150
+
151
+ #
152
+ # Pass every Unauthorized page that the agent visits to a given block.
153
+ #
154
+ # @yield [page]
155
+ # The block will be passed every Unauthorized page visited.
156
+ #
157
+ # @yieldparam [Page] page
158
+ # A visited page.
159
+ #
160
+ def every_unauthorized_page
161
+ every_page do |page|
162
+ yield page if (block_given? && page.unauthorized?)
163
+ end
164
+ end
165
+
166
+ #
167
+ # Pass every Forbidden page that the agent visits to a given block.
168
+ #
169
+ # @yield [page]
170
+ # The block will be passed every Forbidden page visited.
171
+ #
172
+ # @yieldparam [Page] page
173
+ # A visited page.
174
+ #
175
+ def every_forbidden_page
176
+ every_page do |page|
177
+ yield page if (block_given? && page.forbidden?)
178
+ end
179
+ end
180
+
181
+ #
182
+ # Pass every Missing page that the agent visits to a given block.
183
+ #
184
+ # @yield [page]
185
+ # The block will be passed every Missing page visited.
186
+ #
187
+ # @yieldparam [Page] page
188
+ # A visited page.
189
+ #
190
+ def every_missing_page
191
+ every_page do |page|
192
+ yield page if (block_given? && page.missing?)
193
+ end
194
+ end
195
+
196
+ #
197
+ # Pass every Internal Server Error page that the agent visits to a
198
+ # given block.
199
+ #
200
+ # @yield [page]
201
+ # The block will be passed every Internal Server Error page visited.
202
+ #
203
+ # @yieldparam [Page] page
204
+ # A visited page.
205
+ #
206
+ def every_internal_server_error_page
207
+ every_page do |page|
208
+ yield page if (block_given? && page.had_internal_server_error?)
209
+ end
210
+ end
211
+
212
+ #
213
+ # Pass every Plain Text page that the agent visits to a given block.
214
+ #
215
+ # @yield [page]
216
+ # The block will be passed every Plain Text page visited.
217
+ #
218
+ # @yieldparam [Page] page
219
+ # A visited page.
220
+ #
221
+ def every_txt_page
222
+ every_page do |page|
223
+ yield page if (block_given? && page.txt?)
224
+ end
225
+ end
226
+
227
+ #
228
+ # Pass every HTML page that the agent visits to a given block.
229
+ #
230
+ # @yield [page]
231
+ # The block will be passed every HTML page visited.
232
+ #
233
+ # @yieldparam [Page] page
234
+ # A visited page.
235
+ #
236
+ def every_html_page
237
+ every_page do |page|
238
+ yield page if (block_given? && page.html?)
239
+ end
240
+ end
241
+
242
+ #
243
+ # Pass every XML page that the agent visits to a given block.
244
+ #
245
+ # @yield [page]
246
+ # The block will be passed every XML page visited.
247
+ #
248
+ # @yieldparam [Page] page
249
+ # A visited page.
250
+ #
251
+ def every_xml_page
252
+ every_page do |page|
253
+ yield page if (block_given? && page.xml?)
254
+ end
255
+ end
256
+
257
+ #
258
+ # Pass every XML Stylesheet (XSL) page that the agent visits to a
259
+ # given block.
260
+ #
261
+ # @yield [page]
262
+ # The block will be passed every XML Stylesheet (XSL) page visited.
263
+ #
264
+ # @yieldparam [Page] page
265
+ # A visited page.
266
+ #
267
+ def every_xsl_page
268
+ every_page do |page|
269
+ yield page if (block_given? && page.xsl?)
270
+ end
271
+ end
272
+
273
+ #
274
+ # Pass every HTML or XML document that the agent parses to a given
275
+ # block.
276
+ #
277
+ # @yield [doc]
278
+ # The block will be passed every HTML or XML document parsed.
279
+ #
280
+ # @yieldparam [Nokogiri::HTML::Document, Nokogiri::XML::Document] doc
281
+ # A parsed HTML or XML document.
282
+ #
283
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
284
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
285
+ #
286
+ def every_doc
287
+ every_page do |page|
288
+ if block_given?
289
+ if (doc = page.doc)
290
+ yield doc
291
+ end
292
+ end
293
+ end
294
+ end
295
+
296
+ #
297
+ # Pass every HTML document that the agent parses to a given block.
298
+ #
299
+ # @yield [doc]
300
+ # The block will be passed every HTML document parsed.
301
+ #
302
+ # @yieldparam [Nokogiri::HTML::Document] doc
303
+ # A parsed HTML document.
304
+ #
305
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
306
+ #
307
+ def every_html_doc
308
+ every_page do |page|
309
+ if (block_given? && page.html?)
310
+ if (doc = page.doc)
311
+ yield doc
312
+ end
313
+ end
314
+ end
315
+ end
316
+
317
+ #
318
+ # Pass every XML document that the agent parses to a given block.
319
+ #
320
+ # @yield [doc]
321
+ # The block will be passed every XML document parsed.
322
+ #
323
+ # @yieldparam [Nokogiri::XML::Document] doc
324
+ # A parsed XML document.
325
+ #
326
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
327
+ #
328
+ def every_xml_doc
329
+ every_page do |page|
330
+ if (block_given? && page.xml?)
331
+ if (doc = page.doc)
332
+ yield doc
333
+ end
334
+ end
335
+ end
336
+ end
337
+
338
+ #
339
+ # Pass every XML Stylesheet (XSL) that the agent parses to a given
340
+ # block.
341
+ #
342
+ # @yield [doc]
343
+ # The block will be passed every XSL Stylesheet (XSL) parsed.
344
+ #
345
+ # @yieldparam [Nokogiri::XML::Document] doc
346
+ # A parsed XML document.
347
+ #
348
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
349
+ #
350
+ def every_xsl_doc
351
+ every_page do |page|
352
+ if (block_given? && page.xsl?)
353
+ if (doc = page.doc)
354
+ yield doc
355
+ end
356
+ end
357
+ end
358
+ end
359
+
360
+ #
361
+ # Pass every RSS document that the agent parses to a given block.
362
+ #
363
+ # @yield [doc]
364
+ # The block will be passed every RSS document parsed.
365
+ #
366
+ # @yieldparam [Nokogiri::XML::Document] doc
367
+ # A parsed XML document.
368
+ #
369
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
370
+ #
371
+ def every_rss_doc
372
+ every_page do |page|
373
+ if (block_given? && page.rss?)
374
+ if (doc = page.doc)
375
+ yield doc
376
+ end
377
+ end
378
+ end
379
+ end
380
+
381
+ #
382
+ # Pass every Atom document that the agent parses to a given block.
383
+ #
384
+ # @yield [doc]
385
+ # The block will be passed every Atom document parsed.
386
+ #
387
+ # @yieldparam [Nokogiri::XML::Document] doc
388
+ # A parsed XML document.
389
+ #
390
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
391
+ #
392
+ def every_atom_doc
393
+ every_page do |page|
394
+ if (block_given? && page.atom?)
395
+ if (doc = page.doc)
396
+ yield doc
397
+ end
398
+ end
399
+ end
400
+ end
401
+
402
+ #
403
+ # Pass every JavaScript page that the agent visits to a given block.
404
+ #
405
+ # @yield [page]
406
+ # The block will be passed every JavaScript page visited.
407
+ #
408
+ # @yieldparam [Page] page
409
+ # A visited page.
410
+ #
411
+ def every_javascript_page
412
+ every_page do |page|
413
+ yield page if (block_given? && page.javascript?)
414
+ end
415
+ end
416
+
417
+ #
418
+ # Pass every CSS page that the agent visits to a given block.
419
+ #
420
+ # @yield [page]
421
+ # The block will be passed every CSS page visited.
422
+ #
423
+ # @yieldparam [Page] page
424
+ # A visited page.
425
+ #
426
+ def every_css_page
427
+ every_page do |page|
428
+ yield page if (block_given? && page.css?)
429
+ end
430
+ end
431
+
432
+ #
433
+ # Pass every RSS feed that the agent visits to a given block.
434
+ #
435
+ # @yield [feed]
436
+ # The block will be passed every RSS feed visited.
437
+ #
438
+ # @yieldparam [Page] feed
439
+ # A visited page.
440
+ #
441
+ def every_rss_page
442
+ every_page do |page|
443
+ yield page if (block_given? && page.rss?)
444
+ end
445
+ end
446
+
447
+ #
448
+ # Pass every Atom feed that the agent visits to a given block.
449
+ #
450
+ # @yield [feed]
451
+ # The block will be passed every Atom feed visited.
452
+ #
453
+ # @yieldparam [Page] feed
454
+ # A visited page.
455
+ #
456
+ def every_atom_page
457
+ every_page do |page|
458
+ yield page if (block_given? && page.atom?)
459
+ end
460
+ end
461
+
462
+ #
463
+ # Pass every MS Word page that the agent visits to a given block.
464
+ #
465
+ # @yield [page]
466
+ # The block will be passed every MS Word page visited.
467
+ #
468
+ # @yieldparam [Page] page
469
+ # A visited page.
470
+ #
471
+ def every_ms_word_page
472
+ every_page do |page|
473
+ yield page if (block_given? && page.ms_word?)
474
+ end
475
+ end
476
+
477
+ #
478
+ # Pass every PDF page that the agent visits to a given block.
479
+ #
480
+ # @yield [page]
481
+ # The block will be passed every PDF page visited.
482
+ #
483
+ # @yieldparam [Page] page
484
+ # A visited page.
485
+ #
486
+ def every_pdf_page
487
+ every_page do |page|
488
+ yield page if (block_given? && page.pdf?)
489
+ end
490
+ end
491
+
492
+ #
493
+ # Pass every ZIP page that the agent visits to a given block.
494
+ #
495
+ # @yield [page]
496
+ # The block will be passed every ZIP page visited.
497
+ #
498
+ # @yieldparam [Page] page
499
+ # A visited page.
500
+ #
501
+ def every_zip_page
502
+ every_page do |page|
503
+ yield page if (block_given? && page.zip?)
504
+ end
505
+ end
506
+
507
+ #
508
+ # Passes every origin and destination URI of each link to a given
509
+ # block.
510
+ #
511
+ # @yield [origin,dest]
512
+ # The block will be passed every origin and destination URI of
513
+ # each link.
514
+ #
515
+ # @yieldparam [URI::HTTP] origin
516
+ # The URI that a link originated from.
517
+ #
518
+ # @yieldparam [URI::HTTP] dest
519
+ # The destination URI of a link.
520
+ #
521
+ def every_link(&block)
522
+ @every_link_blocks << block
523
+ return self
524
+ end
525
+
526
+ protected
527
+
528
+ def initialize_events(options={})
529
+ @every_url_blocks = []
530
+ @every_failed_url_blocks = []
531
+ @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] }
532
+
533
+ @every_page_blocks = []
534
+ @every_link_blocks = []
535
+ end
536
+ end
537
+ end
@@ -0,0 +1,52 @@
1
+ require 'uri'
2
+
3
+ module URI
4
+ #
5
+ # Expands a URI decoded path, into a proper absolute path.
6
+ #
7
+ # @param [String] path
8
+ # The path from a URI.
9
+ #
10
+ # @return [String]
11
+ # The expanded path.
12
+ #
13
+ # @example
14
+ # URI.expand_path('./path')
15
+ # # => "path"
16
+ #
17
+ # @example
18
+ # URI.expand_path('test/../path')
19
+ # # => "path"
20
+ #
21
+ # @example
22
+ # URI.exand_path('/test/path/')
23
+ # # => "/test/path/"
24
+ #
25
+ # @example
26
+ # URI.expand_path('/test/../path')
27
+ # # => "/path"
28
+ #
29
+ def URI.expand_path(path)
30
+ dirs = path.split(/\/+/)
31
+
32
+ # append any tailing '/' chars, lost due to String#split
33
+ dirs << '' if path[-1,1] == '/'
34
+
35
+ new_dirs = []
36
+
37
+ dirs.each do |dir|
38
+ if dir == '..'
39
+ new_dirs.pop
40
+ elsif dir != '.'
41
+ new_dirs.push(dir)
42
+ end
43
+ end
44
+
45
+ full_path = new_dirs.join('/')
46
+
47
+ # default empty paths to '/'
48
+ full_path = '/' if full_path.empty?
49
+
50
+ return full_path
51
+ end
52
+ end
@@ -0,0 +1 @@
1
+ require 'spidrs/extensions/uri'