spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,866 @@
1
+ require 'spidrs/sanitizers'
2
+ require 'spidrs/filters'
3
+ require 'spidrs/events'
4
+ require 'spidrs/actions'
5
+ require 'spidrs/page'
6
+ require 'spidrs/session_cache'
7
+ require 'spidrs/cookie_jar'
8
+ require 'spidrs/auth_store'
9
+ require 'spidrs/spidrs'
10
+
11
+ require 'openssl'
12
+ require 'net/http'
13
+ require 'set'
14
+
15
+ module Spidr
16
+ class Agent
17
+
18
+ include Sanitizers
19
+ include Filters
20
+ include Events
21
+ include Actions
22
+
23
+ # HTTP Host Header to use
24
+ attr_accessor :host_header
25
+
26
+ # HTTP Host Headers to use for specific hosts
27
+ attr_reader :host_headers
28
+
29
+ # User-Agent to use
30
+ attr_accessor :user_agent
31
+
32
+ # HTTP Authentication credentials
33
+ attr_accessor :authorized
34
+
35
+ # Referer to use
36
+ attr_accessor :referer
37
+
38
+ # Delay in between fetching pages
39
+ attr_accessor :delay
40
+
41
+ # History containing visited URLs
42
+ attr_reader :history
43
+
44
+ # List of unreachable URLs
45
+ attr_reader :failures
46
+
47
+ # Queue of URLs to visit
48
+ attr_reader :queue
49
+
50
+ # Cached cookies
51
+ attr_reader :cookies
52
+
53
+ # Maximum depth
54
+ attr_reader :max_depth
55
+
56
+ # The visited URLs and their depth within a site
57
+ attr_reader :levels
58
+
59
+ #
60
+ # Creates a new Agent object.
61
+ #
62
+ # @param [Hash] options
63
+ # Additional options
64
+ #
65
+ # @option options [Hash] :proxy (Spidr.proxy)
66
+ # The proxy information to use.
67
+ #
68
+ # @option :proxy [String] :host
69
+ # The host the proxy is running on.
70
+ #
71
+ # @option :proxy [Integer] :port
72
+ # The port the proxy is running on.
73
+ #
74
+ # @option :proxy [String] :user
75
+ # The user to authenticate as with the proxy.
76
+ #
77
+ # @option :proxy [String] :password
78
+ # The password to authenticate with.
79
+ #
80
+ # @option options [String] :host_header
81
+ # The HTTP Host header to use with each request.
82
+ #
83
+ # @option options [Hash{String,Regexp => String}] :host_headers
84
+ # The HTTP Host headers to use for specific hosts.
85
+ #
86
+ # @option options [String] :user_agent (Spidr.user_agent)
87
+ # The User-Agent string to send with each requests.
88
+ #
89
+ # @option options [String] :referer
90
+ # The Referer URL to send with each request.
91
+ #
92
+ # @option options [Integer] :delay (0)
93
+ # The number of seconds to pause between each request.
94
+ #
95
+ # @option options [Set, Array] :queue
96
+ # The initial queue of URLs to visit.
97
+ #
98
+ # @option options [Set, Array] :history
99
+ # The initial list of visited URLs.
100
+ #
101
+ # @option options [Integer] :max_depth
102
+ # The maximum link depth to follow.
103
+ #
104
+ # @yield [agent]
105
+ # If a block is given, it will be passed the newly created agent
106
+ # for further configuration.
107
+ #
108
+ # @yieldparam [Agent] agent
109
+ # The newly created agent.
110
+ #
111
+ # @see #initialize_sanitizers
112
+ # @see #initialize_filters
113
+ # @see #initialize_actions
114
+ # @see #initialize_events
115
+ #
116
+ def initialize(options={})
117
+ @host_header = options[:host_header]
118
+ @host_headers = {}
119
+
120
+ if options[:host_headers]
121
+ @host_headers.merge!(options[:host_headers])
122
+ end
123
+
124
+ @user_agent = options.fetch(:user_agent,Spidr.user_agent)
125
+ @referer = options[:referer]
126
+
127
+ @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
128
+ @cookies = CookieJar.new
129
+ @authorized = AuthStore.new
130
+
131
+ @running = false
132
+ @delay = options.fetch(:delay,0)
133
+ @history = Set[]
134
+ @failures = Set[]
135
+ @queue = []
136
+
137
+ @levels = Hash.new(0)
138
+ @max_depth = options[:max_depth]
139
+
140
+ initialize_sanitizers(options)
141
+ initialize_filters(options)
142
+ initialize_actions(options)
143
+ initialize_events(options)
144
+
145
+ yield self if block_given?
146
+ end
147
+
148
+ #
149
+ # Creates a new agent and begin spidering at the given URL.
150
+ #
151
+ # @param [URI::HTTP, String] url
152
+ # The URL to start spidering at.
153
+ #
154
+ # @param [Hash] options
155
+ # Additional options. See {Agent#initialize}.
156
+ #
157
+ # @yield [agent]
158
+ # If a block is given, it will be passed the newly created agent
159
+ # before it begins spidering.
160
+ #
161
+ # @yieldparam [Agent] agent
162
+ # The newly created agent.
163
+ #
164
+ def self.start_at(url,options={},&block)
165
+ agent = new(options,&block)
166
+ agent.start_at(url)
167
+ end
168
+
169
+
170
+ #
171
+ # Creates a new agent and begin spidering at the given URL.
172
+ #
173
+ # @param [URI::HTTP, String] url
174
+ # The URL to start spidering at.
175
+ #
176
+ # @param [Hash] options
177
+ # Additional options. See {Agent#initialize}.
178
+ #
179
+ # @yield [agent]
180
+ # If a block is given, it will be passed the newly created agent
181
+ # before it begins spidering.
182
+ #
183
+ # @yieldparam [Agent] agent
184
+ # The newly created agent.
185
+ #
186
+ def self.start_at(url,regex,options={},&block)
187
+ agent = new(options,&block)
188
+ agent.start_at(url,regex)
189
+ end
190
+
191
+ #
192
+ # Creates a new agent and spiders the web-site located at the given URL.
193
+ #
194
+ # @param [URI::HTTP, String] url
195
+ # The web-site to spider.
196
+ #
197
+ # @param [Hash] options
198
+ # Additional options. See {Agent#initialize}.
199
+ #
200
+ # @yield [agent]
201
+ # If a block is given, it will be passed the newly created agent
202
+ # before it begins spidering.
203
+ #
204
+ # @yieldparam [Agent] agent
205
+ # The newly created agent.
206
+ #
207
+ def self.site(url,options={},&block)
208
+ url = URI(url.to_s) unless url.kind_of?(URI)
209
+
210
+ agent = new(options.merge(:host => url.host),&block)
211
+ agent.start_at(url)
212
+ end
213
+
214
+
215
+ def self._site(url,options={},regex,&block)
216
+ url = URI(url.to_s) unless url.kind_of?(URI)
217
+
218
+ agent = new(options.merge(:host => url.host),&block)
219
+ agent.start_at(regex,url)
220
+ end
221
+
222
+ #
223
+ # Creates a new agent and spiders the given host.
224
+ #
225
+ # @param [String]
226
+ # The host-name to spider.
227
+ #
228
+ # @param [Hash] options
229
+ # Additional options. See {Agent#initialize}.
230
+ #
231
+ # @yield [agent]
232
+ # If a block is given, it will be passed the newly created agent
233
+ # before it begins spidering.
234
+ #
235
+ # @yieldparam [Agent] agent
236
+ # The newly created agent.
237
+ #
238
+ def self.host(name,options={},&block)
239
+ agent = new(options.merge(:host => name),&block)
240
+ agent.start_at(URI::HTTP.build(:host => name, :path => '/'))
241
+ end
242
+
243
+ #
244
+ # Clears the history of the agent.
245
+ #
246
+ def clear
247
+ @queue.clear
248
+ @history.clear
249
+ @failures.clear
250
+ return self
251
+ end
252
+
253
+ #
254
+ # Start spidering at a given URL.
255
+ #
256
+ # @param [URI::HTTP, String] url
257
+ # The URL to start spidering at.
258
+ #
259
+ # @yield [page]
260
+ # If a block is given, it will be passed every page visited.
261
+ #
262
+ # @yieldparam [Page] page
263
+ # A page which has been visited.
264
+ #
265
+ def start_at(url,&block)
266
+ enqueue(url)
267
+ return run(&block)
268
+ end
269
+
270
+
271
+ def start_at(url,regex,&block)
272
+ enqueue(url)
273
+ return _run(regex,&block)
274
+ end
275
+
276
+
277
+ #
278
+ # Start spidering until the queue becomes empty or the agent is
279
+ # paused.
280
+ #
281
+ # @yield [page]
282
+ # If a block is given, it will be passed every page visited.
283
+ #
284
+ # @yieldparam [Page] page
285
+ # A page which has been visited.
286
+ #
287
+ def _run(regex,&block)
288
+ @running = true
289
+
290
+ until (@queue.empty? || paused?)
291
+ begin
292
+ _visit_page(regex,dequeue,&block)
293
+ rescue Actions::Paused
294
+ return self
295
+ rescue Actions::Action
296
+ end
297
+ end
298
+
299
+ @running = false
300
+ @sessions.clear
301
+ return self
302
+ end
303
+
304
+ #
305
+ # Visits a given URL, and enqueus the links recovered from the URL
306
+ # to be visited later.
307
+ #
308
+ # @param [URI::HTTP, String] url
309
+ # The URL to visit.
310
+ #
311
+ # @yield [page]
312
+ # If a block is given, it will be passed the page which was visited.
313
+ #
314
+ # @yieldparam [Page] page
315
+ # The page which was visited.
316
+ #
317
+ # @return [Page, nil]
318
+ # The page that was visited. If `nil` is returned, either the request
319
+ # for the page failed, or the page was skipped.
320
+ #
321
+ def _visit_page(regex,url)
322
+ url = sanitize_url(url)
323
+
324
+ get_page(url) do |page|
325
+ @history << page.url
326
+
327
+ begin
328
+ @every_page_blocks.each { |page_block| page_block.call(page) }
329
+
330
+ yield page if block_given?
331
+ rescue Actions::Paused => action
332
+ raise(action)
333
+ rescue Actions::SkipPage
334
+ return nil
335
+ rescue Actions::Action
336
+ end
337
+
338
+ page.each_url do |next_url|
339
+ begin
340
+ @every_link_blocks.each do |link_block|
341
+ link_block.call(page.url,next_url)
342
+ end
343
+ rescue Actions::Paused => action
344
+ raise(action)
345
+ rescue Actions::SkipLink
346
+ next
347
+ rescue Actions::Action
348
+ end
349
+
350
+ if (@max_depth.nil? || @max_depth > @levels[url])
351
+ if(regex.match(next_url))
352
+ enqueue(next_url,@levels[url] + 1)
353
+ end
354
+ end
355
+ end
356
+ end
357
+ end
358
+
359
+ #
360
+ # Start spidering until the queue becomes empty or the agent is
361
+ # paused.
362
+ #
363
+ # @yield [page]
364
+ # If a block is given, it will be passed every page visited.
365
+ #
366
+ # @yieldparam [Page] page
367
+ # A page which has been visited.
368
+ #
369
+ def run(&block)
370
+ @running = true
371
+
372
+ until (@queue.empty? || paused?)
373
+ begin
374
+ visit_page(dequeue,&block)
375
+ rescue Actions::Paused
376
+ return self
377
+ rescue Actions::Action
378
+ end
379
+ end
380
+
381
+ @running = false
382
+ @sessions.clear
383
+ return self
384
+ end
385
+
386
+ #
387
+ # Determines if the agent is running.
388
+ #
389
+ # @return [Boolean]
390
+ # Specifies whether the agent is running or stopped.
391
+ #
392
+ def running?
393
+ @running == true
394
+ end
395
+
396
+ #
397
+ # The proxy information the agent uses.
398
+ #
399
+ # @return [Hash]
400
+ # The proxy information.
401
+ #
402
+ # @see SessionCache#proxy
403
+ #
404
+ # @since 0.2.2
405
+ #
406
+ def proxy
407
+ @sessions.proxy
408
+ end
409
+
410
+ #
411
+ # Sets the proxy information that the agent uses.
412
+ #
413
+ # @param [Hash] new_proxy
414
+ # The new proxy information.
415
+ #
416
+ # @return [Hash]
417
+ # The new proxy information.
418
+ #
419
+ # @see SessionCache#proxy=
420
+ #
421
+ # @since 0.2.2
422
+ #
423
+ def proxy=(new_proxy)
424
+ @sessions.proxy = new_proxy
425
+ end
426
+
427
+ #
428
+ # Sets the history of URLs that were previously visited.
429
+ #
430
+ # @param [#each] new_history
431
+ # A list of URLs to populate the history with.
432
+ #
433
+ # @return [Set<URI::HTTP>]
434
+ # The history of the agent.
435
+ #
436
+ # @example
437
+ # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
438
+ #
439
+ def history=(new_history)
440
+ @history.clear
441
+
442
+ new_history.each do |url|
443
+ @history << unless url.kind_of?(URI)
444
+ URI(url.to_s)
445
+ else
446
+ url
447
+ end
448
+ end
449
+
450
+ return @history
451
+ end
452
+
453
+ alias visited_urls history
454
+
455
+ #
456
+ # Specifies the links which have been visited.
457
+ #
458
+ # @return [Array<String>]
459
+ # The links which have been visited.
460
+ #
461
+ def visited_links
462
+ @history.map { |url| url.to_s }
463
+ end
464
+
465
+ #
466
+ # Specifies all hosts that were visited.
467
+ #
468
+ # @return [Array<String>]
469
+ # The hosts which have been visited.
470
+ #
471
+ def visited_hosts
472
+ visited_urls.map { |uri| uri.host }.uniq
473
+ end
474
+
475
+ #
476
+ # Determines whether a URL was visited or not.
477
+ #
478
+ # @param [URI::HTTP, String] url
479
+ # The URL to search for.
480
+ #
481
+ # @return [Boolean]
482
+ # Specifies whether a URL was visited.
483
+ #
484
+ def visited?(url)
485
+ url = URI(url.to_s) unless url.kind_of?(URI)
486
+
487
+ return @history.include?(url)
488
+ end
489
+
490
+ #
491
+ # Sets the list of failed URLs.
492
+ #
493
+ # @param [#each]
494
+ # The new list of failed URLs.
495
+ #
496
+ # @return [Array<URI::HTTP>]
497
+ # The list of failed URLs.
498
+ #
499
+ # @example
500
+ # agent.failures = ['http://localhost/']
501
+ #
502
+ def failures=(new_failures)
503
+ @failures.clear
504
+
505
+ new_failures.each do |url|
506
+ @failures << unless url.kind_of?(URI)
507
+ URI(url.to_s)
508
+ else
509
+ url
510
+ end
511
+ end
512
+
513
+ return @failures
514
+ end
515
+
516
+ #
517
+ # Determines whether a given URL could not be visited.
518
+ #
519
+ # @param [URI::HTTP, String] url
520
+ # The URL to check for failures.
521
+ #
522
+ # @return [Boolean]
523
+ # Specifies whether the given URL was unable to be visited.
524
+ #
525
+ def failed?(url)
526
+ url = URI(url.to_s) unless url.kind_of?(URI)
527
+
528
+ return @failures.include?(url)
529
+ end
530
+
531
+ alias pending_urls queue
532
+
533
+ #
534
+ # Sets the queue of URLs to visit.
535
+ #
536
+ # @param [#each]
537
+ # The new list of URLs to visit.
538
+ #
539
+ # @return [Array<URI::HTTP>]
540
+ # The list of URLs to visit.
541
+ #
542
+ # @example
543
+ # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
544
+ #
545
+ def queue=(new_queue)
546
+ @queue.clear
547
+
548
+ new_queue.each do |url|
549
+ @queue << unless url.kind_of?(URI)
550
+ URI(url.to_s)
551
+ else
552
+ url
553
+ end
554
+ end
555
+
556
+ return @queue
557
+ end
558
+
559
+ #
560
+ # Determines whether a given URL has been enqueued.
561
+ #
562
+ # @param [URI::HTTP] url
563
+ # The URL to search for in the queue.
564
+ #
565
+ # @return [Boolean]
566
+ # Specifies whether the given URL has been queued for visiting.
567
+ #
568
+ def queued?(url)
569
+ @queue.include?(url)
570
+ end
571
+
572
+ #
573
+ # Enqueues a given URL for visiting, only if it passes all of the
574
+ # agent's rules for visiting a given URL.
575
+ #
576
+ # @param [URI::HTTP, String] url
577
+ # The URL to enqueue for visiting.
578
+ #
579
+ # @return [Boolean]
580
+ # Specifies whether the URL was enqueued, or ignored.
581
+ #
582
+ def enqueue(url,level=0)
583
+ url = sanitize_url(url)
584
+
585
+ if (!(queued?(url)) && visit?(url))
586
+ link = url.to_s
587
+
588
+ begin
589
+ @every_url_blocks.each { |url_block| url_block.call(url) }
590
+
591
+ @every_url_like_blocks.each do |pattern,url_blocks|
592
+ match = case pattern
593
+ when Regexp
594
+ link =~ pattern
595
+ else
596
+ (pattern == link) || (pattern == url)
597
+ end
598
+
599
+ if match
600
+ url_blocks.each { |url_block| url_block.call(url) }
601
+ end
602
+ end
603
+ rescue Actions::Paused => action
604
+ raise(action)
605
+ rescue Actions::SkipLink
606
+ return false
607
+ rescue Actions::Action
608
+ end
609
+
610
+ @queue << url
611
+ @levels[url] = level
612
+ return true
613
+ end
614
+
615
+ return false
616
+ end
617
+
618
+ #
619
+ # Requests and creates a new Page object from a given URL.
620
+ #
621
+ # @param [URI::HTTP] url
622
+ # The URL to request.
623
+ #
624
+ # @yield [page]
625
+ # If a block is given, it will be passed the page that represents the
626
+ # response.
627
+ #
628
+ # @yieldparam [Page] page
629
+ # The page for the response.
630
+ #
631
+ # @return [Page, nil]
632
+ # The page for the response, or `nil` if the request failed.
633
+ #
634
+ def get_page(url)
635
+ url = URI(url.to_s)
636
+
637
+ prepare_request(url) do |session,path,headers|
638
+ new_page = Page.new(url,session.get(path,headers))
639
+
640
+ # save any new cookies
641
+ @cookies.from_page(new_page)
642
+
643
+ yield new_page if block_given?
644
+ return new_page
645
+ end
646
+ end
647
+
648
+ #
649
+ # Posts supplied form data and creates a new Page object from a given URL.
650
+ #
651
+ # @param [URI::HTTP] url
652
+ # The URL to request.
653
+ #
654
+ # @param [String] post_data
655
+ # Form option data.
656
+ #
657
+ # @yield [page]
658
+ # If a block is given, it will be passed the page that represents the
659
+ # response.
660
+ #
661
+ # @yieldparam [Page] page
662
+ # The page for the response.
663
+ #
664
+ # @return [Page, nil]
665
+ # The page for the response, or `nil` if the request failed.
666
+ #
667
+ # @since 0.2.2
668
+ #
669
+ def post_page(url,post_data='')
670
+ url = URI(url.to_s)
671
+
672
+ prepare_request(url) do |session,path,headers|
673
+ new_page = Page.new(url,session.post(path,post_data,headers))
674
+
675
+ # save any new cookies
676
+ @cookies.from_page(new_page)
677
+
678
+ yield new_page if block_given?
679
+ return new_page
680
+ end
681
+ end
682
+
683
+ #
684
+ # Visits a given URL, and enqueus the links recovered from the URL
685
+ # to be visited later.
686
+ #
687
+ # @param [URI::HTTP, String] url
688
+ # The URL to visit.
689
+ #
690
+ # @yield [page]
691
+ # If a block is given, it will be passed the page which was visited.
692
+ #
693
+ # @yieldparam [Page] page
694
+ # The page which was visited.
695
+ #
696
+ # @return [Page, nil]
697
+ # The page that was visited. If `nil` is returned, either the request
698
+ # for the page failed, or the page was skipped.
699
+ #
700
+ def visit_page(url)
701
+ url = sanitize_url(url)
702
+
703
+ get_page(url) do |page|
704
+ @history << page.url
705
+
706
+ begin
707
+ @every_page_blocks.each { |page_block| page_block.call(page) }
708
+
709
+ yield page if block_given?
710
+ rescue Actions::Paused => action
711
+ raise(action)
712
+ rescue Actions::SkipPage
713
+ return nil
714
+ rescue Actions::Action
715
+ end
716
+
717
+ page.each_url do |next_url|
718
+ begin
719
+ @every_link_blocks.each do |link_block|
720
+ link_block.call(page.url,next_url)
721
+ end
722
+ rescue Actions::Paused => action
723
+ raise(action)
724
+ rescue Actions::SkipLink
725
+ next
726
+ rescue Actions::Action
727
+ end
728
+
729
+ if (@max_depth.nil? || @max_depth > @levels[url])
730
+ enqueue(next_url,@levels[url] + 1)
731
+ end
732
+ end
733
+ end
734
+ end
735
+
736
+ #
737
+ # Converts the agent into a Hash.
738
+ #
739
+ # @return [Hash]
740
+ # The agent represented as a Hash containing the `history` and
741
+ # the `queue` of the agent.
742
+ #
743
+ def to_hash
744
+ {:history => @history, :queue => @queue}
745
+ end
746
+
747
+ protected
748
+
749
+ #
750
+ # Normalizes the request path and grabs a session to handle page
751
+ # get and post requests.
752
+ #
753
+ # @param [URI::HTTP] url
754
+ # The URL to request.
755
+ #
756
+ # @yield [request]
757
+ # A block whose purpose is to make a page request.
758
+ #
759
+ # @yieldparam [Net::HTTP] session
760
+ # An HTTP session object.
761
+ #
762
+ # @yieldparam [String] path
763
+ # Normalized URL string.
764
+ #
765
+ # @yieldparam [Hash] headers
766
+ # A Hash of request header options.
767
+ #
768
+ # @since 0.2.2
769
+ #
770
+ def prepare_request(url,&block)
771
+ host = url.host
772
+ port = url.port
773
+ path = unless url.path.empty?
774
+ url.path
775
+ else
776
+ '/'
777
+ end
778
+
779
+ # append the URL query to the path
780
+ path += "?#{url.query}" if url.query
781
+
782
+ # set any additional HTTP headers
783
+ headers = {}
784
+
785
+ unless @host_headers.empty?
786
+ @host_headers.each do |name,header|
787
+ if host.match(name)
788
+ headers['Host'] = header
789
+ break
790
+ end
791
+ end
792
+ end
793
+
794
+ headers['Host'] ||= @host_header if @host_header
795
+ headers['User-Agent'] = @user_agent if @user_agent
796
+ headers['Referer'] = @referer if @referer
797
+
798
+ if (authorization = @authorized.for_url(url))
799
+ headers['Authorization'] = "Basic #{authorization}"
800
+ end
801
+
802
+ if (header_cookies = @cookies.for_host(url.host))
803
+ headers['Cookie'] = header_cookies
804
+ end
805
+
806
+ begin
807
+ sleep(@delay) if @delay > 0
808
+
809
+ yield @sessions[url], path, headers
810
+ rescue SystemCallError,
811
+ Timeout::Error,
812
+ SocketError,
813
+ IOError,
814
+ OpenSSL::SSL::SSLError,
815
+ Net::HTTPBadResponse
816
+
817
+ @sessions.kill!(url)
818
+
819
+ failed(url)
820
+ return nil
821
+ end
822
+ end
823
+
824
+ #
825
+ # Dequeues a URL that will later be visited.
826
+ #
827
+ # @return [URI::HTTP]
828
+ # The URL that was at the front of the queue.
829
+ #
830
+ def dequeue
831
+ @queue.shift
832
+ end
833
+
834
+ #
835
+ # Determines if a given URL should be visited.
836
+ #
837
+ # @param [URI::HTTP] url
838
+ # The URL in question.
839
+ #
840
+ # @return [Boolean]
841
+ # Specifies whether the given URL should be visited.
842
+ #
843
+ def visit?(url)
844
+ !visited?(url) &&
845
+ visit_scheme?(url.scheme) &&
846
+ visit_host?(url.host) &&
847
+ visit_port?(url.port) &&
848
+ visit_link?(url.to_s) &&
849
+ visit_url?(url) &&
850
+ visit_ext?(url.path)
851
+ end
852
+
853
+ #
854
+ # Adds a given URL to the failures list.
855
+ #
856
+ # @param [URI::HTTP] url
857
+ # The URL to add to the failures list.
858
+ #
859
+ def failed(url)
860
+ @failures << url
861
+ @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
862
+ return true
863
+ end
864
+
865
+ end
866
+ end