spidr_epg 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
@@ -0,0 +1,866 @@
1
+ require 'spidrs/sanitizers'
2
+ require 'spidrs/filters'
3
+ require 'spidrs/events'
4
+ require 'spidrs/actions'
5
+ require 'spidrs/page'
6
+ require 'spidrs/session_cache'
7
+ require 'spidrs/cookie_jar'
8
+ require 'spidrs/auth_store'
9
+ require 'spidrs/spidrs'
10
+
11
+ require 'openssl'
12
+ require 'net/http'
13
+ require 'set'
14
+
15
+ module Spidr
16
+ class Agent
17
+
18
+ include Sanitizers
19
+ include Filters
20
+ include Events
21
+ include Actions
22
+
23
+ # HTTP Host Header to use
24
+ attr_accessor :host_header
25
+
26
+ # HTTP Host Headers to use for specific hosts
27
+ attr_reader :host_headers
28
+
29
+ # User-Agent to use
30
+ attr_accessor :user_agent
31
+
32
+ # HTTP Authentication credentials
33
+ attr_accessor :authorized
34
+
35
+ # Referer to use
36
+ attr_accessor :referer
37
+
38
+ # Delay in between fetching pages
39
+ attr_accessor :delay
40
+
41
+ # History containing visited URLs
42
+ attr_reader :history
43
+
44
+ # List of unreachable URLs
45
+ attr_reader :failures
46
+
47
+ # Queue of URLs to visit
48
+ attr_reader :queue
49
+
50
+ # Cached cookies
51
+ attr_reader :cookies
52
+
53
+ # Maximum depth
54
+ attr_reader :max_depth
55
+
56
+ # The visited URLs and their depth within a site
57
+ attr_reader :levels
58
+
59
+ #
60
+ # Creates a new Agent object.
61
+ #
62
+ # @param [Hash] options
63
+ # Additional options
64
+ #
65
+ # @option options [Hash] :proxy (Spidr.proxy)
66
+ # The proxy information to use.
67
+ #
68
+ # @option :proxy [String] :host
69
+ # The host the proxy is running on.
70
+ #
71
+ # @option :proxy [Integer] :port
72
+ # The port the proxy is running on.
73
+ #
74
+ # @option :proxy [String] :user
75
+ # The user to authenticate as with the proxy.
76
+ #
77
+ # @option :proxy [String] :password
78
+ # The password to authenticate with.
79
+ #
80
+ # @option options [String] :host_header
81
+ # The HTTP Host header to use with each request.
82
+ #
83
+ # @option options [Hash{String,Regexp => String}] :host_headers
84
+ # The HTTP Host headers to use for specific hosts.
85
+ #
86
+ # @option options [String] :user_agent (Spidr.user_agent)
87
+ # The User-Agent string to send with each requests.
88
+ #
89
+ # @option options [String] :referer
90
+ # The Referer URL to send with each request.
91
+ #
92
+ # @option options [Integer] :delay (0)
93
+ # The number of seconds to pause between each request.
94
+ #
95
+ # @option options [Set, Array] :queue
96
+ # The initial queue of URLs to visit.
97
+ #
98
+ # @option options [Set, Array] :history
99
+ # The initial list of visited URLs.
100
+ #
101
+ # @option options [Integer] :max_depth
102
+ # The maximum link depth to follow.
103
+ #
104
+ # @yield [agent]
105
+ # If a block is given, it will be passed the newly created agent
106
+ # for further configuration.
107
+ #
108
+ # @yieldparam [Agent] agent
109
+ # The newly created agent.
110
+ #
111
+ # @see #initialize_sanitizers
112
+ # @see #initialize_filters
113
+ # @see #initialize_actions
114
+ # @see #initialize_events
115
+ #
116
+ def initialize(options={})
117
+ @host_header = options[:host_header]
118
+ @host_headers = {}
119
+
120
+ if options[:host_headers]
121
+ @host_headers.merge!(options[:host_headers])
122
+ end
123
+
124
+ @user_agent = options.fetch(:user_agent,Spidr.user_agent)
125
+ @referer = options[:referer]
126
+
127
+ @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
128
+ @cookies = CookieJar.new
129
+ @authorized = AuthStore.new
130
+
131
+ @running = false
132
+ @delay = options.fetch(:delay,0)
133
+ @history = Set[]
134
+ @failures = Set[]
135
+ @queue = []
136
+
137
+ @levels = Hash.new(0)
138
+ @max_depth = options[:max_depth]
139
+
140
+ initialize_sanitizers(options)
141
+ initialize_filters(options)
142
+ initialize_actions(options)
143
+ initialize_events(options)
144
+
145
+ yield self if block_given?
146
+ end
147
+
148
+ #
149
+ # Creates a new agent and begin spidering at the given URL.
150
+ #
151
+ # @param [URI::HTTP, String] url
152
+ # The URL to start spidering at.
153
+ #
154
+ # @param [Hash] options
155
+ # Additional options. See {Agent#initialize}.
156
+ #
157
+ # @yield [agent]
158
+ # If a block is given, it will be passed the newly created agent
159
+ # before it begins spidering.
160
+ #
161
+ # @yieldparam [Agent] agent
162
+ # The newly created agent.
163
+ #
164
+ def self.start_at(url,options={},&block)
165
+ agent = new(options,&block)
166
+ agent.start_at(url)
167
+ end
168
+
169
+
170
+ #
171
+ # Creates a new agent and begin spidering at the given URL.
172
+ #
173
+ # @param [URI::HTTP, String] url
174
+ # The URL to start spidering at.
175
+ #
176
+ # @param [Hash] options
177
+ # Additional options. See {Agent#initialize}.
178
+ #
179
+ # @yield [agent]
180
+ # If a block is given, it will be passed the newly created agent
181
+ # before it begins spidering.
182
+ #
183
+ # @yieldparam [Agent] agent
184
+ # The newly created agent.
185
+ #
186
+ def self.start_at(url,regex,options={},&block)
187
+ agent = new(options,&block)
188
+ agent.start_at(url,regex)
189
+ end
190
+
191
+ #
192
+ # Creates a new agent and spiders the web-site located at the given URL.
193
+ #
194
+ # @param [URI::HTTP, String] url
195
+ # The web-site to spider.
196
+ #
197
+ # @param [Hash] options
198
+ # Additional options. See {Agent#initialize}.
199
+ #
200
+ # @yield [agent]
201
+ # If a block is given, it will be passed the newly created agent
202
+ # before it begins spidering.
203
+ #
204
+ # @yieldparam [Agent] agent
205
+ # The newly created agent.
206
+ #
207
+ def self.site(url,options={},&block)
208
+ url = URI(url.to_s) unless url.kind_of?(URI)
209
+
210
+ agent = new(options.merge(:host => url.host),&block)
211
+ agent.start_at(url)
212
+ end
213
+
214
+
215
+ def self._site(url,options={},regex,&block)
216
+ url = URI(url.to_s) unless url.kind_of?(URI)
217
+
218
+ agent = new(options.merge(:host => url.host),&block)
219
+ agent.start_at(regex,url)
220
+ end
221
+
222
+ #
223
+ # Creates a new agent and spiders the given host.
224
+ #
225
+ # @param [String]
226
+ # The host-name to spider.
227
+ #
228
+ # @param [Hash] options
229
+ # Additional options. See {Agent#initialize}.
230
+ #
231
+ # @yield [agent]
232
+ # If a block is given, it will be passed the newly created agent
233
+ # before it begins spidering.
234
+ #
235
+ # @yieldparam [Agent] agent
236
+ # The newly created agent.
237
+ #
238
+ def self.host(name,options={},&block)
239
+ agent = new(options.merge(:host => name),&block)
240
+ agent.start_at(URI::HTTP.build(:host => name, :path => '/'))
241
+ end
242
+
243
+ #
244
+ # Clears the history of the agent.
245
+ #
246
+ def clear
247
+ @queue.clear
248
+ @history.clear
249
+ @failures.clear
250
+ return self
251
+ end
252
+
253
+ #
254
+ # Start spidering at a given URL.
255
+ #
256
+ # @param [URI::HTTP, String] url
257
+ # The URL to start spidering at.
258
+ #
259
+ # @yield [page]
260
+ # If a block is given, it will be passed every page visited.
261
+ #
262
+ # @yieldparam [Page] page
263
+ # A page which has been visited.
264
+ #
265
+ def start_at(url,&block)
266
+ enqueue(url)
267
+ return run(&block)
268
+ end
269
+
270
+
271
+ def start_at(url,regex,&block)
272
+ enqueue(url)
273
+ return _run(regex,&block)
274
+ end
275
+
276
+
277
+ #
278
+ # Start spidering until the queue becomes empty or the agent is
279
+ # paused.
280
+ #
281
+ # @yield [page]
282
+ # If a block is given, it will be passed every page visited.
283
+ #
284
+ # @yieldparam [Page] page
285
+ # A page which has been visited.
286
+ #
287
+ def _run(regex,&block)
288
+ @running = true
289
+
290
+ until (@queue.empty? || paused?)
291
+ begin
292
+ _visit_page(regex,dequeue,&block)
293
+ rescue Actions::Paused
294
+ return self
295
+ rescue Actions::Action
296
+ end
297
+ end
298
+
299
+ @running = false
300
+ @sessions.clear
301
+ return self
302
+ end
303
+
304
+ #
305
+ # Visits a given URL, and enqueus the links recovered from the URL
306
+ # to be visited later.
307
+ #
308
+ # @param [URI::HTTP, String] url
309
+ # The URL to visit.
310
+ #
311
+ # @yield [page]
312
+ # If a block is given, it will be passed the page which was visited.
313
+ #
314
+ # @yieldparam [Page] page
315
+ # The page which was visited.
316
+ #
317
+ # @return [Page, nil]
318
+ # The page that was visited. If `nil` is returned, either the request
319
+ # for the page failed, or the page was skipped.
320
+ #
321
+ def _visit_page(regex,url)
322
+ url = sanitize_url(url)
323
+
324
+ get_page(url) do |page|
325
+ @history << page.url
326
+
327
+ begin
328
+ @every_page_blocks.each { |page_block| page_block.call(page) }
329
+
330
+ yield page if block_given?
331
+ rescue Actions::Paused => action
332
+ raise(action)
333
+ rescue Actions::SkipPage
334
+ return nil
335
+ rescue Actions::Action
336
+ end
337
+
338
+ page.each_url do |next_url|
339
+ begin
340
+ @every_link_blocks.each do |link_block|
341
+ link_block.call(page.url,next_url)
342
+ end
343
+ rescue Actions::Paused => action
344
+ raise(action)
345
+ rescue Actions::SkipLink
346
+ next
347
+ rescue Actions::Action
348
+ end
349
+
350
+ if (@max_depth.nil? || @max_depth > @levels[url])
351
+ if(regex.match(next_url))
352
+ enqueue(next_url,@levels[url] + 1)
353
+ end
354
+ end
355
+ end
356
+ end
357
+ end
358
+
359
+ #
360
+ # Start spidering until the queue becomes empty or the agent is
361
+ # paused.
362
+ #
363
+ # @yield [page]
364
+ # If a block is given, it will be passed every page visited.
365
+ #
366
+ # @yieldparam [Page] page
367
+ # A page which has been visited.
368
+ #
369
+ def run(&block)
370
+ @running = true
371
+
372
+ until (@queue.empty? || paused?)
373
+ begin
374
+ visit_page(dequeue,&block)
375
+ rescue Actions::Paused
376
+ return self
377
+ rescue Actions::Action
378
+ end
379
+ end
380
+
381
+ @running = false
382
+ @sessions.clear
383
+ return self
384
+ end
385
+
386
+ #
387
+ # Determines if the agent is running.
388
+ #
389
+ # @return [Boolean]
390
+ # Specifies whether the agent is running or stopped.
391
+ #
392
+ def running?
393
+ @running == true
394
+ end
395
+
396
+ #
397
+ # The proxy information the agent uses.
398
+ #
399
+ # @return [Hash]
400
+ # The proxy information.
401
+ #
402
+ # @see SessionCache#proxy
403
+ #
404
+ # @since 0.2.2
405
+ #
406
+ def proxy
407
+ @sessions.proxy
408
+ end
409
+
410
+ #
411
+ # Sets the proxy information that the agent uses.
412
+ #
413
+ # @param [Hash] new_proxy
414
+ # The new proxy information.
415
+ #
416
+ # @return [Hash]
417
+ # The new proxy information.
418
+ #
419
+ # @see SessionCache#proxy=
420
+ #
421
+ # @since 0.2.2
422
+ #
423
+ def proxy=(new_proxy)
424
+ @sessions.proxy = new_proxy
425
+ end
426
+
427
+ #
428
+ # Sets the history of URLs that were previously visited.
429
+ #
430
+ # @param [#each] new_history
431
+ # A list of URLs to populate the history with.
432
+ #
433
+ # @return [Set<URI::HTTP>]
434
+ # The history of the agent.
435
+ #
436
+ # @example
437
+ # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
438
+ #
439
+ def history=(new_history)
440
+ @history.clear
441
+
442
+ new_history.each do |url|
443
+ @history << unless url.kind_of?(URI)
444
+ URI(url.to_s)
445
+ else
446
+ url
447
+ end
448
+ end
449
+
450
+ return @history
451
+ end
452
+
453
+ alias visited_urls history
454
+
455
+ #
456
+ # Specifies the links which have been visited.
457
+ #
458
+ # @return [Array<String>]
459
+ # The links which have been visited.
460
+ #
461
+ def visited_links
462
+ @history.map { |url| url.to_s }
463
+ end
464
+
465
+ #
466
+ # Specifies all hosts that were visited.
467
+ #
468
+ # @return [Array<String>]
469
+ # The hosts which have been visited.
470
+ #
471
+ def visited_hosts
472
+ visited_urls.map { |uri| uri.host }.uniq
473
+ end
474
+
475
+ #
476
+ # Determines whether a URL was visited or not.
477
+ #
478
+ # @param [URI::HTTP, String] url
479
+ # The URL to search for.
480
+ #
481
+ # @return [Boolean]
482
+ # Specifies whether a URL was visited.
483
+ #
484
+ def visited?(url)
485
+ url = URI(url.to_s) unless url.kind_of?(URI)
486
+
487
+ return @history.include?(url)
488
+ end
489
+
490
+ #
491
+ # Sets the list of failed URLs.
492
+ #
493
+ # @param [#each]
494
+ # The new list of failed URLs.
495
+ #
496
+ # @return [Array<URI::HTTP>]
497
+ # The list of failed URLs.
498
+ #
499
+ # @example
500
+ # agent.failures = ['http://localhost/']
501
+ #
502
+ def failures=(new_failures)
503
+ @failures.clear
504
+
505
+ new_failures.each do |url|
506
+ @failures << unless url.kind_of?(URI)
507
+ URI(url.to_s)
508
+ else
509
+ url
510
+ end
511
+ end
512
+
513
+ return @failures
514
+ end
515
+
516
+ #
517
+ # Determines whether a given URL could not be visited.
518
+ #
519
+ # @param [URI::HTTP, String] url
520
+ # The URL to check for failures.
521
+ #
522
+ # @return [Boolean]
523
+ # Specifies whether the given URL was unable to be visited.
524
+ #
525
+ def failed?(url)
526
+ url = URI(url.to_s) unless url.kind_of?(URI)
527
+
528
+ return @failures.include?(url)
529
+ end
530
+
531
+ alias pending_urls queue
532
+
533
+ #
534
+ # Sets the queue of URLs to visit.
535
+ #
536
+ # @param [#each]
537
+ # The new list of URLs to visit.
538
+ #
539
+ # @return [Array<URI::HTTP>]
540
+ # The list of URLs to visit.
541
+ #
542
+ # @example
543
+ # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
544
+ #
545
+ def queue=(new_queue)
546
+ @queue.clear
547
+
548
+ new_queue.each do |url|
549
+ @queue << unless url.kind_of?(URI)
550
+ URI(url.to_s)
551
+ else
552
+ url
553
+ end
554
+ end
555
+
556
+ return @queue
557
+ end
558
+
559
+ #
560
+ # Determines whether a given URL has been enqueued.
561
+ #
562
+ # @param [URI::HTTP] url
563
+ # The URL to search for in the queue.
564
+ #
565
+ # @return [Boolean]
566
+ # Specifies whether the given URL has been queued for visiting.
567
+ #
568
+ def queued?(url)
569
+ @queue.include?(url)
570
+ end
571
+
572
+ #
573
+ # Enqueues a given URL for visiting, only if it passes all of the
574
+ # agent's rules for visiting a given URL.
575
+ #
576
+ # @param [URI::HTTP, String] url
577
+ # The URL to enqueue for visiting.
578
+ #
579
+ # @return [Boolean]
580
+ # Specifies whether the URL was enqueued, or ignored.
581
+ #
582
+ def enqueue(url,level=0)
583
+ url = sanitize_url(url)
584
+
585
+ if (!(queued?(url)) && visit?(url))
586
+ link = url.to_s
587
+
588
+ begin
589
+ @every_url_blocks.each { |url_block| url_block.call(url) }
590
+
591
+ @every_url_like_blocks.each do |pattern,url_blocks|
592
+ match = case pattern
593
+ when Regexp
594
+ link =~ pattern
595
+ else
596
+ (pattern == link) || (pattern == url)
597
+ end
598
+
599
+ if match
600
+ url_blocks.each { |url_block| url_block.call(url) }
601
+ end
602
+ end
603
+ rescue Actions::Paused => action
604
+ raise(action)
605
+ rescue Actions::SkipLink
606
+ return false
607
+ rescue Actions::Action
608
+ end
609
+
610
+ @queue << url
611
+ @levels[url] = level
612
+ return true
613
+ end
614
+
615
+ return false
616
+ end
617
+
618
+ #
619
+ # Requests and creates a new Page object from a given URL.
620
+ #
621
+ # @param [URI::HTTP] url
622
+ # The URL to request.
623
+ #
624
+ # @yield [page]
625
+ # If a block is given, it will be passed the page that represents the
626
+ # response.
627
+ #
628
+ # @yieldparam [Page] page
629
+ # The page for the response.
630
+ #
631
+ # @return [Page, nil]
632
+ # The page for the response, or `nil` if the request failed.
633
+ #
634
+ def get_page(url)
635
+ url = URI(url.to_s)
636
+
637
+ prepare_request(url) do |session,path,headers|
638
+ new_page = Page.new(url,session.get(path,headers))
639
+
640
+ # save any new cookies
641
+ @cookies.from_page(new_page)
642
+
643
+ yield new_page if block_given?
644
+ return new_page
645
+ end
646
+ end
647
+
648
+ #
649
+ # Posts supplied form data and creates a new Page object from a given URL.
650
+ #
651
+ # @param [URI::HTTP] url
652
+ # The URL to request.
653
+ #
654
+ # @param [String] post_data
655
+ # Form option data.
656
+ #
657
+ # @yield [page]
658
+ # If a block is given, it will be passed the page that represents the
659
+ # response.
660
+ #
661
+ # @yieldparam [Page] page
662
+ # The page for the response.
663
+ #
664
+ # @return [Page, nil]
665
+ # The page for the response, or `nil` if the request failed.
666
+ #
667
+ # @since 0.2.2
668
+ #
669
+ def post_page(url,post_data='')
670
+ url = URI(url.to_s)
671
+
672
+ prepare_request(url) do |session,path,headers|
673
+ new_page = Page.new(url,session.post(path,post_data,headers))
674
+
675
+ # save any new cookies
676
+ @cookies.from_page(new_page)
677
+
678
+ yield new_page if block_given?
679
+ return new_page
680
+ end
681
+ end
682
+
683
+ #
684
+ # Visits a given URL, and enqueus the links recovered from the URL
685
+ # to be visited later.
686
+ #
687
+ # @param [URI::HTTP, String] url
688
+ # The URL to visit.
689
+ #
690
+ # @yield [page]
691
+ # If a block is given, it will be passed the page which was visited.
692
+ #
693
+ # @yieldparam [Page] page
694
+ # The page which was visited.
695
+ #
696
+ # @return [Page, nil]
697
+ # The page that was visited. If `nil` is returned, either the request
698
+ # for the page failed, or the page was skipped.
699
+ #
700
+ def visit_page(url)
701
+ url = sanitize_url(url)
702
+
703
+ get_page(url) do |page|
704
+ @history << page.url
705
+
706
+ begin
707
+ @every_page_blocks.each { |page_block| page_block.call(page) }
708
+
709
+ yield page if block_given?
710
+ rescue Actions::Paused => action
711
+ raise(action)
712
+ rescue Actions::SkipPage
713
+ return nil
714
+ rescue Actions::Action
715
+ end
716
+
717
+ page.each_url do |next_url|
718
+ begin
719
+ @every_link_blocks.each do |link_block|
720
+ link_block.call(page.url,next_url)
721
+ end
722
+ rescue Actions::Paused => action
723
+ raise(action)
724
+ rescue Actions::SkipLink
725
+ next
726
+ rescue Actions::Action
727
+ end
728
+
729
+ if (@max_depth.nil? || @max_depth > @levels[url])
730
+ enqueue(next_url,@levels[url] + 1)
731
+ end
732
+ end
733
+ end
734
+ end
735
+
736
+ #
737
+ # Converts the agent into a Hash.
738
+ #
739
+ # @return [Hash]
740
+ # The agent represented as a Hash containing the `history` and
741
+ # the `queue` of the agent.
742
+ #
743
+ def to_hash
744
+ {:history => @history, :queue => @queue}
745
+ end
746
+
747
+ protected
748
+
749
+ #
750
+ # Normalizes the request path and grabs a session to handle page
751
+ # get and post requests.
752
+ #
753
+ # @param [URI::HTTP] url
754
+ # The URL to request.
755
+ #
756
+ # @yield [request]
757
+ # A block whose purpose is to make a page request.
758
+ #
759
+ # @yieldparam [Net::HTTP] session
760
+ # An HTTP session object.
761
+ #
762
+ # @yieldparam [String] path
763
+ # Normalized URL string.
764
+ #
765
+ # @yieldparam [Hash] headers
766
+ # A Hash of request header options.
767
+ #
768
+ # @since 0.2.2
769
+ #
770
+ def prepare_request(url,&block)
771
+ host = url.host
772
+ port = url.port
773
+ path = unless url.path.empty?
774
+ url.path
775
+ else
776
+ '/'
777
+ end
778
+
779
+ # append the URL query to the path
780
+ path += "?#{url.query}" if url.query
781
+
782
+ # set any additional HTTP headers
783
+ headers = {}
784
+
785
+ unless @host_headers.empty?
786
+ @host_headers.each do |name,header|
787
+ if host.match(name)
788
+ headers['Host'] = header
789
+ break
790
+ end
791
+ end
792
+ end
793
+
794
+ headers['Host'] ||= @host_header if @host_header
795
+ headers['User-Agent'] = @user_agent if @user_agent
796
+ headers['Referer'] = @referer if @referer
797
+
798
+ if (authorization = @authorized.for_url(url))
799
+ headers['Authorization'] = "Basic #{authorization}"
800
+ end
801
+
802
+ if (header_cookies = @cookies.for_host(url.host))
803
+ headers['Cookie'] = header_cookies
804
+ end
805
+
806
+ begin
807
+ sleep(@delay) if @delay > 0
808
+
809
+ yield @sessions[url], path, headers
810
+ rescue SystemCallError,
811
+ Timeout::Error,
812
+ SocketError,
813
+ IOError,
814
+ OpenSSL::SSL::SSLError,
815
+ Net::HTTPBadResponse
816
+
817
+ @sessions.kill!(url)
818
+
819
+ failed(url)
820
+ return nil
821
+ end
822
+ end
823
+
824
+ #
825
+ # Dequeues a URL that will later be visited.
826
+ #
827
+ # @return [URI::HTTP]
828
+ # The URL that was at the front of the queue.
829
+ #
830
+ def dequeue
831
+ @queue.shift
832
+ end
833
+
834
+ #
835
+ # Determines if a given URL should be visited.
836
+ #
837
+ # @param [URI::HTTP] url
838
+ # The URL in question.
839
+ #
840
+ # @return [Boolean]
841
+ # Specifies whether the given URL should be visited.
842
+ #
843
+ def visit?(url)
844
+ !visited?(url) &&
845
+ visit_scheme?(url.scheme) &&
846
+ visit_host?(url.host) &&
847
+ visit_port?(url.port) &&
848
+ visit_link?(url.to_s) &&
849
+ visit_url?(url) &&
850
+ visit_ext?(url.path)
851
+ end
852
+
853
+ #
854
+ # Adds a given URL to the failures list.
855
+ #
856
+ # @param [URI::HTTP] url
857
+ # The URL to add to the failures list.
858
+ #
859
+ def failed(url)
860
+ @failures << url
861
+ @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
862
+ return true
863
+ end
864
+
865
+ end
866
+ end