spidr_epg_gem 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,747 @@
1
+ require 'spidr/sanitizers'
2
+ require 'spidr/filters'
3
+ require 'spidr/events'
4
+ require 'spidr/actions'
5
+ require 'spidr/page'
6
+ require 'spidr/session_cache'
7
+ require 'spidr/cookie_jar'
8
+ require 'spidr/auth_store'
9
+ require 'spidr/spidr'
10
+
11
+ require 'openssl'
12
+ require 'net/http'
13
+ require 'set'
14
+
15
+ module Spidr
16
+ class Agent
17
+
18
+ include Sanitizers
19
+ include Filters
20
+ include Events
21
+ include Actions
22
+
23
+ # HTTP Host Header to use
24
+ attr_accessor :host_header
25
+
26
+ # HTTP Host Headers to use for specific hosts
27
+ attr_reader :host_headers
28
+
29
+ # User-Agent to use
30
+ attr_accessor :user_agent
31
+
32
+ # HTTP Authentication credentials
33
+ attr_accessor :authorized
34
+
35
+ # Referer to use
36
+ attr_accessor :referer
37
+
38
+ # Delay in between fetching pages
39
+ attr_accessor :delay
40
+
41
+ # History containing visited URLs
42
+ attr_reader :history
43
+
44
+ # List of unreachable URLs
45
+ attr_reader :failures
46
+
47
+ # Queue of URLs to visit
48
+ attr_reader :queue
49
+
50
+ # Cached cookies
51
+ attr_reader :cookies
52
+
53
+ # Maximum depth
54
+ attr_reader :max_depth
55
+
56
+ # The visited URLs and their depth within a site
57
+ attr_reader :levels
58
+
59
+ #
60
+ # Creates a new Agent object.
61
+ #
62
+ # @param [Hash] options
63
+ # Additional options
64
+ #
65
+ # @option options [Hash] :proxy (Spidr.proxy)
66
+ # The proxy information to use.
67
+ #
68
+ # @option :proxy [String] :host
69
+ # The host the proxy is running on.
70
+ #
71
+ # @option :proxy [Integer] :port
72
+ # The port the proxy is running on.
73
+ #
74
+ # @option :proxy [String] :user
75
+ # The user to authenticate as with the proxy.
76
+ #
77
+ # @option :proxy [String] :password
78
+ # The password to authenticate with.
79
+ #
80
+ # @option options [String] :host_header
81
+ # The HTTP Host header to use with each request.
82
+ #
83
+ # @option options [Hash{String,Regexp => String}] :host_headers
84
+ # The HTTP Host headers to use for specific hosts.
85
+ #
86
+ # @option options [String] :user_agent (Spidr.user_agent)
87
+ # The User-Agent string to send with each requests.
88
+ #
89
+ # @option options [String] :referer
90
+ # The Referer URL to send with each request.
91
+ #
92
+ # @option options [Integer] :delay (0)
93
+ # The number of seconds to pause between each request.
94
+ #
95
+ # @option options [Set, Array] :queue
96
+ # The initial queue of URLs to visit.
97
+ #
98
+ # @option options [Set, Array] :history
99
+ # The initial list of visited URLs.
100
+ #
101
+ # @option options [Integer] :max_depth
102
+ # The maximum link depth to follow.
103
+ #
104
+ # @yield [agent]
105
+ # If a block is given, it will be passed the newly created agent
106
+ # for further configuration.
107
+ #
108
+ # @yieldparam [Agent] agent
109
+ # The newly created agent.
110
+ #
111
+ # @see #initialize_sanitizers
112
+ # @see #initialize_filters
113
+ # @see #initialize_actions
114
+ # @see #initialize_events
115
+ #
116
+ def initialize(options={})
117
+ @host_header = options[:host_header]
118
+ @host_headers = {}
119
+
120
+ if options[:host_headers]
121
+ @host_headers.merge!(options[:host_headers])
122
+ end
123
+
124
+ @user_agent = options.fetch(:user_agent,Spidr.user_agent)
125
+ @referer = options[:referer]
126
+
127
+ @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
128
+ @cookies = CookieJar.new
129
+ @authorized = AuthStore.new
130
+
131
+ @running = false
132
+ @delay = options.fetch(:delay,0)
133
+ @history = Set[]
134
+ @failures = Set[]
135
+ @queue = []
136
+
137
+ @levels = Hash.new(0)
138
+ @max_depth = options[:max_depth]
139
+
140
+ initialize_sanitizers(options)
141
+ initialize_filters(options)
142
+ initialize_actions(options)
143
+ initialize_events(options)
144
+
145
+ yield self if block_given?
146
+ end
147
+
148
+ #
149
+ # Creates a new agent and begin spidering at the given URL.
150
+ #
151
+ # @param [URI::HTTP, String] url
152
+ # The URL to start spidering at.
153
+ #
154
+ # @param [Hash] options
155
+ # Additional options. See {Agent#initialize}.
156
+ #
157
+ # @yield [agent]
158
+ # If a block is given, it will be passed the newly created agent
159
+ # before it begins spidering.
160
+ #
161
+ # @yieldparam [Agent] agent
162
+ # The newly created agent.
163
+ #
164
+ def self.start_at(url,options={},&block)
165
+ agent = new(options,&block)
166
+ agent.start_at(url)
167
+ end
168
+
169
+ #
170
+ # Creates a new agent and spiders the web-site located at the given URL.
171
+ #
172
+ # @param [URI::HTTP, String] url
173
+ # The web-site to spider.
174
+ #
175
+ # @param [Hash] options
176
+ # Additional options. See {Agent#initialize}.
177
+ #
178
+ # @yield [agent]
179
+ # If a block is given, it will be passed the newly created agent
180
+ # before it begins spidering.
181
+ #
182
+ # @yieldparam [Agent] agent
183
+ # The newly created agent.
184
+ #
185
+ def self.site(url,options={},&block)
186
+ url = URI(url.to_s) unless url.kind_of?(URI)
187
+
188
+ agent = new(options.merge(:host => url.host),&block)
189
+ agent.start_at(url)
190
+ end
191
+
192
+ #
193
+ # Creates a new agent and spiders the given host.
194
+ #
195
+ # @param [String]
196
+ # The host-name to spider.
197
+ #
198
+ # @param [Hash] options
199
+ # Additional options. See {Agent#initialize}.
200
+ #
201
+ # @yield [agent]
202
+ # If a block is given, it will be passed the newly created agent
203
+ # before it begins spidering.
204
+ #
205
+ # @yieldparam [Agent] agent
206
+ # The newly created agent.
207
+ #
208
+ def self.host(name,options={},&block)
209
+ agent = new(options.merge(:host => name),&block)
210
+ agent.start_at(URI::HTTP.build(:host => name, :path => '/'))
211
+ end
212
+
213
+ #
214
+ # Clears the history of the agent.
215
+ #
216
+ def clear
217
+ @queue.clear
218
+ @history.clear
219
+ @failures.clear
220
+ return self
221
+ end
222
+
223
+ #
224
+ # Start spidering at a given URL.
225
+ #
226
+ # @param [URI::HTTP, String] url
227
+ # The URL to start spidering at.
228
+ #
229
+ # @yield [page]
230
+ # If a block is given, it will be passed every page visited.
231
+ #
232
+ # @yieldparam [Page] page
233
+ # A page which has been visited.
234
+ #
235
+ def start_at(url,&block)
236
+ enqueue(url)
237
+ return run(&block)
238
+ end
239
+
240
+ #
241
+ # Start spidering until the queue becomes empty or the agent is
242
+ # paused.
243
+ #
244
+ # @yield [page]
245
+ # If a block is given, it will be passed every page visited.
246
+ #
247
+ # @yieldparam [Page] page
248
+ # A page which has been visited.
249
+ #
250
+ def run(&block)
251
+ @running = true
252
+
253
+ until (@queue.empty? || paused?)
254
+ begin
255
+ visit_page(dequeue,&block)
256
+ rescue Actions::Paused
257
+ return self
258
+ rescue Actions::Action
259
+ end
260
+ end
261
+
262
+ @running = false
263
+ @sessions.clear
264
+ return self
265
+ end
266
+
267
+ #
268
+ # Determines if the agent is running.
269
+ #
270
+ # @return [Boolean]
271
+ # Specifies whether the agent is running or stopped.
272
+ #
273
+ def running?
274
+ @running == true
275
+ end
276
+
277
+ #
278
+ # The proxy information the agent uses.
279
+ #
280
+ # @return [Hash]
281
+ # The proxy information.
282
+ #
283
+ # @see SessionCache#proxy
284
+ #
285
+ # @since 0.2.2
286
+ #
287
+ def proxy
288
+ @sessions.proxy
289
+ end
290
+
291
+ #
292
+ # Sets the proxy information that the agent uses.
293
+ #
294
+ # @param [Hash] new_proxy
295
+ # The new proxy information.
296
+ #
297
+ # @return [Hash]
298
+ # The new proxy information.
299
+ #
300
+ # @see SessionCache#proxy=
301
+ #
302
+ # @since 0.2.2
303
+ #
304
+ def proxy=(new_proxy)
305
+ @sessions.proxy = new_proxy
306
+ end
307
+
308
+ #
309
+ # Sets the history of URLs that were previously visited.
310
+ #
311
+ # @param [#each] new_history
312
+ # A list of URLs to populate the history with.
313
+ #
314
+ # @return [Set<URI::HTTP>]
315
+ # The history of the agent.
316
+ #
317
+ # @example
318
+ # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
319
+ #
320
+ def history=(new_history)
321
+ @history.clear
322
+
323
+ new_history.each do |url|
324
+ @history << unless url.kind_of?(URI)
325
+ URI(url.to_s)
326
+ else
327
+ url
328
+ end
329
+ end
330
+
331
+ return @history
332
+ end
333
+
334
+ alias visited_urls history
335
+
336
+ #
337
+ # Specifies the links which have been visited.
338
+ #
339
+ # @return [Array<String>]
340
+ # The links which have been visited.
341
+ #
342
+ def visited_links
343
+ @history.map { |url| url.to_s }
344
+ end
345
+
346
+ #
347
+ # Specifies all hosts that were visited.
348
+ #
349
+ # @return [Array<String>]
350
+ # The hosts which have been visited.
351
+ #
352
+ def visited_hosts
353
+ visited_urls.map { |uri| uri.host }.uniq
354
+ end
355
+
356
+ #
357
+ # Determines whether a URL was visited or not.
358
+ #
359
+ # @param [URI::HTTP, String] url
360
+ # The URL to search for.
361
+ #
362
+ # @return [Boolean]
363
+ # Specifies whether a URL was visited.
364
+ #
365
+ def visited?(url)
366
+ url = URI(url.to_s) unless url.kind_of?(URI)
367
+
368
+ return @history.include?(url)
369
+ end
370
+
371
+ #
372
+ # Sets the list of failed URLs.
373
+ #
374
+ # @param [#each]
375
+ # The new list of failed URLs.
376
+ #
377
+ # @return [Array<URI::HTTP>]
378
+ # The list of failed URLs.
379
+ #
380
+ # @example
381
+ # agent.failures = ['http://localhost/']
382
+ #
383
+ def failures=(new_failures)
384
+ @failures.clear
385
+
386
+ new_failures.each do |url|
387
+ @failures << unless url.kind_of?(URI)
388
+ URI(url.to_s)
389
+ else
390
+ url
391
+ end
392
+ end
393
+
394
+ return @failures
395
+ end
396
+
397
+ #
398
+ # Determines whether a given URL could not be visited.
399
+ #
400
+ # @param [URI::HTTP, String] url
401
+ # The URL to check for failures.
402
+ #
403
+ # @return [Boolean]
404
+ # Specifies whether the given URL was unable to be visited.
405
+ #
406
+ def failed?(url)
407
+ url = URI(url.to_s) unless url.kind_of?(URI)
408
+
409
+ return @failures.include?(url)
410
+ end
411
+
412
+ alias pending_urls queue
413
+
414
+ #
415
+ # Sets the queue of URLs to visit.
416
+ #
417
+ # @param [#each]
418
+ # The new list of URLs to visit.
419
+ #
420
+ # @return [Array<URI::HTTP>]
421
+ # The list of URLs to visit.
422
+ #
423
+ # @example
424
+ # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
425
+ #
426
+ def queue=(new_queue)
427
+ @queue.clear
428
+
429
+ new_queue.each do |url|
430
+ @queue << unless url.kind_of?(URI)
431
+ URI(url.to_s)
432
+ else
433
+ url
434
+ end
435
+ end
436
+
437
+ return @queue
438
+ end
439
+
440
+ #
441
+ # Determines whether a given URL has been enqueued.
442
+ #
443
+ # @param [URI::HTTP] url
444
+ # The URL to search for in the queue.
445
+ #
446
+ # @return [Boolean]
447
+ # Specifies whether the given URL has been queued for visiting.
448
+ #
449
+ def queued?(url)
450
+ @queue.include?(url)
451
+ end
452
+
453
+ #
454
+ # Enqueues a given URL for visiting, only if it passes all of the
455
+ # agent's rules for visiting a given URL.
456
+ #
457
+ # @param [URI::HTTP, String] url
458
+ # The URL to enqueue for visiting.
459
+ #
460
+ # @return [Boolean]
461
+ # Specifies whether the URL was enqueued, or ignored.
462
+ #
463
+ def enqueue(url,level=0)
464
+ url = sanitize_url(url)
465
+
466
+ if (!(queued?(url)) && visit?(url))
467
+ link = url.to_s
468
+
469
+ begin
470
+ @every_url_blocks.each { |url_block| url_block.call(url) }
471
+
472
+ @every_url_like_blocks.each do |pattern,url_blocks|
473
+ match = case pattern
474
+ when Regexp
475
+ link =~ pattern
476
+ else
477
+ (pattern == link) || (pattern == url)
478
+ end
479
+
480
+ if match
481
+ url_blocks.each { |url_block| url_block.call(url) }
482
+ end
483
+ end
484
+ rescue Actions::Paused => action
485
+ raise(action)
486
+ rescue Actions::SkipLink
487
+ return false
488
+ rescue Actions::Action
489
+ end
490
+
491
+ @queue << url
492
+ @levels[url] = level
493
+ return true
494
+ end
495
+
496
+ return false
497
+ end
498
+
499
+ #
500
+ # Requests and creates a new Page object from a given URL.
501
+ #
502
+ # @param [URI::HTTP] url
503
+ # The URL to request.
504
+ #
505
+ # @yield [page]
506
+ # If a block is given, it will be passed the page that represents the
507
+ # response.
508
+ #
509
+ # @yieldparam [Page] page
510
+ # The page for the response.
511
+ #
512
+ # @return [Page, nil]
513
+ # The page for the response, or `nil` if the request failed.
514
+ #
515
+ def get_page(url)
516
+ url = URI(url.to_s)
517
+
518
+ prepare_request(url) do |session,path,headers|
519
+ new_page = Page.new(url,session.get(path,headers))
520
+
521
+ # save any new cookies
522
+ @cookies.from_page(new_page)
523
+
524
+ yield new_page if block_given?
525
+ return new_page
526
+ end
527
+ end
528
+
529
+ #
530
+ # Posts supplied form data and creates a new Page object from a given URL.
531
+ #
532
+ # @param [URI::HTTP] url
533
+ # The URL to request.
534
+ #
535
+ # @param [String] post_data
536
+ # Form option data.
537
+ #
538
+ # @yield [page]
539
+ # If a block is given, it will be passed the page that represents the
540
+ # response.
541
+ #
542
+ # @yieldparam [Page] page
543
+ # The page for the response.
544
+ #
545
+ # @return [Page, nil]
546
+ # The page for the response, or `nil` if the request failed.
547
+ #
548
+ # @since 0.2.2
549
+ #
550
+ def post_page(url,post_data='')
551
+ url = URI(url.to_s)
552
+
553
+ prepare_request(url) do |session,path,headers|
554
+ new_page = Page.new(url,session.post(path,post_data,headers))
555
+
556
+ # save any new cookies
557
+ @cookies.from_page(new_page)
558
+
559
+ yield new_page if block_given?
560
+ return new_page
561
+ end
562
+ end
563
+
564
+ #
565
+ # Visits a given URL, and enqueus the links recovered from the URL
566
+ # to be visited later.
567
+ #
568
+ # @param [URI::HTTP, String] url
569
+ # The URL to visit.
570
+ #
571
+ # @yield [page]
572
+ # If a block is given, it will be passed the page which was visited.
573
+ #
574
+ # @yieldparam [Page] page
575
+ # The page which was visited.
576
+ #
577
+ # @return [Page, nil]
578
+ # The page that was visited. If `nil` is returned, either the request
579
+ # for the page failed, or the page was skipped.
580
+ #
581
+ def visit_page(url)
582
+ url = sanitize_url(url)
583
+
584
+ get_page(url) do |page|
585
+ @history << page.url
586
+
587
+ begin
588
+ @every_page_blocks.each { |page_block| page_block.call(page) }
589
+
590
+ yield page if block_given?
591
+ rescue Actions::Paused => action
592
+ raise(action)
593
+ rescue Actions::SkipPage
594
+ return nil
595
+ rescue Actions::Action
596
+ end
597
+
598
+ page.each_url do |next_url|
599
+ begin
600
+ @every_link_blocks.each do |link_block|
601
+ link_block.call(page.url,next_url)
602
+ end
603
+ rescue Actions::Paused => action
604
+ raise(action)
605
+ rescue Actions::SkipLink
606
+ next
607
+ rescue Actions::Action
608
+ end
609
+
610
+ if (@max_depth.nil? || @max_depth > @levels[url])
611
+ enqueue(next_url,@levels[url] + 1)
612
+ end
613
+ end
614
+ end
615
+ end
616
+
617
+ #
618
+ # Converts the agent into a Hash.
619
+ #
620
+ # @return [Hash]
621
+ # The agent represented as a Hash containing the `history` and
622
+ # the `queue` of the agent.
623
+ #
624
+ def to_hash
625
+ {:history => @history, :queue => @queue}
626
+ end
627
+
628
+ protected
629
+
630
+ #
631
+ # Normalizes the request path and grabs a session to handle page
632
+ # get and post requests.
633
+ #
634
+ # @param [URI::HTTP] url
635
+ # The URL to request.
636
+ #
637
+ # @yield [request]
638
+ # A block whose purpose is to make a page request.
639
+ #
640
+ # @yieldparam [Net::HTTP] session
641
+ # An HTTP session object.
642
+ #
643
+ # @yieldparam [String] path
644
+ # Normalized URL string.
645
+ #
646
+ # @yieldparam [Hash] headers
647
+ # A Hash of request header options.
648
+ #
649
+ # @since 0.2.2
650
+ #
651
+ def prepare_request(url,&block)
652
+ host = url.host
653
+ port = url.port
654
+ path = unless url.path.empty?
655
+ url.path
656
+ else
657
+ '/'
658
+ end
659
+
660
+ # append the URL query to the path
661
+ path += "?#{url.query}" if url.query
662
+
663
+ # set any additional HTTP headers
664
+ headers = {}
665
+
666
+ unless @host_headers.empty?
667
+ @host_headers.each do |name,header|
668
+ if host.match(name)
669
+ headers['Host'] = header
670
+ break
671
+ end
672
+ end
673
+ end
674
+
675
+ headers['Host'] ||= @host_header if @host_header
676
+ headers['User-Agent'] = @user_agent if @user_agent
677
+ headers['Referer'] = @referer if @referer
678
+
679
+ if (authorization = @authorized.for_url(url))
680
+ headers['Authorization'] = "Basic #{authorization}"
681
+ end
682
+
683
+ if (header_cookies = @cookies.for_host(url.host))
684
+ headers['Cookie'] = header_cookies
685
+ end
686
+
687
+ begin
688
+ sleep(@delay) if @delay > 0
689
+
690
+ yield @sessions[url], path, headers
691
+ rescue SystemCallError,
692
+ Timeout::Error,
693
+ SocketError,
694
+ IOError,
695
+ OpenSSL::SSL::SSLError,
696
+ Net::HTTPBadResponse
697
+
698
+ @sessions.kill!(url)
699
+
700
+ failed(url)
701
+ return nil
702
+ end
703
+ end
704
+
705
+ #
706
+ # Dequeues a URL that will later be visited.
707
+ #
708
+ # @return [URI::HTTP]
709
+ # The URL that was at the front of the queue.
710
+ #
711
+ def dequeue
712
+ @queue.shift
713
+ end
714
+
715
+ #
716
+ # Determines if a given URL should be visited.
717
+ #
718
+ # @param [URI::HTTP] url
719
+ # The URL in question.
720
+ #
721
+ # @return [Boolean]
722
+ # Specifies whether the given URL should be visited.
723
+ #
724
+ def visit?(url)
725
+ !visited?(url) &&
726
+ visit_scheme?(url.scheme) &&
727
+ visit_host?(url.host) &&
728
+ visit_port?(url.port) &&
729
+ visit_link?(url.to_s) &&
730
+ visit_url?(url) &&
731
+ visit_ext?(url.path)
732
+ end
733
+
734
+ #
735
+ # Adds a given URL to the failures list.
736
+ #
737
+ # @param [URI::HTTP] url
738
+ # The URL to add to the failures list.
739
+ #
740
+ def failed(url)
741
+ @failures << url
742
+ @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
743
+ return true
744
+ end
745
+
746
+ end
747
+ end