spidr 0.1.8 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,18 @@
1
+ === 0.1.9 / 2009-06-13
2
+
3
+ * Upgraded to Hoe 2.0.0.
4
+ * Use Hoe.spec instead of Hoe.new.
5
+ * Use the Hoe signing task for signed gems.
6
+ * Added the Agent#schemes and Agent#schemes= methods.
7
+ * Added a warning message if 'net/https' cannot be loaded.
8
+ * Allow the list of acceptable URL schemes to be passed into Agent.new.
9
+ * Allow history and queue information to be passed into Agent.new.
10
+ * Agent#start_at no longer clears the history or the queue.
11
+ * Fixed a bug in the sanitization of semi-escaped URLs.
12
+ * Fixed a bug where https URLs would be followed even if 'net/https'
13
+ could not be loaded.
14
+ * Removed Agent::SCHEMES.
15
+
1
16
  === 0.1.8 / 2009-05-27
2
17
 
3
18
  * Added the Agent#pause! and Agent#continue! methods.
data/README.txt CHANGED
@@ -18,6 +18,7 @@ and easy to use.
18
18
  * frame tags.
19
19
  * HTTP 300, 301, 302, 303 and 307 Redirects.
20
20
  * Black-list or white-list URLs based upon:
21
+ * URL scheme.
21
22
  * Host name
22
23
  * Port number
23
24
  * Full link
data/Rakefile CHANGED
@@ -2,11 +2,12 @@
2
2
 
3
3
  require 'rubygems'
4
4
  require 'hoe'
5
+ require 'hoe/signing'
5
6
  require './tasks/spec.rb'
6
7
  require './tasks/course.rb'
7
8
  require './lib/spidr/version.rb'
8
9
 
9
- Hoe.new('spidr', Spidr::VERSION) do |p|
10
+ Hoe.spec('spidr') do |p|
10
11
  p.rubyforge_name = 'spidr'
11
12
  p.developer('Postmodern', 'postmodern.mod3@gmail.com')
12
13
  p.remote_rdoc_dir = 'docs'
data/lib/spidr/agent.rb CHANGED
@@ -7,9 +7,6 @@ require 'net/http'
7
7
  module Spidr
8
8
  class Agent
9
9
 
10
- # URL schemes to visit
11
- SCHEMES = ['http', 'https']
12
-
13
10
  # Proxy to use
14
11
  attr_accessor :proxy
15
12
 
@@ -22,6 +19,9 @@ module Spidr
22
19
  # Delay in between fetching pages
23
20
  attr_accessor :delay
24
21
 
22
+ # List of acceptable URL schemes to follow
23
+ attr_reader :schemes
24
+
25
25
  # History containing visited URLs
26
26
  attr_reader :history
27
27
 
@@ -42,6 +42,10 @@ module Spidr
42
42
  # <tt>:referer</tt>:: The referer URL to send.
43
43
  # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
44
44
  # link. Defaults to 0.
45
+ # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
46
+ # Defaults to +http+ and +https+. +https+ URL
47
+ # schemes will be ignored if <tt>net/http</tt>
48
+ # cannot be loaded.
45
49
  # <tt>:host</tt>:: The host-name to visit.
46
50
  # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
47
51
  # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
@@ -52,12 +56,32 @@ module Spidr
52
56
  # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
53
57
  # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
54
58
  # visit.
59
+ # <tt>:queue</tt>:: An initial queue of URLs to visit.
60
+ # <tt>:history</tt>:: An initial list of visited URLs.
55
61
  #
56
62
  def initialize(options={},&block)
57
63
  @proxy = (options[:proxy] || Spidr.proxy)
58
64
  @user_agent = (options[:user_agent] || Spidr.user_agent)
59
65
  @referer = options[:referer]
60
66
 
67
+ @schemes = []
68
+
69
+ if options[:schemes]
70
+ @schemes += options[:schemes]
71
+ else
72
+ @schemes << 'http'
73
+
74
+ begin
75
+ require 'net/https'
76
+
77
+ @schemes << 'https'
78
+ rescue Gem::LoadError => e
79
+ raise(e)
80
+ rescue ::LoadError
81
+ STDERR.puts "Warning: cannot load 'net/https', https support disabled"
82
+ end
83
+ end
84
+
61
85
  @host_rules = Rules.new(
62
86
  :accept => options[:hosts],
63
87
  :reject => options[:ignore_hosts]
@@ -91,6 +115,14 @@ module Spidr
91
115
  visit_hosts_like(options[:host])
92
116
  end
93
117
 
118
+ if options[:queue]
119
+ self.queue = options[:queue]
120
+ end
121
+
122
+ if options[:history]
123
+ self.history = options[:history]
124
+ end
125
+
94
126
  block.call(self) if block
95
127
  end
96
128
 
@@ -361,10 +393,9 @@ module Spidr
361
393
  end
362
394
 
363
395
  #
364
- # Clear the history and start spidering at the specified _url_.
396
+ # Start spidering at the specified _url_.
365
397
  #
366
398
  def start_at(url)
367
- clear
368
399
  enqueue(url)
369
400
 
370
401
  return continue!
@@ -413,6 +444,16 @@ module Spidr
413
444
  return self
414
445
  end
415
446
 
447
+ #
448
+ # Sets the list of acceptable URL schemes to follow to the
449
+ # _new_schemes_.
450
+ #
451
+ # agent.schemes = ['http']
452
+ #
453
+ def schemes=(new_schemes)
454
+ @schemes = new_schemes.map { |scheme| scheme.to_s }
455
+ end
456
+
416
457
  #
417
458
  # Sets the history of links that were previously visited to the
418
459
  # specified _new_history_.
@@ -575,7 +616,7 @@ module Spidr
575
616
  #
576
617
  def visit_scheme?(url)
577
618
  if url.scheme
578
- return SCHEMES.include?(url.scheme)
619
+ return @schemes.include?(url.scheme)
579
620
  else
580
621
  return true
581
622
  end
data/lib/spidr/page.rb CHANGED
@@ -252,8 +252,8 @@ module Spidr
252
252
  # based on the url of the page.
253
253
  #
254
254
  def to_absolute(link)
255
- # clean the link
256
- link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
255
+ # decode, clean then re-encode the URL
256
+ link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
257
257
 
258
258
  begin
259
259
  relative = URI(link)
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.8'
2
+ VERSION = '0.1.9'
3
3
  end
data.tar.gz.sig ADDED
Binary file
metadata CHANGED
@@ -1,15 +1,36 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
- cert_chain: []
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIDQDCCAiigAwIBAgIBADANBgkqhkiG9w0BAQUFADBGMRgwFgYDVQQDDA9wb3N0
14
+ bW9kZXJuLm1vZDMxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
15
+ ARkWA2NvbTAeFw0wOTA2MDMwNDU5MDNaFw0xMDA2MDMwNDU5MDNaMEYxGDAWBgNV
16
+ BAMMD3Bvc3Rtb2Rlcm4ubW9kMzEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYK
17
+ CZImiZPyLGQBGRYDY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
18
+ 1wvANkTDHFgVih5XLjuTwTZjgBq1lBGybXJiH6Id1lY2JOMqM5FB1DDHVvvij94i
19
+ mJabN0zkzu6VKWC70y0IwOxY7CPokr0eFdK/D0y7mCq1P8QITv76i2YqAl0eYqIt
20
+ W+IhIkANQ7E6uMZIZcdnfadC6lPAtlKkqtd9crvRbFgr6e3kyflmohbRnTEJHoRd
21
+ 7SHHsybE6DSn7oTDs6XBTNrNIn5VfZA0z01eeos/+zBm1zKJOK2+/7xtLLDuDU9G
22
+ +Rd+ltUBbvxUrMNZmDG29pnmN2xTRH+Q8HxD2AxlvM5SRpK6OeZaHV7PaCCAVZ4L
23
+ T9BFl1sfMvRlABeGEkSyuQIDAQABozkwNzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIE
24
+ sDAdBgNVHQ4EFgQUKwsd+PqEYmBvyaTyoL+uRuk+PhEwDQYJKoZIhvcNAQEFBQAD
25
+ ggEBAB4TvHsrlbcXcKg6gX5BIb9tI+zGkpzo0Z7jnxMEcNO7NGGwmzafDBI/xZYv
26
+ xkRH3/HXbGGYDOi6Q6gWt5GujSx0bOImDtYTJTH8jnzN92HzEK5WdScm1QpZKF1e
27
+ cezArMbxbSPaosxTCtG6LQTkE28lFQsmFZ5xzouugS4h5+LVJiVMmiP+l3EfkjFa
28
+ GOURU+rNEMPWo8MCWivGW7jes6BMzWHcW7DQ0scNVmIcCIgdyMmpscuAEOSeghy9
29
+ /fFs57Ey2OXBL55nDOyvN/ZQ2Vab05UH4t+GCxjAPeirzL/29FBtePT6VD44c38j
30
+ pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
31
+ -----END CERTIFICATE-----
11
32
 
12
- date: 2009-05-27 00:00:00 -07:00
33
+ date: 2009-06-13 00:00:00 -07:00
13
34
  default_executable:
14
35
  dependencies:
15
36
  - !ruby/object:Gem::Dependency
@@ -30,9 +51,12 @@ dependencies:
30
51
  requirements:
31
52
  - - ">="
32
53
  - !ruby/object:Gem::Version
33
- version: 1.12.2
54
+ version: 2.0.0
34
55
  version:
35
- description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
56
+ description: |-
57
+ Spidr is a versatile Ruby web spidering library that can spider a site,
58
+ multiple domains, certain links or infinitely. Spidr is designed to be fast
59
+ and easy to use.
36
60
  email:
37
61
  - postmodern.mod3@gmail.com
38
62
  executables: []
@@ -92,6 +116,8 @@ files:
92
116
  - static/course/specs.json
93
117
  has_rdoc: true
94
118
  homepage: http://spidr.rubyforge.org/
119
+ licenses: []
120
+
95
121
  post_install_message:
96
122
  rdoc_options:
97
123
  - --main
@@ -113,9 +139,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
113
139
  requirements: []
114
140
 
115
141
  rubyforge_project: spidr
116
- rubygems_version: 1.3.1
142
+ rubygems_version: 1.3.4
117
143
  signing_key:
118
- specification_version: 2
144
+ specification_version: 3
119
145
  summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
120
146
  test_files: []
121
147
 
metadata.gz.sig ADDED
Binary file