spidr 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,18 @@
1
+ === 0.1.9 / 2009-06-13
2
+
3
+ * Upgraded to Hoe 2.0.0.
4
+ * Use Hoe.spec instead of Hoe.new.
5
+ * Use the Hoe signing task for signed gems.
6
+ * Added the Agent#schemes and Agent#schemes= methods.
7
+ * Added a warning message if 'net/https' cannot be loaded.
8
+ * Allow the list of acceptable URL schemes to be passed into Agent.new.
9
+ * Allow history and queue information to be passed into Agent.new.
10
+ * Agent#start_at no longer clears the history or the queue.
11
+ * Fixed a bug in the sanitization of semi-escaped URLs.
12
+ * Fixed a bug where https URLs would be followed even if 'net/https'
13
+ could not be loaded.
14
+ * Removed Agent::SCHEMES.
15
+
1
16
  === 0.1.8 / 2009-05-27
2
17
 
3
18
  * Added the Agent#pause! and Agent#continue! methods.
data/README.txt CHANGED
@@ -18,6 +18,7 @@ and easy to use.
18
18
  * frame tags.
19
19
  * HTTP 300, 301, 302, 303 and 307 Redirects.
20
20
  * Black-list or white-list URLs based upon:
21
+ * URL scheme.
21
22
  * Host name
22
23
  * Port number
23
24
  * Full link
data/Rakefile CHANGED
@@ -2,11 +2,12 @@
2
2
 
3
3
  require 'rubygems'
4
4
  require 'hoe'
5
+ require 'hoe/signing'
5
6
  require './tasks/spec.rb'
6
7
  require './tasks/course.rb'
7
8
  require './lib/spidr/version.rb'
8
9
 
9
- Hoe.new('spidr', Spidr::VERSION) do |p|
10
+ Hoe.spec('spidr') do |p|
10
11
  p.rubyforge_name = 'spidr'
11
12
  p.developer('Postmodern', 'postmodern.mod3@gmail.com')
12
13
  p.remote_rdoc_dir = 'docs'
data/lib/spidr/agent.rb CHANGED
@@ -7,9 +7,6 @@ require 'net/http'
7
7
  module Spidr
8
8
  class Agent
9
9
 
10
- # URL schemes to visit
11
- SCHEMES = ['http', 'https']
12
-
13
10
  # Proxy to use
14
11
  attr_accessor :proxy
15
12
 
@@ -22,6 +19,9 @@ module Spidr
22
19
  # Delay in between fetching pages
23
20
  attr_accessor :delay
24
21
 
22
+ # List of acceptable URL schemes to follow
23
+ attr_reader :schemes
24
+
25
25
  # History containing visited URLs
26
26
  attr_reader :history
27
27
 
@@ -42,6 +42,10 @@ module Spidr
42
42
  # <tt>:referer</tt>:: The referer URL to send.
43
43
  # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
44
44
  # link. Defaults to 0.
45
+ # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
46
+ # Defaults to +http+ and +https+. +https+ URL
47
+ # schemes will be ignored if <tt>net/http</tt>
48
+ # cannot be loaded.
45
49
  # <tt>:host</tt>:: The host-name to visit.
46
50
  # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
47
51
  # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
@@ -52,12 +56,32 @@ module Spidr
52
56
  # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
53
57
  # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
54
58
  # visit.
59
+ # <tt>:queue</tt>:: An initial queue of URLs to visit.
60
+ # <tt>:history</tt>:: An initial list of visited URLs.
55
61
  #
56
62
  def initialize(options={},&block)
57
63
  @proxy = (options[:proxy] || Spidr.proxy)
58
64
  @user_agent = (options[:user_agent] || Spidr.user_agent)
59
65
  @referer = options[:referer]
60
66
 
67
+ @schemes = []
68
+
69
+ if options[:schemes]
70
+ @schemes += options[:schemes]
71
+ else
72
+ @schemes << 'http'
73
+
74
+ begin
75
+ require 'net/https'
76
+
77
+ @schemes << 'https'
78
+ rescue Gem::LoadError => e
79
+ raise(e)
80
+ rescue ::LoadError
81
+ STDERR.puts "Warning: cannot load 'net/https', https support disabled"
82
+ end
83
+ end
84
+
61
85
  @host_rules = Rules.new(
62
86
  :accept => options[:hosts],
63
87
  :reject => options[:ignore_hosts]
@@ -91,6 +115,14 @@ module Spidr
91
115
  visit_hosts_like(options[:host])
92
116
  end
93
117
 
118
+ if options[:queue]
119
+ self.queue = options[:queue]
120
+ end
121
+
122
+ if options[:history]
123
+ self.history = options[:history]
124
+ end
125
+
94
126
  block.call(self) if block
95
127
  end
96
128
 
@@ -361,10 +393,9 @@ module Spidr
361
393
  end
362
394
 
363
395
  #
364
- # Clear the history and start spidering at the specified _url_.
396
+ # Start spidering at the specified _url_.
365
397
  #
366
398
  def start_at(url)
367
- clear
368
399
  enqueue(url)
369
400
 
370
401
  return continue!
@@ -413,6 +444,16 @@ module Spidr
413
444
  return self
414
445
  end
415
446
 
447
+ #
448
+ # Sets the list of acceptable URL schemes to follow to the
449
+ # _new_schemes_.
450
+ #
451
+ # agent.schemes = ['http']
452
+ #
453
+ def schemes=(new_schemes)
454
+ @schemes = new_schemes.map { |scheme| scheme.to_s }
455
+ end
456
+
416
457
  #
417
458
  # Sets the history of links that were previously visited to the
418
459
  # specified _new_history_.
@@ -575,7 +616,7 @@ module Spidr
575
616
  #
576
617
  def visit_scheme?(url)
577
618
  if url.scheme
578
- return SCHEMES.include?(url.scheme)
619
+ return @schemes.include?(url.scheme)
579
620
  else
580
621
  return true
581
622
  end
data/lib/spidr/page.rb CHANGED
@@ -252,8 +252,8 @@ module Spidr
252
252
  # based on the url of the page.
253
253
  #
254
254
  def to_absolute(link)
255
- # clean the link
256
- link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
255
+ # decode, clean then re-encode the URL
256
+ link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
257
257
 
258
258
  begin
259
259
  relative = URI(link)
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.8'
2
+ VERSION = '0.1.9'
3
3
  end
data.tar.gz.sig ADDED
Binary file
metadata CHANGED
@@ -1,15 +1,36 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
8
8
  autorequire:
9
9
  bindir: bin
10
- cert_chain: []
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIDQDCCAiigAwIBAgIBADANBgkqhkiG9w0BAQUFADBGMRgwFgYDVQQDDA9wb3N0
14
+ bW9kZXJuLm1vZDMxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
15
+ ARkWA2NvbTAeFw0wOTA2MDMwNDU5MDNaFw0xMDA2MDMwNDU5MDNaMEYxGDAWBgNV
16
+ BAMMD3Bvc3Rtb2Rlcm4ubW9kMzEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYK
17
+ CZImiZPyLGQBGRYDY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
18
+ 1wvANkTDHFgVih5XLjuTwTZjgBq1lBGybXJiH6Id1lY2JOMqM5FB1DDHVvvij94i
19
+ mJabN0zkzu6VKWC70y0IwOxY7CPokr0eFdK/D0y7mCq1P8QITv76i2YqAl0eYqIt
20
+ W+IhIkANQ7E6uMZIZcdnfadC6lPAtlKkqtd9crvRbFgr6e3kyflmohbRnTEJHoRd
21
+ 7SHHsybE6DSn7oTDs6XBTNrNIn5VfZA0z01eeos/+zBm1zKJOK2+/7xtLLDuDU9G
22
+ +Rd+ltUBbvxUrMNZmDG29pnmN2xTRH+Q8HxD2AxlvM5SRpK6OeZaHV7PaCCAVZ4L
23
+ T9BFl1sfMvRlABeGEkSyuQIDAQABozkwNzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIE
24
+ sDAdBgNVHQ4EFgQUKwsd+PqEYmBvyaTyoL+uRuk+PhEwDQYJKoZIhvcNAQEFBQAD
25
+ ggEBAB4TvHsrlbcXcKg6gX5BIb9tI+zGkpzo0Z7jnxMEcNO7NGGwmzafDBI/xZYv
26
+ xkRH3/HXbGGYDOi6Q6gWt5GujSx0bOImDtYTJTH8jnzN92HzEK5WdScm1QpZKF1e
27
+ cezArMbxbSPaosxTCtG6LQTkE28lFQsmFZ5xzouugS4h5+LVJiVMmiP+l3EfkjFa
28
+ GOURU+rNEMPWo8MCWivGW7jes6BMzWHcW7DQ0scNVmIcCIgdyMmpscuAEOSeghy9
29
+ /fFs57Ey2OXBL55nDOyvN/ZQ2Vab05UH4t+GCxjAPeirzL/29FBtePT6VD44c38j
30
+ pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
31
+ -----END CERTIFICATE-----
11
32
 
12
- date: 2009-05-27 00:00:00 -07:00
33
+ date: 2009-06-13 00:00:00 -07:00
13
34
  default_executable:
14
35
  dependencies:
15
36
  - !ruby/object:Gem::Dependency
@@ -30,9 +51,12 @@ dependencies:
30
51
  requirements:
31
52
  - - ">="
32
53
  - !ruby/object:Gem::Version
33
- version: 1.12.2
54
+ version: 2.0.0
34
55
  version:
35
- description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
56
+ description: |-
57
+ Spidr is a versatile Ruby web spidering library that can spider a site,
58
+ multiple domains, certain links or infinitely. Spidr is designed to be fast
59
+ and easy to use.
36
60
  email:
37
61
  - postmodern.mod3@gmail.com
38
62
  executables: []
@@ -92,6 +116,8 @@ files:
92
116
  - static/course/specs.json
93
117
  has_rdoc: true
94
118
  homepage: http://spidr.rubyforge.org/
119
+ licenses: []
120
+
95
121
  post_install_message:
96
122
  rdoc_options:
97
123
  - --main
@@ -113,9 +139,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
113
139
  requirements: []
114
140
 
115
141
  rubyforge_project: spidr
116
- rubygems_version: 1.3.1
142
+ rubygems_version: 1.3.4
117
143
  signing_key:
118
- specification_version: 2
144
+ specification_version: 3
119
145
  summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
120
146
  test_files: []
121
147
 
metadata.gz.sig ADDED
Binary file