spidr 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -0
- data/README.txt +1 -0
- data/Rakefile +2 -1
- data/lib/spidr/agent.rb +47 -6
- data/lib/spidr/page.rb +2 -2
- data/lib/spidr/version.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +33 -7
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
=== 0.1.9 / 2009-06-13
|
2
|
+
|
3
|
+
* Upgraded to Hoe 2.0.0.
|
4
|
+
* Use Hoe.spec instead of Hoe.new.
|
5
|
+
* Use the Hoe signing task for signed gems.
|
6
|
+
* Added the Agent#schemes and Agent#schemes= methods.
|
7
|
+
* Added a warning message if 'net/https' cannot be loaded.
|
8
|
+
* Allow the list of acceptable URL schemes to be passed into Agent.new.
|
9
|
+
* Allow history and queue information to be passed into Agent.new.
|
10
|
+
* Agent#start_at no longer clears the history or the queue.
|
11
|
+
* Fixed a bug in the sanitization of semi-escaped URLs.
|
12
|
+
* Fixed a bug where https URLs would be followed even if 'net/https'
|
13
|
+
could not be loaded.
|
14
|
+
* Removed Agent::SCHEMES.
|
15
|
+
|
1
16
|
=== 0.1.8 / 2009-05-27
|
2
17
|
|
3
18
|
* Added the Agent#pause! and Agent#continue! methods.
|
data/README.txt
CHANGED
data/Rakefile
CHANGED
@@ -2,11 +2,12 @@
|
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'hoe'
|
5
|
+
require 'hoe/signing'
|
5
6
|
require './tasks/spec.rb'
|
6
7
|
require './tasks/course.rb'
|
7
8
|
require './lib/spidr/version.rb'
|
8
9
|
|
9
|
-
Hoe.
|
10
|
+
Hoe.spec('spidr') do |p|
|
10
11
|
p.rubyforge_name = 'spidr'
|
11
12
|
p.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
12
13
|
p.remote_rdoc_dir = 'docs'
|
data/lib/spidr/agent.rb
CHANGED
@@ -7,9 +7,6 @@ require 'net/http'
|
|
7
7
|
module Spidr
|
8
8
|
class Agent
|
9
9
|
|
10
|
-
# URL schemes to visit
|
11
|
-
SCHEMES = ['http', 'https']
|
12
|
-
|
13
10
|
# Proxy to use
|
14
11
|
attr_accessor :proxy
|
15
12
|
|
@@ -22,6 +19,9 @@ module Spidr
|
|
22
19
|
# Delay in between fetching pages
|
23
20
|
attr_accessor :delay
|
24
21
|
|
22
|
+
# List of acceptable URL schemes to follow
|
23
|
+
attr_reader :schemes
|
24
|
+
|
25
25
|
# History containing visited URLs
|
26
26
|
attr_reader :history
|
27
27
|
|
@@ -42,6 +42,10 @@ module Spidr
|
|
42
42
|
# <tt>:referer</tt>:: The referer URL to send.
|
43
43
|
# <tt>:delay</tt>:: Duration in seconds to pause between spidering each
|
44
44
|
# link. Defaults to 0.
|
45
|
+
# <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
|
46
|
+
# Defaults to +http+ and +https+. +https+ URL
|
47
|
+
# schemes will be ignored if <tt>net/http</tt>
|
48
|
+
# cannot be loaded.
|
45
49
|
# <tt>:host</tt>:: The host-name to visit.
|
46
50
|
# <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
|
47
51
|
# <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
|
@@ -52,12 +56,32 @@ module Spidr
|
|
52
56
|
# <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
|
53
57
|
# <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
|
54
58
|
# visit.
|
59
|
+
# <tt>:queue</tt>:: An initial queue of URLs to visit.
|
60
|
+
# <tt>:history</tt>:: An initial list of visited URLs.
|
55
61
|
#
|
56
62
|
def initialize(options={},&block)
|
57
63
|
@proxy = (options[:proxy] || Spidr.proxy)
|
58
64
|
@user_agent = (options[:user_agent] || Spidr.user_agent)
|
59
65
|
@referer = options[:referer]
|
60
66
|
|
67
|
+
@schemes = []
|
68
|
+
|
69
|
+
if options[:schemes]
|
70
|
+
@schemes += options[:schemes]
|
71
|
+
else
|
72
|
+
@schemes << 'http'
|
73
|
+
|
74
|
+
begin
|
75
|
+
require 'net/https'
|
76
|
+
|
77
|
+
@schemes << 'https'
|
78
|
+
rescue Gem::LoadError => e
|
79
|
+
raise(e)
|
80
|
+
rescue ::LoadError
|
81
|
+
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
61
85
|
@host_rules = Rules.new(
|
62
86
|
:accept => options[:hosts],
|
63
87
|
:reject => options[:ignore_hosts]
|
@@ -91,6 +115,14 @@ module Spidr
|
|
91
115
|
visit_hosts_like(options[:host])
|
92
116
|
end
|
93
117
|
|
118
|
+
if options[:queue]
|
119
|
+
self.queue = options[:queue]
|
120
|
+
end
|
121
|
+
|
122
|
+
if options[:history]
|
123
|
+
self.history = options[:history]
|
124
|
+
end
|
125
|
+
|
94
126
|
block.call(self) if block
|
95
127
|
end
|
96
128
|
|
@@ -361,10 +393,9 @@ module Spidr
|
|
361
393
|
end
|
362
394
|
|
363
395
|
#
|
364
|
-
#
|
396
|
+
# Start spidering at the specified _url_.
|
365
397
|
#
|
366
398
|
def start_at(url)
|
367
|
-
clear
|
368
399
|
enqueue(url)
|
369
400
|
|
370
401
|
return continue!
|
@@ -413,6 +444,16 @@ module Spidr
|
|
413
444
|
return self
|
414
445
|
end
|
415
446
|
|
447
|
+
#
|
448
|
+
# Sets the list of acceptable URL schemes to follow to the
|
449
|
+
# _new_schemes_.
|
450
|
+
#
|
451
|
+
# agent.schemes = ['http']
|
452
|
+
#
|
453
|
+
def schemes=(new_schemes)
|
454
|
+
@schemes = new_schemes.map { |scheme| scheme.to_s }
|
455
|
+
end
|
456
|
+
|
416
457
|
#
|
417
458
|
# Sets the history of links that were previously visited to the
|
418
459
|
# specified _new_history_.
|
@@ -575,7 +616,7 @@ module Spidr
|
|
575
616
|
#
|
576
617
|
def visit_scheme?(url)
|
577
618
|
if url.scheme
|
578
|
-
return
|
619
|
+
return @schemes.include?(url.scheme)
|
579
620
|
else
|
580
621
|
return true
|
581
622
|
end
|
data/lib/spidr/page.rb
CHANGED
@@ -252,8 +252,8 @@ module Spidr
|
|
252
252
|
# based on the url of the page.
|
253
253
|
#
|
254
254
|
def to_absolute(link)
|
255
|
-
# clean the
|
256
|
-
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
255
|
+
# decode, clean then re-encode the URL
|
256
|
+
link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
|
257
257
|
|
258
258
|
begin
|
259
259
|
relative = URI(link)
|
data/lib/spidr/version.rb
CHANGED
data.tar.gz.sig
ADDED
Binary file
|
metadata
CHANGED
@@ -1,15 +1,36 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
|
-
cert_chain:
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIDQDCCAiigAwIBAgIBADANBgkqhkiG9w0BAQUFADBGMRgwFgYDVQQDDA9wb3N0
|
14
|
+
bW9kZXJuLm1vZDMxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
|
15
|
+
ARkWA2NvbTAeFw0wOTA2MDMwNDU5MDNaFw0xMDA2MDMwNDU5MDNaMEYxGDAWBgNV
|
16
|
+
BAMMD3Bvc3Rtb2Rlcm4ubW9kMzEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYK
|
17
|
+
CZImiZPyLGQBGRYDY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
|
18
|
+
1wvANkTDHFgVih5XLjuTwTZjgBq1lBGybXJiH6Id1lY2JOMqM5FB1DDHVvvij94i
|
19
|
+
mJabN0zkzu6VKWC70y0IwOxY7CPokr0eFdK/D0y7mCq1P8QITv76i2YqAl0eYqIt
|
20
|
+
W+IhIkANQ7E6uMZIZcdnfadC6lPAtlKkqtd9crvRbFgr6e3kyflmohbRnTEJHoRd
|
21
|
+
7SHHsybE6DSn7oTDs6XBTNrNIn5VfZA0z01eeos/+zBm1zKJOK2+/7xtLLDuDU9G
|
22
|
+
+Rd+ltUBbvxUrMNZmDG29pnmN2xTRH+Q8HxD2AxlvM5SRpK6OeZaHV7PaCCAVZ4L
|
23
|
+
T9BFl1sfMvRlABeGEkSyuQIDAQABozkwNzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIE
|
24
|
+
sDAdBgNVHQ4EFgQUKwsd+PqEYmBvyaTyoL+uRuk+PhEwDQYJKoZIhvcNAQEFBQAD
|
25
|
+
ggEBAB4TvHsrlbcXcKg6gX5BIb9tI+zGkpzo0Z7jnxMEcNO7NGGwmzafDBI/xZYv
|
26
|
+
xkRH3/HXbGGYDOi6Q6gWt5GujSx0bOImDtYTJTH8jnzN92HzEK5WdScm1QpZKF1e
|
27
|
+
cezArMbxbSPaosxTCtG6LQTkE28lFQsmFZ5xzouugS4h5+LVJiVMmiP+l3EfkjFa
|
28
|
+
GOURU+rNEMPWo8MCWivGW7jes6BMzWHcW7DQ0scNVmIcCIgdyMmpscuAEOSeghy9
|
29
|
+
/fFs57Ey2OXBL55nDOyvN/ZQ2Vab05UH4t+GCxjAPeirzL/29FBtePT6VD44c38j
|
30
|
+
pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
|
31
|
+
-----END CERTIFICATE-----
|
11
32
|
|
12
|
-
date: 2009-
|
33
|
+
date: 2009-06-13 00:00:00 -07:00
|
13
34
|
default_executable:
|
14
35
|
dependencies:
|
15
36
|
- !ruby/object:Gem::Dependency
|
@@ -30,9 +51,12 @@ dependencies:
|
|
30
51
|
requirements:
|
31
52
|
- - ">="
|
32
53
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
54
|
+
version: 2.0.0
|
34
55
|
version:
|
35
|
-
description:
|
56
|
+
description: |-
|
57
|
+
Spidr is a versatile Ruby web spidering library that can spider a site,
|
58
|
+
multiple domains, certain links or infinitely. Spidr is designed to be fast
|
59
|
+
and easy to use.
|
36
60
|
email:
|
37
61
|
- postmodern.mod3@gmail.com
|
38
62
|
executables: []
|
@@ -92,6 +116,8 @@ files:
|
|
92
116
|
- static/course/specs.json
|
93
117
|
has_rdoc: true
|
94
118
|
homepage: http://spidr.rubyforge.org/
|
119
|
+
licenses: []
|
120
|
+
|
95
121
|
post_install_message:
|
96
122
|
rdoc_options:
|
97
123
|
- --main
|
@@ -113,9 +139,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
113
139
|
requirements: []
|
114
140
|
|
115
141
|
rubyforge_project: spidr
|
116
|
-
rubygems_version: 1.3.
|
142
|
+
rubygems_version: 1.3.4
|
117
143
|
signing_key:
|
118
|
-
specification_version:
|
144
|
+
specification_version: 3
|
119
145
|
summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
|
120
146
|
test_files: []
|
121
147
|
|
metadata.gz.sig
ADDED
Binary file
|