spidr 0.1.8 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +15 -0
- data/README.txt +1 -0
- data/Rakefile +2 -1
- data/lib/spidr/agent.rb +47 -6
- data/lib/spidr/page.rb +2 -2
- data/lib/spidr/version.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +33 -7
- metadata.gz.sig +0 -0
data/History.txt
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
=== 0.1.9 / 2009-06-13
|
2
|
+
|
3
|
+
* Upgraded to Hoe 2.0.0.
|
4
|
+
* Use Hoe.spec instead of Hoe.new.
|
5
|
+
* Use the Hoe signing task for signed gems.
|
6
|
+
* Added the Agent#schemes and Agent#schemes= methods.
|
7
|
+
* Added a warning message if 'net/https' cannot be loaded.
|
8
|
+
* Allow the list of acceptable URL schemes to be passed into Agent.new.
|
9
|
+
* Allow history and queue information to be passed into Agent.new.
|
10
|
+
* Agent#start_at no longer clears the history or the queue.
|
11
|
+
* Fixed a bug in the sanitization of semi-escaped URLs.
|
12
|
+
* Fixed a bug where https URLs would be followed even if 'net/https'
|
13
|
+
could not be loaded.
|
14
|
+
* Removed Agent::SCHEMES.
|
15
|
+
|
1
16
|
=== 0.1.8 / 2009-05-27
|
2
17
|
|
3
18
|
* Added the Agent#pause! and Agent#continue! methods.
|
data/README.txt
CHANGED
data/Rakefile
CHANGED
@@ -2,11 +2,12 @@
|
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'hoe'
|
5
|
+
require 'hoe/signing'
|
5
6
|
require './tasks/spec.rb'
|
6
7
|
require './tasks/course.rb'
|
7
8
|
require './lib/spidr/version.rb'
|
8
9
|
|
9
|
-
Hoe.
|
10
|
+
Hoe.spec('spidr') do |p|
|
10
11
|
p.rubyforge_name = 'spidr'
|
11
12
|
p.developer('Postmodern', 'postmodern.mod3@gmail.com')
|
12
13
|
p.remote_rdoc_dir = 'docs'
|
data/lib/spidr/agent.rb
CHANGED
@@ -7,9 +7,6 @@ require 'net/http'
|
|
7
7
|
module Spidr
|
8
8
|
class Agent
|
9
9
|
|
10
|
-
# URL schemes to visit
|
11
|
-
SCHEMES = ['http', 'https']
|
12
|
-
|
13
10
|
# Proxy to use
|
14
11
|
attr_accessor :proxy
|
15
12
|
|
@@ -22,6 +19,9 @@ module Spidr
|
|
22
19
|
# Delay in between fetching pages
|
23
20
|
attr_accessor :delay
|
24
21
|
|
22
|
+
# List of acceptable URL schemes to follow
|
23
|
+
attr_reader :schemes
|
24
|
+
|
25
25
|
# History containing visited URLs
|
26
26
|
attr_reader :history
|
27
27
|
|
@@ -42,6 +42,10 @@ module Spidr
|
|
42
42
|
# <tt>:referer</tt>:: The referer URL to send.
|
43
43
|
# <tt>:delay</tt>:: Duration in seconds to pause between spidering each
|
44
44
|
# link. Defaults to 0.
|
45
|
+
# <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
|
46
|
+
# Defaults to +http+ and +https+. +https+ URL
|
47
|
+
# schemes will be ignored if <tt>net/http</tt>
|
48
|
+
# cannot be loaded.
|
45
49
|
# <tt>:host</tt>:: The host-name to visit.
|
46
50
|
# <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
|
47
51
|
# <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
|
@@ -52,12 +56,32 @@ module Spidr
|
|
52
56
|
# <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
|
53
57
|
# <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
|
54
58
|
# visit.
|
59
|
+
# <tt>:queue</tt>:: An initial queue of URLs to visit.
|
60
|
+
# <tt>:history</tt>:: An initial list of visited URLs.
|
55
61
|
#
|
56
62
|
def initialize(options={},&block)
|
57
63
|
@proxy = (options[:proxy] || Spidr.proxy)
|
58
64
|
@user_agent = (options[:user_agent] || Spidr.user_agent)
|
59
65
|
@referer = options[:referer]
|
60
66
|
|
67
|
+
@schemes = []
|
68
|
+
|
69
|
+
if options[:schemes]
|
70
|
+
@schemes += options[:schemes]
|
71
|
+
else
|
72
|
+
@schemes << 'http'
|
73
|
+
|
74
|
+
begin
|
75
|
+
require 'net/https'
|
76
|
+
|
77
|
+
@schemes << 'https'
|
78
|
+
rescue Gem::LoadError => e
|
79
|
+
raise(e)
|
80
|
+
rescue ::LoadError
|
81
|
+
STDERR.puts "Warning: cannot load 'net/https', https support disabled"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
61
85
|
@host_rules = Rules.new(
|
62
86
|
:accept => options[:hosts],
|
63
87
|
:reject => options[:ignore_hosts]
|
@@ -91,6 +115,14 @@ module Spidr
|
|
91
115
|
visit_hosts_like(options[:host])
|
92
116
|
end
|
93
117
|
|
118
|
+
if options[:queue]
|
119
|
+
self.queue = options[:queue]
|
120
|
+
end
|
121
|
+
|
122
|
+
if options[:history]
|
123
|
+
self.history = options[:history]
|
124
|
+
end
|
125
|
+
|
94
126
|
block.call(self) if block
|
95
127
|
end
|
96
128
|
|
@@ -361,10 +393,9 @@ module Spidr
|
|
361
393
|
end
|
362
394
|
|
363
395
|
#
|
364
|
-
#
|
396
|
+
# Start spidering at the specified _url_.
|
365
397
|
#
|
366
398
|
def start_at(url)
|
367
|
-
clear
|
368
399
|
enqueue(url)
|
369
400
|
|
370
401
|
return continue!
|
@@ -413,6 +444,16 @@ module Spidr
|
|
413
444
|
return self
|
414
445
|
end
|
415
446
|
|
447
|
+
#
|
448
|
+
# Sets the list of acceptable URL schemes to follow to the
|
449
|
+
# _new_schemes_.
|
450
|
+
#
|
451
|
+
# agent.schemes = ['http']
|
452
|
+
#
|
453
|
+
def schemes=(new_schemes)
|
454
|
+
@schemes = new_schemes.map { |scheme| scheme.to_s }
|
455
|
+
end
|
456
|
+
|
416
457
|
#
|
417
458
|
# Sets the history of links that were previously visited to the
|
418
459
|
# specified _new_history_.
|
@@ -575,7 +616,7 @@ module Spidr
|
|
575
616
|
#
|
576
617
|
def visit_scheme?(url)
|
577
618
|
if url.scheme
|
578
|
-
return
|
619
|
+
return @schemes.include?(url.scheme)
|
579
620
|
else
|
580
621
|
return true
|
581
622
|
end
|
data/lib/spidr/page.rb
CHANGED
@@ -252,8 +252,8 @@ module Spidr
|
|
252
252
|
# based on the url of the page.
|
253
253
|
#
|
254
254
|
def to_absolute(link)
|
255
|
-
# clean the
|
256
|
-
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
255
|
+
# decode, clean then re-encode the URL
|
256
|
+
link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
|
257
257
|
|
258
258
|
begin
|
259
259
|
relative = URI(link)
|
data/lib/spidr/version.rb
CHANGED
data.tar.gz.sig
ADDED
Binary file
|
metadata
CHANGED
@@ -1,15 +1,36 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
|
-
cert_chain:
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIDQDCCAiigAwIBAgIBADANBgkqhkiG9w0BAQUFADBGMRgwFgYDVQQDDA9wb3N0
|
14
|
+
bW9kZXJuLm1vZDMxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
|
15
|
+
ARkWA2NvbTAeFw0wOTA2MDMwNDU5MDNaFw0xMDA2MDMwNDU5MDNaMEYxGDAWBgNV
|
16
|
+
BAMMD3Bvc3Rtb2Rlcm4ubW9kMzEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYK
|
17
|
+
CZImiZPyLGQBGRYDY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
|
18
|
+
1wvANkTDHFgVih5XLjuTwTZjgBq1lBGybXJiH6Id1lY2JOMqM5FB1DDHVvvij94i
|
19
|
+
mJabN0zkzu6VKWC70y0IwOxY7CPokr0eFdK/D0y7mCq1P8QITv76i2YqAl0eYqIt
|
20
|
+
W+IhIkANQ7E6uMZIZcdnfadC6lPAtlKkqtd9crvRbFgr6e3kyflmohbRnTEJHoRd
|
21
|
+
7SHHsybE6DSn7oTDs6XBTNrNIn5VfZA0z01eeos/+zBm1zKJOK2+/7xtLLDuDU9G
|
22
|
+
+Rd+ltUBbvxUrMNZmDG29pnmN2xTRH+Q8HxD2AxlvM5SRpK6OeZaHV7PaCCAVZ4L
|
23
|
+
T9BFl1sfMvRlABeGEkSyuQIDAQABozkwNzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIE
|
24
|
+
sDAdBgNVHQ4EFgQUKwsd+PqEYmBvyaTyoL+uRuk+PhEwDQYJKoZIhvcNAQEFBQAD
|
25
|
+
ggEBAB4TvHsrlbcXcKg6gX5BIb9tI+zGkpzo0Z7jnxMEcNO7NGGwmzafDBI/xZYv
|
26
|
+
xkRH3/HXbGGYDOi6Q6gWt5GujSx0bOImDtYTJTH8jnzN92HzEK5WdScm1QpZKF1e
|
27
|
+
cezArMbxbSPaosxTCtG6LQTkE28lFQsmFZ5xzouugS4h5+LVJiVMmiP+l3EfkjFa
|
28
|
+
GOURU+rNEMPWo8MCWivGW7jes6BMzWHcW7DQ0scNVmIcCIgdyMmpscuAEOSeghy9
|
29
|
+
/fFs57Ey2OXBL55nDOyvN/ZQ2Vab05UH4t+GCxjAPeirzL/29FBtePT6VD44c38j
|
30
|
+
pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
|
31
|
+
-----END CERTIFICATE-----
|
11
32
|
|
12
|
-
date: 2009-
|
33
|
+
date: 2009-06-13 00:00:00 -07:00
|
13
34
|
default_executable:
|
14
35
|
dependencies:
|
15
36
|
- !ruby/object:Gem::Dependency
|
@@ -30,9 +51,12 @@ dependencies:
|
|
30
51
|
requirements:
|
31
52
|
- - ">="
|
32
53
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
54
|
+
version: 2.0.0
|
34
55
|
version:
|
35
|
-
description:
|
56
|
+
description: |-
|
57
|
+
Spidr is a versatile Ruby web spidering library that can spider a site,
|
58
|
+
multiple domains, certain links or infinitely. Spidr is designed to be fast
|
59
|
+
and easy to use.
|
36
60
|
email:
|
37
61
|
- postmodern.mod3@gmail.com
|
38
62
|
executables: []
|
@@ -92,6 +116,8 @@ files:
|
|
92
116
|
- static/course/specs.json
|
93
117
|
has_rdoc: true
|
94
118
|
homepage: http://spidr.rubyforge.org/
|
119
|
+
licenses: []
|
120
|
+
|
95
121
|
post_install_message:
|
96
122
|
rdoc_options:
|
97
123
|
- --main
|
@@ -113,9 +139,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
113
139
|
requirements: []
|
114
140
|
|
115
141
|
rubyforge_project: spidr
|
116
|
-
rubygems_version: 1.3.
|
142
|
+
rubygems_version: 1.3.4
|
117
143
|
signing_key:
|
118
|
-
specification_version:
|
144
|
+
specification_version: 3
|
119
145
|
summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
|
120
146
|
test_files: []
|
121
147
|
|
metadata.gz.sig
ADDED
Binary file
|