spidr 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +15 -9
- data/README.txt +1 -1
- data/lib/spidr/agent.rb +9 -9
- data/lib/spidr/version.rb +1 -1
- data/tasks/spec.rb +2 -0
- metadata +3 -3
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 0.1.3 / 2009-01-10
|
2
|
+
|
3
|
+
* Added the :host options to Spidr::Agent#initialize.
|
4
|
+
* Added the Web Spider Obstacle Course files to the Manifest.
|
5
|
+
* Aliased Spidr::Agent#visited_urls to Spidr::Agent#history.
|
6
|
+
|
1
7
|
=== 0.1.2 / 2008-11-06
|
2
8
|
|
3
9
|
* Fixed a bug in Page#to_absolute where URLs with no path were not
|
@@ -19,13 +25,13 @@
|
|
19
25
|
=== 0.1.0 / 2008-05-23
|
20
26
|
|
21
27
|
* Initial release.
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
* Black-list or white-list URLs based upon:
|
29
|
+
* Host name
|
30
|
+
* Port number
|
31
|
+
* Full link
|
32
|
+
* URL extension
|
33
|
+
* Provides call-backs for:
|
34
|
+
* Every visited Page.
|
35
|
+
* Every visited URL.
|
36
|
+
* Every visited URL that matches a specified pattern.
|
31
37
|
|
data/README.txt
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -33,10 +33,11 @@ module Spidr
|
|
33
33
|
#
|
34
34
|
# _options_ may contain the following keys:
|
35
35
|
# <tt>:proxy</tt>:: The proxy to use while spidering.
|
36
|
-
# <tt>:user_agent</tt>::
|
36
|
+
# <tt>:user_agent</tt>:: The User-Agent string to send.
|
37
37
|
# <tt>:referer</tt>:: The referer URL to send.
|
38
38
|
# <tt>:delay</tt>:: Duration in seconds to pause between spidering each
|
39
39
|
# link. Defaults to 0.
|
40
|
+
# <tt>:host</tt>:: The host-name to visit.
|
40
41
|
# <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
|
41
42
|
# <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
|
42
43
|
# <tt>:ports</tt>:: An +Array+ of port patterns to visit.
|
@@ -78,6 +79,10 @@ module Spidr
|
|
78
79
|
@history = []
|
79
80
|
@queue = []
|
80
81
|
|
82
|
+
if options[:host]
|
83
|
+
visit_hosts_like(options[:host])
|
84
|
+
end
|
85
|
+
|
81
86
|
block.call(self) if block
|
82
87
|
end
|
83
88
|
|
@@ -102,7 +107,7 @@ module Spidr
|
|
102
107
|
# spidering.
|
103
108
|
#
|
104
109
|
def self.host(name,options={},&block)
|
105
|
-
self.new(options.merge(:
|
110
|
+
self.new(options.merge(:host => name)) do |spider|
|
106
111
|
block.call(spider) if block
|
107
112
|
|
108
113
|
spider.start_at("http://#{name}/")
|
@@ -118,7 +123,7 @@ module Spidr
|
|
118
123
|
def self.site(url,options={},&block)
|
119
124
|
url = URI(url.to_s)
|
120
125
|
|
121
|
-
return self.new(options.merge(:
|
126
|
+
return self.new(options.merge(:host => url.host)) do |spider|
|
122
127
|
block.call(spider) if block
|
123
128
|
|
124
129
|
spider.start_at(url)
|
@@ -341,12 +346,7 @@ module Spidr
|
|
341
346
|
return self
|
342
347
|
end
|
343
348
|
|
344
|
-
|
345
|
-
# Returns the +Array+ of visited URLs.
|
346
|
-
#
|
347
|
-
def visited_urls
|
348
|
-
@history
|
349
|
-
end
|
349
|
+
alias visited_urls history
|
350
350
|
|
351
351
|
#
|
352
352
|
# Returns the +Array+ of visited URLs.
|
data/lib/spidr/version.rb
CHANGED
data/tasks/spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-01-10 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -106,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
106
106
|
requirements: []
|
107
107
|
|
108
108
|
rubyforge_project: spidr
|
109
|
-
rubygems_version: 1.3.
|
109
|
+
rubygems_version: 1.3.1
|
110
110
|
signing_key:
|
111
111
|
specification_version: 2
|
112
112
|
summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
|