spidr 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -9
- data/README.txt +1 -1
- data/lib/spidr/agent.rb +9 -9
- data/lib/spidr/version.rb +1 -1
- data/tasks/spec.rb +2 -0
- metadata +3 -3
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 0.1.3 / 2009-01-10
|
2
|
+
|
3
|
+
* Added the :host options to Spidr::Agent#initialize.
|
4
|
+
* Added the Web Spider Obstacle Course files to the Manifest.
|
5
|
+
* Aliased Spidr::Agent#visited_urls to Spidr::Agent#history.
|
6
|
+
|
1
7
|
=== 0.1.2 / 2008-11-06
|
2
8
|
|
3
9
|
* Fixed a bug in Page#to_absolute where URLs with no path were not
|
@@ -19,13 +25,13 @@
|
|
19
25
|
=== 0.1.0 / 2008-05-23
|
20
26
|
|
21
27
|
* Initial release.
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
28
|
+
* Black-list or white-list URLs based upon:
|
29
|
+
* Host name
|
30
|
+
* Port number
|
31
|
+
* Full link
|
32
|
+
* URL extension
|
33
|
+
* Provides call-backs for:
|
34
|
+
* Every visited Page.
|
35
|
+
* Every visited URL.
|
36
|
+
* Every visited URL that matches a specified pattern.
|
31
37
|
|
data/README.txt
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -33,10 +33,11 @@ module Spidr
|
|
33
33
|
#
|
34
34
|
# _options_ may contain the following keys:
|
35
35
|
# <tt>:proxy</tt>:: The proxy to use while spidering.
|
36
|
-
# <tt>:user_agent</tt>::
|
36
|
+
# <tt>:user_agent</tt>:: The User-Agent string to send.
|
37
37
|
# <tt>:referer</tt>:: The referer URL to send.
|
38
38
|
# <tt>:delay</tt>:: Duration in seconds to pause between spidering each
|
39
39
|
# link. Defaults to 0.
|
40
|
+
# <tt>:host</tt>:: The host-name to visit.
|
40
41
|
# <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
|
41
42
|
# <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
|
42
43
|
# <tt>:ports</tt>:: An +Array+ of port patterns to visit.
|
@@ -78,6 +79,10 @@ module Spidr
|
|
78
79
|
@history = []
|
79
80
|
@queue = []
|
80
81
|
|
82
|
+
if options[:host]
|
83
|
+
visit_hosts_like(options[:host])
|
84
|
+
end
|
85
|
+
|
81
86
|
block.call(self) if block
|
82
87
|
end
|
83
88
|
|
@@ -102,7 +107,7 @@ module Spidr
|
|
102
107
|
# spidering.
|
103
108
|
#
|
104
109
|
def self.host(name,options={},&block)
|
105
|
-
self.new(options.merge(:
|
110
|
+
self.new(options.merge(:host => name)) do |spider|
|
106
111
|
block.call(spider) if block
|
107
112
|
|
108
113
|
spider.start_at("http://#{name}/")
|
@@ -118,7 +123,7 @@ module Spidr
|
|
118
123
|
def self.site(url,options={},&block)
|
119
124
|
url = URI(url.to_s)
|
120
125
|
|
121
|
-
return self.new(options.merge(:
|
126
|
+
return self.new(options.merge(:host => url.host)) do |spider|
|
122
127
|
block.call(spider) if block
|
123
128
|
|
124
129
|
spider.start_at(url)
|
@@ -341,12 +346,7 @@ module Spidr
|
|
341
346
|
return self
|
342
347
|
end
|
343
348
|
|
344
|
-
|
345
|
-
# Returns the +Array+ of visited URLs.
|
346
|
-
#
|
347
|
-
def visited_urls
|
348
|
-
@history
|
349
|
-
end
|
349
|
+
alias visited_urls history
|
350
350
|
|
351
351
|
#
|
352
352
|
# Returns the +Array+ of visited URLs.
|
data/lib/spidr/version.rb
CHANGED
data/tasks/spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-01-10 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -106,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
106
106
|
requirements: []
|
107
107
|
|
108
108
|
rubyforge_project: spidr
|
109
|
-
rubygems_version: 1.3.
|
109
|
+
rubygems_version: 1.3.1
|
110
110
|
signing_key:
|
111
111
|
specification_version: 2
|
112
112
|
summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
|