spidr 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +8 -0
- data/Manifest.txt +6 -0
- data/README.txt +6 -1
- data/lib/spidr/agent.rb +9 -1
- data/lib/spidr/page.rb +22 -4
- data/lib/spidr/version.rb +1 -1
- data/static/course/frames/frame.html +15 -0
- data/static/course/frames/frame_next.html +9 -0
- data/static/course/frames/iframe.html +15 -0
- data/static/course/frames/iframe_next.html +9 -0
- data/static/course/frames/index.html +10 -0
- data/static/course/frames/start.html +15 -0
- data/static/course/specs.json +1 -1
- data/static/course/start.html +1 -1
- data/tasks/course.rb +4 -4
- metadata +9 -3
data/History.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
=== 0.1.7 / 2009-04-24
|
2
|
+
|
3
|
+
* Added Agent#all_headers.
|
4
|
+
* Fixed a bug where Page#headers was always +nil+.
|
5
|
+
* Spidr::Agent will now follow the Location header in HTTP 300, 301, 302,
|
6
|
+
303 and 307 Redirects.
|
7
|
+
* Spidr::Agent will now follow iframe and frame tags.
|
8
|
+
|
1
9
|
=== 0.1.6 / 2009-04-14
|
2
10
|
|
3
11
|
* Added Agent#failures, a list of URLs which could not be visited.
|
data/Manifest.txt
CHANGED
@@ -36,4 +36,10 @@ static/course/absolute/next.html
|
|
36
36
|
static/course/remote/index.html
|
37
37
|
static/course/remote/start.html
|
38
38
|
static/course/remote/next.html
|
39
|
+
static/course/frames/index.html
|
40
|
+
static/course/frames/start.html
|
41
|
+
static/course/frames/iframe.html
|
42
|
+
static/course/frames/iframe_next.html
|
43
|
+
static/course/frames/frame.html
|
44
|
+
static/course/frames/frame_next.html
|
39
45
|
static/course/specs.json
|
data/README.txt
CHANGED
@@ -10,8 +10,13 @@ Spidr is a versatile Ruby web spidering library that can spider a site,
|
|
10
10
|
multiple domains, certain links or infinitely. Spidr is designed to be fast
|
11
11
|
and easy to use.
|
12
12
|
|
13
|
-
== FEATURES
|
13
|
+
== FEATURES:
|
14
14
|
|
15
|
+
* Follows:
|
16
|
+
* a tags.
|
17
|
+
* iframe tags.
|
18
|
+
* frame tags.
|
19
|
+
* HTTP 300, 301, 302, 303 and 307 Redirects.
|
15
20
|
* Black-list or white-list URLs based upon:
|
16
21
|
* Host name
|
17
22
|
* Port number
|
data/lib/spidr/agent.rb
CHANGED
@@ -330,7 +330,7 @@ module Spidr
|
|
330
330
|
end
|
331
331
|
|
332
332
|
#
|
333
|
-
# For every Page that the agent visits
|
333
|
+
# For every Page that the agent visits, pass the page to the
|
334
334
|
# specified _block_.
|
335
335
|
#
|
336
336
|
def every_page(&block)
|
@@ -338,6 +338,14 @@ module Spidr
|
|
338
338
|
return self
|
339
339
|
end
|
340
340
|
|
341
|
+
#
|
342
|
+
# For every Page that the agent visits, pass the headers to the given
|
343
|
+
# _block_.
|
344
|
+
#
|
345
|
+
def all_headers(&block)
|
346
|
+
every_page { |page| block.call(page.headers) }
|
347
|
+
end
|
348
|
+
|
341
349
|
#
|
342
350
|
# Clears the history of the agent.
|
343
351
|
#
|
data/lib/spidr/page.rb
CHANGED
@@ -23,6 +23,7 @@ module Spidr
|
|
23
23
|
def initialize(url,response)
|
24
24
|
@url = url
|
25
25
|
@response = response
|
26
|
+
@headers = response.to_hash
|
26
27
|
@doc = nil
|
27
28
|
end
|
28
29
|
|
@@ -192,6 +193,8 @@ module Spidr
|
|
192
193
|
# returned.
|
193
194
|
#
|
194
195
|
def doc
|
196
|
+
return nil if (body.nil? || body.empty?)
|
197
|
+
|
195
198
|
begin
|
196
199
|
if html?
|
197
200
|
return @doc ||= Nokogiri::HTML(body)
|
@@ -209,11 +212,26 @@ module Spidr
|
|
209
212
|
def links
|
210
213
|
urls = []
|
211
214
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
+
add_url = lambda { |url|
|
216
|
+
urls << url unless (url.nil? || url.empty?)
|
217
|
+
}
|
218
|
+
|
219
|
+
case code
|
220
|
+
when 300..303, 307
|
221
|
+
add_url.call(@headers['location'])
|
222
|
+
end
|
223
|
+
|
224
|
+
if (html? && doc)
|
225
|
+
doc.search('a[@href]').each do |a|
|
226
|
+
add_url.call(a.get_attribute('href'))
|
227
|
+
end
|
228
|
+
|
229
|
+
doc.search('frame[@src]').each do |iframe|
|
230
|
+
add_url.call(iframe.get_attribute('src'))
|
231
|
+
end
|
215
232
|
|
216
|
-
|
233
|
+
doc.search('iframe[@src]').each do |iframe|
|
234
|
+
add_url.call(iframe.get_attribute('src'))
|
217
235
|
end
|
218
236
|
end
|
219
237
|
|
data/lib/spidr/version.rb
CHANGED
@@ -0,0 +1,15 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
|
4
|
+
</head>
|
5
|
+
|
6
|
+
<body>
|
7
|
+
<p>frame contents</p>
|
8
|
+
|
9
|
+
<ul>
|
10
|
+
<li class="follow">
|
11
|
+
<a href="frame_next.html">should follow links within frames</a>
|
12
|
+
</li>
|
13
|
+
</ul>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
|
4
|
+
</head>
|
5
|
+
|
6
|
+
<body>
|
7
|
+
<p>iframe contents</p>
|
8
|
+
|
9
|
+
<ul>
|
10
|
+
<li class="follow">
|
11
|
+
<a href="iframe_next.html">should follow links within iframes</a>
|
12
|
+
</li>
|
13
|
+
</ul>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,10 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
|
4
|
+
<script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
|
5
|
+
<script type="text/javascript" src="../scripts/course.js"></script>
|
6
|
+
<script type="text/javascript">
|
7
|
+
fail();
|
8
|
+
</script>
|
9
|
+
</head>
|
10
|
+
</html>
|
data/static/course/specs.json
CHANGED
@@ -1 +1 @@
|
|
1
|
-
[{"link":"\/course\/absolute\/next.html","
|
1
|
+
[{"link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","behavior":"follow"},{"link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","behavior":"nofollow"},{"link":"","example":"<a>should not follow links with no href attributes<\/a>","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":" ","example":"<a href=\"\">should ignore links with blank href attributes<\/a>","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","behavior":"ignore"},{"link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","behavior":"ignore"},{"link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","behavior":"ignore"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","behavior":"follow"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","behavior":"follow"},{"link":".\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","behavior":"follow"},{"link":"..\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","behavior":"follow"},{"link":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","behavior":"ignore"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","behavior":"fail"},{"link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes<\/a>","message":"should follow links within iframes","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/iframe_next.html","behavior":"follow"},{"link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames<\/a>","message":"should follow links within frames","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/frame_next.html","behavior":"follow"}]
|
data/static/course/start.html
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
<li><a href="relative/start.html">Relative links</a></li>
|
22
22
|
<li><a href="empty/start.html">Empty links</a></li>
|
23
23
|
<li><a href="javascript/start.html">Bogus JavaScript Links</a></li>
|
24
|
-
<li><a href="
|
24
|
+
<li><a href="frames/start.html">Frames</a></li>
|
25
25
|
</ul>
|
26
26
|
</body>
|
27
27
|
</html>
|
data/tasks/course.rb
CHANGED
@@ -8,9 +8,9 @@ namespace :course do
|
|
8
8
|
|
9
9
|
COURSE_DIR = File.join(STATIC_DIR,'course')
|
10
10
|
|
11
|
-
desc "Build the JSON
|
12
|
-
task :
|
13
|
-
File.open(File.join(COURSE_DIR,'specs.json'),'w') do |
|
11
|
+
desc "Build the JSON specs file for the course"
|
12
|
+
task :specs do
|
13
|
+
File.open(File.join(COURSE_DIR,'specs.json'),'w') do |file|
|
14
14
|
specs = []
|
15
15
|
|
16
16
|
Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
|
@@ -50,7 +50,7 @@ namespace :course do
|
|
50
50
|
end
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
file.write(specs.to_json)
|
54
54
|
end
|
55
55
|
end
|
56
56
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-04-
|
12
|
+
date: 2009-04-24 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.12.
|
33
|
+
version: 1.12.2
|
34
34
|
version:
|
35
35
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
36
36
|
email:
|
@@ -82,6 +82,12 @@ files:
|
|
82
82
|
- static/course/remote/index.html
|
83
83
|
- static/course/remote/start.html
|
84
84
|
- static/course/remote/next.html
|
85
|
+
- static/course/frames/index.html
|
86
|
+
- static/course/frames/start.html
|
87
|
+
- static/course/frames/iframe.html
|
88
|
+
- static/course/frames/iframe_next.html
|
89
|
+
- static/course/frames/frame.html
|
90
|
+
- static/course/frames/frame_next.html
|
85
91
|
- static/course/specs.json
|
86
92
|
has_rdoc: true
|
87
93
|
homepage: http://spidr.rubyforge.org/
|