spidr 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +8 -0
- data/Manifest.txt +6 -0
- data/README.txt +6 -1
- data/lib/spidr/agent.rb +9 -1
- data/lib/spidr/page.rb +22 -4
- data/lib/spidr/version.rb +1 -1
- data/static/course/frames/frame.html +15 -0
- data/static/course/frames/frame_next.html +9 -0
- data/static/course/frames/iframe.html +15 -0
- data/static/course/frames/iframe_next.html +9 -0
- data/static/course/frames/index.html +10 -0
- data/static/course/frames/start.html +15 -0
- data/static/course/specs.json +1 -1
- data/static/course/start.html +1 -1
- data/tasks/course.rb +4 -4
- metadata +9 -3
data/History.txt
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
=== 0.1.7 / 2009-04-24
|
2
|
+
|
3
|
+
* Added Agent#all_headers.
|
4
|
+
* Fixed a bug where Page#headers was always +nil+.
|
5
|
+
* Spidr::Agent will now follow the Location header in HTTP 300, 301, 302,
|
6
|
+
303 and 307 Redirects.
|
7
|
+
* Spidr::Agent will now follow iframe and frame tags.
|
8
|
+
|
1
9
|
=== 0.1.6 / 2009-04-14
|
2
10
|
|
3
11
|
* Added Agent#failures, a list of URLs which could not be visited.
|
data/Manifest.txt
CHANGED
@@ -36,4 +36,10 @@ static/course/absolute/next.html
|
|
36
36
|
static/course/remote/index.html
|
37
37
|
static/course/remote/start.html
|
38
38
|
static/course/remote/next.html
|
39
|
+
static/course/frames/index.html
|
40
|
+
static/course/frames/start.html
|
41
|
+
static/course/frames/iframe.html
|
42
|
+
static/course/frames/iframe_next.html
|
43
|
+
static/course/frames/frame.html
|
44
|
+
static/course/frames/frame_next.html
|
39
45
|
static/course/specs.json
|
data/README.txt
CHANGED
@@ -10,8 +10,13 @@ Spidr is a versatile Ruby web spidering library that can spider a site,
|
|
10
10
|
multiple domains, certain links or infinitely. Spidr is designed to be fast
|
11
11
|
and easy to use.
|
12
12
|
|
13
|
-
== FEATURES
|
13
|
+
== FEATURES:
|
14
14
|
|
15
|
+
* Follows:
|
16
|
+
* a tags.
|
17
|
+
* iframe tags.
|
18
|
+
* frame tags.
|
19
|
+
* HTTP 300, 301, 302, 303 and 307 Redirects.
|
15
20
|
* Black-list or white-list URLs based upon:
|
16
21
|
* Host name
|
17
22
|
* Port number
|
data/lib/spidr/agent.rb
CHANGED
@@ -330,7 +330,7 @@ module Spidr
|
|
330
330
|
end
|
331
331
|
|
332
332
|
#
|
333
|
-
# For every Page that the agent visits
|
333
|
+
# For every Page that the agent visits, pass the page to the
|
334
334
|
# specified _block_.
|
335
335
|
#
|
336
336
|
def every_page(&block)
|
@@ -338,6 +338,14 @@ module Spidr
|
|
338
338
|
return self
|
339
339
|
end
|
340
340
|
|
341
|
+
#
|
342
|
+
# For every Page that the agent visits, pass the headers to the given
|
343
|
+
# _block_.
|
344
|
+
#
|
345
|
+
def all_headers(&block)
|
346
|
+
every_page { |page| block.call(page.headers) }
|
347
|
+
end
|
348
|
+
|
341
349
|
#
|
342
350
|
# Clears the history of the agent.
|
343
351
|
#
|
data/lib/spidr/page.rb
CHANGED
@@ -23,6 +23,7 @@ module Spidr
|
|
23
23
|
def initialize(url,response)
|
24
24
|
@url = url
|
25
25
|
@response = response
|
26
|
+
@headers = response.to_hash
|
26
27
|
@doc = nil
|
27
28
|
end
|
28
29
|
|
@@ -192,6 +193,8 @@ module Spidr
|
|
192
193
|
# returned.
|
193
194
|
#
|
194
195
|
def doc
|
196
|
+
return nil if (body.nil? || body.empty?)
|
197
|
+
|
195
198
|
begin
|
196
199
|
if html?
|
197
200
|
return @doc ||= Nokogiri::HTML(body)
|
@@ -209,11 +212,26 @@ module Spidr
|
|
209
212
|
def links
|
210
213
|
urls = []
|
211
214
|
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
+
add_url = lambda { |url|
|
216
|
+
urls << url unless (url.nil? || url.empty?)
|
217
|
+
}
|
218
|
+
|
219
|
+
case code
|
220
|
+
when 300..303, 307
|
221
|
+
add_url.call(@headers['location'])
|
222
|
+
end
|
223
|
+
|
224
|
+
if (html? && doc)
|
225
|
+
doc.search('a[@href]').each do |a|
|
226
|
+
add_url.call(a.get_attribute('href'))
|
227
|
+
end
|
228
|
+
|
229
|
+
doc.search('frame[@src]').each do |iframe|
|
230
|
+
add_url.call(iframe.get_attribute('src'))
|
231
|
+
end
|
215
232
|
|
216
|
-
|
233
|
+
doc.search('iframe[@src]').each do |iframe|
|
234
|
+
add_url.call(iframe.get_attribute('src'))
|
217
235
|
end
|
218
236
|
end
|
219
237
|
|
data/lib/spidr/version.rb
CHANGED
@@ -0,0 +1,15 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
|
4
|
+
</head>
|
5
|
+
|
6
|
+
<body>
|
7
|
+
<p>frame contents</p>
|
8
|
+
|
9
|
+
<ul>
|
10
|
+
<li class="follow">
|
11
|
+
<a href="frame_next.html">should follow links within frames</a>
|
12
|
+
</li>
|
13
|
+
</ul>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
|
4
|
+
</head>
|
5
|
+
|
6
|
+
<body>
|
7
|
+
<p>iframe contents</p>
|
8
|
+
|
9
|
+
<ul>
|
10
|
+
<li class="follow">
|
11
|
+
<a href="iframe_next.html">should follow links within iframes</a>
|
12
|
+
</li>
|
13
|
+
</ul>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,10 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
|
4
|
+
<script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
|
5
|
+
<script type="text/javascript" src="../scripts/course.js"></script>
|
6
|
+
<script type="text/javascript">
|
7
|
+
fail();
|
8
|
+
</script>
|
9
|
+
</head>
|
10
|
+
</html>
|
data/static/course/specs.json
CHANGED
@@ -1 +1 @@
|
|
1
|
-
[{"link":"\/course\/absolute\/next.html","
|
1
|
+
[{"link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","behavior":"follow"},{"link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","behavior":"nofollow"},{"link":"","example":"<a>should not follow links with no href attributes<\/a>","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":" ","example":"<a href=\"\">should ignore links with blank href attributes<\/a>","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","behavior":"ignore"},{"link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","behavior":"ignore"},{"link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","behavior":"ignore"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","behavior":"follow"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","behavior":"follow"},{"link":".\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","behavior":"follow"},{"link":"..\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","behavior":"follow"},{"link":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","behavior":"ignore"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","behavior":"fail"},{"link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes<\/a>","message":"should follow links within iframes","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/iframe_next.html","behavior":"follow"},{"link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames<\/a>","message":"should follow links within frames","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/frame_next.html","behavior":"follow"}]
|
data/static/course/start.html
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
<li><a href="relative/start.html">Relative links</a></li>
|
22
22
|
<li><a href="empty/start.html">Empty links</a></li>
|
23
23
|
<li><a href="javascript/start.html">Bogus JavaScript Links</a></li>
|
24
|
-
<li><a href="
|
24
|
+
<li><a href="frames/start.html">Frames</a></li>
|
25
25
|
</ul>
|
26
26
|
</body>
|
27
27
|
</html>
|
data/tasks/course.rb
CHANGED
@@ -8,9 +8,9 @@ namespace :course do
|
|
8
8
|
|
9
9
|
COURSE_DIR = File.join(STATIC_DIR,'course')
|
10
10
|
|
11
|
-
desc "Build the JSON
|
12
|
-
task :
|
13
|
-
File.open(File.join(COURSE_DIR,'specs.json'),'w') do |
|
11
|
+
desc "Build the JSON specs file for the course"
|
12
|
+
task :specs do
|
13
|
+
File.open(File.join(COURSE_DIR,'specs.json'),'w') do |file|
|
14
14
|
specs = []
|
15
15
|
|
16
16
|
Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
|
@@ -50,7 +50,7 @@ namespace :course do
|
|
50
50
|
end
|
51
51
|
end
|
52
52
|
|
53
|
-
|
53
|
+
file.write(specs.to_json)
|
54
54
|
end
|
55
55
|
end
|
56
56
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-04-
|
12
|
+
date: 2009-04-24 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.12.
|
33
|
+
version: 1.12.2
|
34
34
|
version:
|
35
35
|
description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
|
36
36
|
email:
|
@@ -82,6 +82,12 @@ files:
|
|
82
82
|
- static/course/remote/index.html
|
83
83
|
- static/course/remote/start.html
|
84
84
|
- static/course/remote/next.html
|
85
|
+
- static/course/frames/index.html
|
86
|
+
- static/course/frames/start.html
|
87
|
+
- static/course/frames/iframe.html
|
88
|
+
- static/course/frames/iframe_next.html
|
89
|
+
- static/course/frames/frame.html
|
90
|
+
- static/course/frames/frame_next.html
|
85
91
|
- static/course/specs.json
|
86
92
|
has_rdoc: true
|
87
93
|
homepage: http://spidr.rubyforge.org/
|