spidr 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,11 @@
1
+ === 0.1.7 / 2009-04-24
2
+
3
+ * Added Agent#all_headers.
4
+ * Fixed a bug where Page#headers was always +nil+.
5
+ * Spidr::Agent will now follow the Location header in HTTP 300, 301, 302,
6
+ 303 and 307 Redirects.
7
+ * Spidr::Agent will now follow iframe and frame tags.
8
+
1
9
  === 0.1.6 / 2009-04-14
2
10
 
3
11
  * Added Agent#failures, a list of URLs which could not be visited.
data/Manifest.txt CHANGED
@@ -36,4 +36,10 @@ static/course/absolute/next.html
36
36
  static/course/remote/index.html
37
37
  static/course/remote/start.html
38
38
  static/course/remote/next.html
39
+ static/course/frames/index.html
40
+ static/course/frames/start.html
41
+ static/course/frames/iframe.html
42
+ static/course/frames/iframe_next.html
43
+ static/course/frames/frame.html
44
+ static/course/frames/frame_next.html
39
45
  static/course/specs.json
data/README.txt CHANGED
@@ -10,8 +10,13 @@ Spidr is a versatile Ruby web spidering library that can spider a site,
10
10
  multiple domains, certain links or infinitely. Spidr is designed to be fast
11
11
  and easy to use.
12
12
 
13
- == FEATURES/PROBLEMS:
13
+ == FEATURES:
14
14
 
15
+ * Follows:
16
+ * a tags.
17
+ * iframe tags.
18
+ * frame tags.
19
+ * HTTP 300, 301, 302, 303 and 307 Redirects.
15
20
  * Black-list or white-list URLs based upon:
16
21
  * Host name
17
22
  * Port number
data/lib/spidr/agent.rb CHANGED
@@ -330,7 +330,7 @@ module Spidr
330
330
  end
331
331
 
332
332
  #
333
- # For every Page that the agent visits it will be passed to the
333
+ # For every Page that the agent visits, pass the page to the
334
334
  # specified _block_.
335
335
  #
336
336
  def every_page(&block)
@@ -338,6 +338,14 @@ module Spidr
338
338
  return self
339
339
  end
340
340
 
341
+ #
342
+ # For every Page that the agent visits, pass the headers to the given
343
+ # _block_.
344
+ #
345
+ def all_headers(&block)
346
+ every_page { |page| block.call(page.headers) }
347
+ end
348
+
341
349
  #
342
350
  # Clears the history of the agent.
343
351
  #
data/lib/spidr/page.rb CHANGED
@@ -23,6 +23,7 @@ module Spidr
23
23
  def initialize(url,response)
24
24
  @url = url
25
25
  @response = response
26
+ @headers = response.to_hash
26
27
  @doc = nil
27
28
  end
28
29
 
@@ -192,6 +193,8 @@ module Spidr
192
193
  # returned.
193
194
  #
194
195
  def doc
196
+ return nil if (body.nil? || body.empty?)
197
+
195
198
  begin
196
199
  if html?
197
200
  return @doc ||= Nokogiri::HTML(body)
@@ -209,11 +212,26 @@ module Spidr
209
212
  def links
210
213
  urls = []
211
214
 
212
- if (html? && self.doc)
213
- self.doc.search('a[@href]').each do |a|
214
- url = a.get_attribute('href')
215
+ add_url = lambda { |url|
216
+ urls << url unless (url.nil? || url.empty?)
217
+ }
218
+
219
+ case code
220
+ when 300..303, 307
221
+ add_url.call(@headers['location'])
222
+ end
223
+
224
+ if (html? && doc)
225
+ doc.search('a[@href]').each do |a|
226
+ add_url.call(a.get_attribute('href'))
227
+ end
228
+
229
+ doc.search('frame[@src]').each do |iframe|
230
+ add_url.call(iframe.get_attribute('src'))
231
+ end
215
232
 
216
- urls << url unless url.empty?
233
+ doc.search('iframe[@src]').each do |iframe|
234
+ add_url.call(iframe.get_attribute('src'))
217
235
  end
218
236
  end
219
237
 
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.6'
2
+ VERSION = '0.1.7'
3
3
  end
@@ -0,0 +1,15 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>frame contents</p>
8
+
9
+ <ul>
10
+ <li class="follow">
11
+ <a href="frame_next.html">should follow links within frames</a>
12
+ </li>
13
+ </ul>
14
+ </body>
15
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>Links from within frames</p>
8
+ </body>
9
+ </html>
@@ -0,0 +1,15 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>iframe contents</p>
8
+
9
+ <ul>
10
+ <li class="follow">
11
+ <a href="iframe_next.html">should follow links within iframes</a>
12
+ </li>
13
+ </ul>
14
+ </body>
15
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>Links from within an iframe</p>
8
+ </body>
9
+ </html>
@@ -0,0 +1,10 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
4
+ <script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
5
+ <script type="text/javascript" src="../scripts/course.js"></script>
6
+ <script type="text/javascript">
7
+ fail();
8
+ </script>
9
+ </head>
10
+ </html>
@@ -0,0 +1,15 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>Frames</p>
8
+
9
+ <iframe src="iframe.html"></iframe>
10
+
11
+ <frameset>
12
+ <frame src="frame.html" />
13
+ </frameset>
14
+ </body>
15
+ </html>
@@ -1 +1 @@
1
- [{"link":"\/course\/absolute\/next.html","behavior":"follow","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"link":"\/course\/absolute\/start.html","behavior":"nofollow","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a>should not follow links with no href attributes<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"link":" ","behavior":"ignore","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"link":"javascript:fail();","behavior":"ignore","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"link":"next.html","behavior":"follow","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"link":"normal.html","behavior":"follow","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"link":".\/current_directory.html","behavior":"follow","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"link":"..\/relative\/same_directory.html","behavior":"follow","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","behavior":"fail","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>"}]
1
+ [{"link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","behavior":"follow"},{"link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","behavior":"nofollow"},{"link":"","example":"<a>should not follow links with no href attributes<\/a>","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":" ","example":"<a href=\"\">should ignore links with blank href attributes<\/a>","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","behavior":"ignore"},{"link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","behavior":"ignore"},{"link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","behavior":"ignore"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","behavior":"follow"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","behavior":"follow"},{"link":".\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","behavior":"follow"},{"link":"..\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","behavior":"follow"},{"link":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","behavior":"ignore"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","behavior":"fail"},{"link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes<\/a>","message":"should follow links within iframes","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/iframe_next.html","behavior":"follow"},{"link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames<\/a>","message":"should follow links within frames","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/frame_next.html","behavior":"follow"}]
@@ -21,7 +21,7 @@
21
21
  <li><a href="relative/start.html">Relative links</a></li>
22
22
  <li><a href="empty/start.html">Empty links</a></li>
23
23
  <li><a href="javascript/start.html">Bogus JavaScript Links</a></li>
24
- <li><a href="malformed/start.html">Malformed HTML</a></li>
24
+ <li><a href="frames/start.html">Frames</a></li>
25
25
  </ul>
26
26
  </body>
27
27
  </html>
data/tasks/course.rb CHANGED
@@ -8,9 +8,9 @@ namespace :course do
8
8
 
9
9
  COURSE_DIR = File.join(STATIC_DIR,'course')
10
10
 
11
- desc "Build the JSON spec file for the course"
12
- task :spec do
13
- File.open(File.join(COURSE_DIR,'specs.json'),'w') do |spec|
11
+ desc "Build the JSON specs file for the course"
12
+ task :specs do
13
+ File.open(File.join(COURSE_DIR,'specs.json'),'w') do |file|
14
14
  specs = []
15
15
 
16
16
  Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
@@ -50,7 +50,7 @@ namespace :course do
50
50
  end
51
51
  end
52
52
 
53
- spec.write(specs.to_json)
53
+ file.write(specs.to_json)
54
54
  end
55
55
  end
56
56
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-04-14 00:00:00 -07:00
12
+ date: 2009-04-24 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: 1.12.1
33
+ version: 1.12.2
34
34
  version:
35
35
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
36
36
  email:
@@ -82,6 +82,12 @@ files:
82
82
  - static/course/remote/index.html
83
83
  - static/course/remote/start.html
84
84
  - static/course/remote/next.html
85
+ - static/course/frames/index.html
86
+ - static/course/frames/start.html
87
+ - static/course/frames/iframe.html
88
+ - static/course/frames/iframe_next.html
89
+ - static/course/frames/frame.html
90
+ - static/course/frames/frame_next.html
85
91
  - static/course/specs.json
86
92
  has_rdoc: true
87
93
  homepage: http://spidr.rubyforge.org/