spidr 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,11 @@
1
+ === 0.1.7 / 2009-04-24
2
+
3
+ * Added Agent#all_headers.
4
+ * Fixed a bug where Page#headers was always +nil+.
5
+ * Spidr::Agent will now follow the Location header in HTTP 300, 301, 302,
6
+ 303 and 307 Redirects.
7
+ * Spidr::Agent will now follow iframe and frame tags.
8
+
1
9
  === 0.1.6 / 2009-04-14
2
10
 
3
11
  * Added Agent#failures, a list of URLs which could not be visited.
data/Manifest.txt CHANGED
@@ -36,4 +36,10 @@ static/course/absolute/next.html
36
36
  static/course/remote/index.html
37
37
  static/course/remote/start.html
38
38
  static/course/remote/next.html
39
+ static/course/frames/index.html
40
+ static/course/frames/start.html
41
+ static/course/frames/iframe.html
42
+ static/course/frames/iframe_next.html
43
+ static/course/frames/frame.html
44
+ static/course/frames/frame_next.html
39
45
  static/course/specs.json
data/README.txt CHANGED
@@ -10,8 +10,13 @@ Spidr is a versatile Ruby web spidering library that can spider a site,
10
10
  multiple domains, certain links or infinitely. Spidr is designed to be fast
11
11
  and easy to use.
12
12
 
13
- == FEATURES/PROBLEMS:
13
+ == FEATURES:
14
14
 
15
+ * Follows:
16
+ * a tags.
17
+ * iframe tags.
18
+ * frame tags.
19
+ * HTTP 300, 301, 302, 303 and 307 Redirects.
15
20
  * Black-list or white-list URLs based upon:
16
21
  * Host name
17
22
  * Port number
data/lib/spidr/agent.rb CHANGED
@@ -330,7 +330,7 @@ module Spidr
330
330
  end
331
331
 
332
332
  #
333
- # For every Page that the agent visits it will be passed to the
333
+ # For every Page that the agent visits, pass the page to the
334
334
  # specified _block_.
335
335
  #
336
336
  def every_page(&block)
@@ -338,6 +338,14 @@ module Spidr
338
338
  return self
339
339
  end
340
340
 
341
+ #
342
+ # For every Page that the agent visits, pass the headers to the given
343
+ # _block_.
344
+ #
345
+ def all_headers(&block)
346
+ every_page { |page| block.call(page.headers) }
347
+ end
348
+
341
349
  #
342
350
  # Clears the history of the agent.
343
351
  #
data/lib/spidr/page.rb CHANGED
@@ -23,6 +23,7 @@ module Spidr
23
23
  def initialize(url,response)
24
24
  @url = url
25
25
  @response = response
26
+ @headers = response.to_hash
26
27
  @doc = nil
27
28
  end
28
29
 
@@ -192,6 +193,8 @@ module Spidr
192
193
  # returned.
193
194
  #
194
195
  def doc
196
+ return nil if (body.nil? || body.empty?)
197
+
195
198
  begin
196
199
  if html?
197
200
  return @doc ||= Nokogiri::HTML(body)
@@ -209,11 +212,26 @@ module Spidr
209
212
  def links
210
213
  urls = []
211
214
 
212
- if (html? && self.doc)
213
- self.doc.search('a[@href]').each do |a|
214
- url = a.get_attribute('href')
215
+ add_url = lambda { |url|
216
+ urls << url unless (url.nil? || url.empty?)
217
+ }
218
+
219
+ case code
220
+ when 300..303, 307
221
+ add_url.call(@headers['location'])
222
+ end
223
+
224
+ if (html? && doc)
225
+ doc.search('a[@href]').each do |a|
226
+ add_url.call(a.get_attribute('href'))
227
+ end
228
+
229
+ doc.search('frame[@src]').each do |iframe|
230
+ add_url.call(iframe.get_attribute('src'))
231
+ end
215
232
 
216
- urls << url unless url.empty?
233
+ doc.search('iframe[@src]').each do |iframe|
234
+ add_url.call(iframe.get_attribute('src'))
217
235
  end
218
236
  end
219
237
 
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.6'
2
+ VERSION = '0.1.7'
3
3
  end
@@ -0,0 +1,15 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>frame contents</p>
8
+
9
+ <ul>
10
+ <li class="follow">
11
+ <a href="frame_next.html">should follow links within frames</a>
12
+ </li>
13
+ </ul>
14
+ </body>
15
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>Links from within frames</p>
8
+ </body>
9
+ </html>
@@ -0,0 +1,15 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>iframe contents</p>
8
+
9
+ <ul>
10
+ <li class="follow">
11
+ <a href="iframe_next.html">should follow links within iframes</a>
12
+ </li>
13
+ </ul>
14
+ </body>
15
+ </html>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>Links from within an iframe</p>
8
+ </body>
9
+ </html>
@@ -0,0 +1,10 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
4
+ <script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
5
+ <script type="text/javascript" src="../scripts/course.js"></script>
6
+ <script type="text/javascript">
7
+ fail();
8
+ </script>
9
+ </head>
10
+ </html>
@@ -0,0 +1,15 @@
1
+ <html>
2
+ <head>
3
+ <title>Spidr :: Web-Spider Obstacle Course :: Frames</title>
4
+ </head>
5
+
6
+ <body>
7
+ <p>Frames</p>
8
+
9
+ <iframe src="iframe.html"></iframe>
10
+
11
+ <frameset>
12
+ <frame src="frame.html" />
13
+ </frameset>
14
+ </body>
15
+ </html>
@@ -1 +1 @@
1
- [{"link":"\/course\/absolute\/next.html","behavior":"follow","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"link":"\/course\/absolute\/start.html","behavior":"nofollow","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a>should not follow links with no href attributes<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"link":" ","behavior":"ignore","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"link":"javascript:fail();","behavior":"ignore","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"link":"next.html","behavior":"follow","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"link":"normal.html","behavior":"follow","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"link":".\/current_directory.html","behavior":"follow","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"link":"..\/relative\/same_directory.html","behavior":"follow","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","behavior":"fail","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>"}]
1
+ [{"link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","behavior":"follow"},{"link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","behavior":"nofollow"},{"link":"","example":"<a>should not follow links with no href attributes<\/a>","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","behavior":"nofollow"},{"link":" ","example":"<a href=\"\">should ignore links with blank href attributes<\/a>","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","behavior":"ignore"},{"link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","behavior":"ignore"},{"link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","behavior":"ignore"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","behavior":"follow"},{"link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","behavior":"nofollow"},{"link":"normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","behavior":"follow"},{"link":".\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","behavior":"follow"},{"link":"..\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","behavior":"follow"},{"link":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","behavior":"ignore"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","behavior":"fail"},{"link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes<\/a>","message":"should follow links within iframes","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/iframe_next.html","behavior":"follow"},{"link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames<\/a>","message":"should follow links within frames","url":"http:\/\/spidr.rubyforge.org\/course\/frames\/frame_next.html","behavior":"follow"}]
@@ -21,7 +21,7 @@
21
21
  <li><a href="relative/start.html">Relative links</a></li>
22
22
  <li><a href="empty/start.html">Empty links</a></li>
23
23
  <li><a href="javascript/start.html">Bogus JavaScript Links</a></li>
24
- <li><a href="malformed/start.html">Malformed HTML</a></li>
24
+ <li><a href="frames/start.html">Frames</a></li>
25
25
  </ul>
26
26
  </body>
27
27
  </html>
data/tasks/course.rb CHANGED
@@ -8,9 +8,9 @@ namespace :course do
8
8
 
9
9
  COURSE_DIR = File.join(STATIC_DIR,'course')
10
10
 
11
- desc "Build the JSON spec file for the course"
12
- task :spec do
13
- File.open(File.join(COURSE_DIR,'specs.json'),'w') do |spec|
11
+ desc "Build the JSON specs file for the course"
12
+ task :specs do
13
+ File.open(File.join(COURSE_DIR,'specs.json'),'w') do |file|
14
14
  specs = []
15
15
 
16
16
  Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
@@ -50,7 +50,7 @@ namespace :course do
50
50
  end
51
51
  end
52
52
 
53
- spec.write(specs.to_json)
53
+ file.write(specs.to_json)
54
54
  end
55
55
  end
56
56
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-04-14 00:00:00 -07:00
12
+ date: 2009-04-24 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: 1.12.1
33
+ version: 1.12.2
34
34
  version:
35
35
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
36
36
  email:
@@ -82,6 +82,12 @@ files:
82
82
  - static/course/remote/index.html
83
83
  - static/course/remote/start.html
84
84
  - static/course/remote/next.html
85
+ - static/course/frames/index.html
86
+ - static/course/frames/start.html
87
+ - static/course/frames/iframe.html
88
+ - static/course/frames/iframe_next.html
89
+ - static/course/frames/frame.html
90
+ - static/course/frames/frame_next.html
85
91
  - static/course/specs.json
86
92
  has_rdoc: true
87
93
  homepage: http://spidr.rubyforge.org/