spidr 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.5 / 2009-03-22
2
+
3
+ * Catch malformed URIs in Page#to_absolute and return +nil+.
4
+ * Filter out +nil+ URIs in Page#urls.
5
+
1
6
  === 0.1.4 / 2009-01-15
2
7
 
3
8
  * Use Nokogiri for HTML and XML parsing.
data/README.txt CHANGED
@@ -1,6 +1,7 @@
1
1
  = Spidr
2
2
 
3
3
  * http://spidr.rubyforge.org/
4
+ * http://github.com/postmodern/spidr/
4
5
  * Postmodern (postmodern.mod3 at gmail.com)
5
6
 
6
7
  == DESCRIPTION:
@@ -25,7 +26,7 @@ and easy to use.
25
26
 
26
27
  == REQUIREMENTS:
27
28
 
28
- * nokogiri
29
+ * {nokogiri}[http://nokogiri.rubyforge.org/]
29
30
 
30
31
  == INSTALL:
31
32
 
data/lib/spidr/page.rb CHANGED
@@ -192,10 +192,14 @@ module Spidr
192
192
  # returned.
193
193
  #
194
194
  def doc
195
- if html?
196
- return @doc ||= Nokogiri::HTML(body)
197
- elsif xml?
198
- return @doc ||= Nokogiri::XML(body)
195
+ begin
196
+ if html?
197
+ return @doc ||= Nokogiri::HTML(body)
198
+ elsif xml?
199
+ return @doc ||= Nokogiri::XML(body)
200
+ end
201
+ rescue
202
+ return nil
199
203
  end
200
204
  end
201
205
 
@@ -205,7 +209,7 @@ module Spidr
205
209
  def links
206
210
  urls = []
207
211
 
208
- if html?
212
+ if (html? && self.doc)
209
213
  self.doc.search('a[@href]').each do |a|
210
214
  url = a.get_attribute('href')
211
215
 
@@ -220,7 +224,7 @@ module Spidr
220
224
  # Returns all links from the HtML page as absolute URLs.
221
225
  #
222
226
  def urls
223
- links.map { |link| to_absolute(link) }
227
+ links.map { |link| to_absolute(link) }.compact
224
228
  end
225
229
 
226
230
  protected
@@ -233,20 +237,24 @@ module Spidr
233
237
  # clean the link
234
238
  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
235
239
 
236
- relative = URI(link)
237
- absolute = @url.merge(relative)
238
-
239
- if absolute.path
240
- if absolute.path.empty?
241
- # default the absolute path to '/'
242
- absolute.path = '/'
243
- else
244
- # make sure the path does not contain any .. or . directories.
245
- absolute.path = File.expand_path(absolute.path)
240
+ begin
241
+ relative = URI(link)
242
+ absolute = @url.merge(relative)
243
+
244
+ if absolute.path
245
+ if absolute.path.empty?
246
+ # default the absolute path to '/'
247
+ absolute.path = '/'
248
+ else
249
+ # make sure the path does not contain any .. or . directories.
250
+ absolute.path = File.expand_path(absolute.path)
251
+ end
246
252
  end
247
- end
248
253
 
249
- return absolute
254
+ return absolute
255
+ rescue URI::InvalidURIError => e
256
+ return nil
257
+ end
250
258
  end
251
259
 
252
260
  #
data/lib/spidr/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Spidr
2
- VERSION = '0.1.4'
2
+ VERSION = '0.1.5'
3
3
  end
@@ -1 +1 @@
1
- [{"url":"javascript:fail();","link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","message":"should ignore links beginning with \"javascript:\"","behavior":"ignore"},{"url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"ignore"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","message":"should follow links pointing to other pages","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","message":"should not follow links pointing to the current page","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","message":"should not follow links to previously visited pages","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","link":"normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>","message":"should follow relative links","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","link":".\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","message":"should follow relative links to files in the current directory","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","link":"..\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","message":"should follow links that transverse directories","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","link":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","message":"should ignore in-page links","behavior":"ignore"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","link":"","example":"<a>should not follow links with no href attributes<\/a>","message":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","link":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","message":"should not follow links with empty href attributes","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","link":" ","example":"<a href=\" \">should ignore links with blank href attributes<\/a>","message":"should ignore links with blank href attributes","behavior":"ignore"},{"url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","message":"should follow remote links to unvisited pages","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","message":"should not follow remote links to the same page","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","message":"should not follow remote links with a relative path to the same page","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","message":"should follow absolute links to unvisited pages","behavior":"follow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>","message":"should not follow absolute links to the current page","behavior":"nofollow"}]
1
+ [{"behavior":"ignore","link":"javascript:fail();","url":"javascript:fail();","message":"should ignore links beginning with \"javascript:\"","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","message":"should ignore links with an onclick attribute and a href pointing to the page.","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"behavior":"follow","link":"next.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","message":"should follow links pointing to other pages","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links pointing to the current page","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","link":"normal.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","message":"should follow relative links","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"behavior":"follow","link":".\/current_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","message":"should follow relative links to files in the current directory","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"behavior":"follow","link":"..\/relative\/same_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","message":"should follow links that transverse directories","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","message":"should ignore in-page links","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with no href attributes","example":"<a>should not follow links with no href attributes<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with empty href attributes","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"behavior":"ignore","link":" ","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","message":"should ignore links with blank href attributes","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"behavior":"follow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"behavior":"follow","link":"\/course\/absolute\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","message":"should follow absolute links to unvisited pages","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"\/course\/absolute\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","message":"should not follow absolute links to the current page","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"}]
data/tasks/course.rb CHANGED
@@ -1,4 +1,4 @@
1
- require 'hpricot'
1
+ require 'nokogiri'
2
2
  require 'json'
3
3
 
4
4
  namespace :course do
@@ -14,13 +14,11 @@ namespace :course do
14
14
  specs = []
15
15
 
16
16
  Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
17
- doc = Hpricot(open(page))
17
+ doc = Nokogiri::HTML(open(page))
18
18
  page_url = COURSE_URL.merge(page.sub(STATIC_DIR,''))
19
19
 
20
- link_to_spec = lambda { |container,spec_data|
21
- link = container.at('a')
22
-
23
- relative_url = link['href'].to_s
20
+ link_to_spec = lambda { |link,spec_data|
21
+ relative_url = (link.get_attribute('href') || '')
24
22
  absolute_url = page_url.merge(URI.encode(relative_url))
25
23
 
26
24
  if absolute_url.path
@@ -35,15 +33,15 @@ namespace :course do
35
33
  )
36
34
  }
37
35
 
38
- doc.search('.follow[a]') do |follow|
36
+ doc.search('.follow//a').each do |follow|
39
37
  specs << link_to_spec.call(follow, :behavior => :follow)
40
38
  end
41
39
 
42
- doc.search('.nofollow[a]') do |nofollow|
40
+ doc.search('.nofollow//a').each do |nofollow|
43
41
  specs << link_to_spec.call(nofollow, :behavior => :nofollow)
44
42
  end
45
43
 
46
- doc.search('.ignore[a]') do |ignore|
44
+ doc.search('.ignore//a').each do |ignore|
47
45
  specs << link_to_spec.call(ignore, :behavior => :ignore)
48
46
  end
49
47
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Postmodern
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-01-15 00:00:00 -08:00
12
+ date: 2009-03-22 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: 1.8.2
33
+ version: 1.11.0
34
34
  version:
35
35
  description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
36
36
  email: