RubyGems - spidr - Versions diffs - 0.1.4 → 0.1.5 - Mend

spidr 0.1.4 → 0.1.5

Files changed (7) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,8 @@
+=== 0.1.5 / 2009-03-22
+* Catch malformed URIs in Page#to_absolute and return +nil+.
+* Filter out +nil+ URIs in Page#urls.
 === 0.1.4 / 2009-01-15
 * Use Nokogiri for HTML and XML parsing.

data/README.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 = Spidr
 * http://spidr.rubyforge.org/
+* http://github.com/postmodern/spidr/
 * Postmodern (postmodern.mod3 at gmail.com)
 == DESCRIPTION:
@@ -25,7 +26,7 @@ and easy to use.
 == REQUIREMENTS:
-* nokogiri
+* {nokogiri}[http://nokogiri.rubyforge.org/]
 == INSTALL:

data/lib/spidr/page.rb CHANGED Viewed

@@ -192,10 +192,14 @@ module Spidr
     # returned.
     #
     def doc
-      if html?
-        return @doc ||= Nokogiri::HTML(body)
-      elsif xml?
-        return @doc ||= Nokogiri::XML(body)
+      begin
+        if html?
+          return @doc ||= Nokogiri::HTML(body)
+        elsif xml?
+          return @doc ||= Nokogiri::XML(body)
+        end
+      rescue
+        return nil
       end
     end
@@ -205,7 +209,7 @@ module Spidr
     def links
       urls = []
-      if html?
+      if (html? && self.doc)
         self.doc.search('a[@href]').each do |a|
           url = a.get_attribute('href')
@@ -220,7 +224,7 @@ module Spidr
     # Returns all links from the HtML page as absolute URLs.
     #
     def urls
-      links.map { |link| to_absolute(link) }
+      links.map { |link| to_absolute(link) }.compact
     end
     protected
@@ -233,20 +237,24 @@ module Spidr
       # clean the link
       link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
-      relative = URI(link)
-      absolute = @url.merge(relative)
-      if absolute.path
-        if absolute.path.empty?
-          # default the absolute path to '/'
-          absolute.path = '/'
-        else
-          # make sure the path does not contain any .. or . directories.
-          absolute.path = File.expand_path(absolute.path)
+      begin
+        relative = URI(link)
+        absolute = @url.merge(relative)
+        if absolute.path
+          if absolute.path.empty?
+            # default the absolute path to '/'
+            absolute.path = '/'
+          else
+            # make sure the path does not contain any .. or . directories.
+            absolute.path = File.expand_path(absolute.path)
+          end
         end
-      end
-      return absolute
+        return absolute
+      rescue URI::InvalidURIError => e
+        return nil
+      end
     end
     #

data/lib/spidr/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Spidr
-  VERSION = '0.1.4'
+  VERSION = '0.1.5'
 end

data/static/course/specs.json CHANGED Viewed

	@@ -1 +1 @@
1	- [{"~~url~~":"javascript:fail();","~~link~~":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","~~message~~":"~~should~~ ignore ~~links beginning with \~~"~~javascript:\""~~,"~~behavior~~":"~~ignore~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","~~link~~":"~~#","example":"<a href=\"#\" onclick=\"fail();\">~~should ignore links with an onclick attribute and a href pointing to the page~~.<\/a>~~","~~message~~":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"~~ignore~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","~~link~~":"~~next.html~~","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","~~message~~":"~~should follow links pointing to other pages~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","~~link~~":"~~start.html~~","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","~~message~~":"~~should not follow links pointing to the current page~~","~~behavior~~":"~~nofollow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","~~link~~":"~~start.html~~","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","~~message~~":"~~should not~~ follow ~~links to previously visited pages~~","~~behavior~~":"~~nofollow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","~~link~~":"~~normal.html~~","example":"<a href=\"normal.html\">should follow relative links<\/a>","~~message~~":"~~should~~ follow ~~relative links~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","~~link~~":"~~.\/current_directory.html~~","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","~~message~~":"~~should~~ follow ~~relative links to files in the current directory~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","~~link~~":"~~..\/relative\/same_directory.html~~","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","~~message~~":"~~should follow links that transverse directories~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","~~link~~":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","~~message~~":"~~should ignore in-page links~~","~~behavior~~":"~~ignore~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","~~link~~":"~~","example":"<a>~~should not follow links with no href attributes~~<\/a>~~","~~message~~":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","~~link~~":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","~~message~~":"~~should not follow links with empty href attributes~~","~~behavior~~":"~~nofollow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","~~link~~":" ","example":"<a href=\" \">should ignore links with blank href attributes<\/a>","~~message":"should ignore links with blank href attributes","~~behavior":"~~ignore~~"},{"~~url~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","~~link~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","~~message":"should follow remote links to unvisited pages","~~behavior":"~~follow~~"},{"~~url~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","~~link~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","~~message":"should not follow remote links to the same page","~~behavior":"nofollow"},{"~~url~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","~~link~~":"http:\/\/spidr.rubyforge.org\/course\/~~loop\/..\/~~remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","~~message~~":"~~should not~~ follow ~~remote links with a relative path to the same page~~","~~behavior~~":"~~nofollow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","~~link~~":"~~\/course\/~~absolute~~\/next.html~~","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","~~message~~":"~~should follow absolute links to unvisited pages~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","~~link~~":"~~\/course\/~~absolute~~\/start.html~~","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"~~,"message":"should not follow absolute links to the current page","behavior":"nofollow"~~}]
1	+ [{"behavior":"ignore","link":"javascript:fail();","url":"javascript:fail();","message":"should ignore links beginning with \"javascript:\"","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","message":"should ignore links with an onclick attribute and a href pointing to the page.","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"behavior":"follow","link":"next.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","message":"should follow links pointing to other pages","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links pointing to the current page","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","link":"normal.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","message":"should follow relative links","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"behavior":"follow","link":".\/current_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","message":"should follow relative links to files in the current directory","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"behavior":"follow","link":"..\/relative\/same_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","message":"should follow links that transverse directories","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","message":"should ignore in-page links","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with no href attributes","example":"<a>should not follow links with no href attributes<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with empty href attributes","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"behavior":"ignore","link":" ","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","message":"should ignore links with blank href attributes","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"behavior":"follow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"behavior":"follow","link":"\/course\/absolute\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","message":"should follow absolute links to unvisited pages","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"\/course\/absolute\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","message":"should not follow absolute links to the current page","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"}]

data/tasks/course.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require 'hpricot'
+require 'nokogiri'
 require 'json'
 namespace :course do
@@ -14,13 +14,11 @@ namespace :course do
       specs = []
       Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
-        doc = Hpricot(open(page))
+        doc = Nokogiri::HTML(open(page))
         page_url = COURSE_URL.merge(page.sub(STATIC_DIR,''))
-        link_to_spec = lambda { |container,spec_data|
-          link = container.at('a')
-          relative_url = link['href'].to_s
+        link_to_spec = lambda { |link,spec_data|
+          relative_url = (link.get_attribute('href') || '')
           absolute_url = page_url.merge(URI.encode(relative_url))
           if absolute_url.path
@@ -35,15 +33,15 @@ namespace :course do
           )
         }
-        doc.search('.follow[a]') do |follow|
+        doc.search('.follow//a').each do |follow|
           specs << link_to_spec.call(follow, :behavior => :follow)
         end
-        doc.search('.nofollow[a]') do |nofollow|
+        doc.search('.nofollow//a').each do |nofollow|
           specs << link_to_spec.call(nofollow, :behavior => :nofollow)
         end
-        doc.search('.ignore[a]') do |ignore|
+        doc.search('.ignore//a').each do |ignore|
           specs << link_to_spec.call(ignore, :behavior => :ignore)
         end
       end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.1.5
 platform: ruby
 authors:
 - Postmodern
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-01-15 00:00:00 -08:00
+date: 2009-03-22 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.8.2
+        version: 1.11.0
     version:
 description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
 email: