RubyGems - spidr - Versions diffs - 0.1.4 → 0.1.5 - Mend

spidr 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,8 @@
+=== 0.1.5 / 2009-03-22
+* Catch malformed URIs in Page#to_absolute and return +nil+.
+* Filter out +nil+ URIs in Page#urls.
 === 0.1.4 / 2009-01-15
 * Use Nokogiri for HTML and XML parsing.

data/README.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 = Spidr
 * http://spidr.rubyforge.org/
+* http://github.com/postmodern/spidr/
 * Postmodern (postmodern.mod3 at gmail.com)
 == DESCRIPTION:
@@ -25,7 +26,7 @@ and easy to use.
 == REQUIREMENTS:
-* nokogiri
+* {nokogiri}[http://nokogiri.rubyforge.org/]
 == INSTALL:

data/lib/spidr/page.rb CHANGED Viewed

@@ -192,10 +192,14 @@ module Spidr
     # returned.
     #
     def doc
-      if html?
-        return @doc ||= Nokogiri::HTML(body)
-      elsif xml?
-        return @doc ||= Nokogiri::XML(body)
+      begin
+        if html?
+          return @doc ||= Nokogiri::HTML(body)
+        elsif xml?
+          return @doc ||= Nokogiri::XML(body)
+        end
+      rescue
+        return nil
       end
     end
@@ -205,7 +209,7 @@ module Spidr
     def links
       urls = []
-      if html?
+      if (html? && self.doc)
         self.doc.search('a[@href]').each do |a|
           url = a.get_attribute('href')
@@ -220,7 +224,7 @@ module Spidr
     # Returns all links from the HtML page as absolute URLs.
     #
     def urls
-      links.map { |link| to_absolute(link) }
+      links.map { |link| to_absolute(link) }.compact
     end
     protected
@@ -233,20 +237,24 @@ module Spidr
       # clean the link
       link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
-      relative = URI(link)
-      absolute = @url.merge(relative)
-      if absolute.path
-        if absolute.path.empty?
-          # default the absolute path to '/'
-          absolute.path = '/'
-        else
-          # make sure the path does not contain any .. or . directories.
-          absolute.path = File.expand_path(absolute.path)
+      begin
+        relative = URI(link)
+        absolute = @url.merge(relative)
+        if absolute.path
+          if absolute.path.empty?
+            # default the absolute path to '/'
+            absolute.path = '/'
+          else
+            # make sure the path does not contain any .. or . directories.
+            absolute.path = File.expand_path(absolute.path)
+          end
         end
-      end
-      return absolute
+        return absolute
+      rescue URI::InvalidURIError => e
+        return nil
+      end
     end
     #

data/lib/spidr/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Spidr
-  VERSION = '0.1.4'
+  VERSION = '0.1.5'
 end

data/static/course/specs.json CHANGED Viewed

	@@ -1 +1 @@
1	- [{"~~url~~":"javascript:fail();","~~link~~":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>","~~message~~":"~~should~~ ignore ~~links beginning with \~~"~~javascript:\""~~,"~~behavior~~":"~~ignore~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","~~link~~":"~~#","example":"<a href=\"#\" onclick=\"fail();\">~~should ignore links with an onclick attribute and a href pointing to the page~~.<\/a>~~","~~message~~":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"~~ignore~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","~~link~~":"~~next.html~~","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>","~~message~~":"~~should follow links pointing to other pages~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","~~link~~":"~~start.html~~","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>","~~message~~":"~~should not follow links pointing to the current page~~","~~behavior~~":"~~nofollow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","~~link~~":"~~start.html~~","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>","~~message~~":"~~should not~~ follow ~~links to previously visited pages~~","~~behavior~~":"~~nofollow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","~~link~~":"~~normal.html~~","example":"<a href=\"normal.html\">should follow relative links<\/a>","~~message~~":"~~should~~ follow ~~relative links~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","~~link~~":"~~.\/current_directory.html~~","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>","~~message~~":"~~should~~ follow ~~relative links to files in the current directory~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","~~link~~":"~~..\/relative\/same_directory.html~~","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>","~~message~~":"~~should follow links that transverse directories~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","~~link~~":"#","example":"<a href=\"#\">should ignore in-page links<\/a>","~~message~~":"~~should ignore in-page links~~","~~behavior~~":"~~ignore~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","~~link~~":"~~","example":"<a>~~should not follow links with no href attributes~~<\/a>~~","~~message~~":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","~~link~~":"","example":"<a href=\"\">should not follow links with empty href attributes<\/a>","~~message~~":"~~should not follow links with empty href attributes~~","~~behavior~~":"~~nofollow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","~~link~~":" ","example":"<a href=\" \">should ignore links with blank href attributes<\/a>","~~message":"should ignore links with blank href attributes","~~behavior":"~~ignore~~"},{"~~url~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","~~link~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>","~~message":"should follow remote links to unvisited pages","~~behavior":"~~follow~~"},{"~~url~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","~~link~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>","~~message":"should not follow remote links to the same page","~~behavior":"nofollow"},{"~~url~~":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","~~link~~":"http:\/\/spidr.rubyforge.org\/course\/~~loop\/..\/~~remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>","~~message~~":"~~should not~~ follow ~~remote links with a relative path to the same page~~","~~behavior~~":"~~nofollow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","~~link~~":"~~\/course\/~~absolute~~\/next.html~~","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>","~~message~~":"~~should follow absolute links to unvisited pages~~","~~behavior~~":"~~follow~~"},{"url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","~~link~~":"~~\/course\/~~absolute~~\/start.html~~","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"~~,"message":"should not follow absolute links to the current page","behavior":"nofollow"~~}]
1	+ [{"behavior":"ignore","link":"javascript:fail();","url":"javascript:fail();","message":"should ignore links beginning with \"javascript:\"","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","message":"should ignore links with an onclick attribute and a href pointing to the page.","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"behavior":"follow","link":"next.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","message":"should follow links pointing to other pages","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links pointing to the current page","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"behavior":"nofollow","link":"start.html","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","link":"normal.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","message":"should follow relative links","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"behavior":"follow","link":".\/current_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","message":"should follow relative links to files in the current directory","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"behavior":"follow","link":"..\/relative\/same_directory.html","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","message":"should follow links that transverse directories","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"behavior":"ignore","link":"#","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","message":"should ignore in-page links","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with no href attributes","example":"<a>should not follow links with no href attributes<\/a>"},{"behavior":"nofollow","link":"","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","message":"should not follow links with empty href attributes","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"behavior":"ignore","link":" ","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","message":"should ignore links with blank href attributes","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"behavior":"follow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"behavior":"nofollow","link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"behavior":"follow","link":"\/course\/absolute\/next.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","message":"should follow absolute links to unvisited pages","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"behavior":"nofollow","link":"\/course\/absolute\/start.html","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","message":"should not follow absolute links to the current page","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"}]

data/tasks/course.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require 'hpricot'
+require 'nokogiri'
 require 'json'
 namespace :course do
@@ -14,13 +14,11 @@ namespace :course do
       specs = []
       Dir[File.join(COURSE_DIR,'**','*.html')].each do |page|
-        doc = Hpricot(open(page))
+        doc = Nokogiri::HTML(open(page))
         page_url = COURSE_URL.merge(page.sub(STATIC_DIR,''))
-        link_to_spec = lambda { |container,spec_data|
-          link = container.at('a')
-          relative_url = link['href'].to_s
+        link_to_spec = lambda { |link,spec_data|
+          relative_url = (link.get_attribute('href') || '')
           absolute_url = page_url.merge(URI.encode(relative_url))
           if absolute_url.path
@@ -35,15 +33,15 @@ namespace :course do
           )
         }
-        doc.search('.follow[a]') do |follow|
+        doc.search('.follow//a').each do |follow|
           specs << link_to_spec.call(follow, :behavior => :follow)
         end
-        doc.search('.nofollow[a]') do |nofollow|
+        doc.search('.nofollow//a').each do |nofollow|
           specs << link_to_spec.call(nofollow, :behavior => :nofollow)
         end
-        doc.search('.ignore[a]') do |ignore|
+        doc.search('.ignore//a').each do |ignore|
           specs << link_to_spec.call(ignore, :behavior => :ignore)
         end
       end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.1.5
 platform: ruby
 authors:
 - Postmodern
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-01-15 00:00:00 -08:00
+date: 2009-03-22 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.8.2
+        version: 1.11.0
     version:
 description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
 email: