RubyGems - spidr - Versions diffs - 0.1.5 → 0.1.6 - Mend

spidr 0.1.5 → 0.1.6

Files changed (9) hide show

data/History.txt +13 -0
data/README.txt +1 -0
data/lib/spidr/agent.rb +108 -34
data/lib/spidr/version.rb +1 -1
data/spec/helpers/course.rb +22 -9
data/static/course/remote/start.html +4 -0
data/static/course/specs.json +1 -1
data/tasks/course.rb +4 -0
metadata +3 -3

data/History.txt CHANGED Viewed

@@ -1,3 +1,16 @@
+=== 0.1.6 / 2009-04-14
+* Added Agent#failures, a list of URLs which could not be visited.
+* Added Agent#failed?.
+* Added Agent#every_failed_url.
+* Added Agent#clear, which clears the history and failures URL lists.
+* Improved fault tolerance in Agent#get_page.
+  * If a Network or HTTP error is encountered, the URL will be added to
+    the failures list and the next URL will be visited.
+* Fixed a typo in Agent#ignore_exts_like.
+* Updated the Web Spider Obstacle Course with links that always fail to be
+  visited.
 === 0.1.5 / 2009-03-22
 * Catch malformed URIs in Page#to_absolute and return +nil+.

data/README.txt CHANGED Viewed

@@ -21,6 +21,7 @@ and easy to use.
   * Every visited Page.
   * Every visited URL.
   * Every visited URL that matches a specified pattern.
+  * Every URL that failed to be visited.
 * Custom User-Agent strings.
 * Custom proxy settings.

data/lib/spidr/agent.rb CHANGED Viewed

@@ -23,7 +23,10 @@ module Spidr
     attr_accessor :delay
     # History containing visited URLs
-    attr_accessor :history
+    attr_reader :history
+    # List of unreachable URLs
+    attr_reader :failures
     #
     # Creates a new Agent object with the given _options_ and _block_.
@@ -70,12 +73,14 @@ module Spidr
       )
       @every_url_blocks = []
+      @every_failed_url_blocks = []
       @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @every_page_blocks = []
       @delay = (options[:delay] || 0)
       @history = []
+      @failures = []
       @queue = []
       if options[:host]
@@ -287,7 +292,7 @@ module Spidr
     # Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
     # it will be added to the ignore_exts.
     #
-    def ignore_exts_like(&block)
+    def ignore_exts_like(pattern=nil,&block)
       if pattern
         ignore_exts << pattern
       elsif block
@@ -306,6 +311,15 @@ module Spidr
       return self
     end
+    #
+    # For every URL that the agent is unable to visit, it will be passed
+    # to the specified _block_.
+    #
+    def every_failed_url(&block)
+      @every_failed_url_blocks << block
+      return self
+    end
     #
     # For every URL that the agent visits and matches the specified
     # _pattern_, it will be passed to the specified _block_.
@@ -324,11 +338,21 @@ module Spidr
       return self
     end
+    #
+    # Clears the history of the agent.
+    #
+    def clear
+      @queue.clear
+      @history.clear
+      @failures.clear
+      return self
+    end
     #
     # Clear the history and start spidering at the specified _url_.
     #
     def start_at(url)
-      @history.clear
+      clear
       return run(url)
     end
@@ -366,11 +390,23 @@ module Spidr
     # otherwise.
     #
     def visited?(url)
-      if url.kind_of?(URI)
-        return @history.include?(url)
-      else
-        return @history.include?(URI(url).to_s)
+      unless url.kind_of?(URI)
+        url = URI(url)
       end
+      return @history.include?(url)
+    end
+    #
+    # Returns +true+ if the specified _url_ was unable to be visited,
+    # returns +false+ otherwise.
+    #
+    def failed?(url)
+      unless url.kind_of?(URI)
+        url = URI(url)
+      end
+      return @failures.include?(url)
     end
     #
@@ -392,16 +428,21 @@ module Spidr
       proxy_user = @proxy[:user]
       proxy_password = @proxy[:password]
-      Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
-        headers = {}
+      begin
+        Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
+          headers = {}
-        headers['User-Agent'] = @user_agent if @user_agent
-        headers['Referer'] = @referer if @referer
+          headers['User-Agent'] = @user_agent if @user_agent
+          headers['Referer'] = @referer if @referer
-        new_page = Page.new(url,sess.get(path,headers))
+          new_page = Page.new(url,sess.get(path,headers))
-        block.call(new_page) if block
-        return new_page
+          block.call(new_page) if block
+          return new_page
+        end
+      rescue SystemCallError, Net::HTTPBadResponse
+        failed(url)
+        return nil
       end
     end
@@ -447,6 +488,50 @@ module Spidr
       @queue.shift
     end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # it's scheme, returns +false+ otherwise.
+    #
+    def visit_scheme?(url)
+      if url.scheme
+        return SCHEMES.include?(url.scheme)
+      else
+        return true
+      end
+    end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # the host of the _url_, returns +false+ otherwise.
+    #
+    def visit_host?(url)
+      @host_rules.accept?(url.host)
+    end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # the port of the _url_, returns +false+ otherwise.
+    #
+    def visit_port?(url)
+      @port_rules.accept?(url.port)
+    end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # the pattern of the _url_, returns +false+ otherwise.
+    #
+    def visit_link?(url)
+      @link_rules.accept?(url.to_s)
+    end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # the file extension of the _url_, returns +false+ otherwise.
+    #
+    def visit_ext?(url)
+      @ext_rules.accept?(File.extname(url.path)[1..-1])
+    end
     #
     # Returns +true+ if the specified URL should be visited, returns
     # +false+ otherwise.
@@ -477,28 +562,17 @@ module Spidr
       end
     end
-    def visit_scheme?(url)
-      if url.scheme
-        return SCHEMES.include?(url.scheme)
-      else
-        return true
+    #
+    # Adds the specified _url_ to the failures list.
+    #
+    def failed(url)
+      unless url.kind_of?(URI)
+        url = URI(url.to_s)
       end
-    end
-    def visit_host?(url)
-      @host_rules.accept?(url.host)
-    end
-    def visit_port?(url)
-      @port_rules.accept?(url.port)
-    end
-    def visit_link?(url)
-      @link_rules.accept?(url.to_s)
-    end
-    def visit_ext?(url)
-      @ext_rules.accept?(File.extname(url.path)[1..-1])
+      @every_failed_url_blocks.each { |block| block.call(url) }
+      @failures << url
+      return true
     end
   end

data/lib/spidr/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Spidr
-  VERSION = '0.1.5'
+  VERSION = '0.1.6'
 end

data/spec/helpers/course.rb CHANGED Viewed

@@ -15,18 +15,25 @@ module Helpers
           message = spec['message'].to_s.dump
           url = spec['url'].to_s.dump
-          if spec['behavior'] == 'follow'
+          case spec['behavior']
+          when 'follow'
             base.module_eval %{
               it #{message} do
                 should_visit_link(#{url})
               end
             }
-          elsif spec['behavior'] == 'nofollow'
+          when 'nofollow'
             base.module_eval %{
               it #{message} do
                 should_visit_once(#{url})
               end
             }
+          when 'fail'
+            base.module_eval %{
+              it #{message} do
+                should_fail_link(#{url})
+              end
+            }
           else
             link = spec['link'].to_s.dump
@@ -42,7 +49,10 @@ module Helpers
     end
     def run_course
-      Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host])
+      Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
+        agent.every_failed_url { |url| puts "[FAILED] #{url}" }
+        agent.every_url { |url| puts url }
+      end
     end
     def visited_once?(link)
@@ -58,13 +68,11 @@ module Helpers
     # +false+ otherwise.
     #
     def visited_link?(link)
-      url = COURSE_URL.merge(URI.encode(link))
-      @agent.visited_urls.each do |visited_url|
-        return true if visited_url == url
-      end
+      @agent.visited?(COURSE_URL.merge(URI.encode(link)))
+    end
-      return false
+    def visit_failed?(link)
+      @agent.failed?(COURSE_URL.merge(URI.encode(link)))
     end
     def should_visit_link(link)
@@ -78,5 +86,10 @@ module Helpers
     def should_visit_once(link)
       visited_once?(link).should == true
     end
+    def should_fail_link(link)
+      visited_link?(link).should == false
+      visit_failed?(link).should == true
+    end
   end
 end

data/static/course/remote/start.html CHANGED Viewed

@@ -18,6 +18,10 @@
       <li class="follow">
         <a href="http://spidr.rubyforge.org/course/remote/next.html">should follow remote links to unvisited pages</a>
       </li>
+      <li class="fail">
+        <a href="http://spidr.rubyforge.org:1337/path/">should ignore links that fail</a>
+      </li>
     </ul>
   </body>
 </html>

data/static/course/specs.json CHANGED Viewed

	@@ -1 +1 @@
1	- [{"~~behavior":"ignore","~~link":"~~javascript:fail();~~","~~url~~":"~~javascript:fail();~~","message":"should ~~ignore~~ links ~~beginning~~ ~~with~~ \"~~javascript:\~~"","example":"<a href=\"~~javascript:fail();\~~">should ~~ignore~~ links ~~beginning~~ ~~with~~ ~~\"javascript:\"~~<\/a>"},{"~~behavior":"ignore","~~link":"#","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/javascript\/%23~~","message":"should ~~ignore~~ ~~links~~ ~~with~~ an ~~onclick attribute and a href pointing~~ to the page.","example":"<a href=\"#\" ~~onclick=\"fail();\"~~>should ~~ignore~~ ~~links~~ ~~with~~ an ~~onclick attribute and a href pointing~~ to the page~~.<\/~~a>"},{"behavior":"~~follow~~","~~link~~":"~~next.html~~","url":"http:\/\/spidr.rubyforge.org\/course\/~~loop~~\/~~next~~.html","message":"should follow links ~~pointing~~ to ~~other~~ ~~pages~~","example":"<a href=\"~~next.html~~\">should follow links ~~pointing~~ to ~~other~~ ~~pages~~<\/a>"},{"behavior":"~~nofollow~~","~~link~~":"~~start.html~~","url":"http:\/\/spidr.rubyforge.org\/course\/~~loop\/start.html~~","~~message~~":"should ~~not~~ ~~follow~~ links ~~pointing~~ to ~~the~~ ~~current~~ ~~page~~","example":"<a href=\"~~start.html\~~">should ~~not~~ ~~follow~~ links ~~pointing~~ to ~~the current page~~<\/a>"},{"behavior":"~~nofollow~~","~~link~~":"~~start~~.~~html~~","url":"http:\/\/spidr.rubyforge.org\/course\/~~loop\/~~start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","~~link~~":"~~normal.html~~","url":"http:\/\/spidr.rubyforge.org\/course\/~~relative~~\/~~normal~~.html","~~message":"should follow relative links","~~example":"<a href=\"~~normal~~.html\">should follow ~~relative~~ ~~links~~<\/a>"},{"~~behavior":"follow","~~link":"~~.\/current_directory~~.html","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html~~","message":"should follow ~~relative~~ links to ~~files in~~ the current ~~directory~~","example":"<a href=\"~~.\/current_directory~~.html\">should follow ~~relative~~ links to ~~files in~~ the current ~~directory~~<\/a>"},{"behavior":"follow","~~link~~":"~~..\/~~relative~~\/same_directory.html~~","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/~~same_directory~~.html","~~message":"should follow links that transverse directories","~~example":"<a href=\"~~..\/relative\/same_directory~~.html\">should follow links ~~that transverse directories~~<\/a>"},{"~~behavior":"ignore","~~link":"#","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/relative\/%23~~","message":"should ~~ignore~~ ~~in-page~~ links~~","example":"<a~~ ~~href=\"#\">should~~ ~~ignore~~ in~~-page~~ ~~links<\/a>~~"},{"~~behavior":"nofollow","link":"","~~url":"http:\/\/spidr.rubyforge.org\/course\/~~empty~~\/~~start~~.html","~~message~~":"~~should~~ ~~not follow links with no~~ href ~~attributes~~","~~example":"<a~~>should ~~not~~ follow links ~~with~~ no ~~href~~ ~~attributes~~<\/a>"},{"behavior":"~~nofollow~~","~~link~~":"","url":"http:\/\/spidr.rubyforge.org\/course\/~~empty~~\/~~start~~.html","~~message":"should not follow links with empty href attributes","~~example":"<a href=\"\">should ~~not~~ follow links ~~with~~ ~~empty~~ ~~href attributes~~<\/a>"},{"behavior":"ignore","~~link~~":" ","url":"http:\/\/spidr.rubyforge.org\/course\/~~empty~~\/%20","~~message":"should ignore links with blank href attributes","~~example":"<a href=\"\">should ignore links ~~with blank href attributes~~<\/a>"},{"~~behavior":"follow","~~link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/remote\/next.html~~","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"~~behavior":"nofollow","~~link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/remote\/start.html~~","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"~~behavior":"nofollow","~~link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/remote\/start.html~~","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"~~behavior":"follow","~~link":"~~\/course\/absolute\/next.html","url":"~~http:\/\/spidr.rubyforge.org\/~~course~~\/~~absolute\/next.html~~","~~message~~":"~~should follow absolute links to unvisited pages~~","~~example~~":"~~<a href=\"\/course\/absolute\/next.html\">~~should ~~follow~~ ~~absolute~~ links to ~~unvisited pages<\/a>~~"},{"~~behavior":"nofollow","link":"\/course\/absolute\/start.html","~~url":"http:\/\/spidr.rubyforge.org\/~~course\/absolute\/start.html~~","~~message":"should not follow absolute links to the current page","~~example":"<a href=\"\/~~course\/absolute\/start.html\~~">should ~~not~~ ~~follow absolute~~ links to ~~the current page~~<\/a>"}]
1	+ [{"link":"\/course\/absolute\/next.html","behavior":"follow","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"link":"\/course\/absolute\/start.html","behavior":"nofollow","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a>should not follow links with no href attributes<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"link":" ","behavior":"ignore","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"link":"javascript:fail();","behavior":"ignore","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"link":"next.html","behavior":"follow","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"link":"normal.html","behavior":"follow","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"link":".\/current_directory.html","behavior":"follow","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"link":"..\/relative\/same_directory.html","behavior":"follow","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","behavior":"fail","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>"}]

data/tasks/course.rb CHANGED Viewed

@@ -44,6 +44,10 @@ namespace :course do
         doc.search('.ignore//a').each do |ignore|
           specs << link_to_spec.call(ignore, :behavior => :ignore)
         end
+        doc.search('.fail//a').each do |ignore|
+          specs << link_to_spec.call(ignore, :behavior => :fail)
+        end
       end
       spec.write(specs.to_json)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - Postmodern
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-03-22 00:00:00 -07:00
+date: 2009-04-14 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.11.0
+        version: 1.12.1
     version:
 description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
 email: