RubyGems - spidr - Versions diffs - 0.1.5 → 0.1.6 - Mend

spidr 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/History.txt +13 -0
data/README.txt +1 -0
data/lib/spidr/agent.rb +108 -34
data/lib/spidr/version.rb +1 -1
data/spec/helpers/course.rb +22 -9
data/static/course/remote/start.html +4 -0
data/static/course/specs.json +1 -1
data/tasks/course.rb +4 -0
metadata +3 -3

data/History.txt CHANGED Viewed

@@ -1,3 +1,16 @@
+=== 0.1.6 / 2009-04-14
+* Added Agent#failures, a list of URLs which could not be visited.
+* Added Agent#failed?.
+* Added Agent#every_failed_url.
+* Added Agent#clear, which clears the history and failures URL lists.
+* Improved fault tolerance in Agent#get_page.
+  * If a Network or HTTP error is encountered, the URL will be added to
+    the failures list and the next URL will be visited.
+* Fixed a typo in Agent#ignore_exts_like.
+* Updated the Web Spider Obstacle Course with links that always fail to be
+  visited.
 === 0.1.5 / 2009-03-22
 * Catch malformed URIs in Page#to_absolute and return +nil+.

data/README.txt CHANGED Viewed

@@ -21,6 +21,7 @@ and easy to use.
   * Every visited Page.
   * Every visited URL.
   * Every visited URL that matches a specified pattern.
+  * Every URL that failed to be visited.
 * Custom User-Agent strings.
 * Custom proxy settings.

data/lib/spidr/agent.rb CHANGED Viewed

@@ -23,7 +23,10 @@ module Spidr
     attr_accessor :delay
     # History containing visited URLs
-    attr_accessor :history
+    attr_reader :history
+    # List of unreachable URLs
+    attr_reader :failures
     #
     # Creates a new Agent object with the given _options_ and _block_.
@@ -70,12 +73,14 @@ module Spidr
       )
       @every_url_blocks = []
+      @every_failed_url_blocks = []
       @urls_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @every_page_blocks = []
       @delay = (options[:delay] || 0)
       @history = []
+      @failures = []
       @queue = []
       if options[:host]
@@ -287,7 +292,7 @@ module Spidr
     # Adds the given _pattern_ to the ignore_exts. If a _block_ is given,
     # it will be added to the ignore_exts.
     #
-    def ignore_exts_like(&block)
+    def ignore_exts_like(pattern=nil,&block)
       if pattern
         ignore_exts << pattern
       elsif block
@@ -306,6 +311,15 @@ module Spidr
       return self
     end
+    #
+    # For every URL that the agent is unable to visit, it will be passed
+    # to the specified _block_.
+    #
+    def every_failed_url(&block)
+      @every_failed_url_blocks << block
+      return self
+    end
     #
     # For every URL that the agent visits and matches the specified
     # _pattern_, it will be passed to the specified _block_.
@@ -324,11 +338,21 @@ module Spidr
       return self
     end
+    #
+    # Clears the history of the agent.
+    #
+    def clear
+      @queue.clear
+      @history.clear
+      @failures.clear
+      return self
+    end
     #
     # Clear the history and start spidering at the specified _url_.
     #
     def start_at(url)
-      @history.clear
+      clear
       return run(url)
     end
@@ -366,11 +390,23 @@ module Spidr
     # otherwise.
     #
     def visited?(url)
-      if url.kind_of?(URI)
-        return @history.include?(url)
-      else
-        return @history.include?(URI(url).to_s)
+      unless url.kind_of?(URI)
+        url = URI(url)
       end
+      return @history.include?(url)
+    end
+    #
+    # Returns +true+ if the specified _url_ was unable to be visited,
+    # returns +false+ otherwise.
+    #
+    def failed?(url)
+      unless url.kind_of?(URI)
+        url = URI(url)
+      end
+      return @failures.include?(url)
     end
     #
@@ -392,16 +428,21 @@ module Spidr
       proxy_user = @proxy[:user]
       proxy_password = @proxy[:password]
-      Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
-        headers = {}
+      begin
+        Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
+          headers = {}
-        headers['User-Agent'] = @user_agent if @user_agent
-        headers['Referer'] = @referer if @referer
+          headers['User-Agent'] = @user_agent if @user_agent
+          headers['Referer'] = @referer if @referer
-        new_page = Page.new(url,sess.get(path,headers))
+          new_page = Page.new(url,sess.get(path,headers))
-        block.call(new_page) if block
-        return new_page
+          block.call(new_page) if block
+          return new_page
+        end
+      rescue SystemCallError, Net::HTTPBadResponse
+        failed(url)
+        return nil
       end
     end
@@ -447,6 +488,50 @@ module Spidr
       @queue.shift
     end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # it's scheme, returns +false+ otherwise.
+    #
+    def visit_scheme?(url)
+      if url.scheme
+        return SCHEMES.include?(url.scheme)
+      else
+        return true
+      end
+    end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # the host of the _url_, returns +false+ otherwise.
+    #
+    def visit_host?(url)
+      @host_rules.accept?(url.host)
+    end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # the port of the _url_, returns +false+ otherwise.
+    #
+    def visit_port?(url)
+      @port_rules.accept?(url.port)
+    end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # the pattern of the _url_, returns +false+ otherwise.
+    #
+    def visit_link?(url)
+      @link_rules.accept?(url.to_s)
+    end
+    #
+    # Returns +true+ if the specified _url_ should be visited, based on
+    # the file extension of the _url_, returns +false+ otherwise.
+    #
+    def visit_ext?(url)
+      @ext_rules.accept?(File.extname(url.path)[1..-1])
+    end
     #
     # Returns +true+ if the specified URL should be visited, returns
     # +false+ otherwise.
@@ -477,28 +562,17 @@ module Spidr
       end
     end
-    def visit_scheme?(url)
-      if url.scheme
-        return SCHEMES.include?(url.scheme)
-      else
-        return true
+    #
+    # Adds the specified _url_ to the failures list.
+    #
+    def failed(url)
+      unless url.kind_of?(URI)
+        url = URI(url.to_s)
       end
-    end
-    def visit_host?(url)
-      @host_rules.accept?(url.host)
-    end
-    def visit_port?(url)
-      @port_rules.accept?(url.port)
-    end
-    def visit_link?(url)
-      @link_rules.accept?(url.to_s)
-    end
-    def visit_ext?(url)
-      @ext_rules.accept?(File.extname(url.path)[1..-1])
+      @every_failed_url_blocks.each { |block| block.call(url) }
+      @failures << url
+      return true
     end
   end

data/lib/spidr/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Spidr
-  VERSION = '0.1.5'
+  VERSION = '0.1.6'
 end

data/spec/helpers/course.rb CHANGED Viewed

@@ -15,18 +15,25 @@ module Helpers
           message = spec['message'].to_s.dump
           url = spec['url'].to_s.dump
-          if spec['behavior'] == 'follow'
+          case spec['behavior']
+          when 'follow'
             base.module_eval %{
               it #{message} do
                 should_visit_link(#{url})
               end
             }
-          elsif spec['behavior'] == 'nofollow'
+          when 'nofollow'
             base.module_eval %{
               it #{message} do
                 should_visit_once(#{url})
               end
             }
+          when 'fail'
+            base.module_eval %{
+              it #{message} do
+                should_fail_link(#{url})
+              end
+            }
           else
             link = spec['link'].to_s.dump
@@ -42,7 +49,10 @@ module Helpers
     end
     def run_course
-      Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host])
+      Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
+        agent.every_failed_url { |url| puts "[FAILED] #{url}" }
+        agent.every_url { |url| puts url }
+      end
     end
     def visited_once?(link)
@@ -58,13 +68,11 @@ module Helpers
     # +false+ otherwise.
     #
     def visited_link?(link)
-      url = COURSE_URL.merge(URI.encode(link))
-      @agent.visited_urls.each do |visited_url|
-        return true if visited_url == url
-      end
+      @agent.visited?(COURSE_URL.merge(URI.encode(link)))
+    end
-      return false
+    def visit_failed?(link)
+      @agent.failed?(COURSE_URL.merge(URI.encode(link)))
     end
     def should_visit_link(link)
@@ -78,5 +86,10 @@ module Helpers
     def should_visit_once(link)
       visited_once?(link).should == true
     end
+    def should_fail_link(link)
+      visited_link?(link).should == false
+      visit_failed?(link).should == true
+    end
   end
 end

data/static/course/remote/start.html CHANGED Viewed

@@ -18,6 +18,10 @@
       <li class="follow">
         <a href="http://spidr.rubyforge.org/course/remote/next.html">should follow remote links to unvisited pages</a>
       </li>
+      <li class="fail">
+        <a href="http://spidr.rubyforge.org:1337/path/">should ignore links that fail</a>
+      </li>
     </ul>
   </body>
 </html>

data/static/course/specs.json CHANGED Viewed

	@@ -1 +1 @@
1	- [{"~~behavior":"ignore","~~link":"~~javascript:fail();~~","~~url~~":"~~javascript:fail();~~","message":"should ~~ignore~~ links ~~beginning~~ ~~with~~ \"~~javascript:\~~"","example":"<a href=\"~~javascript:fail();\~~">should ~~ignore~~ links ~~beginning~~ ~~with~~ ~~\"javascript:\"~~<\/a>"},{"~~behavior":"ignore","~~link":"#","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/javascript\/%23~~","message":"should ~~ignore~~ ~~links~~ ~~with~~ an ~~onclick attribute and a href pointing~~ to the page.","example":"<a href=\"#\" ~~onclick=\"fail();\"~~>should ~~ignore~~ ~~links~~ ~~with~~ an ~~onclick attribute and a href pointing~~ to the page~~.<\/~~a>"},{"behavior":"~~follow~~","~~link~~":"~~next.html~~","url":"http:\/\/spidr.rubyforge.org\/course\/~~loop~~\/~~next~~.html","message":"should follow links ~~pointing~~ to ~~other~~ ~~pages~~","example":"<a href=\"~~next.html~~\">should follow links ~~pointing~~ to ~~other~~ ~~pages~~<\/a>"},{"behavior":"~~nofollow~~","~~link~~":"~~start.html~~","url":"http:\/\/spidr.rubyforge.org\/course\/~~loop\/start.html~~","~~message~~":"should ~~not~~ ~~follow~~ links ~~pointing~~ to ~~the~~ ~~current~~ ~~page~~","example":"<a href=\"~~start.html\~~">should ~~not~~ ~~follow~~ links ~~pointing~~ to ~~the current page~~<\/a>"},{"behavior":"~~nofollow~~","~~link~~":"~~start~~.~~html~~","url":"http:\/\/spidr.rubyforge.org\/course\/~~loop\/~~start.html","message":"should not follow links to previously visited pages","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"behavior":"follow","~~link~~":"~~normal.html~~","url":"http:\/\/spidr.rubyforge.org\/course\/~~relative~~\/~~normal~~.html","~~message":"should follow relative links","~~example":"<a href=\"~~normal~~.html\">should follow ~~relative~~ ~~links~~<\/a>"},{"~~behavior":"follow","~~link":"~~.\/current_directory~~.html","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html~~","message":"should follow ~~relative~~ links to ~~files in~~ the current ~~directory~~","example":"<a href=\"~~.\/current_directory~~.html\">should follow ~~relative~~ links to ~~files in~~ the current ~~directory~~<\/a>"},{"behavior":"follow","~~link~~":"~~..\/~~relative~~\/same_directory.html~~","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/~~same_directory~~.html","~~message":"should follow links that transverse directories","~~example":"<a href=\"~~..\/relative\/same_directory~~.html\">should follow links ~~that transverse directories~~<\/a>"},{"~~behavior":"ignore","~~link":"#","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/relative\/%23~~","message":"should ~~ignore~~ ~~in-page~~ links~~","example":"<a~~ ~~href=\"#\">should~~ ~~ignore~~ in~~-page~~ ~~links<\/a>~~"},{"~~behavior":"nofollow","link":"","~~url":"http:\/\/spidr.rubyforge.org\/course\/~~empty~~\/~~start~~.html","~~message~~":"~~should~~ ~~not follow links with no~~ href ~~attributes~~","~~example":"<a~~>should ~~not~~ follow links ~~with~~ no ~~href~~ ~~attributes~~<\/a>"},{"behavior":"~~nofollow~~","~~link~~":"","url":"http:\/\/spidr.rubyforge.org\/course\/~~empty~~\/~~start~~.html","~~message":"should not follow links with empty href attributes","~~example":"<a href=\"\">should ~~not~~ follow links ~~with~~ ~~empty~~ ~~href attributes~~<\/a>"},{"behavior":"ignore","~~link~~":" ","url":"http:\/\/spidr.rubyforge.org\/course\/~~empty~~\/%20","~~message":"should ignore links with blank href attributes","~~example":"<a href=\"\">should ignore links ~~with blank href attributes~~<\/a>"},{"~~behavior":"follow","~~link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/remote\/next.html~~","message":"should follow remote links to unvisited pages","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"~~behavior":"nofollow","~~link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/remote\/start.html~~","message":"should not follow remote links to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"~~behavior":"nofollow","~~link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/remote\/start.html~~","message":"should not follow remote links with a relative path to the same page","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"~~behavior":"follow","~~link":"~~\/course\/absolute\/next.html","url":"~~http:\/\/spidr.rubyforge.org\/~~course~~\/~~absolute\/next.html~~","~~message~~":"~~should follow absolute links to unvisited pages~~","~~example~~":"~~<a href=\"\/course\/absolute\/next.html\">~~should ~~follow~~ ~~absolute~~ links to ~~unvisited pages<\/a>~~"},{"~~behavior":"nofollow","link":"\/course\/absolute\/start.html","~~url":"http:\/\/spidr.rubyforge.org\/~~course\/absolute\/start.html~~","~~message":"should not follow absolute links to the current page","~~example":"<a href=\"\/~~course\/absolute\/start.html\~~">should ~~not~~ ~~follow absolute~~ links to ~~the current page~~<\/a>"}]
1	+ [{"link":"\/course\/absolute\/next.html","behavior":"follow","message":"should follow absolute links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages<\/a>"},{"link":"\/course\/absolute\/start.html","behavior":"nofollow","message":"should not follow absolute links to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with no href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a>should not follow links with no href attributes<\/a>"},{"link":"","behavior":"nofollow","message":"should not follow links with empty href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/start.html","example":"<a href=\"\">should not follow links with empty href attributes<\/a>"},{"link":" ","behavior":"ignore","message":"should ignore links with blank href attributes","url":"http:\/\/spidr.rubyforge.org\/course\/empty\/%20","example":"<a href=\"\">should ignore links with blank href attributes<\/a>"},{"link":"javascript:fail();","behavior":"ignore","message":"should ignore links beginning with \"javascript:\"","url":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http:\/\/spidr.rubyforge.org\/course\/javascript\/%23","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links to previously visited pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages<\/a>"},{"link":"next.html","behavior":"follow","message":"should follow links pointing to other pages","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/next.html","example":"<a href=\"next.html\">should follow links pointing to other pages<\/a>"},{"link":"start.html","behavior":"nofollow","message":"should not follow links pointing to the current page","url":"http:\/\/spidr.rubyforge.org\/course\/loop\/start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page<\/a>"},{"link":"normal.html","behavior":"follow","message":"should follow relative links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/normal.html","example":"<a href=\"normal.html\">should follow relative links<\/a>"},{"link":".\/current_directory.html","behavior":"follow","message":"should follow relative links to files in the current directory","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/current_directory.html","example":"<a href=\".\/current_directory.html\">should follow relative links to files in the current directory<\/a>"},{"link":"..\/relative\/same_directory.html","behavior":"follow","message":"should follow links that transverse directories","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/same_directory.html","example":"<a href=\"..\/relative\/same_directory.html\">should follow links that transverse directories<\/a>"},{"link":"#","behavior":"ignore","message":"should ignore in-page links","url":"http:\/\/spidr.rubyforge.org\/course\/relative\/%23","example":"<a href=\"#\">should ignore in-page links<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","behavior":"follow","message":"should follow remote links to unvisited pages","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html","behavior":"nofollow","message":"should not follow remote links with a relative path to the same page","url":"http:\/\/spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http:\/\/spidr.rubyforge.org\/course\/loop\/..\/remote\/start.html\">should not follow remote links with a relative path to the same page<\/a>"},{"link":"http:\/\/spidr.rubyforge.org:1337\/path\/","behavior":"fail","message":"should ignore links that fail","url":"http:\/\/spidr.rubyforge.org:1337\/path","example":"<a href=\"http:\/\/spidr.rubyforge.org:1337\/path\/\">should ignore links that fail<\/a>"}]

data/tasks/course.rb CHANGED Viewed

@@ -44,6 +44,10 @@ namespace :course do
         doc.search('.ignore//a').each do |ignore|
           specs << link_to_spec.call(ignore, :behavior => :ignore)
         end
+        doc.search('.fail//a').each do |ignore|
+          specs << link_to_spec.call(ignore, :behavior => :fail)
+        end
       end
       spec.write(specs.to_json)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - Postmodern
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-03-22 00:00:00 -07:00
+date: 2009-04-14 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -30,7 +30,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.11.0
+        version: 1.12.1
     version:
 description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
 email: