RubyGems - spidr - Versions diffs - 0.1.9 → 0.2.0 - Mend

spidr 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data.tar.gz.sig +0 -0
data/History.txt +43 -0
data/Manifest.txt +19 -0
data/README.txt +100 -11
data/Rakefile +15 -5
data/lib/spidr/actions.rb +2 -0
data/lib/spidr/actions/actions.rb +79 -0
data/lib/spidr/actions/exceptions.rb +4 -0
data/lib/spidr/actions/exceptions/action.rb +6 -0
data/lib/spidr/actions/exceptions/paused.rb +8 -0
data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
data/lib/spidr/agent.rb +385 -444
data/lib/spidr/events.rb +87 -0
data/lib/spidr/extensions.rb +1 -0
data/lib/spidr/extensions/uri.rb +45 -0
data/lib/spidr/filters.rb +438 -0
data/lib/spidr/page.rb +211 -70
data/lib/spidr/rules.rb +40 -18
data/lib/spidr/spidr.rb +57 -7
data/lib/spidr/version.rb +2 -1
data/spec/actions_spec.rb +61 -0
data/spec/agent_spec.rb +24 -31
data/spec/extensions/uri_spec.rb +39 -0
data/spec/filters_spec.rb +53 -0
data/spec/helpers/page.rb +8 -0
data/spec/page_examples.rb +17 -0
data/spec/page_spec.rb +81 -0
data/spec/rules_spec.rb +43 -0
data/spec/spec_helper.rb +1 -1
data/spec/spidr_spec.rb +30 -0
data/static/course/specs.json +1 -1
data/tasks/course.rb +8 -1
data/tasks/spec.rb +1 -0
data/tasks/yard.rb +12 -0
metadata +45 -6
metadata.gz.sig +0 -0

data/spec/page_examples.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require 'spidr/page'
+require 'spec_helper'
+shared_examples_for "Page" do
+  it "should have a status code" do
+    @page.code.should be_integer
+  end
+  it "should have a body" do
+    @page.body.should_not be_empty
+  end
+  it "should provide transparent access to the response headers" do
+    @page.content_type.should == @page.content_type
+  end
+end

data/spec/page_spec.rb ADDED Viewed

@@ -0,0 +1,81 @@
+require 'spidr/page'
+require 'spec_helper'
+require 'page_examples'
+require 'helpers/page'
+describe Page do
+  describe "html" do
+    before(:all) do
+      @page = get_page('http://spidr.rubyforge.org/course/start.html')
+    end
+    it_should_behave_like "Page"
+    it "should be OK" do
+      @page.should be_ok
+    end
+    it "should have a content-type" do
+      @page.content_type.should =~ /text\/html/
+    end
+    it "should be a html page" do
+      @page.should be_html
+    end
+    it "should have provide a document" do
+      @page.doc.class.should == Nokogiri::HTML::Document
+    end
+    it "should allow searching the document" do
+      @page.doc.search('//p').length.should == 2
+      @page.doc.at('//p[2]').inner_text.should == 'Ready! Set! Go!'
+    end
+    it "should have a title" do
+      @page.title.should == 'Spidr :: Web-Spider Obstacle Course :: Start'
+    end
+    it "should have links" do
+      @page.links.should_not be_empty
+    end
+  end
+  describe "txt" do
+    before(:all) do
+      @page = get_page('http://www.ruby-lang.org/en/LICENSE.txt')
+    end
+    it_should_behave_like "Page"
+    it "should be OK" do
+      @page.should be_ok
+    end
+    it "should have a content-type" do
+      @page.content_type.should =~ /text\/plain/
+    end
+    it "should be a txt page" do
+      @page.should be_txt
+    end
+    it "should not have provide a document" do
+      @page.doc.should be_nil
+    end
+    it "should not allow searching the document" do
+      @page.search('//p').should be_empty
+      @page.at('//p').should be_nil
+    end
+    it "should not have links" do
+      @page.links.should be_empty
+    end
+    it "should not have a title" do
+      @page.title.should be_nil
+    end
+  end
+end

data/spec/rules_spec.rb ADDED Viewed

@@ -0,0 +1,43 @@
+require 'spidr/rules'
+require 'spec_helper'
+describe Rules do
+  it "should accept data based on acceptance data" do
+    rules = Rules.new(:accept => [1])
+    rules.accept?(1).should == true
+  end
+  it "should accept data based on acceptance regexps" do
+    rules = Rules.new(:accept => [/1/])
+    rules.accept?('1').should == true
+  end
+  it "should match non-Strings using acceptance regexps" do
+    rules = Rules.new(:accept => [/1/])
+    rules.accept?(1).should == true
+  end
+  it "should accept data using acceptance lambdas" do
+    rules = Rules.new(:accept => [lambda { |data| data > 2 }])
+    rules.accept?(3).should == true
+  end
+  it "should reject data that does not match any acceptance patterns" do
+    rules = Rules.new(:accept => [1, 2, 3])
+    rules.accept?(2).should == true
+    rules.accept?(4).should == false
+  end
+  it "should accept data that does not match any rejection patterns" do
+    rules = Rules.new(:reject => [1, 2, 3])
+    rules.accept?(2).should == false
+    rules.accept?(4).should == true
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require 'rubygems'
-gem 'rspec', '>=1.1.3'
+gem 'rspec', '>=1.2.8'
 require 'spec'
 require 'spidr/version'

data/spec/spidr_spec.rb CHANGED Viewed

@@ -6,4 +6,34 @@ describe Spidr do
   it "should have a VERSION constant" do
     Spidr.const_defined?('VERSION').should == true
   end
+  describe "proxy" do
+    after(:all) do
+      Spidr.disable_proxy!
+    end
+    it "should not have proxy settings by default" do
+      Spidr.proxy[:host].should be_nil
+    end
+    it "should allow setting new proxy settings" do
+      Spidr.proxy = {:host => 'example.com', :port => 8010}
+      Spidr.proxy[:host].should == 'example.com'
+      Spidr.proxy[:port].should == 8010
+    end
+    it "should default the :port option of new proxy settings" do
+      Spidr.proxy = {:host => 'example.com'}
+      Spidr.proxy[:host].should == 'example.com'
+      Spidr.proxy[:port].should == Spidr::COMMON_PROXY_PORT
+    end
+    it "should allow disabling the proxy" do
+      Spidr.disable_proxy!
+      Spidr.proxy[:host].should be_nil
+    end
+  end
 end

data/static/course/specs.json CHANGED Viewed

	@@ -1 +1 @@
1	- [{"link":"\/course\/absolute\/next.html","example":"<a href=\"\/course\/absolute\/next.html\">should follow absolute links to unvisited pages~~<\/~~a>","message":"should follow absolute links to unvisited pages","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/absolute~~\/next~~.html","~~behavior":"follow"},{"~~link":"\/course\/absolute\/start.html","example":"<a href=\"\/course\/absolute\/start.html\">should not follow absolute links to the current page~~<\/~~a>","message":"should not follow absolute links to the current page","url":"http~~:\/\/~~spidr.rubyforge.org\/course~~\/absolute\/~~start.html","~~behavior":"nofollow"},{"~~link":"","example":"<a>should not follow links with no href attributes~~<\/~~a>","message":"should not follow links with no href attributes","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/empty\/start.html","~~behavior":"nofollow"},{"~~link":"","example":"<a href=\"\">should not follow links with empty href attributes~~<\/~~a>","message":"should not follow links with empty href attributes","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/empty~~\/start.html~~","~~behavior":"nofollow"},{"~~link":" ","example":"<a href=\"\">should ignore links with blank href attributes~~<\/~~a>","message":"should ignore links with blank href attributes","~~url~~":"~~http:\/\/spidr.rubyforge.org\/course\/empty\/%20~~","~~behavior~~":"~~ignore~~"},{"link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"~~<\/~~a>","message":"should ignore links beginning with \"javascript:\"","~~url~~":"~~javascript:fail();~~","~~behavior~~":"~~ignore~~"},{"link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page~~.<\/~~a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","url":"http~~:\/\/~~spidr.rubyforge.org\/course~~\/javascript\/%23~~","~~behavior":"ignore"},{"~~link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages~~<\/~~a>","message":"should not follow links to previously visited pages","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/loop~~\/start~~.html","~~behavior":"nofollow"},{"~~link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages~~<\/~~a>","message":"should follow links pointing to other pages","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/loop~~\/next~~.html","~~behavior":"follow"},{"~~link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page~~<\/~~a>","message":"should not follow links pointing to the current page","url":"http~~:\/\/~~spidr.rubyforge.org\/course~~\/loop\/start~~.html","~~behavior":"nofollow"},{"~~link":"normal.html","example":"<a href=\"normal.html\">should follow relative links~~<\/~~a>","message":"should follow relative links","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/relative~~\/normal~~.html","~~behavior":"follow"},{"~~link":"~~.\/~~current_directory.html","example":"<a href=\"~~.\/~~current_directory.html\">should follow relative links to files in the current directory~~<\/~~a>","message":"should follow relative links to files in the current directory","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/relative~~\/current_directory~~.html","~~behavior":"follow"},{"~~link":"~~..\/~~relative\/same_directory.html","example":"<a href=\"~~..\/~~relative\/same_directory.html\">should follow links that transverse directories~~<\/~~a>","message":"should follow links that transverse directories","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/relative~~\/same_directory.html~~","~~behavior":"follow"},{"~~link":"#","example":"<a href=\"#\">should ignore in-page links~~<\/~~a>","message":"should ignore in-page links","url":"http~~:\/\/~~spidr.rubyforge.org\/course~~\/relative\/%23~~","~~behavior":"ignore"},{"~~link":"http~~:\/\/~~spidr.rubyforge.org\/course\/remote\/next.html","example":"<a href=\"http~~:\/\/~~spidr.rubyforge.org\/course\/remote\/next.html\">should follow remote links to unvisited pages~~<\/~~a>","message":"should follow remote links to unvisited pages","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/remote~~\/next~~.html","~~behavior":"follow"},{"~~link":"http~~:\/\/~~spidr.rubyforge.org\/course\/remote\/start.html","example":"<a href=\"http~~:\/\/~~spidr.rubyforge.org\/course\/remote\/start.html\">should not follow remote links to the same page~~<\/~~a>","message":"should not follow remote links to the same page","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/remote\/start.html","~~behavior":"nofollow"},{"~~link":"http~~:\/\/~~spidr.rubyforge.org\/course\/loop~~\/..\/~~remote\/start.html","example":"<a href=\"http~~:\/\/~~spidr.rubyforge.org\/course\/loop~~\/..\/~~remote\/start.html\">should not follow remote links with a relative path to the same page~~<\/~~a>","message":"should not follow remote links with a relative path to the same page","url":"http~~:\/\/~~spidr.rubyforge.org~~\/course\/remote\/start.html","behavior"~~:"~~nofollow"}~~,{"link":"http~~:\/\/~~spidr.rubyforge.org:1337\/path\/","example":"<a href=\"http~~:\/\/~~spidr.rubyforge.org:1337\/path~~\/\~~">should ignore links that fail~~<\/~~a>","message":"should ignore links that fail","url":"http~~:\/\/~~spidr.rubyforge.org~~:1337\/path~~","~~behavior":"fail"},{"~~link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes~~<\/~~a>","message":"should follow links within iframes","url":"http~~:\/\/~~spidr.rubyforge.org\/course\/frames~~\/iframe_next~~.html","~~behavior":"follow"},{"~~link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames~~<\/~~a>","message":"should follow links within frames","~~url":"http:\/\/spidr.rubyforge.org\/course\/frames\/frame_next.html","~~behavior":"follow"}]
1	+ [{"url":"http://spidr.rubyforge.org/course/absolute/next.html","link":"/course/absolute/next.html","example":"<a href=\"/course/absolute/next.html\">should follow absolute links to unvisited pages</a>","message":"should follow absolute links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/absolute/start.html","link":"/course/absolute/start.html","example":"<a href=\"/course/absolute/start.html\">should not follow absolute links to the current page</a>","message":"should not follow absolute links to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a>should not follow links with no href attributes</a>","message":"should not follow links with no href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/start.html","link":"","example":"<a href=\"\">should not follow links with empty href attributes</a>","message":"should not follow links with empty href attributes","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/empty/%20","link":" ","example":"<a href=\"\">should ignore links with blank href attributes</a>","message":"should ignore links with blank href attributes","behavior":"ignore"},{"url":"javascript:fail();","link":"javascript:fail();","example":"<a href=\"javascript:fail();\">should ignore links beginning with \"javascript:\"</a>","message":"should ignore links beginning with \"javascript:\"","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/javascript/%23","link":"#","example":"<a href=\"#\" onclick=\"fail();\">should ignore links with an onclick attribute and a href pointing to the page.</a>","message":"should ignore links with an onclick attribute and a href pointing to the page.","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links to previously visited pages</a>","message":"should not follow links to previously visited pages","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/loop/next.html","link":"next.html","example":"<a href=\"next.html\">should follow links pointing to other pages</a>","message":"should follow links pointing to other pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/loop/start.html","link":"start.html","example":"<a href=\"start.html\">should not follow links pointing to the current page</a>","message":"should not follow links pointing to the current page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/relative/normal.html","link":"normal.html","example":"<a href=\"normal.html\">should follow relative links</a>","message":"should follow relative links","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/current_directory.html","link":"./current_directory.html","example":"<a href=\"./current_directory.html\">should follow relative links to files in the current directory</a>","message":"should follow relative links to files in the current directory","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/same_directory.html","link":"../relative/same_directory.html","example":"<a href=\"../relative/same_directory.html\">should follow links that transverse directories</a>","message":"should follow links that transverse directories","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/relative/%23","link":"#","example":"<a href=\"#\">should ignore in-page links</a>","message":"should ignore in-page links","behavior":"ignore"},{"url":"http://spidr.rubyforge.org/course/remote/next.html","link":"http://spidr.rubyforge.org/course/remote/next.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/next.html\">should follow remote links to unvisited pages</a>","message":"should follow remote links to unvisited pages","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/remote/start.html\">should not follow remote links to the same page</a>","message":"should not follow remote links to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org/course/remote/start.html","link":"http://spidr.rubyforge.org/course/loop/../remote/start.html","example":"<a href=\"http://spidr.rubyforge.org/course/loop/../remote/start.html\">should not follow remote links with a relative path to the same page</a>","message":"should not follow remote links with a relative path to the same page","behavior":"nofollow"},{"url":"http://spidr.rubyforge.org:1337/path/","link":"http://spidr.rubyforge.org:1337/path/","example":"<a href=\"http://spidr.rubyforge.org:1337/path/\">should ignore links that fail</a>","message":"should ignore links that fail","behavior":"fail"},{"url":"http://spidr.rubyforge.org/course/frames/iframe_next.html","link":"iframe_next.html","example":"<a href=\"iframe_next.html\">should follow links within iframes</a>","message":"should follow links within iframes","behavior":"follow"},{"url":"http://spidr.rubyforge.org/course/frames/frame_next.html","link":"frame_next.html","example":"<a href=\"frame_next.html\">should follow links within frames</a>","message":"should follow links within frames","behavior":"follow"}]

data/tasks/course.rb CHANGED Viewed

@@ -1,3 +1,10 @@
+lib_dir = File.expand_path(File.join(File.dirname(__FILE__),'..','lib'))
+unless $LOAD_PATH.include?(lib_dir)
+  $LOAD_PATH.unshift(lib_dir)
+end
+require 'spidr/extensions/uri'
 require 'nokogiri'
 require 'json'
@@ -22,7 +29,7 @@ namespace :course do
           absolute_url = page_url.merge(URI.encode(relative_url))
           if absolute_url.path
-            absolute_url.path = File.expand_path(absolute_url.path)
+            absolute_url.path = URI.expand_path(absolute_url.path)
           end
           spec_data.merge(

data/tasks/spec.rb CHANGED Viewed

@@ -6,4 +6,5 @@ Spec::Rake::SpecTask.new(:spec) do |t|
   t.spec_opts = ['--colour', '--format', 'specdoc']
 end
+task :test => :spec
 task :default => :spec

data/tasks/yard.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'yard'
+YARD::Rake::YardocTask.new do |t|
+  t.files   = ['lib/**/*.rb']
+  t.options = [
+    '--protected',
+    '--files', 'History.txt',
+    '--title', 'Spidr'
+  ]
+end
+task :docs => :yardoc

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.1.9
+  version: 0.2.0
 platform: ruby
 authors:
 - Postmodern
@@ -30,7 +30,7 @@ cert_chain:
   pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
   -----END CERTIFICATE-----
-date: 2009-06-13 00:00:00 -07:00
+date: 2009-10-10 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -41,7 +41,27 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: "0"
+        version: 1.2.0
+    version:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.2.8
+    version:
+- !ruby/object:Gem::Dependency
+  name: yard
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.2.3.5
     version:
 - !ruby/object:Gem::Dependency
   name: hoe
@@ -51,7 +71,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 2.0.0
+        version: 2.3.3
     version:
 description: |-
   Spidr is a versatile Ruby web spidering library that can spider a site,
@@ -73,15 +93,34 @@ files:
 - README.txt
 - Rakefile
 - lib/spidr.rb
+- lib/spidr/extensions.rb
+- lib/spidr/extensions/uri.rb
 - lib/spidr/page.rb
 - lib/spidr/rules.rb
+- lib/spidr/filters.rb
+- lib/spidr/events.rb
+- lib/spidr/actions.rb
+- lib/spidr/actions/exceptions.rb
+- lib/spidr/actions/exceptions/action.rb
+- lib/spidr/actions/exceptions/paused.rb
+- lib/spidr/actions/exceptions/skip_link.rb
+- lib/spidr/actions/exceptions/skip_page.rb
+- lib/spidr/actions/actions.rb
 - lib/spidr/agent.rb
 - lib/spidr/spidr.rb
 - lib/spidr/version.rb
 - tasks/spec.rb
+- tasks/yard.rb
 - tasks/course.rb
 - spec/spec_helper.rb
 - spec/helpers/course.rb
+- spec/helpers/page.rb
+- spec/extensions/uri_spec.rb
+- spec/page_examples.rb
+- spec/page_spec.rb
+- spec/rules_spec.rb
+- spec/filters_spec.rb
+- spec/actions_spec.rb
 - spec/agent_spec.rb
 - spec/spidr_spec.rb
 - static/course/index.html
@@ -114,7 +153,7 @@ files:
 - static/course/frames/frame.html
 - static/course/frames/frame_next.html
 - static/course/specs.json
-has_rdoc: true
+has_rdoc: yard
 homepage: http://spidr.rubyforge.org/
 licenses: []
@@ -139,7 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: spidr
-rubygems_version: 1.3.4
+rubygems_version: 1.3.5
 signing_key:
 specification_version: 3
 summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely

metadata.gz.sig CHANGED Viewed

Binary file