RubyGems - shingara-anemone - Versions diffs - 0.2.4 - Mend

shingara-anemone 0.2.4

Files changed (22) hide show

data/CHANGELOG.rdoc +27 -0
data/LICENSE.txt +19 -0
data/README.rdoc +24 -0
data/bin/anemone +4 -0
data/lib/anemone.rb +2 -0
data/lib/anemone/cli.rb +24 -0
data/lib/anemone/cli/count.rb +22 -0
data/lib/anemone/cli/cron.rb +90 -0
data/lib/anemone/cli/pagedepth.rb +32 -0
data/lib/anemone/cli/serialize.rb +35 -0
data/lib/anemone/cli/url_list.rb +41 -0
data/lib/anemone/core.rb +256 -0
data/lib/anemone/http.rb +123 -0
data/lib/anemone/page.rb +155 -0
data/lib/anemone/page_hash.rb +142 -0
data/lib/anemone/tentacle.rb +39 -0
data/spec/anemone_spec.rb +15 -0
data/spec/core_spec.rb +203 -0
data/spec/fakeweb_helper.rb +57 -0
data/spec/page_spec.rb +52 -0
data/spec/spec_helper.rb +7 -0
metadata +96 -0

@@ -0,0 +1,15 @@
+require File.dirname(__FILE__) + '/spec_helper'
+describe Anemone do
+  it "should have a version" do
+    Anemone.const_defined?('VERSION').should == true
+  end
+  it "should return a Anemone::Core from the crawl, which has a PageHash" do
+    result = Anemone.crawl(SPEC_DOMAIN)
+    result.should be_an_instance_of(Anemone::Core)
+    result.pages.should be_an_instance_of(Anemone::PageHash)
+  end
+end

data/spec/core_spec.rb ADDED

@@ -0,0 +1,203 @@
+require File.dirname(__FILE__) + '/spec_helper'
+module Anemone
+  describe Core do
+    before(:each) do
+      FakeWeb.clean_registry
+    end
+    it "should crawl all the html pages in a domain by following <a> href's" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1', '2'])
+      pages << FakePage.new('1', :links => ['3'])
+      pages << FakePage.new('2')
+      pages << FakePage.new('3')
+      Anemone.crawl(pages[0].url).should have(4).pages
+    end
+    it "should not leave the original domain" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
+      pages << FakePage.new('1')
+      core = Anemone.crawl(pages[0].url)
+      core.should have(2).pages
+      core.pages.keys.should_not include('http://www.other.com/')
+    end
+    it "should follow http redirects" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1'])
+      pages << FakePage.new('1', :redirect => '2')
+      pages << FakePage.new('2')
+      Anemone.crawl(pages[0].url).should have(3).pages
+    end
+    it "should accept multiple starting URLs" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1'])
+      pages << FakePage.new('1')
+      pages << FakePage.new('2', :links => ['3'])
+      pages << FakePage.new('3')
+      Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
+    end
+    it "should include the query string when following links" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1?foo=1'])
+      pages << FakePage.new('1?foo=1')
+      pages << FakePage.new('1')
+      core = Anemone.crawl(pages[0].url)
+      core.should have(2).pages
+      core.pages.keys.should_not include(pages[2].url)
+    end
+    it "should be able to skip links based on a RegEx" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1', '2'])
+      pages << FakePage.new('1')
+      pages << FakePage.new('2')
+      pages << FakePage.new('3')
+      core = Anemone.crawl(pages[0].url) do |a|
+        a.skip_links_like /1/, /3/
+      end
+      core.should have(2).pages
+      core.pages.keys.should_not include(pages[1].url)
+      core.pages.keys.should_not include(pages[3].url)
+    end
+    it "should be able to call a block on every page" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1', '2'])
+      pages << FakePage.new('1')
+      pages << FakePage.new('2')
+      count = 0
+      Anemone.crawl(pages[0].url) do |a|
+        a.on_every_page { count += 1 }
+      end
+      count.should == 3
+    end
+    it "should not discard page bodies by default" do
+      Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
+    end
+    it "should optionally discard page bodies to conserve memory" do
+      core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
+      core.pages.values.first.doc.should be_nil
+    end
+    it "should provide a focus_crawl method to select the links on each page to follow" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1', '2'])
+      pages << FakePage.new('1')
+      pages << FakePage.new('2')
+      core = Anemone.crawl(pages[0].url) do |a|
+        a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
+      end
+      core.should have(2).pages
+      core.pages.keys.should_not include(pages[1].url)
+    end
+    it "should optionally delay between page requests" do
+      delay = 0.25
+      pages = []
+      pages << FakePage.new('0', :links => '1')
+      pages << FakePage.new('1')
+      start = Time.now
+      Anemone.crawl(pages[0].url, :delay => delay)
+      finish = Time.now
+      (finish - start).should satisfy {|t| t > delay * 2}
+    end
+    it "should optionally obey the robots exclusion protocol" do
+      pages = []
+      pages << FakePage.new('0', :links => '1')
+      pages << FakePage.new('1')
+      pages << FakePage.new('robots.txt',
+                            :body => "User-agent: *\nDisallow: /1",
+                            :content_type => 'text/plain')
+      core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
+      urls = core.pages.keys
+      urls.should include(pages[0].url)
+      urls.should_not include(pages[1].url)
+    end
+    describe "many pages" do
+      before(:each) do
+        @pages, size = [], 5
+        size.times do |n|
+          # register this page with a link to the next page
+          link = (n + 1).to_s if n + 1 < size
+          @pages << FakePage.new(n.to_s, :links => Array(link))
+        end
+      end
+      it "should track the page depth and referer" do
+        core = Anemone.crawl(@pages[0].url)
+        previous_page = nil
+        @pages.each_with_index do |page, i|
+          page = core.pages[page.url]
+          page.should be
+          page.depth.should == i
+          if previous_page
+            page.referer.should == previous_page.url
+          else
+            page.referer.should be_nil
+          end
+          previous_page = page
+        end
+      end
+      it "should optionally limit the depth of the crawl" do
+        core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
+        core.should have(4).pages
+      end
+    end
+    describe "options" do
+      it "should accept options for the crawl" do
+        core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
+                                          :threads => 2,
+                                          :discard_page_bodies => true,
+                                          :user_agent => 'test',
+                                          :obey_robots_txt => true,
+                                          :depth_limit => 3)
+        core.opts[:verbose].should == false
+        core.opts[:threads].should == 2
+        core.opts[:discard_page_bodies].should == true
+        core.opts[:delay].should == 0
+        core.opts[:user_agent].should == 'test'
+        core.opts[:obey_robots_txt].should == true
+        core.opts[:depth_limit].should == 3
+      end
+      it "should use 1 thread if a delay is requested" do
+        Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
+      end
+    end
+  end
+end

data/spec/fakeweb_helper.rb ADDED

@@ -0,0 +1,57 @@
+begin
+  require 'fakeweb'
+rescue LoadError
+  warn "You need the 'fakeweb' gem installed to test Anemone"
+  exit
+end
+FakeWeb.allow_net_connect = false
+module Anemone
+  SPEC_DOMAIN = "http://www.example.com/"
+  class FakePage
+    attr_accessor :links
+    attr_accessor :hrefs
+    attr_accessor :body
+    def initialize(name = '', options = {})
+      @name = name
+      @links = [options[:links]].flatten if options.has_key?(:links)
+      @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
+      @redirect = options[:redirect] if options.has_key?(:redirect)
+      @content_type = options[:content_type] || "text/html"
+      @body = options[:body]
+      create_body unless @body
+      add_to_fakeweb
+    end
+    def url
+      SPEC_DOMAIN + @name
+    end
+    private
+    def create_body
+      @body = "<html><body>"
+      @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
+      @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
+      @body += "</body></html>"
+    end
+    def add_to_fakeweb
+      options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
+      if @redirect
+        options[:status] = [301, "Permanently Moved"]
+        options[:location] = SPEC_DOMAIN + @redirect
+      end
+      FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
+    end
+  end
+end
+#default root
+Anemone::FakePage.new

data/spec/page_spec.rb ADDED

@@ -0,0 +1,52 @@
+require File.dirname(__FILE__) + '/spec_helper'
+module Anemone
+  describe Page do
+    before(:all) do
+      @http = Anemone::HTTP.new
+    end
+    before(:each) do
+      @page = @http.fetch_page(FakePage.new('home').url)
+    end
+    it "should store the response headers when fetching a page" do
+      @page.headers.should_not be_nil
+      @page.headers.should have_key('content-type')
+    end
+    it "should have an OpenStruct attribute for the developer to store data in" do
+      @page.data.should_not be_nil
+      @page.data.should be_an_instance_of(OpenStruct)
+      @page.data.test = 'test'
+      @page.data.test.should == 'test'
+    end
+    it "should have a Nokogori::HTML::Document attribute for the page body" do
+      @page.doc.should_not be_nil
+      @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
+    end
+    it "should indicate whether it was fetched after an HTTP redirect" do
+      @page.should respond_to(:redirect?)
+      @page.redirect?.should == false
+      @http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
+    end
+    it "should have a method to tell if a URI is in the same domain as the page" do
+      @page.should respond_to(:in_domain?)
+      @page.in_domain?(URI(FakePage.new('test').url)).should == true
+      @page.in_domain?(URI('http://www.other.com/')).should == false
+    end
+    it "should include the response time for the HTTP request" do
+      @page.should respond_to(:response_time)
+    end
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,7 @@
+require 'rubygems'
+require File.dirname(__FILE__) + '/fakeweb_helper'
+$:.unshift(File.dirname(__FILE__) + '/../lib/')
+require 'anemone'
+SPEC_DOMAIN = 'http://www.example.com/'

metadata ADDED

@@ -0,0 +1,96 @@
+--- !ruby/object:Gem::Specification
+name: shingara-anemone
+version: !ruby/object:Gem::Version
+  version: 0.2.4
+platform: ruby
+authors:
+- Chris Kite
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-11-20 00:00:00 +01:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+    version:
+- !ruby/object:Gem::Dependency
+  name: robots
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.2
+    version:
+description:
+email:
+executables:
+- anemone
+extensions: []
+extra_rdoc_files:
+- README.rdoc
+files:
+- LICENSE.txt
+- CHANGELOG.rdoc
+- README.rdoc
+- bin/anemone
+- lib/anemone.rb
+- lib/anemone/core.rb
+- lib/anemone/http.rb
+- lib/anemone/page.rb
+- lib/anemone/page_hash.rb
+- lib/anemone/tentacle.rb
+- lib/anemone/cli.rb
+- lib/anemone/cli/url_list.rb
+- lib/anemone/cli/cron.rb
+- lib/anemone/cli/count.rb
+- lib/anemone/cli/pagedepth.rb
+- lib/anemone/cli/serialize.rb
+has_rdoc: true
+homepage: http://anemone.rubyforge.org
+licenses: []
+post_install_message:
+rdoc_options:
+- -m
+- README.rdoc
+- -t
+- Anemone
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: anemone
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Anemone web-spider framework
+test_files:
+- spec/anemone_spec.rb
+- spec/core_spec.rb
+- spec/page_spec.rb
+- spec/fakeweb_helper.rb
+- spec/spec_helper.rb