RubyGems - jeremyf-anemone - Versions diffs - 0.1.3 - Mend

jeremyf-anemone 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

data/lib/anemone/page_hash.rb ADDED Viewed

@@ -0,0 +1,116 @@
+module Anemone
+  class PageHash < Hash
+    #
+    # Use a breadth-first search to calculate the single-source
+    # shortest paths from *root* to all pages in the PageHash
+    #
+    def shortest_paths!(root)
+      root = URI(root) if root.is_a?(String)
+      raise "Root node not found" if !has_key?(root)
+      each_value {|p| p.visited = false if p}
+      q = Queue.new
+      q.enq(root)
+      self[root].depth = 0
+      self[root].visited = true
+      while(!q.empty?)
+        url = q.deq
+        next if !has_key?(url)
+        page = self[url]
+        page.links.each do |u|
+          next if !has_key?(u) or self[u].nil?
+          link = self[u]
+          aliases = [link].concat(link.aliases.map {|a| self[a] })
+          aliases.each do |node|
+            if node.depth.nil? or page.depth + 1 < node.depth
+              node.depth = page.depth + 1
+            end
+          end
+          q.enq(self[u].url) if !self[u].visited
+          self[u].visited = true
+        end
+      end
+      self
+    end
+    #
+    # Returns a new PageHash by removing redirect-aliases for each
+    # non-redirect Page
+    #
+    def uniq
+      results = PageHash.new
+      each do |url, page|
+        #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
+        page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
+        if !page.redirect? and !page_added
+          results[url] = page.clone
+          results[url].aliases = []
+        end
+      end
+      results
+    end
+    #
+    # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
+    # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
+    #
+    def pages_linking_to(urls)
+      unless urls.is_a?(Array)
+        urls = [urls] unless urls.is_a?(Array)
+        single = true
+      end
+      urls.map! do |url|
+        if url.is_a?(String)
+          URI(url) rescue nil
+        else
+          url
+        end
+      end
+      urls.compact
+      links = {}
+      urls.each { |url| links[url] = [] }
+      values.each do |page|
+        urls.each { |url| links[url] << page if page.links.include?(url) }
+      end
+      if single and !links.empty?
+        return links.first
+      else
+        return links
+      end
+    end
+    #
+    # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
+    # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
+    #
+    def urls_linking_to(urls)
+      unless urls.is_a?(Array)
+        urls = [urls] unless urls.is_a?(Array)
+        single = true
+      end
+      links = pages_linking_to(urls)
+      links.each { |url, pages| links[url] = pages.map{|p| p.url} }
+      if single and !links.empty?
+        return links.first
+      else
+        return links
+      end
+    end
+  end
+end

data/lib/anemone/tentacle.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'anemone/page'
+module Anemone
+  class Tentacle
+    #
+    # Create a new Tentacle
+    #
+    def initialize(link_queue, page_queue)
+      @link_queue = link_queue
+      @page_queue = page_queue
+    end
+    #
+    # Gets links from @link_queue, and returns the fetched
+    # Page objects into @page_queue
+    #
+    def run
+      while true do
+        link = @link_queue.deq
+        break if link == :END
+        page = Page.fetch(link)
+        @page_queue.enq(page)
+      end
+    end
+  end
+end

data/spec/anemone_spec.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require File.dirname(__FILE__) + '/spec_helper'
+describe Anemone do
+  it "should have a version and user agent" do
+    Anemone.const_defined?('VERSION').should == true
+    Anemone.const_defined?('USER_AGENT').should == true
+  end
+  it "should have options" do
+    Anemone.should respond_to(:options)
+  end
+  it "should accept options for the crawl" do
+    Anemone.crawl(SPEC_DOMAIN, :verbose => false, :threads => 2, :discard_page_bodies => true)
+    Anemone.options.verbose.should == false
+    Anemone.options.threads.should == 2
+    Anemone.options.discard_page_bodies.should == true
+  end
+  it "should return a Anemone::Core from the crawl, which has a PageHash" do
+    result = Anemone.crawl(SPEC_DOMAIN)
+    result.should be_an_instance_of(Anemone::Core)
+    result.pages.should be_an_instance_of(Anemone::PageHash)
+  end
+end

data/spec/core_spec.rb ADDED Viewed

@@ -0,0 +1,114 @@
+require File.dirname(__FILE__) + '/spec_helper'
+module Anemone
+  describe Core do
+    before(:each) do
+      FakeWeb.clean_registry
+    end
+    it "should crawl all the html pages in a domain by following <a> href's" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1', '2'])
+      pages << FakePage.new('1', :links => ['3'])
+      pages << FakePage.new('2')
+      pages << FakePage.new('3')
+      Anemone.crawl(pages[0].url).should have(4).pages
+    end
+    it "should not leave the original domain" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
+      pages << FakePage.new('1')
+      core = Anemone.crawl(pages[0].url)
+      core.should have(2).pages
+      core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
+    end
+    it "should follow http redirects" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1'])
+      pages << FakePage.new('1', :redirect => '2')
+      pages << FakePage.new('2')
+      Anemone.crawl(pages[0].url).should have(3).pages
+    end
+    it "should accept multiple starting URLs" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1'])
+      pages << FakePage.new('1')
+      pages << FakePage.new('2', :links => ['3'])
+      pages << FakePage.new('3')
+      Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
+    end
+    it "should include the query string when following links" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1?foo=1'])
+      pages << FakePage.new('1?foo=1')
+      pages << FakePage.new('1')
+      core = Anemone.crawl(pages[0].url)
+      core.should have(2).pages
+      core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
+    end
+    it "should be able to skip links based on a RegEx" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1', '2'])
+      pages << FakePage.new('1')
+      pages << FakePage.new('2')
+      core = Anemone.crawl(pages[0].url) do |a|
+        a.skip_links_like /1/
+      end
+      core.should have(2).pages
+      core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
+    end
+    it "should be able to call a block on every page" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1', '2'])
+      pages << FakePage.new('1')
+      pages << FakePage.new('2')
+      count = 0
+      Anemone.crawl(pages[0].url) do |a|
+        a.on_every_page { count += 1 }
+      end
+      count.should == 3
+    end
+    it "should not discard page bodies by default" do
+      Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
+    end
+    it "should optionally discard page bodies to conserve memory" do
+      core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
+      core.pages.values.first.doc.should be_nil
+    end
+    it "should provide a focus_crawl method to select the links on each page to follow" do
+      pages = []
+      pages << FakePage.new('0', :links => ['1', '2'])
+      pages << FakePage.new('1')
+      pages << FakePage.new('2')
+      core = Anemone.crawl(pages[0].url) do |a|
+        a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
+      end
+      core.should have(2).pages
+      core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
+    end
+  end
+end

data/spec/fakeweb_helper.rb ADDED Viewed

@@ -0,0 +1,55 @@
+begin
+  require 'fakeweb'
+rescue LoadError
+  warn "You need the 'fakeweb' gem installed to test Anemone"
+  exit
+end
+FakeWeb.allow_net_connect = false
+module Anemone
+  SPEC_DOMAIN = "http://www.example.com/"
+  class FakePage
+    attr_accessor :links
+    attr_accessor :hrefs
+    def initialize(name = '', options = {})
+      @name = name
+      @links = [options[:links]].flatten if options.has_key?(:links)
+      @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
+      @redirect = options[:redirect] if options.has_key?(:redirect)
+      create_body
+      add_to_fakeweb
+    end
+    def url
+      SPEC_DOMAIN + @name
+    end
+    private
+    def create_body
+      @body = "<html><body>"
+      @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
+      @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
+      @body += "</body></html>"
+    end
+    def add_to_fakeweb
+      options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
+      if @redirect
+        options[:status] = [301, "Permanently Moved"]
+        options[:location] = SPEC_DOMAIN + @redirect
+      end
+      FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
+    end
+  end
+end
+#default root
+Anemone::FakePage.new

data/spec/page_spec.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require File.dirname(__FILE__) + '/spec_helper'
+module Anemone
+  describe Page do
+    before(:each) do
+      @page = Page.fetch(FakePage.new('home').url)
+    end
+    it "should be able to fetch a page" do
+      @page.should_not be_nil
+      @page.url.to_s.should include('home')
+    end
+    it "should store the response headers when fetching a page" do
+      @page.headers.should_not be_nil
+      @page.headers.should have_key('content-type')
+    end
+    it "should have an OpenStruct attribute for the developer to store data in" do
+      @page.data.should_not be_nil
+      @page.data.should be_an_instance_of(OpenStruct)
+      @page.data.test = 'test'
+      @page.data.test.should == 'test'
+    end
+    it "should have a Nokogori::HTML::Document attribute for the page body" do
+      @page.doc.should_not be_nil
+      @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
+    end
+    it "should indicate whether it was fetched after an HTTP redirect" do
+      @page.should respond_to(:redirect?)
+      @page.redirect?.should == false
+      Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
+    end
+    it "should have a method to tell if a URI is in the same domain as the page" do
+      @page.should respond_to(:in_domain?)
+      @page.in_domain?(URI(FakePage.new('test').url)).should == true
+      @page.in_domain?(URI('http://www.other.com/')).should == false
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require File.dirname(__FILE__) + '/../lib/anemone'
+require File.dirname(__FILE__) + '/fakeweb_helper'
+require 'rubygems'
+SPEC_DOMAIN = 'http://www.example.com/'

metadata ADDED Viewed

@@ -0,0 +1,85 @@
+--- !ruby/object:Gem::Specification
+name: jeremyf-anemone
+version: !ruby/object:Gem::Version
+  version: 0.1.3
+platform: ruby
+authors:
+- Chris Kite
+- Jeremy Friesen
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-08-05 00:00:00 -07:00
+default_executable:
+dependencies: []
+description:
+email: jeremy.n.friesen@gmail.com
+executables:
+- anemone_count.rb
+- anemone_cron.rb
+- anemone_pagedepth.rb
+- anemone_serialize.rb
+- anemone_url_list.rb
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.rdoc
+files:
+- LICENSE.txt
+- README.rdoc
+- Rakefile
+- VERSION.yml
+- anemone.gemspec
+- bin/anemone_count.rb
+- bin/anemone_cron.rb
+- bin/anemone_pagedepth.rb
+- bin/anemone_serialize.rb
+- bin/anemone_url_list.rb
+- lib/anemone.rb
+- lib/anemone/anemone.rb
+- lib/anemone/core.rb
+- lib/anemone/http.rb
+- lib/anemone/page.rb
+- lib/anemone/page_hash.rb
+- lib/anemone/tentacle.rb
+- spec/anemone_spec.rb
+- spec/core_spec.rb
+- spec/fakeweb_helper.rb
+- spec/page_spec.rb
+- spec/spec_helper.rb
+has_rdoc: false
+homepage: http://github.com/jeremyf/anemone
+licenses:
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Anemone is a web spider framework that can spider a domain.
+test_files:
+- spec/anemone_spec.rb
+- spec/core_spec.rb
+- spec/fakeweb_helper.rb
+- spec/page_spec.rb
+- spec/spec_helper.rb