RubyGems - spk-anemone - Versions diffs - 0.2.4 → 0.3.0 - Mend

spk-anemone 0.2.4 → 0.3.0

Files changed (18) hide show

data/CHANGELOG.rdoc +10 -0
data/README.rdoc +2 -0
data/lib/anemone/cli/serialize.rb +2 -2
data/lib/anemone/core.rb +43 -53
data/lib/anemone/http.rb +32 -21
data/lib/anemone/page.rb +43 -50
data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
data/lib/anemone/storage.rb +19 -0
data/lib/anemone/storage/pstore.rb +48 -0
data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
data/lib/anemone/tentacle.rb +7 -7
data/spec/anemone_spec.rb +4 -4
data/spec/core_spec.rb +226 -163
data/spec/http_spec.rb +23 -0
data/spec/page_spec.rb +28 -14
data/spec/page_store_spec.rb +128 -0
data/spec/storage_spec.rb +123 -0
metadata +10 -5

data/lib/anemone/{page_hash.rb → page_store.rb} RENAMED Viewed

@@ -1,21 +1,52 @@
+require 'forwardable'
 module Anemone
-  class PageHash < Hash
+  class PageStore
+    extend Forwardable
+    def_delegators :@storage, :keys, :values, :size, :each
+    def initialize(storage = {})
+      @storage = storage
+    end
     # We typically index the hash with a URI,
     # but convert it to a String for easier retrieval
     def [](index)
-      super(index.to_s)
+      @storage[index.to_s]
     end
     def []=(index, other)
-      super(index.to_s, other)
+      @storage[index.to_s] = other
+    end
+    def delete(key)
+      @storage.delete key.to_s
     end
     def has_key?(key)
-      super(key.to_s)
+      @storage.has_key? key.to_s
+    end
+    def each_value
+      each { |key, value| yield value }
+    end
+    def values
+      result = []
+      each { |key, value| result << value }
+      result
+    end
+    def touch_key(key)
+      self[key] = Page.new(key)
     end
-    # Does this PageHash contain the specified URL?
+    def touch_keys(keys)
+      @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
+    end
+    # Does this PageStore contain the specified URL?
     # HTTP and HTTPS versions of a URL are considered to be the same page.
     def has_page?(url)
       schemes = %w(http https)
@@ -24,80 +55,67 @@ module Anemone
         return schemes.any? { |s| u.scheme = s; has_key?(u) }
       end
-      has_key?(url)
+      has_key? url
     end
     #
     # Use a breadth-first search to calculate the single-source
-    # shortest paths from *root* to all pages in the PageHash
+    # shortest paths from *root* to all pages in the PageStore
     #
     def shortest_paths!(root)
       root = URI(root) if root.is_a?(String)
       raise "Root node not found" if !has_key?(root)
-      each_value {|p| p.visited = false if p}
       q = Queue.new
-      q.enq(root)
-      self[root].depth = 0
-      self[root].visited = true
-      while(!q.empty?)
-        url = q.deq
-        next if !has_key?(url)
-        page = self[url]
+      q.enq root
+      root_page = self[root]
+      root_page.depth = 0
+      root_page.visited = true
+      self[root] = root_page
+      while !q.empty?
+        page = self[q.deq]
         page.links.each do |u|
-          next if !has_key?(u) or self[u].nil?
-          link = self[u]
-          aliases = [link].concat(link.aliases.map {|a| self[a] })
-          aliases.each do |node|
-            if node.depth.nil? or page.depth + 1 < node.depth
-              node.depth = page.depth + 1
+          begin
+            link = self[u]
+            next if link.nil? || !link.fetched? || link.visited
+            q << u unless link.redirect?
+            link.visited = true
+            link.depth = page.depth + 1
+            self[u] = link
+            if link.redirect?
+              u = link.redirect_to
+              redo
             end
           end
-          q.enq(self[u].url) if !self[u].visited
-          self[u].visited = true
         end
       end
       self
     end
     #
-    # Returns a new PageHash by removing redirect-aliases for each
-    # non-redirect Page
+    # Removes all Pages from storage where redirect? is true
     #
-    def uniq
-      results = PageHash.new
-      each do |url, page|
-        #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
-        page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
-        if !page.redirect? and !page_added
-          results[url] = page.clone
-          results[url].aliases = []
-        end
-      end
-      results
+    def uniq!
+      each_value { |page| delete page.url if page.redirect? }
+      self
     end
     #
     # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
     # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
     #
     def pages_linking_to(urls)
       unless urls.is_a?(Array)
-        urls = [urls] unless urls.is_a?(Array)
+        urls = [urls]
         single = true
       end
       urls.map! do |url|
-        if url.is_a?(String)
+        unless url.is_a?(URI)
           URI(url) rescue nil
         else
           url
@@ -112,7 +130,7 @@ module Anemone
       end
       if single and !links.empty?
-        return links.first
+        return links[urls.first]
       else
         return links
       end
@@ -132,11 +150,11 @@ module Anemone
       links.each { |url, pages| links[url] = pages.map{|p| p.url} }
       if single and !links.empty?
-        return links.first
+        return links[urls.first]
       else
         return links
-      end
+      end
     end
   end
-end
+end

data/lib/anemone/storage.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module Anemone
+  module Storage
+    def self.Hash(*args)
+      Hash.new(*args)
+    end
+    def self.PStore(*args)
+      require 'anemone/storage/pstore'
+      self::PStore.new(*args)
+    end
+    def self.TokyoCabinet(file)
+      require 'anemone/storage/tokyo_cabinet'
+      self::TokyoCabinet.new(file)
+    end
+  end
+end

data/lib/anemone/storage/pstore.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require 'pstore'
+require 'forwardable'
+module Anemone
+  module Storage
+    class PStore
+      extend Forwardable
+      def_delegators :@keys, :has_key?, :keys, :size
+      def initialize(file)
+        File.delete(file) if File.exists?(file)
+        @store = ::PStore.new(file)
+        @keys = {}
+      end
+      def [](key)
+        @store.transaction { |s| s[key] }
+      end
+      def []=(key,value)
+        @keys[key] = nil
+        @store.transaction { |s| s[key] = value }
+      end
+      def delete(key)
+        @keys.delete(key)
+        @store.transaction { |s| s.delete key}
+      end
+      def each
+        @keys.each_key do |key|
+          value = nil
+          @store.transaction { |s| value = s[key] }
+          yield key, value
+        end
+      end
+      def merge!(hash)
+        @store.transaction do |s|
+          hash.each { |key, value| s[key] = value; @keys[key] = nil }
+        end
+        self
+      end
+    end
+  end
+end

data/lib/anemone/storage/tokyo_cabinet.rb ADDED Viewed

@@ -0,0 +1,57 @@
+begin
+  require 'tokyocabinet'
+rescue LoadError
+  puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
+  exit
+end
+require 'forwardable'
+module Anemone
+  module Storage
+    class TokyoCabinet
+      extend Forwardable
+      def_delegators :@db, :close, :size, :keys, :has_key?
+      def initialize(file)
+        raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
+        @db = ::TokyoCabinet::HDB::new
+        @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
+        @db.clear
+      end
+      def [](key)
+        if value = @db[key]
+          load_value(value)
+        end
+      end
+      def []=(key, value)
+        @db[key] = [Marshal.dump(value)].pack("m")
+      end
+      def delete(key)
+        value = self[key]
+        @db.delete(key)
+        value
+      end
+      def each
+        @db.each { |k, v| yield k, load_value(v) }
+      end
+      def merge!(hash)
+        hash.each { |key, value| self[key] = value }
+        self
+      end
+      private
+      def load_value(value)
+        Marshal.load(value.unpack("m")[0])
+      end
+    end
+  end
+end

data/lib/anemone/tentacle.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require 'anemone/http'
 module Anemone
   class Tentacle
     #
     # Create a new Tentacle
     #
@@ -12,18 +12,18 @@ module Anemone
       @http = Anemone::HTTP.new(opts)
       @opts = opts
     end
     #
     # Gets links from @link_queue, and returns the fetched
     # Page objects into @page_queue
     #
     def run
       loop do
-        link, from_page = @link_queue.deq
+        link, referer, depth = @link_queue.deq
         break if link == :END
-        @page_queue << @http.fetch_page(link, from_page)
+        @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
         delay
       end
@@ -32,8 +32,8 @@ module Anemone
     private
     def delay
-      sleep @opts[:delay] if @opts[:delay]
+      sleep @opts[:delay] if @opts[:delay] > 0
     end
   end
-end
+end

data/spec/anemone_spec.rb CHANGED Viewed

@@ -1,15 +1,15 @@
 require File.dirname(__FILE__) + '/spec_helper'
 describe Anemone do
   it "should have a version" do
     Anemone.const_defined?('VERSION').should == true
   end
-  it "should return a Anemone::Core from the crawl, which has a PageHash" do
+  it "should return a Anemone::Core from the crawl, which has a PageStore" do
     result = Anemone.crawl(SPEC_DOMAIN)
     result.should be_an_instance_of(Anemone::Core)
-    result.pages.should be_an_instance_of(Anemone::PageHash)
+    result.pages.should be_an_instance_of(Anemone::PageStore)
   end
 end

data/spec/core_spec.rb CHANGED Viewed

@@ -1,178 +1,222 @@
 require File.dirname(__FILE__) + '/spec_helper'
+%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
 module Anemone
   describe Core do
     before(:each) do
       FakeWeb.clean_registry
     end
-    it "should crawl all the html pages in a domain by following <a> href's" do
-      pages = []
-      pages << FakePage.new('0', :links => ['1', '2'])
-      pages << FakePage.new('1', :links => ['3'])
-      pages << FakePage.new('2')
-      pages << FakePage.new('3')
-      Anemone.crawl(pages[0].url).should have(4).pages
-    end
-    it "should not leave the original domain" do
-      pages = []
-      pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
-      pages << FakePage.new('1')
-      core = Anemone.crawl(pages[0].url)
-      core.should have(2).pages
-      core.pages.keys.should_not include('http://www.other.com/')
-    end
-    it "should follow http redirects" do
-      pages = []
-      pages << FakePage.new('0', :links => ['1'])
-      pages << FakePage.new('1', :redirect => '2')
-      pages << FakePage.new('2')
-      Anemone.crawl(pages[0].url).should have(3).pages
-    end
-    it "should accept multiple starting URLs" do
-      pages = []
-      pages << FakePage.new('0', :links => ['1'])
-      pages << FakePage.new('1')
-      pages << FakePage.new('2', :links => ['3'])
-      pages << FakePage.new('3')
-      Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
-    end
-    it "should include the query string when following links" do
-      pages = []
-      pages << FakePage.new('0', :links => ['1?foo=1'])
-      pages << FakePage.new('1?foo=1')
-      pages << FakePage.new('1')
-      core = Anemone.crawl(pages[0].url)
-      core.should have(2).pages
-      core.pages.keys.should_not include(pages[2].url)
-    end
-    it "should be able to skip links based on a RegEx" do
-      pages = []
-      pages << FakePage.new('0', :links => ['1', '2'])
-      pages << FakePage.new('1')
-      pages << FakePage.new('2')
-      pages << FakePage.new('3')
-      core = Anemone.crawl(pages[0].url) do |a|
-        a.skip_links_like /1/, /3/
-      end
-      core.should have(2).pages
-      core.pages.keys.should_not include(pages[1].url)
-      core.pages.keys.should_not include(pages[3].url)
-    end
-    it "should be able to call a block on every page" do
-      pages = []
-      pages << FakePage.new('0', :links => ['1', '2'])
-      pages << FakePage.new('1')
-      pages << FakePage.new('2')
-      count = 0
-      Anemone.crawl(pages[0].url) do |a|
-        a.on_every_page { count += 1 }
-      end
-      count.should == 3
-    end
-    it "should not discard page bodies by default" do
-      Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
-    end
-    it "should optionally discard page bodies to conserve memory" do
-      core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
-      core.pages.values.first.doc.should be_nil
-    end
-    it "should provide a focus_crawl method to select the links on each page to follow" do
-      pages = []
-      pages << FakePage.new('0', :links => ['1', '2'])
-      pages << FakePage.new('1')
-      pages << FakePage.new('2')
-      core = Anemone.crawl(pages[0].url) do |a|
-        a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
-      end
-      core.should have(2).pages
-      core.pages.keys.should_not include(pages[1].url)
+    shared_examples_for "crawl" do
+      it "should crawl all the html pages in a domain by following <a> href's" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1', :links => ['3'])
+        pages << FakePage.new('2')
+        pages << FakePage.new('3')
+        Anemone.crawl(pages[0].url, @opts).should have(4).pages
+      end
+      it "should not leave the original domain" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
+        pages << FakePage.new('1')
+        core = Anemone.crawl(pages[0].url, @opts)
+        core.should have(2).pages
+        core.pages.keys.should_not include('http://www.other.com/')
+      end
+      it "should follow http redirects" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1'])
+        pages << FakePage.new('1', :redirect => '2')
+        pages << FakePage.new('2')
+        Anemone.crawl(pages[0].url, @opts).should have(3).pages
+      end
+      it "should accept multiple starting URLs" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2', :links => ['3'])
+        pages << FakePage.new('3')
+        Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
+      end
+      it "should include the query string when following links" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1?foo=1'])
+        pages << FakePage.new('1?foo=1')
+        pages << FakePage.new('1')
+        core = Anemone.crawl(pages[0].url, @opts)
+        core.should have(2).pages
+        core.pages.keys.should_not include(pages[2].url)
+      end
+      it "should be able to skip links based on a RegEx" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2')
+        pages << FakePage.new('3')
+        core = Anemone.crawl(pages[0].url, @opts) do |a|
+          a.skip_links_like /1/, /3/
+        end
+        core.should have(2).pages
+        core.pages.keys.should_not include(pages[1].url)
+        core.pages.keys.should_not include(pages[3].url)
+      end
+      it "should be able to call a block on every page" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2')
+        count = 0
+        Anemone.crawl(pages[0].url, @opts) do |a|
+          a.on_every_page { count += 1 }
+        end
+        count.should == 3
+      end
+      it "should not discard page bodies by default" do
+        Anemone.crawl(FakePage.new('0').url, @opts).pages.values.first.doc.should_not be_nil
+      end
+      it "should optionally discard page bodies to conserve memory" do
+        core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
+        core.pages.values.first.doc.should be_nil
+      end
+      it "should provide a focus_crawl method to select the links on each page to follow" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2')
+        core = Anemone.crawl(pages[0].url, @opts) do |a|
+          a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
+        end
+        core.should have(2).pages
+        core.pages.keys.should_not include(pages[1].url)
+      end
+      it "should optionally delay between page requests" do
+        delay = 0.25
+        pages = []
+        pages << FakePage.new('0', :links => '1')
+        pages << FakePage.new('1')
+        start = Time.now
+        Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
+        finish = Time.now
+        (finish - start).should satisfy {|t| t > delay * 2}
+      end
+      it "should optionally obey the robots exclusion protocol" do
+        pages = []
+        pages << FakePage.new('0', :links => '1')
+        pages << FakePage.new('1')
+        pages << FakePage.new('robots.txt',
+                              :body => "User-agent: *\nDisallow: /1",
+                              :content_type => 'text/plain')
+        core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
+        urls = core.pages.keys
+        urls.should include(pages[0].url)
+        urls.should_not include(pages[1].url)
+      end
+      describe "many pages" do
+        before(:each) do
+          @pages, size = [], 5
+          size.times do |n|
+            # register this page with a link to the next page
+            link = (n + 1).to_s if n + 1 < size
+            @pages << FakePage.new(n.to_s, :links => Array(link))
+          end
+        end
+        it "should track the page depth and referer" do
+          core = Anemone.crawl(@pages[0].url, @opts)
+          previous_page = nil
+          @pages.each_with_index do |page, i|
+            page = core.pages[page.url]
+            page.should be
+            page.depth.should == i
+            if previous_page
+              page.referer.should == previous_page.url
+            else
+              page.referer.should be_nil
+            end
+            previous_page = page
+          end
+        end
+        it "should optionally limit the depth of the crawl" do
+          core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
+          core.should have(4).pages
+        end
+      end
     end
-    it "should optionally delay between page requests" do
-      delay = 0.25
-      pages = []
-      pages << FakePage.new('0', :links => '1')
-      pages << FakePage.new('1')
-      start = Time.now
-      Anemone.crawl(pages[0].url, :delay => delay)
-      finish = Time.now
-      (finish - start).should satisfy {|t| t > delay * 2}
+    describe Hash do
+      it_should_behave_like "crawl"
+      before(:all) do
+        @opts = {}
+      end
     end
-    it "should optionally obey the robots exclusion protocol" do
-      pages = []
-      pages << FakePage.new('0', :links => '1')
-      pages << FakePage.new('1')
-      pages << FakePage.new('robots.txt',
-                            :body => "User-agent: *\nDisallow: /1",
-                            :content_type => 'text/plain')
-      core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
-      urls = core.pages.keys
-      urls.should include(pages[0].url)
-      urls.should_not include(pages[1].url)
+    describe Storage::PStore do
+      it_should_behave_like "crawl"
+      before(:each) do
+        @test_file = 'test.pstore'
+        File.delete(@test_file) if File.exists?(@test_file)
+        @opts = {:storage => Storage.PStore(@test_file)}
+      end
+      after(:all) do
+        File.delete(@test_file) if File.exists?(@test_file)
+      end
     end
-    describe "many pages" do
+    describe Storage::TokyoCabinet do
+      it_should_behave_like "crawl"
       before(:each) do
-        @pages, size = [], 5
-        size.times do |n|
-          # register this page with a link to the next page
-          link = (n + 1).to_s if n + 1 < size
-          @pages << FakePage.new(n.to_s, :links => Array(link))
-        end
-      end
-      it "should track the page depth and referer" do
-        core = Anemone.crawl(@pages[0].url)
-        previous_page = nil
-        @pages.each_with_index do |page, i|
-          page = core.pages[page.url]
-          page.should be
-          page.depth.should == i
-          if previous_page
-            page.referer.should == previous_page.url
-          else
-            page.referer.should be_nil
-          end
-          previous_page = page
-        end
+        @test_file = 'test.tch'
+        File.delete(@test_file) if File.exists?(@test_file)
+        @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
       end
-      it "should optionally limit the depth of the crawl" do
-        core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
-        core.should have(4).pages
+      after(:each) do
+        @store.close
+      end
+      after(:all) do
+        File.delete(@test_file) if File.exists?(@test_file)
       end
     end
@@ -194,6 +238,25 @@ module Anemone
         core.opts[:depth_limit].should == 3
       end
+      it "should accept options via setter methods in the crawl block" do
+        core = Anemone.crawl(SPEC_DOMAIN) do |a|
+          a.verbose = false
+          a.threads = 2
+          a.discard_page_bodies = true
+          a.user_agent = 'test'
+          a.obey_robots_txt = true
+          a.depth_limit = 3
+        end
+        core.opts[:verbose].should == false
+        core.opts[:threads].should == 2
+        core.opts[:discard_page_bodies].should == true
+        core.opts[:delay].should == 0
+        core.opts[:user_agent].should == 'test'
+        core.opts[:obey_robots_txt].should == true
+        core.opts[:depth_limit].should == 3
+      end
       it "should use 1 thread if a delay is requested" do
         Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
       end