anemone 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +6 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +27 -11
- data/lib/anemone/http.rb +35 -17
- data/lib/anemone/page.rb +8 -0
- data/spec/cookie_store_spec.rb +27 -0
- data/spec/core_spec.rb +18 -0
- data/spec/http_spec.rb +3 -2
- data/spec/page_spec.rb +5 -0
- metadata +4 -2
    
        data/CHANGELOG.rdoc
    CHANGED
    
    
| @@ -0,0 +1,35 @@ | |
| 1 | 
            +
            require 'delegate'
         | 
| 2 | 
            +
            require 'webrick/cookie'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            class WEBrick::Cookie
         | 
| 5 | 
            +
              def expired?
         | 
| 6 | 
            +
                !!expires && expires < Time.now
         | 
| 7 | 
            +
              end
         | 
| 8 | 
            +
            end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            module Anemone
         | 
| 11 | 
            +
              class CookieStore < DelegateClass(Hash)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                def initialize(cookies = nil)
         | 
| 14 | 
            +
                  @cookies = {}
         | 
| 15 | 
            +
                  cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
         | 
| 16 | 
            +
                  super(@cookies)
         | 
| 17 | 
            +
                end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                def merge!(set_cookie_str)
         | 
| 20 | 
            +
                  begin
         | 
| 21 | 
            +
                    cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
         | 
| 22 | 
            +
                      hash[cookie.name] = cookie if !!cookie
         | 
| 23 | 
            +
                      hash
         | 
| 24 | 
            +
                    end
         | 
| 25 | 
            +
                    @cookies.merge! cookie_hash
         | 
| 26 | 
            +
                  rescue
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def to_s
         | 
| 31 | 
            +
                  @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
              end
         | 
| 35 | 
            +
            end
         | 
    
        data/lib/anemone/core.rb
    CHANGED
    
    | @@ -7,7 +7,7 @@ require 'anemone/storage' | |
| 7 7 |  | 
| 8 8 | 
             
            module Anemone
         | 
| 9 9 |  | 
| 10 | 
            -
              VERSION = '0. | 
| 10 | 
            +
              VERSION = '0.4.0';
         | 
| 11 11 |  | 
| 12 12 | 
             
              #
         | 
| 13 13 | 
             
              # Convenience method to start a crawl
         | 
| @@ -41,13 +41,17 @@ module Anemone | |
| 41 41 | 
             
                  # number of times HTTP redirects will be followed
         | 
| 42 42 | 
             
                  :redirect_limit => 5,
         | 
| 43 43 | 
             
                  # storage engine defaults to Hash in +process_options+ if none specified
         | 
| 44 | 
            -
                  :storage => nil
         | 
| 44 | 
            +
                  :storage => nil,
         | 
| 45 | 
            +
                  # Hash of cookie name => value to send with HTTP requests
         | 
| 46 | 
            +
                  :cookies => nil,
         | 
| 47 | 
            +
                  # accept cookies from the server and send them back?
         | 
| 48 | 
            +
                  :accept_cookies => false
         | 
| 45 49 | 
             
                }
         | 
| 46 50 |  | 
| 47 51 | 
             
                # Create setter methods for all options to be called from the crawl block
         | 
| 48 52 | 
             
                DEFAULT_OPTS.keys.each do |key|
         | 
| 49 | 
            -
                  define_method "#{key}=" do  | 
| 50 | 
            -
                    @opts[key.to_sym] =  | 
| 53 | 
            +
                  define_method "#{key}=" do |value|
         | 
| 54 | 
            +
                    @opts[key.to_sym] = value
         | 
| 51 55 | 
             
                  end
         | 
| 52 56 | 
             
                end
         | 
| 53 57 |  | 
| @@ -173,7 +177,7 @@ module Anemone | |
| 173 177 | 
             
                    end
         | 
| 174 178 | 
             
                  end
         | 
| 175 179 |  | 
| 176 | 
            -
                  @tentacles.each { | | 
| 180 | 
            +
                  @tentacles.each { |thread| thread.join }
         | 
| 177 181 | 
             
                  do_after_crawl_blocks
         | 
| 178 182 | 
             
                  self
         | 
| 179 183 | 
             
                end
         | 
| @@ -185,25 +189,37 @@ module Anemone | |
| 185 189 | 
             
                  @opts[:threads] = 1 if @opts[:delay] > 0
         | 
| 186 190 | 
             
                  @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
         | 
| 187 191 | 
             
                  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                  freeze_options
         | 
| 194 | 
            +
                end
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                #
         | 
| 197 | 
            +
                # Freeze the opts Hash so that no options can be modified
         | 
| 198 | 
            +
                # once the crawl begins
         | 
| 199 | 
            +
                #
         | 
| 200 | 
            +
                def freeze_options
         | 
| 201 | 
            +
                  @opts.freeze
         | 
| 202 | 
            +
                  @opts.each_key { |key| @opts[key].freeze }
         | 
| 203 | 
            +
                  @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
         | 
| 188 204 | 
             
                end
         | 
| 189 205 |  | 
| 190 206 | 
             
                #
         | 
| 191 207 | 
             
                # Execute the after_crawl blocks
         | 
| 192 208 | 
             
                #
         | 
| 193 209 | 
             
                def do_after_crawl_blocks
         | 
| 194 | 
            -
                  @after_crawl_blocks.each { | | 
| 210 | 
            +
                  @after_crawl_blocks.each { |block| block.call(@pages) }
         | 
| 195 211 | 
             
                end
         | 
| 196 212 |  | 
| 197 213 | 
             
                #
         | 
| 198 214 | 
             
                # Execute the on_every_page blocks for *page*
         | 
| 199 215 | 
             
                #
         | 
| 200 216 | 
             
                def do_page_blocks(page)
         | 
| 201 | 
            -
                  @on_every_page_blocks.each do | | 
| 202 | 
            -
                     | 
| 217 | 
            +
                  @on_every_page_blocks.each do |block|
         | 
| 218 | 
            +
                    block.call(page)
         | 
| 203 219 | 
             
                  end
         | 
| 204 220 |  | 
| 205 | 
            -
                  @on_pages_like_blocks.each do |pattern,  | 
| 206 | 
            -
                     | 
| 221 | 
            +
                  @on_pages_like_blocks.each do |pattern, blocks|
         | 
| 222 | 
            +
                    blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
         | 
| 207 223 | 
             
                  end
         | 
| 208 224 | 
             
                end
         | 
| 209 225 |  | 
| @@ -241,7 +257,7 @@ module Anemone | |
| 241 257 | 
             
                # its URL matches a skip_link pattern.
         | 
| 242 258 | 
             
                #
         | 
| 243 259 | 
             
                def skip_link?(link)
         | 
| 244 | 
            -
                  @skip_link_patterns.any? { | | 
| 260 | 
            +
                  @skip_link_patterns.any? { |pattern| link.path =~ pattern }
         | 
| 245 261 | 
             
                end
         | 
| 246 262 |  | 
| 247 263 | 
             
              end
         | 
    
        data/lib/anemone/http.rb
    CHANGED
    
    | @@ -1,14 +1,19 @@ | |
| 1 1 | 
             
            require 'net/https'
         | 
| 2 2 | 
             
            require 'anemone/page'
         | 
| 3 | 
            +
            require 'anemone/cookie_store'
         | 
| 3 4 |  | 
| 4 5 | 
             
            module Anemone
         | 
| 5 6 | 
             
              class HTTP
         | 
| 6 7 | 
             
                # Maximum number of redirects to follow on each get_response
         | 
| 7 8 | 
             
                REDIRECT_LIMIT = 5
         | 
| 8 9 |  | 
| 10 | 
            +
                # CookieStore for this HTTP client
         | 
| 11 | 
            +
                attr_reader :cookie_store
         | 
| 12 | 
            +
             | 
| 9 13 | 
             
                def initialize(opts = {})
         | 
| 10 14 | 
             
                  @connections = {}
         | 
| 11 15 | 
             
                  @opts = opts
         | 
| 16 | 
            +
                  @cookie_store = CookieStore.new(@opts[:cookies])
         | 
| 12 17 | 
             
                end
         | 
| 13 18 |  | 
| 14 19 | 
             
                #
         | 
| @@ -47,6 +52,28 @@ module Anemone | |
| 47 52 | 
             
                  end
         | 
| 48 53 | 
             
                end
         | 
| 49 54 |  | 
| 55 | 
            +
                #
         | 
| 56 | 
            +
                # The maximum number of redirects to follow
         | 
| 57 | 
            +
                #
         | 
| 58 | 
            +
                def redirect_limit
         | 
| 59 | 
            +
                  @opts[:redirect_limit] || REDIRECT_LIMIT
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                #
         | 
| 63 | 
            +
                # The user-agent string which will be sent with each request,
         | 
| 64 | 
            +
                # or nil if no such option is set
         | 
| 65 | 
            +
                #
         | 
| 66 | 
            +
                def user_agent
         | 
| 67 | 
            +
                  @opts[:user_agent]
         | 
| 68 | 
            +
                end
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                #
         | 
| 71 | 
            +
                # Does this HTTP client accept cookies from the server?
         | 
| 72 | 
            +
                #
         | 
| 73 | 
            +
                def accept_cookies?
         | 
| 74 | 
            +
                  @opts[:accept_cookies]
         | 
| 75 | 
            +
                end
         | 
| 76 | 
            +
             | 
| 50 77 | 
             
                private
         | 
| 51 78 |  | 
| 52 79 | 
             
                #
         | 
| @@ -55,22 +82,19 @@ module Anemone | |
| 55 82 | 
             
                # for each response.
         | 
| 56 83 | 
             
                #
         | 
| 57 84 | 
             
                def get(url, referer = nil)
         | 
| 58 | 
            -
                  response, response_time = get_response(url, referer)
         | 
| 59 | 
            -
                  code = Integer(response.code)
         | 
| 60 | 
            -
                  loc = url
         | 
| 61 | 
            -
                  redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
         | 
| 62 | 
            -
                  yield response, code, loc, redirect_to, response_time
         | 
| 63 | 
            -
             | 
| 64 85 | 
             
                  limit = redirect_limit
         | 
| 65 | 
            -
                   | 
| 66 | 
            -
             | 
| 86 | 
            +
                  loc = url
         | 
| 87 | 
            +
                  begin
         | 
| 88 | 
            +
                      # if redirected to a relative url, merge it with the host of the original
         | 
| 89 | 
            +
                      # request url
         | 
| 67 90 | 
             
                      loc = url.merge(loc) if loc.relative?
         | 
| 91 | 
            +
             | 
| 68 92 | 
             
                      response, response_time = get_response(loc, referer)
         | 
| 69 93 | 
             
                      code = Integer(response.code)
         | 
| 70 94 | 
             
                      redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']) : nil
         | 
| 71 95 | 
             
                      yield response, code, loc, redirect_to, response_time
         | 
| 72 96 | 
             
                      limit -= 1
         | 
| 73 | 
            -
                  end
         | 
| 97 | 
            +
                  end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
         | 
| 74 98 | 
             
                end
         | 
| 75 99 |  | 
| 76 100 | 
             
                #
         | 
| @@ -82,6 +106,7 @@ module Anemone | |
| 82 106 | 
             
                  opts = {}
         | 
| 83 107 | 
             
                  opts['User-Agent'] = user_agent if user_agent
         | 
| 84 108 | 
             
                  opts['Referer'] = referer.to_s if referer
         | 
| 109 | 
            +
                  opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
         | 
| 85 110 |  | 
| 86 111 | 
             
                  retries = 0
         | 
| 87 112 | 
             
                  begin
         | 
| @@ -89,6 +114,7 @@ module Anemone | |
| 89 114 | 
             
                    response = connection(url).get(full_path, opts)
         | 
| 90 115 | 
             
                    finish = Time.now()
         | 
| 91 116 | 
             
                    response_time = ((finish - start) * 1000).round
         | 
| 117 | 
            +
                    @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
         | 
| 92 118 | 
             
                    return response, response_time
         | 
| 93 119 | 
             
                  rescue EOFError
         | 
| 94 120 | 
             
                    refresh_connection(url)
         | 
| @@ -116,14 +142,6 @@ module Anemone | |
| 116 142 | 
             
                  @connections[url.host][url.port] = http.start
         | 
| 117 143 | 
             
                end
         | 
| 118 144 |  | 
| 119 | 
            -
                def redirect_limit
         | 
| 120 | 
            -
                  @opts[:redirect_limit] || REDIRECT_LIMIT
         | 
| 121 | 
            -
                end
         | 
| 122 | 
            -
             | 
| 123 | 
            -
                def user_agent
         | 
| 124 | 
            -
                  @opts[:user_agent]
         | 
| 125 | 
            -
                end
         | 
| 126 | 
            -
             | 
| 127 145 | 
             
                def verbose?
         | 
| 128 146 | 
             
                  @opts[:verbose]
         | 
| 129 147 | 
             
                end
         | 
    
        data/lib/anemone/page.rb
    CHANGED
    
    | @@ -1,5 +1,6 @@ | |
| 1 1 | 
             
            require 'nokogiri'
         | 
| 2 2 | 
             
            require 'ostruct'
         | 
| 3 | 
            +
            require 'webrick/cookie'
         | 
| 3 4 |  | 
| 4 5 | 
             
            module Anemone
         | 
| 5 6 | 
             
              class Page
         | 
| @@ -92,6 +93,13 @@ module Anemone | |
| 92 93 | 
             
                  @fetched
         | 
| 93 94 | 
             
                end
         | 
| 94 95 |  | 
| 96 | 
            +
                #
         | 
| 97 | 
            +
                # Array of cookies received with this page as WEBrick::Cookie objects.
         | 
| 98 | 
            +
                #
         | 
| 99 | 
            +
                def cookies
         | 
| 100 | 
            +
                  WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
         | 
| 101 | 
            +
                end
         | 
| 102 | 
            +
             | 
| 95 103 | 
             
                #
         | 
| 96 104 | 
             
                # The content-type returned by the HTTP request for this page
         | 
| 97 105 | 
             
                #
         | 
| @@ -0,0 +1,27 @@ | |
| 1 | 
            +
            require File.dirname(__FILE__) + '/spec_helper'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Anemone
         | 
| 4 | 
            +
              describe CookieStore do
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                it "should start out empty if no cookies are specified" do
         | 
| 7 | 
            +
                  CookieStore.new.empty?.should be true
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                it "should accept a Hash of cookies in the constructor" do
         | 
| 11 | 
            +
                  CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                it "should be able to merge an HTTP cookie string" do
         | 
| 15 | 
            +
                  cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
         | 
| 16 | 
            +
                  cs.merge! "a=A; path=/, c=C; path=/"
         | 
| 17 | 
            +
                  cs['a'].value.should == 'A'
         | 
| 18 | 
            +
                  cs['b'].value.should == 'b'
         | 
| 19 | 
            +
                  cs['c'].value.should == 'C'
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
         | 
| 23 | 
            +
                  CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              end
         | 
| 27 | 
            +
            end
         | 
    
        data/spec/core_spec.rb
    CHANGED
    
    | @@ -154,6 +154,24 @@ module Anemone | |
| 154 154 | 
             
                    urls.should_not include(pages[1].url)
         | 
| 155 155 | 
             
                  end
         | 
| 156 156 |  | 
| 157 | 
            +
                  it "should be able to set cookies to send with HTTP requests" do
         | 
| 158 | 
            +
                    cookies = {:a => '1', :b => '2'}
         | 
| 159 | 
            +
                    core = Anemone.crawl(FakePage.new('0').url) do |anemone|
         | 
| 160 | 
            +
                      anemone.cookies = cookies
         | 
| 161 | 
            +
                    end
         | 
| 162 | 
            +
                    core.opts[:cookies].should == cookies
         | 
| 163 | 
            +
                  end
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                  it "should freeze the options once the crawl begins" do
         | 
| 166 | 
            +
                    core = Anemone.crawl(FakePage.new('0').url) do |anemone|
         | 
| 167 | 
            +
                      anemone.threads = 4
         | 
| 168 | 
            +
                      anemone.on_every_page do
         | 
| 169 | 
            +
                        lambda {anemone.threads = 2}.should raise_error
         | 
| 170 | 
            +
                      end
         | 
| 171 | 
            +
                    end
         | 
| 172 | 
            +
                    core.opts[:threads].should == 4
         | 
| 173 | 
            +
                  end
         | 
| 174 | 
            +
             | 
| 157 175 | 
             
                  describe "many pages" do
         | 
| 158 176 | 
             
                    before(:each) do
         | 
| 159 177 | 
             
                      @pages, size = [], 5
         | 
    
        data/spec/http_spec.rb
    CHANGED
    
    
    
        data/spec/page_spec.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: anemone
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.4.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors: 
         | 
| 7 7 | 
             
            - Chris Kite
         | 
| @@ -9,7 +9,7 @@ autorequire: | |
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 11 |  | 
| 12 | 
            -
            date: 2010- | 
| 12 | 
            +
            date: 2010-04-08 00:00:00 -05:00
         | 
| 13 13 | 
             
            default_executable: 
         | 
| 14 14 | 
             
            dependencies: 
         | 
| 15 15 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -46,6 +46,7 @@ files: | |
| 46 46 | 
             
            - README.rdoc
         | 
| 47 47 | 
             
            - bin/anemone
         | 
| 48 48 | 
             
            - lib/anemone.rb
         | 
| 49 | 
            +
            - lib/anemone/cookie_store.rb
         | 
| 49 50 | 
             
            - lib/anemone/core.rb
         | 
| 50 51 | 
             
            - lib/anemone/http.rb
         | 
| 51 52 | 
             
            - lib/anemone/page.rb
         | 
| @@ -93,6 +94,7 @@ specification_version: 3 | |
| 93 94 | 
             
            summary: Anemone web-spider framework
         | 
| 94 95 | 
             
            test_files: 
         | 
| 95 96 | 
             
            - spec/anemone_spec.rb
         | 
| 97 | 
            +
            - spec/cookie_store_spec.rb
         | 
| 96 98 | 
             
            - spec/core_spec.rb
         | 
| 97 99 | 
             
            - spec/page_spec.rb
         | 
| 98 100 | 
             
            - spec/page_store_spec.rb
         |