cobweb 0.0.19 → 0.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/cobweb.rb +32 -23
 - data/lib/cobweb/version.rb +1 -0
 - data/lib/crawl_job.rb +3 -2
 - data/lib/robots.rb +9 -0
 - data/lib/stats.rb +9 -6
 - data/spec/cobweb/cobweb_spec.rb +62 -13
 - data/spec/spec_helper.rb +12 -0
 - metadata +24 -22
 
    
        data/lib/cobweb.rb
    CHANGED
    
    | 
         @@ -29,8 +29,7 @@ class Cobweb 
     | 
|
| 
       29 
29 
     | 
    
         
             
                @options[:debug] = false unless @options.has_key?(:debug)
         
     | 
| 
       30 
30 
     | 
    
         
             
                @options[:cache] = 300 unless @options.has_key?(:cache)
         
     | 
| 
       31 
31 
     | 
    
         
             
                @options[:timeout] = 10 unless @options.has_key?(:timeout)
         
     | 
| 
       32 
     | 
    
         
            -
                @options[:redis_options] = {} unless @options.has_key?(:redis_options)
         
     | 
| 
       33 
     | 
    
         
            -
                
         
     | 
| 
      
 32 
     | 
    
         
            +
                @options[:redis_options] = {} unless @options.has_key?(:redis_options)    
         
     | 
| 
       34 
33 
     | 
    
         
             
              end
         
     | 
| 
       35 
34 
     | 
    
         | 
| 
       36 
35 
     | 
    
         
             
              def start(base_url)
         
     | 
| 
         @@ -41,33 +40,40 @@ class Cobweb 
     | 
|
| 
       41 
40 
     | 
    
         
             
                }  
         
     | 
| 
       42 
41 
     | 
    
         | 
| 
       43 
42 
     | 
    
         
             
                request.merge!(@options)
         
     | 
| 
       44 
     | 
    
         
            -
                redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{request[:crawl_id]}")
         
     | 
| 
       45 
     | 
    
         
            -
                redis.hset "statistics", "queued_at", DateTime.now
         
     | 
| 
      
 43 
     | 
    
         
            +
                @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{VERSION}-#{request[:crawl_id]}")
         
     | 
| 
      
 44 
     | 
    
         
            +
                @redis.hset "statistics", "queued_at", DateTime.now
         
     | 
| 
       46 
45 
     | 
    
         | 
| 
       47 
46 
     | 
    
         
             
                Resque.enqueue(CrawlJob, request)
         
     | 
| 
       48 
47 
     | 
    
         
             
              end
         
     | 
| 
       49 
48 
     | 
    
         | 
| 
       50 
     | 
    
         
            -
              def get(url,  
     | 
| 
      
 49 
     | 
    
         
            +
              def get(url, options = @options)
         
     | 
| 
       51 
50 
     | 
    
         | 
| 
       52 
     | 
    
         
            -
                raise "url cannot be nil" if url.nil? 
     | 
| 
      
 51 
     | 
    
         
            +
                raise "url cannot be nil" if url.nil?
         
     | 
| 
       53 
52 
     | 
    
         | 
| 
       54 
53 
     | 
    
         
             
                absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
         
     | 
| 
       55 
54 
     | 
    
         | 
| 
       56 
55 
     | 
    
         
             
                # get the unique id for this request
         
     | 
| 
       57 
56 
     | 
    
         
             
                unique_id = Digest::SHA1.hexdigest(url.to_s)
         
     | 
| 
      
 57 
     | 
    
         
            +
                redirect_limit = options[:redirect_limit]
         
     | 
| 
       58 
58 
     | 
    
         | 
| 
       59 
59 
     | 
    
         
             
                # connect to redis
         
     | 
| 
       60 
     | 
    
         
            -
                 
     | 
| 
      
 60 
     | 
    
         
            +
                ap options
         
     | 
| 
      
 61 
     | 
    
         
            +
                if options.has_key? :crawl_id
         
     | 
| 
      
 62 
     | 
    
         
            +
                  redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
         
     | 
| 
      
 63 
     | 
    
         
            +
                else
         
     | 
| 
      
 64 
     | 
    
         
            +
                  redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
         
     | 
| 
      
 65 
     | 
    
         
            +
                end
         
     | 
| 
       61 
66 
     | 
    
         | 
| 
      
 67 
     | 
    
         
            +
                ap "===== HEAD NAMESPACE ====="
         
     | 
| 
      
 68 
     | 
    
         
            +
                ap redis.namespace
         
     | 
| 
      
 69 
     | 
    
         
            +
                ap "===== HEAD NAMESPACE ====="
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
       62 
71 
     | 
    
         
             
                content = {}
         
     | 
| 
       63 
72 
     | 
    
         | 
| 
       64 
73 
     | 
    
         
             
                # check if it has already been cached
         
     | 
| 
       65 
74 
     | 
    
         
             
                if redis.get(unique_id) and @options[:cache]
         
     | 
| 
       66 
75 
     | 
    
         
             
                  puts "Cache hit for #{url}" unless @options[:quiet]
         
     | 
| 
       67 
     | 
    
         
            -
                  content =  
     | 
| 
       68 
     | 
    
         
            -
                  content[:body] = Base64.decode64(content[:body])
         
     | 
| 
       69 
     | 
    
         
            -
                  
         
     | 
| 
       70 
     | 
    
         
            -
                  content
         
     | 
| 
      
 76 
     | 
    
         
            +
                  content = Marshal.load(redis.get(unique_id)).deep_symbolize_keys
         
     | 
| 
       71 
77 
     | 
    
         
             
                else
         
     | 
| 
       72 
78 
     | 
    
         
             
                  # this url is valid for processing so lets get on with it
         
     | 
| 
       73 
79 
     | 
    
         
             
                  uri = Addressable::URI.parse(url.strip)
         
     | 
| 
         @@ -143,8 +149,7 @@ class Cobweb 
     | 
|
| 
       143 
149 
     | 
    
         
             
                    end
         
     | 
| 
       144 
150 
     | 
    
         
             
                    # add content to cache if required
         
     | 
| 
       145 
151 
     | 
    
         
             
                    if @options[:cache]
         
     | 
| 
       146 
     | 
    
         
            -
                       
     | 
| 
       147 
     | 
    
         
            -
                      redis.set(unique_id, content.to_json)
         
     | 
| 
      
 152 
     | 
    
         
            +
                      redis.set(unique_id, Marshal.dump(content))
         
     | 
| 
       148 
153 
     | 
    
         
             
                      redis.expire unique_id, @options[:cache].to_i
         
     | 
| 
       149 
154 
     | 
    
         
             
                    end
         
     | 
| 
       150 
155 
     | 
    
         
             
                  rescue RedirectError => e
         
     | 
| 
         @@ -196,28 +201,32 @@ class Cobweb 
     | 
|
| 
       196 
201 
     | 
    
         
             
                content  
         
     | 
| 
       197 
202 
     | 
    
         
             
              end
         
     | 
| 
       198 
203 
     | 
    
         | 
| 
       199 
     | 
    
         
            -
              def head(url,  
     | 
| 
      
 204 
     | 
    
         
            +
              def head(url, options = @options)
         
     | 
| 
       200 
205 
     | 
    
         
             
                raise "url cannot be nil" if url.nil?    
         
     | 
| 
       201 
206 
     | 
    
         | 
| 
       202 
207 
     | 
    
         
             
                absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
         
     | 
| 
       203 
208 
     | 
    
         | 
| 
       204 
209 
     | 
    
         
             
                # get the unique id for this request
         
     | 
| 
       205 
210 
     | 
    
         
             
                unique_id = Digest::SHA1.hexdigest(url)
         
     | 
| 
      
 211 
     | 
    
         
            +
                redirect_limit = options[:redirect_limit]
         
     | 
| 
       206 
212 
     | 
    
         | 
| 
       207 
213 
     | 
    
         
             
                # connect to redis
         
     | 
| 
       208 
     | 
    
         
            -
                 
     | 
| 
      
 214 
     | 
    
         
            +
                if options.has_key? :crawl_id
         
     | 
| 
      
 215 
     | 
    
         
            +
                  redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
         
     | 
| 
      
 216 
     | 
    
         
            +
                else
         
     | 
| 
      
 217 
     | 
    
         
            +
                  redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
         
     | 
| 
      
 218 
     | 
    
         
            +
                end
         
     | 
| 
       209 
219 
     | 
    
         | 
| 
      
 220 
     | 
    
         
            +
                ap "===== HEAD NAMESPACE ====="
         
     | 
| 
      
 221 
     | 
    
         
            +
                ap redis.namespace
         
     | 
| 
      
 222 
     | 
    
         
            +
                ap "===== HEAD NAMESPACE ====="
         
     | 
| 
      
 223 
     | 
    
         
            +
             
     | 
| 
       210 
224 
     | 
    
         
             
                content = {}
         
     | 
| 
       211 
225 
     | 
    
         | 
| 
       212 
226 
     | 
    
         
             
                # check if it has already been cached
         
     | 
| 
       213 
     | 
    
         
            -
                if  
     | 
| 
      
 227 
     | 
    
         
            +
                if redis.get("head-#{unique_id}") and @options[:cache]
         
     | 
| 
       214 
228 
     | 
    
         
             
                  puts "Cache hit for #{url}" unless @options[:quiet]
         
     | 
| 
       215 
     | 
    
         
            -
                   
     | 
| 
       216 
     | 
    
         
            -
                    content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
         
     | 
| 
       217 
     | 
    
         
            -
                  else
         
     | 
| 
       218 
     | 
    
         
            -
                    content = JSON.parse(redis.get("head-#{unique_id}")).deep_symbolize_keys
         
     | 
| 
       219 
     | 
    
         
            -
                  end
         
     | 
| 
       220 
     | 
    
         
            -
                  content
         
     | 
| 
      
 229 
     | 
    
         
            +
                  Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
         
     | 
| 
       221 
230 
     | 
    
         
             
                else
         
     | 
| 
       222 
231 
     | 
    
         
             
                  print "Retrieving #{url }... " unless @options[:quiet]
         
     | 
| 
       223 
232 
     | 
    
         
             
                  uri = Addressable::URI.parse(url.strip)
         
     | 
| 
         @@ -259,7 +268,7 @@ class Cobweb 
     | 
|
| 
       259 
268 
     | 
    
         
             
                      # add content to cache if required
         
     | 
| 
       260 
269 
     | 
    
         
             
                      if @options[:cache]
         
     | 
| 
       261 
270 
     | 
    
         
             
                        puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
         
     | 
| 
       262 
     | 
    
         
            -
                        redis.set("head-#{unique_id}", content 
     | 
| 
      
 271 
     | 
    
         
            +
                        redis.set("head-#{unique_id}", Marshal.dump(content))
         
     | 
| 
       263 
272 
     | 
    
         
             
                        redis.expire "head-#{unique_id}", @options[:cache].to_i
         
     | 
| 
       264 
273 
     | 
    
         
             
                      else
         
     | 
| 
       265 
274 
     | 
    
         
             
                        puts "Not storing in cache as cache disabled" if @options[:debug]
         
     | 
| 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            VERSION = "0.0.20"
         
     | 
    
        data/lib/crawl_job.rb
    CHANGED
    
    | 
         @@ -28,7 +28,8 @@ class CrawlJob 
     | 
|
| 
       28 
28 
     | 
    
         
             
              def self.perform(content_request)
         
     | 
| 
       29 
29 
     | 
    
         
             
                # change all hash keys to symbols    
         
     | 
| 
       30 
30 
     | 
    
         
             
                content_request.deep_symbolize_keys
         
     | 
| 
       31 
     | 
    
         
            -
                redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}")
         
     | 
| 
      
 31 
     | 
    
         
            +
                redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{VERSION}-#{content_request[:crawl_id]}")
         
     | 
| 
      
 32 
     | 
    
         
            +
                ap redis.namespace
         
     | 
| 
       32 
33 
     | 
    
         
             
                @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
         
     | 
| 
       33 
34 
     | 
    
         | 
| 
       34 
35 
     | 
    
         
             
                # check we haven't crawled this url before
         
     | 
| 
         @@ -40,7 +41,7 @@ class CrawlJob 
     | 
|
| 
       40 
41 
     | 
    
         
             
                  redis.incr "crawl-counter"
         
     | 
| 
       41 
42 
     | 
    
         
             
                  crawl_counter += 1
         
     | 
| 
       42 
43 
     | 
    
         
             
                  if crawl_counter <= content_request[:crawl_limit].to_i
         
     | 
| 
       43 
     | 
    
         
            -
                    content = Cobweb.new(content_request).get(content_request[:url])
         
     | 
| 
      
 44 
     | 
    
         
            +
                    content = Cobweb.new(content_request).get(content_request[:url], content_request)
         
     | 
| 
       44 
45 
     | 
    
         | 
| 
       45 
46 
     | 
    
         
             
                    ## update statistics
         
     | 
| 
       46 
47 
     | 
    
         
             
                    if redis.hexists "statistics", "average_response_time"
         
     | 
    
        data/lib/robots.rb
    ADDED
    
    | 
         @@ -0,0 +1,9 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            class Robots
         
     | 
| 
      
 2 
     | 
    
         
            +
              
         
     | 
| 
      
 3 
     | 
    
         
            +
              def initialize(url, file_name="robots.txt")
         
     | 
| 
      
 4 
     | 
    
         
            +
                uri = URI.parse(url)
         
     | 
| 
      
 5 
     | 
    
         
            +
                [uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join
         
     | 
| 
      
 6 
     | 
    
         
            +
                Cobweb.new(:cache => 6000).get([uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join)
         
     | 
| 
      
 7 
     | 
    
         
            +
                
         
     | 
| 
      
 8 
     | 
    
         
            +
              end
         
     | 
| 
      
 9 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/stats.rb
    CHANGED
    
    | 
         @@ -20,12 +20,15 @@ class Stats < Sinatra::Base 
     | 
|
| 
       20 
20 
     | 
    
         
             
                haml :statistics
         
     | 
| 
       21 
21 
     | 
    
         
             
              end
         
     | 
| 
       22 
22 
     | 
    
         | 
| 
       23 
     | 
    
         
            -
            end
         
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
            thread = Thread.new do
         
     | 
| 
       26 
     | 
    
         
            -
              Stats.run!
         
     | 
| 
       27 
23 
     | 
    
         | 
| 
       28 
     | 
    
         
            -
               
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
      
 24 
     | 
    
         
            +
              def self.start
         
     | 
| 
      
 25 
     | 
    
         
            +
                thread = Thread.new do
         
     | 
| 
      
 26 
     | 
    
         
            +
                  Stats.run!
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
                  ## we need to manually kill the main thread as sinatra traps the interrupts
         
     | 
| 
      
 29 
     | 
    
         
            +
                  Thread.main.kill
         
     | 
| 
      
 30 
     | 
    
         
            +
                end    
         
     | 
| 
      
 31 
     | 
    
         
            +
              end
         
     | 
| 
       30 
32 
     | 
    
         
             
            end
         
     | 
| 
       31 
33 
     | 
    
         | 
| 
      
 34 
     | 
    
         
            +
             
     | 
    
        data/spec/cobweb/cobweb_spec.rb
    CHANGED
    
    | 
         @@ -169,19 +169,68 @@ describe Cobweb do 
     | 
|
| 
       169 
169 
     | 
    
         | 
| 
       170 
170 
     | 
    
         
             
                    end
         
     | 
| 
       171 
171 
     | 
    
         
             
                  end
         
     | 
| 
      
 172 
     | 
    
         
            +
                  
         
     | 
| 
      
 173 
     | 
    
         
            +
                  describe "with cache" do
         
     | 
| 
      
 174 
     | 
    
         
            +
                    
         
     | 
| 
      
 175 
     | 
    
         
            +
                    before(:each) do
         
     | 
| 
      
 176 
     | 
    
         
            +
                      @cobweb = Cobweb.new :quiet => true, :cache => 200
         
     | 
| 
      
 177 
     | 
    
         
            +
                    end
         
     | 
| 
      
 178 
     | 
    
         
            +
                    
         
     | 
| 
      
 179 
     | 
    
         
            +
                    describe "content object" do
         
     | 
| 
      
 180 
     | 
    
         
            +
                      it "should return the url" do
         
     | 
| 
      
 181 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:url].should == @base_url
         
     | 
| 
      
 182 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:url].should == @base_url
         
     | 
| 
      
 183 
     | 
    
         
            +
                      end
         
     | 
| 
      
 184 
     | 
    
         
            +
                      it "should return correct content-type" do
         
     | 
| 
      
 185 
     | 
    
         
            +
                        @mock_http_response.stub!(:content_type).and_return("image/jpeg")
         
     | 
| 
      
 186 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
         
     | 
| 
      
 187 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
         
     | 
| 
      
 188 
     | 
    
         
            +
                      end
         
     | 
| 
      
 189 
     | 
    
         
            +
                      it "should return correct status-code" do
         
     | 
| 
      
 190 
     | 
    
         
            +
                        @mock_http_response.stub!(:code).and_return(404)
         
     | 
| 
      
 191 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:status_code].should == 404
         
     | 
| 
      
 192 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:status_code].should == 404
         
     | 
| 
      
 193 
     | 
    
         
            +
                      end
         
     | 
| 
      
 194 
     | 
    
         
            +
                      it "should return correct status-code" do
         
     | 
| 
      
 195 
     | 
    
         
            +
                        @mock_http_response.stub!(:code).and_return(404)
         
     | 
| 
      
 196 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:status_code].should == 404
         
     | 
| 
      
 197 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:status_code].should == 404
         
     | 
| 
      
 198 
     | 
    
         
            +
                      end
         
     | 
| 
      
 199 
     | 
    
         
            +
                      it "should return correct character_set" do
         
     | 
| 
      
 200 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:character_set].should == "UTF-8"
         
     | 
| 
      
 201 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:character_set].should == "UTF-8"
         
     | 
| 
      
 202 
     | 
    
         
            +
                      end 
         
     | 
| 
      
 203 
     | 
    
         
            +
                      it "should return correct content_length" do
         
     | 
| 
      
 204 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:length].should == 1024
         
     | 
| 
      
 205 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:length].should == 1024
         
     | 
| 
      
 206 
     | 
    
         
            +
                      end
         
     | 
| 
      
 207 
     | 
    
         
            +
                      it "should return correct content_body" do
         
     | 
| 
      
 208 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:body].should == "asdf"
         
     | 
| 
      
 209 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:body].should == "asdf"
         
     | 
| 
      
 210 
     | 
    
         
            +
                      end
         
     | 
| 
      
 211 
     | 
    
         
            +
                      it "should return correct location" do
         
     | 
| 
      
 212 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:location].should == nil
         
     | 
| 
      
 213 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:location].should == nil
         
     | 
| 
      
 214 
     | 
    
         
            +
             
     | 
| 
      
 215 
     | 
    
         
            +
                        @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
         
     | 
| 
      
 216 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:location].should == "http://google.com/"
         
     | 
| 
      
 217 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:location].should == "http://google.com/"
         
     | 
| 
      
 218 
     | 
    
         
            +
                      end
         
     | 
| 
      
 219 
     | 
    
         
            +
                      it "should return correct headers" do
         
     | 
| 
      
 220 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:headers].should == @default_headers
         
     | 
| 
      
 221 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:headers].should == @default_headers
         
     | 
| 
      
 222 
     | 
    
         
            +
                      end
         
     | 
| 
      
 223 
     | 
    
         
            +
                      it "should return correct a hash of links" do
         
     | 
| 
      
 224 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
         
     | 
| 
      
 225 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
         
     | 
| 
      
 226 
     | 
    
         
            +
                      end 
         
     | 
| 
      
 227 
     | 
    
         
            +
                      it "should return the response time for the url" do
         
     | 
| 
      
 228 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float 
         
     | 
| 
      
 229 
     | 
    
         
            +
                        @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float 
         
     | 
| 
      
 230 
     | 
    
         
            +
                      end
         
     | 
| 
      
 231 
     | 
    
         
            +
                    end
         
     | 
| 
      
 232 
     | 
    
         
            +
                    
         
     | 
| 
      
 233 
     | 
    
         
            +
                  end
         
     | 
| 
       172 
234 
     | 
    
         
             
                end  
         
     | 
| 
       173 
235 
     | 
    
         
             
              end  
         
     | 
| 
       174 
     | 
    
         
            -
             
     | 
| 
       175 
     | 
    
         
            -
              describe "without mock" do
         
     | 
| 
       176 
     | 
    
         
            -
                
         
     | 
| 
       177 
     | 
    
         
            -
                it "should throw exception when server is unavailable" #do
         
     | 
| 
       178 
     | 
    
         
            -
                #  lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
         
     | 
| 
       179 
     | 
    
         
            -
                #end
         
     | 
| 
       180 
     | 
    
         
            -
                
         
     | 
| 
       181 
     | 
    
         
            -
                it "should return a valid content hash when url doesn't exist on a live server" do
         
     | 
| 
       182 
     | 
    
         
            -
                  status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
         
     | 
| 
       183 
     | 
    
         
            -
                  status_code.should == 404
         
     | 
| 
       184 
     | 
    
         
            -
                end
         
     | 
| 
       185 
     | 
    
         
            -
                
         
     | 
| 
       186 
     | 
    
         
            -
              end
         
     | 
| 
       187 
236 
     | 
    
         
             
            end 
         
     | 
    
        data/spec/spec_helper.rb
    CHANGED
    
    | 
         @@ -1 +1,13 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'mock_redis'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            RSpec.configure do |config|
         
     | 
| 
      
 5 
     | 
    
         
            +
              config.before(:each) {
         
     | 
| 
      
 6 
     | 
    
         
            +
                #redis_mock = double("redis")
         
     | 
| 
      
 7 
     | 
    
         
            +
                #ap redis_mock
         
     | 
| 
      
 8 
     | 
    
         
            +
                #redis_mock.stub(:new).and_return(MockRedis.new)
         
     | 
| 
      
 9 
     | 
    
         
            +
                
         
     | 
| 
      
 10 
     | 
    
         
            +
                Redis.new.flushdb
         
     | 
| 
      
 11 
     | 
    
         
            +
              }
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: cobweb
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.20
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       7 
7 
     | 
    
         
             
            authors:
         
     | 
| 
         @@ -9,11 +9,11 @@ authors: 
     | 
|
| 
       9 
9 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       10 
10 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       11 
11 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       12 
     | 
    
         
            -
            date: 2012-03- 
     | 
| 
      
 12 
     | 
    
         
            +
            date: 2012-03-09 00:00:00.000000000 Z
         
     | 
| 
       13 
13 
     | 
    
         
             
            dependencies:
         
     | 
| 
       14 
14 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       15 
15 
     | 
    
         
             
              name: resque
         
     | 
| 
       16 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 16 
     | 
    
         
            +
              requirement: &70234753393720 !ruby/object:Gem::Requirement
         
     | 
| 
       17 
17 
     | 
    
         
             
                none: false
         
     | 
| 
       18 
18 
     | 
    
         
             
                requirements:
         
     | 
| 
       19 
19 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -21,10 +21,10 @@ dependencies: 
     | 
|
| 
       21 
21 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       22 
22 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       23 
23 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       24 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 24 
     | 
    
         
            +
              version_requirements: *70234753393720
         
     | 
| 
       25 
25 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       26 
26 
     | 
    
         
             
              name: redis
         
     | 
| 
       27 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 27 
     | 
    
         
            +
              requirement: &70234753392840 !ruby/object:Gem::Requirement
         
     | 
| 
       28 
28 
     | 
    
         
             
                none: false
         
     | 
| 
       29 
29 
     | 
    
         
             
                requirements:
         
     | 
| 
       30 
30 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -32,10 +32,10 @@ dependencies: 
     | 
|
| 
       32 
32 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       33 
33 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       34 
34 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       35 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 35 
     | 
    
         
            +
              version_requirements: *70234753392840
         
     | 
| 
       36 
36 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       37 
37 
     | 
    
         
             
              name: absolutize
         
     | 
| 
       38 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 38 
     | 
    
         
            +
              requirement: &70234753392180 !ruby/object:Gem::Requirement
         
     | 
| 
       39 
39 
     | 
    
         
             
                none: false
         
     | 
| 
       40 
40 
     | 
    
         
             
                requirements:
         
     | 
| 
       41 
41 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -43,10 +43,10 @@ dependencies: 
     | 
|
| 
       43 
43 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       44 
44 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       45 
45 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       46 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 46 
     | 
    
         
            +
              version_requirements: *70234753392180
         
     | 
| 
       47 
47 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       48 
48 
     | 
    
         
             
              name: nokogiri
         
     | 
| 
       49 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 49 
     | 
    
         
            +
              requirement: &70234753391520 !ruby/object:Gem::Requirement
         
     | 
| 
       50 
50 
     | 
    
         
             
                none: false
         
     | 
| 
       51 
51 
     | 
    
         
             
                requirements:
         
     | 
| 
       52 
52 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -54,10 +54,10 @@ dependencies: 
     | 
|
| 
       54 
54 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       55 
55 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       56 
56 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       57 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 57 
     | 
    
         
            +
              version_requirements: *70234753391520
         
     | 
| 
       58 
58 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       59 
59 
     | 
    
         
             
              name: addressable
         
     | 
| 
       60 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 60 
     | 
    
         
            +
              requirement: &70234753390840 !ruby/object:Gem::Requirement
         
     | 
| 
       61 
61 
     | 
    
         
             
                none: false
         
     | 
| 
       62 
62 
     | 
    
         
             
                requirements:
         
     | 
| 
       63 
63 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -65,10 +65,10 @@ dependencies: 
     | 
|
| 
       65 
65 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       66 
66 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       67 
67 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       68 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 68 
     | 
    
         
            +
              version_requirements: *70234753390840
         
     | 
| 
       69 
69 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       70 
70 
     | 
    
         
             
              name: rspec
         
     | 
| 
       71 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 71 
     | 
    
         
            +
              requirement: &70234753390200 !ruby/object:Gem::Requirement
         
     | 
| 
       72 
72 
     | 
    
         
             
                none: false
         
     | 
| 
       73 
73 
     | 
    
         
             
                requirements:
         
     | 
| 
       74 
74 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -76,10 +76,10 @@ dependencies: 
     | 
|
| 
       76 
76 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       77 
77 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       78 
78 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       79 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 79 
     | 
    
         
            +
              version_requirements: *70234753390200
         
     | 
| 
       80 
80 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       81 
81 
     | 
    
         
             
              name: awesome_print
         
     | 
| 
       82 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 82 
     | 
    
         
            +
              requirement: &70234753389280 !ruby/object:Gem::Requirement
         
     | 
| 
       83 
83 
     | 
    
         
             
                none: false
         
     | 
| 
       84 
84 
     | 
    
         
             
                requirements:
         
     | 
| 
       85 
85 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -87,10 +87,10 @@ dependencies: 
     | 
|
| 
       87 
87 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       88 
88 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       89 
89 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       90 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 90 
     | 
    
         
            +
              version_requirements: *70234753389280
         
     | 
| 
       91 
91 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       92 
92 
     | 
    
         
             
              name: sinatra
         
     | 
| 
       93 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 93 
     | 
    
         
            +
              requirement: &70234753388360 !ruby/object:Gem::Requirement
         
     | 
| 
       94 
94 
     | 
    
         
             
                none: false
         
     | 
| 
       95 
95 
     | 
    
         
             
                requirements:
         
     | 
| 
       96 
96 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -98,10 +98,10 @@ dependencies: 
     | 
|
| 
       98 
98 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       99 
99 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       100 
100 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       101 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 101 
     | 
    
         
            +
              version_requirements: *70234753388360
         
     | 
| 
       102 
102 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       103 
103 
     | 
    
         
             
              name: thin
         
     | 
| 
       104 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 104 
     | 
    
         
            +
              requirement: &70234753387580 !ruby/object:Gem::Requirement
         
     | 
| 
       105 
105 
     | 
    
         
             
                none: false
         
     | 
| 
       106 
106 
     | 
    
         
             
                requirements:
         
     | 
| 
       107 
107 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -109,10 +109,10 @@ dependencies: 
     | 
|
| 
       109 
109 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       110 
110 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       111 
111 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       112 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 112 
     | 
    
         
            +
              version_requirements: *70234753387580
         
     | 
| 
       113 
113 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       114 
114 
     | 
    
         
             
              name: haml
         
     | 
| 
       115 
     | 
    
         
            -
              requirement: & 
     | 
| 
      
 115 
     | 
    
         
            +
              requirement: &70234753386980 !ruby/object:Gem::Requirement
         
     | 
| 
       116 
116 
     | 
    
         
             
                none: false
         
     | 
| 
       117 
117 
     | 
    
         
             
                requirements:
         
     | 
| 
       118 
118 
     | 
    
         
             
                - - ! '>='
         
     | 
| 
         @@ -120,7 +120,7 @@ dependencies: 
     | 
|
| 
       120 
120 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       121 
121 
     | 
    
         
             
              type: :runtime
         
     | 
| 
       122 
122 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       123 
     | 
    
         
            -
              version_requirements: * 
     | 
| 
      
 123 
     | 
    
         
            +
              version_requirements: *70234753386980
         
     | 
| 
       124 
124 
     | 
    
         
             
            description: 
         
     | 
| 
       125 
125 
     | 
    
         
             
            email: stewart@rockwellcottage.com
         
     | 
| 
       126 
126 
     | 
    
         
             
            executables: []
         
     | 
| 
         @@ -134,6 +134,7 @@ files: 
     | 
|
| 
       134 
134 
     | 
    
         
             
            - spec/samples/sample_html_links.html
         
     | 
| 
       135 
135 
     | 
    
         
             
            - spec/spec.opts
         
     | 
| 
       136 
136 
     | 
    
         
             
            - spec/spec_helper.rb
         
     | 
| 
      
 137 
     | 
    
         
            +
            - lib/cobweb/version.rb
         
     | 
| 
       137 
138 
     | 
    
         
             
            - lib/cobweb.rb
         
     | 
| 
       138 
139 
     | 
    
         
             
            - lib/cobweb_crawler.rb
         
     | 
| 
       139 
140 
     | 
    
         
             
            - lib/cobweb_finished_job.rb
         
     | 
| 
         @@ -143,6 +144,7 @@ files: 
     | 
|
| 
       143 
144 
     | 
    
         
             
            - lib/hash.rb
         
     | 
| 
       144 
145 
     | 
    
         
             
            - lib/namespaced_redis.rb
         
     | 
| 
       145 
146 
     | 
    
         
             
            - lib/redirect_error.rb
         
     | 
| 
      
 147 
     | 
    
         
            +
            - lib/robots.rb
         
     | 
| 
       146 
148 
     | 
    
         
             
            - lib/stats.rb
         
     | 
| 
       147 
149 
     | 
    
         
             
            - views/statistics.haml
         
     | 
| 
       148 
150 
     | 
    
         
             
            - README.textile
         
     |