cobweb 0.0.73 → 0.0.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb.rb +6 -6
- data/lib/cobweb_crawl_helper.rb +1 -1
- data/lib/cobweb_crawler.rb +2 -2
- data/lib/cobweb_version.rb +1 -1
- data/lib/content_link_parser.rb +2 -3
- data/lib/crawl.rb +263 -0
- data/lib/crawl_job.rb +45 -189
- data/lib/crawl_object.rb +30 -0
- data/lib/hash_util.rb +1 -0
- data/lib/server.rb +2 -2
- data/lib/stats.rb +1 -1
- data/spec/cobweb/{crawl_spec.rb → cobweb_crawl_helper_spec.rb} +0 -0
- data/spec/cobweb/cobweb_job_spec.rb +58 -33
- data/spec/cobweb/content_link_parser_spec.rb +3 -5
- data/spec/cobweb/site_test_spec.rb.tmp +101 -0
- metadata +28 -25
    
        data/README.textile
    CHANGED
    
    
    
        data/lib/cobweb.rb
    CHANGED
    
    | @@ -4,7 +4,6 @@ require 'resque' | |
| 4 4 | 
             
            require "addressable/uri"
         | 
| 5 5 | 
             
            require 'digest/sha1'
         | 
| 6 6 | 
             
            require 'base64'
         | 
| 7 | 
            -
            require 'namespaced_redis'
         | 
| 8 7 |  | 
| 9 8 | 
             
            Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
         | 
| 10 9 | 
             
              require file
         | 
| @@ -46,6 +45,7 @@ class Cobweb | |
| 46 45 | 
             
                default_text_mime_types_to                ["text/*", "application/xhtml+xml"]
         | 
| 47 46 | 
             
                default_obey_robots_to                    false
         | 
| 48 47 | 
             
                default_user_agent_to                     "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
         | 
| 48 | 
            +
                default_valid_mime_types_to                ["*/*"]
         | 
| 49 49 |  | 
| 50 50 | 
             
              end
         | 
| 51 51 |  | 
| @@ -65,7 +65,7 @@ class Cobweb | |
| 65 65 | 
             
                end
         | 
| 66 66 |  | 
| 67 67 | 
             
                request.merge!(@options)
         | 
| 68 | 
            -
                @redis =  | 
| 68 | 
            +
                @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => Redis.new(request[:redis_options]))
         | 
| 69 69 | 
             
                @redis.set("original_base_url", base_url)
         | 
| 70 70 | 
             
                @redis.hset "statistics", "queued_at", DateTime.now
         | 
| 71 71 | 
             
                @redis.set("crawl-counter", 0)
         | 
| @@ -110,9 +110,9 @@ class Cobweb | |
| 110 110 |  | 
| 111 111 | 
             
                # connect to redis
         | 
| 112 112 | 
             
                if options.has_key? :crawl_id
         | 
| 113 | 
            -
                  redis =  | 
| 113 | 
            +
                  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
         | 
| 114 114 | 
             
                else
         | 
| 115 | 
            -
                  redis =  | 
| 115 | 
            +
                  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
         | 
| 116 116 | 
             
                end
         | 
| 117 117 |  | 
| 118 118 | 
             
                content = {:base_url => url}
         | 
| @@ -269,9 +269,9 @@ class Cobweb | |
| 269 269 |  | 
| 270 270 | 
             
                # connect to redis
         | 
| 271 271 | 
             
                if options.has_key? :crawl_id
         | 
| 272 | 
            -
                  redis =  | 
| 272 | 
            +
                  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
         | 
| 273 273 | 
             
                else
         | 
| 274 | 
            -
                  redis =  | 
| 274 | 
            +
                  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
         | 
| 275 275 | 
             
                end
         | 
| 276 276 |  | 
| 277 277 | 
             
                content = {:base_url => url}
         | 
    
        data/lib/cobweb_crawl_helper.rb
    CHANGED
    
    | @@ -15,7 +15,7 @@ class CobwebCrawlHelper | |
| 15 15 | 
             
                @stats = Stats.new(data)
         | 
| 16 16 | 
             
              end
         | 
| 17 17 |  | 
| 18 | 
            -
              def destroy(options)
         | 
| 18 | 
            +
              def destroy(options={})
         | 
| 19 19 |  | 
| 20 20 | 
             
                options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
         | 
| 21 21 | 
             
                options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
         | 
    
        data/lib/cobweb_crawler.rb
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            require 'digest/md5'
         | 
| 2 2 | 
             
            require 'date'
         | 
| 3 3 | 
             
            require 'ap'
         | 
| 4 | 
            -
             | 
| 4 | 
            +
            require 'redis-namespace'
         | 
| 5 5 |  | 
| 6 6 | 
             
            # CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
         | 
| 7 7 | 
             
            class CobwebCrawler
         | 
| @@ -20,7 +20,7 @@ class CobwebCrawler | |
| 20 20 | 
             
                  @options[:crawl_id] = @crawl_id
         | 
| 21 21 | 
             
                end
         | 
| 22 22 |  | 
| 23 | 
            -
                @redis =  | 
| 23 | 
            +
                @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
         | 
| 24 24 | 
             
                @options[:internal_urls] = [] if @options[:internal_urls].nil?
         | 
| 25 25 | 
             
                @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
         | 
| 26 26 | 
             
                @debug = @options[:debug]
         | 
    
        data/lib/cobweb_version.rb
    CHANGED
    
    
    
        data/lib/content_link_parser.rb
    CHANGED
    
    | @@ -6,7 +6,7 @@ class ContentLinkParser | |
| 6 6 |  | 
| 7 7 | 
             
              # Parses the content and absolutizes the urls based on url.  Options can be setup to determine the links that are extracted.
         | 
| 8 8 | 
             
              def initialize(url, content, options = {})
         | 
| 9 | 
            -
                @options = options
         | 
| 9 | 
            +
                @options = {}.merge(options)
         | 
| 10 10 | 
             
                @url = url
         | 
| 11 11 | 
             
                @doc = Nokogiri::HTML(content)
         | 
| 12 12 |  | 
| @@ -61,8 +61,7 @@ class ContentLinkParser | |
| 61 61 | 
             
                  end
         | 
| 62 62 | 
             
                  links.uniq
         | 
| 63 63 | 
             
                else
         | 
| 64 | 
            -
                   | 
| 65 | 
            -
                  []
         | 
| 64 | 
            +
                  super
         | 
| 66 65 | 
             
                end
         | 
| 67 66 | 
             
              end
         | 
| 68 67 |  | 
    
        data/lib/crawl.rb
    ADDED
    
    | @@ -0,0 +1,263 @@ | |
| 1 | 
            +
            module CobwebModule
         | 
| 2 | 
            +
              class Crawl
         | 
| 3 | 
            +
                
         | 
| 4 | 
            +
                def initialize(options={})
         | 
| 5 | 
            +
                  @options = HashUtil.deep_symbolize_keys(options)
         | 
| 6 | 
            +
                  
         | 
| 7 | 
            +
                  setup_defaults
         | 
| 8 | 
            +
                  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", Redis.new(@options[:redis_options]))
         | 
| 9 | 
            +
                  @stats = Stats.new(@options)
         | 
| 10 | 
            +
                  @debug = @options[:debug]
         | 
| 11 | 
            +
                  @first_to_finish = false
         | 
| 12 | 
            +
                  
         | 
| 13 | 
            +
                end
         | 
| 14 | 
            +
                
         | 
| 15 | 
            +
                # Returns true if the url requested is already in the crawled queue
         | 
| 16 | 
            +
                def already_crawled?(link=@options[:url])
         | 
| 17 | 
            +
                   @redis.sismember "crawled", link
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
                
         | 
| 20 | 
            +
                def already_queued?(link)
         | 
| 21 | 
            +
                  @redis.sismember "queued", link
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
                
         | 
| 24 | 
            +
                # Returns true if the crawl count is within limits
         | 
| 25 | 
            +
                def within_crawl_limits?
         | 
| 26 | 
            +
                  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
         | 
| 27 | 
            +
                end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                # Returns true if the processed count is within limits
         | 
| 30 | 
            +
                def within_process_limits?
         | 
| 31 | 
            +
                  @options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                # Returns true if the queue count is calculated to be still within limits when complete
         | 
| 35 | 
            +
                def within_queue_limits?
         | 
| 36 | 
            +
                  
         | 
| 37 | 
            +
                  # if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
         | 
| 38 | 
            +
                  if @options[:crawl_limit_by_page]
         | 
| 39 | 
            +
                    return true
         | 
| 40 | 
            +
                    
         | 
| 41 | 
            +
                  # if a crawl limit is set, limit queue size to crawled + queue
         | 
| 42 | 
            +
                  elsif @options[:crawl_limit].to_i > 0
         | 
| 43 | 
            +
                    (queue_counter + crawl_counter) < @options[:crawl_limit].to_i
         | 
| 44 | 
            +
                  
         | 
| 45 | 
            +
                  # no crawl limit set so always within queue limit
         | 
| 46 | 
            +
                  else
         | 
| 47 | 
            +
                    true
         | 
| 48 | 
            +
                  end
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
                
         | 
| 51 | 
            +
                def retrieve
         | 
| 52 | 
            +
                  unless already_crawled?
         | 
| 53 | 
            +
                    if within_crawl_limits?
         | 
| 54 | 
            +
                      @stats.update_status("Retrieving #{@options[:url]}...")
         | 
| 55 | 
            +
                      @content = Cobweb.new(@options).get(@options[:url], @options)
         | 
| 56 | 
            +
                      if @options[:url] == @redis.get("original_base_url")
         | 
| 57 | 
            +
                         @redis.set("crawled_base_url", @content[:base_url])
         | 
| 58 | 
            +
                      end
         | 
| 59 | 
            +
                      update_queues
         | 
| 60 | 
            +
                  
         | 
| 61 | 
            +
                      if content.permitted_type?
         | 
| 62 | 
            +
                        ## update statistics
         | 
| 63 | 
            +
                      
         | 
| 64 | 
            +
                        @stats.update_statistics(@content)
         | 
| 65 | 
            +
                        return true
         | 
| 66 | 
            +
                      end
         | 
| 67 | 
            +
                    else
         | 
| 68 | 
            +
                      decrement_queue_counter
         | 
| 69 | 
            +
                    end
         | 
| 70 | 
            +
                  else
         | 
| 71 | 
            +
                    decrement_queue_counter
         | 
| 72 | 
            +
                  end
         | 
| 73 | 
            +
                  false
         | 
| 74 | 
            +
                end
         | 
| 75 | 
            +
                
         | 
| 76 | 
            +
                def process_links &block
         | 
| 77 | 
            +
                  
         | 
| 78 | 
            +
                  # set the base url if this is the first page
         | 
| 79 | 
            +
                  set_base_url @redis
         | 
| 80 | 
            +
                  
         | 
| 81 | 
            +
                  @cobweb_links = CobwebLinks.new(@options)
         | 
| 82 | 
            +
                  if within_queue_limits?
         | 
| 83 | 
            +
                    internal_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
         | 
| 84 | 
            +
                    #get rid of duplicate links in the same page.
         | 
| 85 | 
            +
                    internal_links.uniq!
         | 
| 86 | 
            +
                    # select the link if its internal
         | 
| 87 | 
            +
                    internal_links.select! { |link| @cobweb_links.internal?(link) }
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    # reject the link if we've crawled it or queued it
         | 
| 90 | 
            +
                    internal_links.reject! { |link| @redis.sismember("crawled", link) }
         | 
| 91 | 
            +
                    internal_links.reject! { |link| @redis.sismember("queued", link) }
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                    internal_links.each do |link|
         | 
| 94 | 
            +
                      if within_queue_limits? && !already_queued?(link) && !already_crawled?(link)
         | 
| 95 | 
            +
                        if status != CobwebCrawlHelper::CANCELLED
         | 
| 96 | 
            +
                          yield link if block_given?
         | 
| 97 | 
            +
                          unless link.nil?
         | 
| 98 | 
            +
                            @redis.sadd "queued", link
         | 
| 99 | 
            +
                            increment_queue_counter
         | 
| 100 | 
            +
                          end
         | 
| 101 | 
            +
                        else
         | 
| 102 | 
            +
                          puts "Cannot enqueue new content as crawl has been cancelled." if @options[:debug]
         | 
| 103 | 
            +
                        end
         | 
| 104 | 
            +
                      end
         | 
| 105 | 
            +
                    end
         | 
| 106 | 
            +
                  end
         | 
| 107 | 
            +
                end
         | 
| 108 | 
            +
                
         | 
| 109 | 
            +
                def content
         | 
| 110 | 
            +
                  raise "Content is not available" if @content.nil?
         | 
| 111 | 
            +
                  CobwebModule::CrawlObject.new(@content, @options) 
         | 
| 112 | 
            +
                end
         | 
| 113 | 
            +
                
         | 
| 114 | 
            +
                def update_queues
         | 
| 115 | 
            +
                  @redis.multi do
         | 
| 116 | 
            +
                    #@redis.incr "inprogress"
         | 
| 117 | 
            +
                    # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
         | 
| 118 | 
            +
                    @redis.srem "queued", @options[:url]
         | 
| 119 | 
            +
                    @redis.sadd "crawled", @options[:url]
         | 
| 120 | 
            +
                    if content.url != @options[:url]
         | 
| 121 | 
            +
                      @redis.srem "queued", content.url
         | 
| 122 | 
            +
                      @redis.sadd "crawled", content.url
         | 
| 123 | 
            +
                    end
         | 
| 124 | 
            +
                    # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
         | 
| 125 | 
            +
                    if @options[:crawl_limit_by_page]
         | 
| 126 | 
            +
                      ap "#{content.mime_type} - #{content.url}"
         | 
| 127 | 
            +
                      if content.mime_type.match("text/html")
         | 
| 128 | 
            +
                        increment_crawl_counter
         | 
| 129 | 
            +
                      end
         | 
| 130 | 
            +
                    else
         | 
| 131 | 
            +
                      increment_crawl_counter
         | 
| 132 | 
            +
                    end
         | 
| 133 | 
            +
                    decrement_queue_counter
         | 
| 134 | 
            +
                  end
         | 
| 135 | 
            +
                end
         | 
| 136 | 
            +
                
         | 
| 137 | 
            +
                def to_be_processed?
         | 
| 138 | 
            +
                  !finished? || first_to_finish? || within_process_limits?
         | 
| 139 | 
            +
                end
         | 
| 140 | 
            +
                
         | 
| 141 | 
            +
                def process
         | 
| 142 | 
            +
                  if @options[:crawl_limit_by_page]
         | 
| 143 | 
            +
                    if content.mime_type.match("text/html")
         | 
| 144 | 
            +
                      increment_process_counter
         | 
| 145 | 
            +
                    end
         | 
| 146 | 
            +
                  else
         | 
| 147 | 
            +
                    increment_process_counter
         | 
| 148 | 
            +
                  end
         | 
| 149 | 
            +
                end
         | 
| 150 | 
            +
                
         | 
| 151 | 
            +
                def finished?
         | 
| 152 | 
            +
                  print_counters
         | 
| 153 | 
            +
                  # if there's nothing left queued or the crawled limit has been reached
         | 
| 154 | 
            +
                  if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
         | 
| 155 | 
            +
                    if queue_counter.to_i == 0
         | 
| 156 | 
            +
                      finished
         | 
| 157 | 
            +
                      return true
         | 
| 158 | 
            +
                    end
         | 
| 159 | 
            +
                  elsif (queue_counter.to_i) == 0 || crawl_counter.to_i >= @options[:crawl_limit].to_i
         | 
| 160 | 
            +
                    finished
         | 
| 161 | 
            +
                    return true
         | 
| 162 | 
            +
                  end
         | 
| 163 | 
            +
                  false
         | 
| 164 | 
            +
                end
         | 
| 165 | 
            +
                
         | 
| 166 | 
            +
                def finished
         | 
| 167 | 
            +
                  set_first_to_finish if !@redis.exists("first_to_finish")
         | 
| 168 | 
            +
                  ap "CRAWL FINISHED  #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if @options[:debug]
         | 
| 169 | 
            +
                  @stats.end_crawl(@options)
         | 
| 170 | 
            +
                end
         | 
| 171 | 
            +
                
         | 
| 172 | 
            +
                def set_first_to_finish
         | 
| 173 | 
            +
                  @redis.watch("first_to_finish") do
         | 
| 174 | 
            +
                    if !@redis.exists("first_to_finish")
         | 
| 175 | 
            +
                      @redis.multi do
         | 
| 176 | 
            +
                        puts "set first to finish"
         | 
| 177 | 
            +
                        @first_to_finish = true
         | 
| 178 | 
            +
                        @redis.set("first_to_finish", 1)
         | 
| 179 | 
            +
                      end
         | 
| 180 | 
            +
                    else
         | 
| 181 | 
            +
                      @redis.unwatch
         | 
| 182 | 
            +
                    end
         | 
| 183 | 
            +
                  end
         | 
| 184 | 
            +
                end
         | 
| 185 | 
            +
                
         | 
| 186 | 
            +
                
         | 
| 187 | 
            +
                def first_to_finish? 
         | 
| 188 | 
            +
                  @first_to_finish
         | 
| 189 | 
            +
                end
         | 
| 190 | 
            +
                
         | 
| 191 | 
            +
                def crawled_base_url
         | 
| 192 | 
            +
                  @redis.get("crawled_base_url")
         | 
| 193 | 
            +
                end
         | 
| 194 | 
            +
                
         | 
| 195 | 
            +
                def statistics
         | 
| 196 | 
            +
                  @stats.get_statistics
         | 
| 197 | 
            +
                end
         | 
| 198 | 
            +
                
         | 
| 199 | 
            +
                def redis
         | 
| 200 | 
            +
                  @redis
         | 
| 201 | 
            +
                end
         | 
| 202 | 
            +
                
         | 
| 203 | 
            +
                private
         | 
| 204 | 
            +
                def setup_defaults
         | 
| 205 | 
            +
                  @options[:redis_options] = {} unless @options.has_key? :redis_options
         | 
| 206 | 
            +
                  @options[:crawl_limit_by_page] = false unless @options.has_key? :crawl_limit_by_page
         | 
| 207 | 
            +
                  @options[:valid_mime_types] = ["*/*"] unless @options.has_key? :valid_mime_types
         | 
| 208 | 
            +
                end
         | 
| 209 | 
            +
                
         | 
| 210 | 
            +
                # Increments the queue counter and refreshes crawl counters
         | 
| 211 | 
            +
                def increment_queue_counter
         | 
| 212 | 
            +
                  @redis.incr "queue-counter"
         | 
| 213 | 
            +
                end
         | 
| 214 | 
            +
                # Increments the crawl counter and refreshes crawl counters
         | 
| 215 | 
            +
                def increment_crawl_counter
         | 
| 216 | 
            +
                  @redis.incr "crawl-counter"
         | 
| 217 | 
            +
                end
         | 
| 218 | 
            +
                # Increments the process counter
         | 
| 219 | 
            +
                def increment_process_counter
         | 
| 220 | 
            +
                  @redis.incr "process-counter"
         | 
| 221 | 
            +
                end
         | 
| 222 | 
            +
                # Decrements the queue counter and refreshes crawl counters
         | 
| 223 | 
            +
                def decrement_queue_counter
         | 
| 224 | 
            +
                  @redis.decr "queue-counter"
         | 
| 225 | 
            +
                end
         | 
| 226 | 
            +
                
         | 
| 227 | 
            +
                def crawl_counter
         | 
| 228 | 
            +
                  @redis.get("crawl-counter").to_i
         | 
| 229 | 
            +
                end
         | 
| 230 | 
            +
                def queue_counter
         | 
| 231 | 
            +
                  @redis.get("queue-counter").to_i
         | 
| 232 | 
            +
                end
         | 
| 233 | 
            +
                def process_counter
         | 
| 234 | 
            +
                  @redis.get("process-counter").to_i
         | 
| 235 | 
            +
                end
         | 
| 236 | 
            +
                
         | 
| 237 | 
            +
                def status
         | 
| 238 | 
            +
                  @stats.get_status
         | 
| 239 | 
            +
                end
         | 
| 240 | 
            +
                
         | 
| 241 | 
            +
                def print_counters
         | 
| 242 | 
            +
                  puts counters
         | 
| 243 | 
            +
                end
         | 
| 244 | 
            +
                
         | 
| 245 | 
            +
                def counters
         | 
| 246 | 
            +
                  "crawl_counter: #{crawl_counter} queue_counter: #{queue_counter} process_counter: #{process_counter} crawl_limit: #{@options[:crawl_limit]}"
         | 
| 247 | 
            +
                end
         | 
| 248 | 
            +
                
         | 
| 249 | 
            +
                # Sets the base url in redis.  If the first page is a redirect, it sets the base_url to the destination
         | 
| 250 | 
            +
                def set_base_url(redis)
         | 
| 251 | 
            +
                  if redis.get("base_url").nil?
         | 
| 252 | 
            +
                    unless !defined?(content.redirect_through) || content.redirect_through.empty? || !@options[:first_page_redirect_internal]
         | 
| 253 | 
            +
                      uri = Addressable::URI.parse(content.redirect_through.last)
         | 
| 254 | 
            +
                      redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
         | 
| 255 | 
            +
                    end
         | 
| 256 | 
            +
                    redis.set("base_url", content.url)
         | 
| 257 | 
            +
                  end
         | 
| 258 | 
            +
                end
         | 
| 259 | 
            +
             | 
| 260 | 
            +
             | 
| 261 | 
            +
                
         | 
| 262 | 
            +
              end
         | 
| 263 | 
            +
            end
         | 
    
        data/lib/crawl_job.rb
    CHANGED
    
    | @@ -5,132 +5,58 @@ class CrawlJob | |
| 5 5 | 
             
              require "net/https"  
         | 
| 6 6 | 
             
              require "uri"
         | 
| 7 7 | 
             
              require "redis"
         | 
| 8 | 
            -
               | 
| 9 | 
            -
             | 
| 8 | 
            +
              
         | 
| 10 9 | 
             
              @queue = :cobweb_crawl_job
         | 
| 11 | 
            -
             | 
| 10 | 
            +
              
         | 
| 12 11 | 
             
              # Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
         | 
| 13 12 | 
             
              def self.perform(content_request)
         | 
| 14 | 
            -
                # change all hash keys to symbols
         | 
| 15 | 
            -
                content_request = HashUtil.deep_symbolize_keys(content_request)
         | 
| 16 | 
            -
                @content_request = content_request
         | 
| 17 | 
            -
                @crawl = CobwebCrawlHelper.new(content_request)
         | 
| 18 | 
            -
                
         | 
| 19 | 
            -
                content_request[:redis_options] = {} unless content_request.has_key? :redis_options
         | 
| 20 | 
            -
                content_request[:crawl_limit_by_page] = false unless content_request.has_key? :crawl_limit_by_page
         | 
| 21 | 
            -
                content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
         | 
| 22 13 |  | 
| 23 | 
            -
                 | 
| 24 | 
            -
                @ | 
| 14 | 
            +
                # setup the crawl class to manage the crawl of this object
         | 
| 15 | 
            +
                @crawl = CobwebModule::Crawl.new(content_request)
         | 
| 25 16 |  | 
| 26 | 
            -
                 | 
| 17 | 
            +
                # update the counters and then perform the get, returns false if we are outwith limits
         | 
| 18 | 
            +
                if @crawl.retrieve
         | 
| 27 19 |  | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
                     | 
| 33 | 
            -
                      content = Cobweb.new(content_request).get(content_request[:url], content_request)
         | 
| 34 | 
            -
                      if content_request[:url] == @redis.get("original_base_url")
         | 
| 35 | 
            -
                         @redis.set("crawled_base_url", content[:base_url])
         | 
| 36 | 
            -
                      end
         | 
| 37 | 
            -
                      if is_permitted_type(content)
         | 
| 38 | 
            -
                        begin
         | 
| 39 | 
            -
                          @redis.incr "inprogress"
         | 
| 40 | 
            -
                          # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
         | 
| 41 | 
            -
                          @redis.srem "queued", content_request[:url]
         | 
| 42 | 
            -
                          @redis.sadd "crawled", content_request[:url]
         | 
| 43 | 
            -
                          @redis.srem "queued", content[:url]
         | 
| 44 | 
            -
                          @redis.sadd "crawled", content[:url]
         | 
| 45 | 
            -
                          # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
         | 
| 46 | 
            -
                          if content_request[:crawl_limit_by_page]
         | 
| 47 | 
            -
                            if content[:mime_type].match("text/html")
         | 
| 48 | 
            -
                              increment_crawl_started_counter
         | 
| 49 | 
            -
                            end
         | 
| 50 | 
            -
                          else
         | 
| 51 | 
            -
                            increment_crawl_started_counter
         | 
| 52 | 
            -
                          end
         | 
| 53 | 
            -
             | 
| 54 | 
            -
                          ## update statistics
         | 
| 55 | 
            -
                          @stats.update_status("Crawling #{content_request[:url]}...")
         | 
| 56 | 
            -
                          @stats.update_statistics(content)
         | 
| 57 | 
            -
             | 
| 58 | 
            -
                          # set the base url if this is the first page
         | 
| 59 | 
            -
                          set_base_url @redis, content, content_request
         | 
| 60 | 
            -
                        
         | 
| 61 | 
            -
                          @cobweb_links = CobwebLinks.new(content_request)
         | 
| 62 | 
            -
                          if within_queue_limits?(content_request[:crawl_limit])
         | 
| 63 | 
            -
                            internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
         | 
| 64 | 
            -
                            #get rid of duplicate links in the same page.
         | 
| 65 | 
            -
                            internal_links.uniq!
         | 
| 66 | 
            -
                            # select the link if its internal
         | 
| 67 | 
            -
                            internal_links.select! { |link| @cobweb_links.internal?(link) }
         | 
| 68 | 
            -
             | 
| 69 | 
            -
                            # reject the link if we've crawled it or queued it
         | 
| 70 | 
            -
                            internal_links.reject! { |link| @redis.sismember("crawled", link) }
         | 
| 71 | 
            -
                            internal_links.reject! { |link| @redis.sismember("queued", link) }
         | 
| 72 | 
            -
             | 
| 73 | 
            -
                            internal_links.each do |link|
         | 
| 74 | 
            -
                              puts link
         | 
| 75 | 
            -
                              puts "Not enqueuing due to cancelled crawl" if @crawl.status == CobwebCrawlHelper::CANCELLED
         | 
| 76 | 
            -
                              if within_queue_limits?(content_request[:crawl_limit])
         | 
| 77 | 
            -
                                if @crawl.status != CobwebCrawlHelper::CANCELLED
         | 
| 78 | 
            -
                                  enqueue_content(content_request, link) 
         | 
| 79 | 
            -
                                else
         | 
| 80 | 
            -
                                  puts "Cannot enqueue new content as crawl has been cancelled." if content_request[:debug]
         | 
| 81 | 
            -
                                end
         | 
| 82 | 
            -
                              end
         | 
| 83 | 
            -
                            end
         | 
| 84 | 
            -
                          end
         | 
| 85 | 
            -
             | 
| 86 | 
            -
                          # enqueue to processing queue
         | 
| 87 | 
            -
                          send_to_processing_queue(content, content_request)
         | 
| 20 | 
            +
                  # if the crawled object is an object type we are interested
         | 
| 21 | 
            +
                  if @crawl.content.permitted_type?
         | 
| 22 | 
            +
                    
         | 
| 23 | 
            +
                    # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
         | 
| 24 | 
            +
                    @crawl.process_links do |link|
         | 
| 88 25 |  | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
                            current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
         | 
| 93 | 
            -
                            enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
         | 
| 94 | 
            -
                          end
         | 
| 26 | 
            +
                      # enqueue the links to resque
         | 
| 27 | 
            +
                      puts "ENQUEUED LINK: #{link}"
         | 
| 28 | 
            +
                      enqueue_content(content_request, link) 
         | 
| 95 29 |  | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
             | 
| 107 | 
            -
             | 
| 108 | 
            -
             | 
| 109 | 
            -
             | 
| 110 | 
            -
             | 
| 111 | 
            -
                         | 
| 112 | 
            -
             | 
| 113 | 
            -
                        puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
         | 
| 30 | 
            +
                    end
         | 
| 31 | 
            +
                    
         | 
| 32 | 
            +
                    
         | 
| 33 | 
            +
                    if @crawl.to_be_processed?
         | 
| 34 | 
            +
                      @crawl.process
         | 
| 35 | 
            +
                      
         | 
| 36 | 
            +
                      # enqueue to processing queue
         | 
| 37 | 
            +
                      @crawl.redis.incr("crawl_job_enqueued_count")
         | 
| 38 | 
            +
                      puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
         | 
| 39 | 
            +
                      send_to_processing_queue(@crawl.content.to_hash, content_request)
         | 
| 40 | 
            +
                      
         | 
| 41 | 
            +
                      
         | 
| 42 | 
            +
                      #if the enqueue counter has been requested update that
         | 
| 43 | 
            +
                      if content_request.has_key?(:enqueue_counter_key)
         | 
| 44 | 
            +
                        enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
         | 
| 45 | 
            +
                        current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
         | 
| 46 | 
            +
                        enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
         | 
| 114 47 | 
             
                      end
         | 
| 115 48 | 
             
                    else
         | 
| 116 | 
            -
                       | 
| 49 | 
            +
                      ap "@crawl.finished? #{@crawl.finished?}"
         | 
| 50 | 
            +
                      ap "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
         | 
| 51 | 
            +
                      ap "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
         | 
| 117 52 | 
             
                    end
         | 
| 118 | 
            -
                  else
         | 
| 119 | 
            -
                    puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
         | 
| 120 53 | 
             
                  end
         | 
| 121 | 
            -
                  
         | 
| 122 | 
            -
                else
         | 
| 123 | 
            -
                  @redis.srem "queued", content_request[:url]
         | 
| 124 | 
            -
                  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
         | 
| 125 54 | 
             
                end
         | 
| 126 | 
            -
             | 
| 127 | 
            -
                 | 
| 128 | 
            -
                 | 
| 129 | 
            -
                 | 
| 130 | 
            -
             | 
| 131 | 
            -
                    finished(content_request)
         | 
| 132 | 
            -
                  end
         | 
| 133 | 
            -
                elsif (queue_counter+crawl_started_counter-crawl_counter)== 0 || crawl_counter >= content_request[:crawl_limit].to_i
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                # test queue and crawl sizes to see if we have completed the crawl
         | 
| 57 | 
            +
                ap "finished? #{@crawl.finished?}"
         | 
| 58 | 
            +
                ap "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
         | 
| 59 | 
            +
                if @crawl.finished? && @crawl.first_to_finish?
         | 
| 134 60 | 
             
                  finished(content_request)
         | 
| 135 61 | 
             
                end
         | 
| 136 62 |  | 
| @@ -138,19 +64,12 @@ class CrawlJob | |
| 138 64 |  | 
| 139 65 | 
             
              # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
         | 
| 140 66 | 
             
              def self.finished(content_request)
         | 
| 141 | 
            -
                 | 
| 142 | 
            -
                 | 
| 143 | 
            -
             | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
| 146 | 
            -
             | 
| 147 | 
            -
                  additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
         | 
| 148 | 
            -
                  additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
         | 
| 149 | 
            -
                  
         | 
| 150 | 
            -
                  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
         | 
| 151 | 
            -
                else
         | 
| 152 | 
            -
                  # nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
         | 
| 153 | 
            -
                end
         | 
| 67 | 
            +
                additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @crawl.crawled_base_url}
         | 
| 68 | 
            +
                additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
         | 
| 69 | 
            +
                additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
         | 
| 70 | 
            +
                
         | 
| 71 | 
            +
                @crawl.redis.incr("crawl_finished_enqueued_count")
         | 
| 72 | 
            +
                Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
         | 
| 154 73 | 
             
              end
         | 
| 155 74 |  | 
| 156 75 | 
             
              # Enqueues the content to the processing queue setup in options
         | 
| @@ -171,34 +90,6 @@ class CrawlJob | |
| 171 90 |  | 
| 172 91 | 
             
              private
         | 
| 173 92 |  | 
| 174 | 
            -
              # Helper method to determine if this content is to be processed or not
         | 
| 175 | 
            -
              def self.is_permitted_type(content)
         | 
| 176 | 
            -
                @content_request[:valid_mime_types].each do |mime_type|
         | 
| 177 | 
            -
                  return true if content[:mime_type].match(Cobweb.escape_pattern_for_regex(mime_type))
         | 
| 178 | 
            -
                end
         | 
| 179 | 
            -
                false
         | 
| 180 | 
            -
              end
         | 
| 181 | 
            -
              
         | 
| 182 | 
            -
              # Returns true if the crawl count is within limits
         | 
| 183 | 
            -
              def self.within_crawl_limits?(crawl_limit)
         | 
| 184 | 
            -
                crawl_limit.nil? or crawl_counter < crawl_limit.to_i
         | 
| 185 | 
            -
              end
         | 
| 186 | 
            -
              
         | 
| 187 | 
            -
              # Returns true if the queue count is calculated to be still within limits when complete
         | 
| 188 | 
            -
              def self.within_queue_limits?(crawl_limit)
         | 
| 189 | 
            -
                (@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (queue_counter + crawl_counter) < crawl_limit.to_i)
         | 
| 190 | 
            -
              end
         | 
| 191 | 
            -
              
         | 
| 192 | 
            -
              # Sets the base url in redis.  If the first page is a redirect, it sets the base_url to the destination
         | 
| 193 | 
            -
              def self.set_base_url(redis, content, content_request)
         | 
| 194 | 
            -
                if redis.get("base_url").nil?
         | 
| 195 | 
            -
                  unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
         | 
| 196 | 
            -
                    uri = Addressable::URI.parse(content[:redirect_through].last)
         | 
| 197 | 
            -
                    redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
         | 
| 198 | 
            -
                  end
         | 
| 199 | 
            -
                  redis.set("base_url", content[:url])
         | 
| 200 | 
            -
                end
         | 
| 201 | 
            -
              end
         | 
| 202 93 |  | 
| 203 94 | 
             
              # Enqueues content to the crawl_job queue
         | 
| 204 95 | 
             
              def self.enqueue_content(content_request, link)
         | 
| @@ -206,43 +97,8 @@ class CrawlJob | |
| 206 97 | 
             
                new_request[:url] = link
         | 
| 207 98 | 
             
                new_request[:parent] = content_request[:url]
         | 
| 208 99 | 
             
                #to help prevent accidentally double processing a link, let's mark it as queued just before the Resque.enqueue statement, rather than just after.
         | 
| 209 | 
            -
                @redis.sadd "queued", link
         | 
| 210 100 | 
             
                Resque.enqueue(CrawlJob, new_request)
         | 
| 211 | 
            -
                increment_queue_counter
         | 
| 212 101 | 
             
              end
         | 
| 213 102 |  | 
| 214 | 
            -
              # Increments the queue counter and refreshes crawl counters
         | 
| 215 | 
            -
              def self.increment_queue_counter
         | 
| 216 | 
            -
                @redis.incr "queue-counter"
         | 
| 217 | 
            -
              end
         | 
| 218 | 
            -
              # Increments the crawl counter and refreshes crawl counters
         | 
| 219 | 
            -
              def self.increment_crawl_counter
         | 
| 220 | 
            -
                @redis.incr "crawl-counter"
         | 
| 221 | 
            -
              end
         | 
| 222 | 
            -
              def self.increment_crawl_started_counter
         | 
| 223 | 
            -
                @redis.incr "crawl-started-counter"
         | 
| 224 | 
            -
              end
         | 
| 225 | 
            -
              # Decrements the queue counter and refreshes crawl counters
         | 
| 226 | 
            -
              def self.decrement_queue_counter
         | 
| 227 | 
            -
                @redis.decr "queue-counter"
         | 
| 228 | 
            -
              end
         | 
| 229 | 
            -
              
         | 
| 230 | 
            -
              def self.crawl_counter
         | 
| 231 | 
            -
                @redis.get("crawl-counter").to_i
         | 
| 232 | 
            -
              end
         | 
| 233 | 
            -
              def self.crawl_started_counter
         | 
| 234 | 
            -
                @redis.get("crawl-started-counter").to_i
         | 
| 235 | 
            -
              end
         | 
| 236 | 
            -
              def self.queue_counter
         | 
| 237 | 
            -
                @redis.get("queue-counter").to_i
         | 
| 238 | 
            -
              end
         | 
| 239 | 
            -
              
         | 
| 240 | 
            -
              def self.print_counters
         | 
| 241 | 
            -
                puts counters
         | 
| 242 | 
            -
              end
         | 
| 243 | 
            -
             | 
| 244 | 
            -
              def self.counters
         | 
| 245 | 
            -
                "crawl_counter: #{crawl_counter} crawl_started_counter: #{crawl_started_counter} queue_counter: #{queue_counter}"
         | 
| 246 | 
            -
              end
         | 
| 247 103 |  | 
| 248 104 | 
             
            end
         | 
    
        data/lib/crawl_object.rb
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
| 1 | 
            +
            module CobwebModule
         | 
| 2 | 
            +
              class CrawlObject
         | 
| 3 | 
            +
                
         | 
| 4 | 
            +
                def initialize(content_hash, options={})
         | 
| 5 | 
            +
                  @content = HashUtil.deep_symbolize_keys(content_hash)
         | 
| 6 | 
            +
                  @options = options
         | 
| 7 | 
            +
                end
         | 
| 8 | 
            +
                
         | 
| 9 | 
            +
                
         | 
| 10 | 
            +
                # Helper method to determine if this content is to be processed or not
         | 
| 11 | 
            +
                def permitted_type?
         | 
| 12 | 
            +
                  @options[:valid_mime_types].each do |valid_mime_type|
         | 
| 13 | 
            +
                    return true if @content[:mime_type].match(Cobweb.escape_pattern_for_regex(valid_mime_type))
         | 
| 14 | 
            +
                  end
         | 
| 15 | 
            +
                  false
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
                
         | 
| 18 | 
            +
                def method_missing(m)
         | 
| 19 | 
            +
                  if @content.keys.include? m.to_sym
         | 
| 20 | 
            +
                    @content[m.to_sym]
         | 
| 21 | 
            +
                  else
         | 
| 22 | 
            +
                    super
         | 
| 23 | 
            +
                  end
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
                
         | 
| 26 | 
            +
                def to_hash
         | 
| 27 | 
            +
                  @content
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
            end
         | 
    
        data/lib/hash_util.rb
    CHANGED
    
    
    
        data/lib/server.rb
    CHANGED
    
    | @@ -16,7 +16,7 @@ class Server < Sinatra::Base | |
| 16 16 | 
             
                @crawls = []
         | 
| 17 17 | 
             
                @full_redis.smembers("cobweb_crawls").each do |crawl_id|      
         | 
| 18 18 | 
             
                  version = cobweb_version(crawl_id)
         | 
| 19 | 
            -
                  redis =  | 
| 19 | 
            +
                  redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => Redis.new(redis_options))
         | 
| 20 20 | 
             
                  stats = HashUtil.deep_symbolize_keys({
         | 
| 21 21 | 
             
                    :cobweb_version => version,
         | 
| 22 22 | 
             
                    :crawl_details => redis.hgetall("crawl_details"),
         | 
| @@ -33,7 +33,7 @@ class Server < Sinatra::Base | |
| 33 33 | 
             
              get '/statistics/:crawl_id' do
         | 
| 34 34 |  | 
| 35 35 | 
             
                version = cobweb_version(params[:crawl_id])
         | 
| 36 | 
            -
                redis =  | 
| 36 | 
            +
                redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => Redis.new(redis_options))
         | 
| 37 37 |  | 
| 38 38 | 
             
                @statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
         | 
| 39 39 | 
             
                if @statistics[:status_counts].nil?
         | 
    
        data/lib/stats.rb
    CHANGED
    
    | @@ -8,7 +8,7 @@ class Stats | |
| 8 8 | 
             
              def initialize(options)
         | 
| 9 9 | 
             
                options[:redis_options] = {} unless options.has_key? :redis_options
         | 
| 10 10 | 
             
                @full_redis = Redis.new(options[:redis_options])
         | 
| 11 | 
            -
                @redis =  | 
| 11 | 
            +
                @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
         | 
| 12 12 | 
             
              end
         | 
| 13 13 |  | 
| 14 14 | 
             
              # Sets up the crawl in statistics
         | 
| 
            File without changes
         | 
| @@ -9,7 +9,7 @@ describe Cobweb, :local_only => true do | |
| 9 9 | 
             
                # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
         | 
| 10 10 | 
             
                puts "Starting Workers... Please Wait..."
         | 
| 11 11 | 
             
                `mkdir log`
         | 
| 12 | 
            -
                io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT= | 
| 12 | 
            +
                io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
         | 
| 13 13 | 
             
                puts "Workers Started."
         | 
| 14 14 |  | 
| 15 15 | 
             
              end
         | 
| @@ -17,6 +17,7 @@ describe Cobweb, :local_only => true do | |
| 17 17 | 
             
              before(:each) do
         | 
| 18 18 | 
             
                @base_url = "http://localhost:3532/"
         | 
| 19 19 | 
             
                @base_page_count = 77
         | 
| 20 | 
            +
             | 
| 20 21 | 
             
                clear_queues
         | 
| 21 22 | 
             
              end
         | 
| 22 23 |  | 
| @@ -29,6 +30,7 @@ describe Cobweb, :local_only => true do | |
| 29 30 | 
             
                    :debug => false,
         | 
| 30 31 | 
             
                    :cache => nil
         | 
| 31 32 | 
             
                  }
         | 
| 33 | 
            +
                  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
         | 
| 32 34 | 
             
                  @cobweb = Cobweb.new @request
         | 
| 33 35 | 
             
                end
         | 
| 34 36 | 
             
                it "should not crawl anything if nothing has started" do
         | 
| @@ -37,7 +39,7 @@ describe Cobweb, :local_only => true do | |
| 37 39 | 
             
                  crawl_obj.destroy
         | 
| 38 40 | 
             
                  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 39 41 | 
             
                  wait_for_crawl_finished crawl[:crawl_id]
         | 
| 40 | 
            -
                   | 
| 42 | 
            +
                  @redis.get("crawl_job_enqueued_count").to_i.should == 0
         | 
| 41 43 | 
             
                end
         | 
| 42 44 |  | 
| 43 45 | 
             
                it "should not complete the crawl when cancelled" do
         | 
| @@ -47,8 +49,8 @@ describe Cobweb, :local_only => true do | |
| 47 49 | 
             
                  crawl_obj.destroy
         | 
| 48 50 | 
             
                  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 49 51 | 
             
                  wait_for_crawl_finished crawl[:crawl_id]
         | 
| 50 | 
            -
                   | 
| 51 | 
            -
                   | 
| 52 | 
            +
                  @redis.get("crawl_job_enqueued_count").to_i.should > 0
         | 
| 53 | 
            +
                  @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
         | 
| 52 54 | 
             
                end
         | 
| 53 55 |  | 
| 54 56 | 
             
              end
         | 
| @@ -61,22 +63,24 @@ describe Cobweb, :local_only => true do | |
| 61 63 | 
             
                    :debug => false,
         | 
| 62 64 | 
             
                    :cache => nil
         | 
| 63 65 | 
             
                  }
         | 
| 66 | 
            +
                  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
         | 
| 67 | 
            +
             | 
| 64 68 | 
             
                  @cobweb = Cobweb.new @request
         | 
| 65 69 | 
             
                end
         | 
| 66 70 |  | 
| 67 71 | 
             
                it "should crawl entire site" do
         | 
| 68 | 
            -
                  ap Resque.size("cobweb_process_job")
         | 
| 69 72 | 
             
                  crawl = @cobweb.start(@base_url)
         | 
| 70 73 | 
             
                  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 71 74 | 
             
                  wait_for_crawl_finished crawl[:crawl_id]
         | 
| 72 | 
            -
                   | 
| 73 | 
            -
                   | 
| 75 | 
            +
                  @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
         | 
| 76 | 
            +
                  @redis.get("crawl_finished_enqueued_count").to_i.should == 1
         | 
| 74 77 | 
             
                end
         | 
| 75 78 | 
             
                it "detect crawl finished once" do
         | 
| 76 79 | 
             
                  crawl = @cobweb.start(@base_url)
         | 
| 77 80 | 
             
                  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 78 81 | 
             
                  wait_for_crawl_finished crawl[:crawl_id]
         | 
| 79 | 
            -
                   | 
| 82 | 
            +
                  @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
         | 
| 83 | 
            +
                  @redis.get("crawl_finished_enqueued_count").to_i.should == 1
         | 
| 80 84 | 
             
                end
         | 
| 81 85 | 
             
              end
         | 
| 82 86 | 
             
              describe "with limited mime_types" do
         | 
| @@ -87,6 +91,7 @@ describe Cobweb, :local_only => true do | |
| 87 91 | 
             
                    :cache => nil,
         | 
| 88 92 | 
             
                    :valid_mime_types => ["text/html"]
         | 
| 89 93 | 
             
                  }
         | 
| 94 | 
            +
                  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
         | 
| 90 95 | 
             
                  @cobweb = Cobweb.new @request
         | 
| 91 96 | 
             
                end
         | 
| 92 97 |  | 
| @@ -94,7 +99,7 @@ describe Cobweb, :local_only => true do | |
| 94 99 | 
             
                  crawl = @cobweb.start(@base_url)
         | 
| 95 100 | 
             
                  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 96 101 | 
             
                  wait_for_crawl_finished crawl[:crawl_id]
         | 
| 97 | 
            -
                   | 
| 102 | 
            +
                  @redis.get("crawl_job_enqueued_count").to_i.should == 8
         | 
| 98 103 |  | 
| 99 104 | 
             
                  mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
         | 
| 100 105 | 
             
                  mime_types.count.should == 8
         | 
| @@ -110,6 +115,7 @@ describe Cobweb, :local_only => true do | |
| 110 115 | 
             
                    :quiet => true,
         | 
| 111 116 | 
             
                    :cache => nil
         | 
| 112 117 | 
             
                  }
         | 
| 118 | 
            +
                  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
         | 
| 113 119 | 
             
                end
         | 
| 114 120 |  | 
| 115 121 | 
             
                describe "limit to 1" do
         | 
| @@ -122,19 +128,19 @@ describe Cobweb, :local_only => true do | |
| 122 128 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 123 129 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 124 130 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 125 | 
            -
                     | 
| 131 | 
            +
                    @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
         | 
| 126 132 | 
             
                  end
         | 
| 127 133 | 
             
                  it "should only crawl 1 page" do
         | 
| 128 134 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 129 135 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 130 136 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 131 | 
            -
                     | 
| 137 | 
            +
                    @redis.get("crawl_job_enqueued_count").to_i.should == 1
         | 
| 132 138 | 
             
                  end
         | 
| 133 139 | 
             
                  it "should notify of crawl finished once" do
         | 
| 134 140 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 135 141 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 136 142 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 137 | 
            -
                     | 
| 143 | 
            +
                    @redis.get("crawl_finished_enqueued_count").to_i.should == 1
         | 
| 138 144 | 
             
                  end
         | 
| 139 145 | 
             
                end
         | 
| 140 146 |  | 
| @@ -145,6 +151,7 @@ describe Cobweb, :local_only => true do | |
| 145 151 | 
             
                    @cobweb = Cobweb.new @request
         | 
| 146 152 | 
             
                  end
         | 
| 147 153 |  | 
| 154 | 
            +
                  # the following describes when we want all the assets of a page, and the page itself, but we only want 5 pages
         | 
| 148 155 | 
             
                  it "should only use html pages towards the crawl limit" do
         | 
| 149 156 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 150 157 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| @@ -165,19 +172,19 @@ describe Cobweb, :local_only => true do | |
| 165 172 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 166 173 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 167 174 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 168 | 
            -
                     | 
| 175 | 
            +
                    @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
         | 
| 169 176 | 
             
                  end
         | 
| 170 177 | 
             
                  it "should notify of crawl finished once" do
         | 
| 171 178 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 172 179 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 173 180 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 174 | 
            -
                     | 
| 181 | 
            +
                    @redis.get("crawl_finished_enqueued_count").to_i.should == 1
         | 
| 175 182 | 
             
                  end
         | 
| 176 183 | 
             
                  it "should only crawl 10 objects" do
         | 
| 177 184 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 178 185 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 179 186 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 180 | 
            -
                     | 
| 187 | 
            +
                    @redis.get("crawl_job_enqueued_count").to_i.should == 10
         | 
| 181 188 | 
             
                  end
         | 
| 182 189 | 
             
                end
         | 
| 183 190 |  | 
| @@ -191,23 +198,24 @@ describe Cobweb, :local_only => true do | |
| 191 198 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 192 199 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 193 200 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 194 | 
            -
                     | 
| 201 | 
            +
                    @redis.get("crawl_job_enqueued_count").to_i.should == @base_page_count
         | 
| 195 202 | 
             
                  end
         | 
| 196 203 | 
             
                  it "should notify of crawl finished once" do
         | 
| 197 204 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 198 205 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 199 206 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 200 | 
            -
                     | 
| 207 | 
            +
                    @redis.get("crawl_finished_enqueued_count").to_i.should == 1
         | 
| 201 208 | 
             
                  end
         | 
| 202 209 | 
             
                  it "should not crawl 100 pages" do
         | 
| 203 210 | 
             
                    crawl = @cobweb.start(@base_url)
         | 
| 204 211 | 
             
                    @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 205 212 | 
             
                    wait_for_crawl_finished crawl[:crawl_id]
         | 
| 206 | 
            -
                     | 
| 213 | 
            +
                    @redis.get("crawl_job_enqueued_count").to_i.should_not == 100
         | 
| 207 214 | 
             
                  end
         | 
| 208 215 | 
             
                end
         | 
| 209 216 | 
             
              end
         | 
| 210 217 |  | 
| 218 | 
            +
             | 
| 211 219 | 
             
              after(:all) do
         | 
| 212 220 |  | 
| 213 221 | 
             
                @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
         | 
| @@ -220,26 +228,43 @@ describe Cobweb, :local_only => true do | |
| 220 228 | 
             
            end
         | 
| 221 229 |  | 
| 222 230 | 
             
            def wait_for_crawl_finished(crawl_id, timeout=20)
         | 
| 223 | 
            -
              counter = 0
         | 
| 231 | 
            +
              @counter = 0
         | 
| 224 232 | 
             
              start_time = Time.now
         | 
| 225 233 | 
             
              while(running?(crawl_id) && Time.now < start_time + timeout) do
         | 
| 226 | 
            -
             | 
| 227 | 
            -
                end
         | 
| 228 | 
            -
                if Time.now > start_time + timeout
         | 
| 229 | 
            -
                  raise "End of crawl not detected"
         | 
| 230 | 
            -
                end
         | 
| 234 | 
            +
                sleep 0.5
         | 
| 231 235 | 
             
              end
         | 
| 232 | 
            -
             | 
| 233 | 
            -
             | 
| 234 | 
            -
                @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
         | 
| 236 | 
            +
              if Time.now > start_time + timeout
         | 
| 237 | 
            +
                raise "End of crawl not detected"
         | 
| 235 238 | 
             
              end
         | 
| 239 | 
            +
            end
         | 
| 236 240 |  | 
| 237 | 
            -
             | 
| 238 | 
            -
             | 
| 239 | 
            -
             | 
| 241 | 
            +
            def running?(crawl_id)
         | 
| 242 | 
            +
              status = @stat.get_status
         | 
| 243 | 
            +
              result = true
         | 
| 244 | 
            +
              if status == CobwebCrawlHelper::STARTING
         | 
| 245 | 
            +
                result = true
         | 
| 246 | 
            +
              else
         | 
| 247 | 
            +
                if status == @last_stat
         | 
| 248 | 
            +
                  if @counter > 5
         | 
| 249 | 
            +
                    raise "Static status: #{status}"
         | 
| 250 | 
            +
                  else
         | 
| 251 | 
            +
                    @counter += 1
         | 
| 252 | 
            +
                  end
         | 
| 253 | 
            +
                  puts "Static Status.. #{6-@counter}"
         | 
| 254 | 
            +
                else
         | 
| 255 | 
            +
                  result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
         | 
| 240 256 | 
             
                end
         | 
| 257 | 
            +
              end
         | 
| 258 | 
            +
              @last_stat = @stat.get_status
         | 
| 259 | 
            +
              result
         | 
| 260 | 
            +
            end
         | 
| 241 261 |  | 
| 242 | 
            -
             | 
| 243 | 
            -
             | 
| 244 | 
            -
                Resque. | 
| 262 | 
            +
            def clear_queues
         | 
| 263 | 
            +
              Resque.queues.each do |queue|
         | 
| 264 | 
            +
                Resque.remove_queue(queue)
         | 
| 245 265 | 
             
              end
         | 
| 266 | 
            +
             | 
| 267 | 
            +
              Resque.size("cobweb_process_job").should == 0
         | 
| 268 | 
            +
              Resque.size("cobweb_finished_job").should == 0
         | 
| 269 | 
            +
              Resque.peek("cobweb_process_job", 0, 200).should be_empty
         | 
| 270 | 
            +
            end
         | 
| @@ -76,11 +76,9 @@ describe ContentLinkParser do | |
| 76 76 | 
             
                      links.length.should == 3
         | 
| 77 77 | 
             
                    end
         | 
| 78 78 | 
             
                  end
         | 
| 79 | 
            -
                  describe "returning unknown link type" do
         | 
| 79 | 
            +
                  describe "returning unknown link type should raise an error" do
         | 
| 80 80 | 
             
                    it "should return an empty array" do
         | 
| 81 | 
            -
                       | 
| 82 | 
            -
                      links.should_not be_nil
         | 
| 83 | 
            -
                      links.should be_an_instance_of Array
         | 
| 81 | 
            +
                      lambda {@content_parser.asdfasdfsadf}.should raise_error
         | 
| 84 82 | 
             
                    end
         | 
| 85 83 | 
             
                  end
         | 
| 86 84 | 
             
                end
         | 
| @@ -122,7 +120,7 @@ describe ContentLinkParser do | |
| 122 120 | 
             
                describe "ignoring default tags" do
         | 
| 123 121 | 
             
                  it "should not return any links" do
         | 
| 124 122 | 
             
                    parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
         | 
| 125 | 
            -
                    parser.links.should  | 
| 123 | 
            +
                    lambda{parser.links}.should raise_error(NoMethodError)
         | 
| 126 124 | 
             
                  end
         | 
| 127 125 | 
             
                end
         | 
| 128 126 | 
             
              end
         | 
| @@ -0,0 +1,101 @@ | |
| 1 | 
            +
            require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            describe Cobweb, :local_only => true do
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              before(:all) do
         | 
| 6 | 
            +
                #store all existing resque process ids so we don't kill them afterwards
         | 
| 7 | 
            +
                @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
         | 
| 10 | 
            +
                puts "Starting Workers... Please Wait..."
         | 
| 11 | 
            +
                `mkdir log`
         | 
| 12 | 
            +
                io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
         | 
| 13 | 
            +
                puts "Workers Started."
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              before(:each) do
         | 
| 18 | 
            +
                @base_url = "http://localhost:3532/"
         | 
| 19 | 
            +
                @base_page_count = 77
         | 
| 20 | 
            +
                clear_queues
         | 
| 21 | 
            +
              end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              describe "with a crawl limit" do
         | 
| 24 | 
            +
                before(:each) do
         | 
| 25 | 
            +
                  @request = {
         | 
| 26 | 
            +
                    :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
         | 
| 27 | 
            +
                    :quiet => true,
         | 
| 28 | 
            +
                    :cache => nil,
         | 
| 29 | 
            +
                    :use_encoding_safe_process_job => true,
         | 
| 30 | 
            +
                    :crawl_limit_by_page => true
         | 
| 31 | 
            +
                  }
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
                
         | 
| 34 | 
            +
                describe "on ancestry.com.au" do
         | 
| 35 | 
            +
                  describe "limited to 100" do
         | 
| 36 | 
            +
                    before(:each) do
         | 
| 37 | 
            +
                      @request[:crawl_limit] = 100
         | 
| 38 | 
            +
                      @request[:valid_mime_types] = ["text/html"]
         | 
| 39 | 
            +
                      @cobweb = Cobweb.new @request        
         | 
| 40 | 
            +
                    end
         | 
| 41 | 
            +
                    
         | 
| 42 | 
            +
                    it "should crawl 100 pages" do
         | 
| 43 | 
            +
                      crawl = @cobweb.start("http://www.ancestry.com.au/")
         | 
| 44 | 
            +
                      @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 45 | 
            +
                      wait_for_crawl_finished crawl[:crawl_id], 180
         | 
| 46 | 
            +
                      puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
         | 
| 47 | 
            +
                    end
         | 
| 48 | 
            +
                  end
         | 
| 49 | 
            +
                  
         | 
| 50 | 
            +
                  describe "limited to 999" do
         | 
| 51 | 
            +
                    before(:each) do
         | 
| 52 | 
            +
                      @request[:crawl_limit] = 999
         | 
| 53 | 
            +
                      @cobweb = Cobweb.new @request        
         | 
| 54 | 
            +
                    end
         | 
| 55 | 
            +
                  
         | 
| 56 | 
            +
                    it "should crawl 999 pages" do
         | 
| 57 | 
            +
                      crawl = @cobweb.start("http://www.ancestry.com.au/")
         | 
| 58 | 
            +
                      @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
         | 
| 59 | 
            +
                      wait_for_crawl_finished crawl[:crawl_id], 720
         | 
| 60 | 
            +
                      puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
         | 
| 61 | 
            +
                    end
         | 
| 62 | 
            +
                  end
         | 
| 63 | 
            +
                __END__
         | 
| 64 | 
            +
                
         | 
| 65 | 
            +
              end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
              after(:all) do
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
         | 
| 70 | 
            +
                command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
         | 
| 71 | 
            +
                IO.popen(command)
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                clear_queues
         | 
| 74 | 
            +
              end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            def wait_for_crawl_finished(crawl_id, timeout=20)
         | 
| 79 | 
            +
              counter = 0
         | 
| 80 | 
            +
              start_time = Time.now
         | 
| 81 | 
            +
              while(running?(crawl_id) && Time.now < start_time + timeout) do
         | 
| 82 | 
            +
                  sleep 0.5
         | 
| 83 | 
            +
                end
         | 
| 84 | 
            +
                if Time.now > start_time + timeout
         | 
| 85 | 
            +
                  raise "End of crawl not detected"
         | 
| 86 | 
            +
                end
         | 
| 87 | 
            +
              end
         | 
| 88 | 
            +
             | 
| 89 | 
            +
              def running?(crawl_id)
         | 
| 90 | 
            +
                @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
         | 
| 91 | 
            +
              end
         | 
| 92 | 
            +
             | 
| 93 | 
            +
              def clear_queues
         | 
| 94 | 
            +
                Resque.queues.each do |queue|
         | 
| 95 | 
            +
                  Resque.remove_queue(queue)
         | 
| 96 | 
            +
                end
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                Resque.size("cobweb_process_job").should == 0
         | 
| 99 | 
            +
                Resque.size("cobweb_finished_job").should == 0
         | 
| 100 | 
            +
                Resque.peek("cobweb_process_job", 0, 200).should be_empty
         | 
| 101 | 
            +
              end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: cobweb
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.0. | 
| 4 | 
            +
              version: 0.0.74
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -9,11 +9,11 @@ authors: | |
| 9 9 | 
             
            autorequire: 
         | 
| 10 10 | 
             
            bindir: bin
         | 
| 11 11 | 
             
            cert_chain: []
         | 
| 12 | 
            -
            date: 2012- | 
| 12 | 
            +
            date: 2012-10-15 00:00:00.000000000 Z
         | 
| 13 13 | 
             
            dependencies:
         | 
| 14 14 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 15 | 
             
              name: resque
         | 
| 16 | 
            -
              requirement: & | 
| 16 | 
            +
              requirement: &70347429190520 !ruby/object:Gem::Requirement
         | 
| 17 17 | 
             
                none: false
         | 
| 18 18 | 
             
                requirements:
         | 
| 19 19 | 
             
                - - ! '>='
         | 
| @@ -21,10 +21,10 @@ dependencies: | |
| 21 21 | 
             
                    version: '0'
         | 
| 22 22 | 
             
              type: :runtime
         | 
| 23 23 | 
             
              prerelease: false
         | 
| 24 | 
            -
              version_requirements: * | 
| 24 | 
            +
              version_requirements: *70347429190520
         | 
| 25 25 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 26 26 | 
             
              name: redis
         | 
| 27 | 
            -
              requirement: & | 
| 27 | 
            +
              requirement: &70347429190020 !ruby/object:Gem::Requirement
         | 
| 28 28 | 
             
                none: false
         | 
| 29 29 | 
             
                requirements:
         | 
| 30 30 | 
             
                - - ! '>='
         | 
| @@ -32,10 +32,10 @@ dependencies: | |
| 32 32 | 
             
                    version: '0'
         | 
| 33 33 | 
             
              type: :runtime
         | 
| 34 34 | 
             
              prerelease: false
         | 
| 35 | 
            -
              version_requirements: * | 
| 35 | 
            +
              version_requirements: *70347429190020
         | 
| 36 36 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 37 37 | 
             
              name: nokogiri
         | 
| 38 | 
            -
              requirement: & | 
| 38 | 
            +
              requirement: &70347429189540 !ruby/object:Gem::Requirement
         | 
| 39 39 | 
             
                none: false
         | 
| 40 40 | 
             
                requirements:
         | 
| 41 41 | 
             
                - - ! '>='
         | 
| @@ -43,10 +43,10 @@ dependencies: | |
| 43 43 | 
             
                    version: '0'
         | 
| 44 44 | 
             
              type: :runtime
         | 
| 45 45 | 
             
              prerelease: false
         | 
| 46 | 
            -
              version_requirements: * | 
| 46 | 
            +
              version_requirements: *70347429189540
         | 
| 47 47 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 48 48 | 
             
              name: addressable
         | 
| 49 | 
            -
              requirement: & | 
| 49 | 
            +
              requirement: &70347429188880 !ruby/object:Gem::Requirement
         | 
| 50 50 | 
             
                none: false
         | 
| 51 51 | 
             
                requirements:
         | 
| 52 52 | 
             
                - - ! '>='
         | 
| @@ -54,10 +54,10 @@ dependencies: | |
| 54 54 | 
             
                    version: '0'
         | 
| 55 55 | 
             
              type: :runtime
         | 
| 56 56 | 
             
              prerelease: false
         | 
| 57 | 
            -
              version_requirements: * | 
| 57 | 
            +
              version_requirements: *70347429188880
         | 
| 58 58 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 59 59 | 
             
              name: rspec
         | 
| 60 | 
            -
              requirement: & | 
| 60 | 
            +
              requirement: &70347429187340 !ruby/object:Gem::Requirement
         | 
| 61 61 | 
             
                none: false
         | 
| 62 62 | 
             
                requirements:
         | 
| 63 63 | 
             
                - - ! '>='
         | 
| @@ -65,10 +65,10 @@ dependencies: | |
| 65 65 | 
             
                    version: '0'
         | 
| 66 66 | 
             
              type: :runtime
         | 
| 67 67 | 
             
              prerelease: false
         | 
| 68 | 
            -
              version_requirements: * | 
| 68 | 
            +
              version_requirements: *70347429187340
         | 
| 69 69 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 70 70 | 
             
              name: awesome_print
         | 
| 71 | 
            -
              requirement: & | 
| 71 | 
            +
              requirement: &70347429185820 !ruby/object:Gem::Requirement
         | 
| 72 72 | 
             
                none: false
         | 
| 73 73 | 
             
                requirements:
         | 
| 74 74 | 
             
                - - ! '>='
         | 
| @@ -76,10 +76,10 @@ dependencies: | |
| 76 76 | 
             
                    version: '0'
         | 
| 77 77 | 
             
              type: :runtime
         | 
| 78 78 | 
             
              prerelease: false
         | 
| 79 | 
            -
              version_requirements: * | 
| 79 | 
            +
              version_requirements: *70347429185820
         | 
| 80 80 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 81 81 | 
             
              name: sinatra
         | 
| 82 | 
            -
              requirement: & | 
| 82 | 
            +
              requirement: &70347429185040 !ruby/object:Gem::Requirement
         | 
| 83 83 | 
             
                none: false
         | 
| 84 84 | 
             
                requirements:
         | 
| 85 85 | 
             
                - - ! '>='
         | 
| @@ -87,10 +87,10 @@ dependencies: | |
| 87 87 | 
             
                    version: '0'
         | 
| 88 88 | 
             
              type: :runtime
         | 
| 89 89 | 
             
              prerelease: false
         | 
| 90 | 
            -
              version_requirements: * | 
| 90 | 
            +
              version_requirements: *70347429185040
         | 
| 91 91 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 92 92 | 
             
              name: thin
         | 
| 93 | 
            -
              requirement: & | 
| 93 | 
            +
              requirement: &70347429184340 !ruby/object:Gem::Requirement
         | 
| 94 94 | 
             
                none: false
         | 
| 95 95 | 
             
                requirements:
         | 
| 96 96 | 
             
                - - ! '>='
         | 
| @@ -98,10 +98,10 @@ dependencies: | |
| 98 98 | 
             
                    version: '0'
         | 
| 99 99 | 
             
              type: :runtime
         | 
| 100 100 | 
             
              prerelease: false
         | 
| 101 | 
            -
              version_requirements: * | 
| 101 | 
            +
              version_requirements: *70347429184340
         | 
| 102 102 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 103 103 | 
             
              name: haml
         | 
| 104 | 
            -
              requirement: & | 
| 104 | 
            +
              requirement: &70347429183120 !ruby/object:Gem::Requirement
         | 
| 105 105 | 
             
                none: false
         | 
| 106 106 | 
             
                requirements:
         | 
| 107 107 | 
             
                - - ! '>='
         | 
| @@ -109,10 +109,10 @@ dependencies: | |
| 109 109 | 
             
                    version: '0'
         | 
| 110 110 | 
             
              type: :runtime
         | 
| 111 111 | 
             
              prerelease: false
         | 
| 112 | 
            -
              version_requirements: * | 
| 112 | 
            +
              version_requirements: *70347429183120
         | 
| 113 113 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 114 114 | 
             
              name: namespaced_redis
         | 
| 115 | 
            -
              requirement: & | 
| 115 | 
            +
              requirement: &70347429181840 !ruby/object:Gem::Requirement
         | 
| 116 116 | 
             
                none: false
         | 
| 117 117 | 
             
                requirements:
         | 
| 118 118 | 
             
                - - ! '>='
         | 
| @@ -120,10 +120,10 @@ dependencies: | |
| 120 120 | 
             
                    version: 1.0.2
         | 
| 121 121 | 
             
              type: :runtime
         | 
| 122 122 | 
             
              prerelease: false
         | 
| 123 | 
            -
              version_requirements: * | 
| 123 | 
            +
              version_requirements: *70347429181840
         | 
| 124 124 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 125 125 | 
             
              name: json
         | 
| 126 | 
            -
              requirement: & | 
| 126 | 
            +
              requirement: &70347429180860 !ruby/object:Gem::Requirement
         | 
| 127 127 | 
             
                none: false
         | 
| 128 128 | 
             
                requirements:
         | 
| 129 129 | 
             
                - - ! '>='
         | 
| @@ -131,7 +131,7 @@ dependencies: | |
| 131 131 | 
             
                    version: '0'
         | 
| 132 132 | 
             
              type: :runtime
         | 
| 133 133 | 
             
              prerelease: false
         | 
| 134 | 
            -
              version_requirements: * | 
| 134 | 
            +
              version_requirements: *70347429180860
         | 
| 135 135 | 
             
            description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
         | 
| 136 136 | 
             
              crawl extremely large sites which is much more perofmant than multi-threaded crawlers.  It
         | 
| 137 137 | 
             
              is also a standalone crawler that has a sophisticated statistics monitoring interface
         | 
| @@ -142,13 +142,14 @@ extensions: [] | |
| 142 142 | 
             
            extra_rdoc_files:
         | 
| 143 143 | 
             
            - README.textile
         | 
| 144 144 | 
             
            files:
         | 
| 145 | 
            +
            - spec/cobweb/cobweb_crawl_helper_spec.rb
         | 
| 145 146 | 
             
            - spec/cobweb/cobweb_crawler_spec.rb
         | 
| 146 147 | 
             
            - spec/cobweb/cobweb_job_spec.rb
         | 
| 147 148 | 
             
            - spec/cobweb/cobweb_links_spec.rb
         | 
| 148 149 | 
             
            - spec/cobweb/cobweb_spec.rb
         | 
| 149 150 | 
             
            - spec/cobweb/content_link_parser_spec.rb
         | 
| 150 | 
            -
            - spec/cobweb/crawl_spec.rb
         | 
| 151 151 | 
             
            - spec/cobweb/robots_spec.rb
         | 
| 152 | 
            +
            - spec/cobweb/site_test_spec.rb.tmp
         | 
| 152 153 | 
             
            - spec/samples/robots.txt
         | 
| 153 154 | 
             
            - spec/samples/sample_html_links.html
         | 
| 154 155 | 
             
            - spec/samples/sample_server.rb
         | 
| @@ -328,7 +329,9 @@ files: | |
| 328 329 | 
             
            - lib/cobweb_process_job.rb
         | 
| 329 330 | 
             
            - lib/cobweb_version.rb
         | 
| 330 331 | 
             
            - lib/content_link_parser.rb
         | 
| 332 | 
            +
            - lib/crawl.rb
         | 
| 331 333 | 
             
            - lib/crawl_job.rb
         | 
| 334 | 
            +
            - lib/crawl_object.rb
         | 
| 332 335 | 
             
            - lib/encoding_safe_process_job.rb
         | 
| 333 336 | 
             
            - lib/hash_util.rb
         | 
| 334 337 | 
             
            - lib/redirect_error.rb
         |