RubyGems - cobweb - Versions diffs - 1.0.28 → 1.0.29 - Mend

cobweb 1.0.28 → 1.0.29

Files changed (20) hide show

checksums.yaml +4 -4
data/README.textile +71 -67
data/lib/cobweb.rb +41 -41
data/lib/cobweb_crawl_helper.rb +1 -5
data/lib/cobweb_version.rb +2 -2
data/lib/crawl_worker.rb +14 -14
data/lib/export_command.rb +3 -3
data/lib/report_command.rb +1 -1
data/lib/string.rb +4 -9
data/spec/cobweb/cobweb_crawler_spec.rb +15 -15
data/spec/cobweb/crawl_job_spec.rb +8 -6
data/spec/cobweb/crawl_worker_spec.rb +32 -32
data/spec/samples/sample_site/{boxgrid>withsillyname.html → boxgridwithsillyname.html} +37 -37
data/spec/samples/sample_site/dashboard.html +1 -1
data/spec/samples/sample_site/forms.html +1 -1
data/spec/samples/sample_site/gallery.html +1 -1
data/spec/samples/sample_site/more.html +1 -1
data/spec/samples/sample_site/tables.html +1 -1
data/spec/samples/sample_site/typography.html +1 -1
metadata +4 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 4ee0943ca3fabf5cb097b9d6edb324783cc155d6
-  data.tar.gz: d79eca877414244d94d5e66062a368bee4104368
+  metadata.gz: ba10134e0b3da7418f0a1a5772ca196cf5525066
+  data.tar.gz: ae2f27f0036172b001968e277913d0038549220b
 SHA512:
-  metadata.gz: 420d1fe0daff99694de78846491f97357d42632a78cdc9a29953f95afe93223c40ecbf2fc09b34bf378321322ff9a7958f28a45b352e629d4750af56cb192a0f
-  data.tar.gz: e9e38c6a2c33ffff09e76c04e71f673512523a417b37b001a87142ffd8a01148b40140d1c98fb263b562e2278c0ebdb08e830a65467c41ad3fdb2ef654938012
+  metadata.gz: 175e8dedf0592c1cc8e9abb50cc0efa835ffb8e1d3c0df3ecb3c3a9900e24a1a2aebc479c5188ea44d57de210b559e0637efb8beac4f90c49f3f6e54bc0492d7
+  data.tar.gz: d4fc557a20e3b4d54daaecaa151b5365e4a483067b6e2cc4429f846e47066d1326a868383f2d1a2031159dc21cfbd053854e2c12ea57cd1eeb2de999f8aec2ab

data/README.textile CHANGED

@@ -1,4 +1,4 @@
-h1. Cobweb v1.0.28
+h1. Cobweb v1.0.29
 "@cobweb_gem":https://twitter.com/cobweb_gem
 !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -34,21 +34,21 @@ h3. Command Line
 h3. Data Returned For Each Page
   The data available in the returned hash are:
-  * :url - url of the resource requested
-  * :status_code - status code of the resource requested
-  * :mime_type - content type of the resource
-  * :character_set - character set of content determined from content type
-  * :length - length of the content returned
-  * :body - content of the resource
-  * :location - location header if returned
-  * :redirect_through - if your following redirects, any redirects are stored here detailing where you were redirected through to get to the final location
-  * :headers - hash or the headers returned
-  * :links - hash or links on the page split in to types
-    ** :links - url's from a tags within the resource
-    ** :images - url's from img tags within the resource
-    ** :related - url's from link tags
-    ** :scripts - url's from script tags
-    ** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
+  * @:url@ - url of the resource requested
+  * @:status_code@ - status code of the resource requested
+  * @:mime_type@ - content type of the resource
+  * @:character_set@ - character set of content determined from content type
+  * @:length@ - length of the content returned
+  * @:body@ - content of the resource
+  * @:location@ - location header if returned
+  * @:redirect_through@ - if your following redirects, any redirects are stored here detailing where you were redirected through to get to the final location
+  * @:headers@ - hash or the headers returned
+  * @:links@ - hash or links on the page split in to types
+    ** @:links@ - urls from a tags within the resource
+    ** @:images@ - urls from img tags within the resource
+    ** @:related@ - urls from link tags
+    ** @:scripts@ - urls from script tags
+    ** @:styles@ - urls from within link tags with rel of stylesheet and from url() directives with stylesheets
   The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
@@ -58,23 +58,23 @@ h3. Statistics
   The data available within statistics is as follows:
-  * :average_length - average size of each objet
-  * :minimum_length - minimum length returned
-  * :queued_at - date and time that the crawl was started at (eg: "2012-09-10T23:10:08+01:00")
-  * :maximum_length - maximum length of object received
-  * :status_counts - hash with the status returned as the key and value as number of pages (eg: {"404" => 1, "200" => 1})
-  * :mime_counts - hash containing the mime type as key and count or pages as value (eg: {"text/html" => 8, "image/jpeg" => 25)})
-  * :queue_counter - size of queue waiting to be processed for crawl
-  * :page_count - number of html pages retrieved
-  * :total_length - total size of data received
-  * :current_status - Current status of crawl
-  * :asset_count - count of non-html objects received
-  * :page_size - total size of pages received
-  * :average_response_time - average response time of all objects
-  * :crawl_counter - number of objects that have been crawled
-  * :minimum_response_time - quickest response time of crawl
-  * :maximum_response_time - longest response time of crawl
-  * :asset_size - total size of all non-assets received
+  * @:average_length@ - average size of each objet
+  * @:minimum_length@ - minimum length returned
+  * @:queued_at@ - date and time that the crawl was started at (eg: "2012-09-10T23:10:08+01:00")
+  * @:maximum_length@ - maximum length of object received
+  * @:status_counts@ - hash with the status returned as the key and value as number of pages (eg: {"404" => 1, "200" => 1})
+  * @:mime_counts@ - hash containing the mime type as key and count or pages as value (eg: {"text/html" => 8, "image/jpeg" => 25)})
+  * @:queue_counter@ - size of queue waiting to be processed for crawl
+  * @:page_count@ - number of html pages retrieved
+  * @:total_length@ - total size of data received
+  * @:current_status@ - Current status of crawl
+  * @:asset_count@ - count of non-html objects received
+  * @:page_size@ - total size of pages received
+  * @:average_response_time@ - average response time of all objects
+  * @:crawl_counter@ - number of objects that have been crawled
+  * @:minimum_response_time@ - quickest response time of crawl
+  * @:maximum_response_time@ - longest response time of crawl
+  * @:asset_size@ - total size of all non-assets received
 h2. Installation
@@ -82,6 +82,10 @@ Install crawler as a gem
 bc. gem install cobweb
+or in a @Gemfile@
+bc. gem 'cobweb'
 h2. Usage
 h3. Cobweb
@@ -90,42 +94,42 @@ h4. new(options)
 Creates a new crawler object based on a base_url
-  * options - Options are passed in as a hash,
-    ** :follow_redirects              - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
-    ** :redirect_limit                - sets the limit to be used for concurrent redirects (Default: 10)
-    ** :processing_queue              - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
-    ** :crawl_finished_queue          - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
-    ** :debug                         - enables debug output (Default: false)
-    ** :quiet                         - hides default output (Default: false)
-    ** :cache                         - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
-    ** :timeout                       - http timeout for requests (Default: 10)
-    ** :redis_options                 - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
-    ** :internal_urls                 - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
-    ** :first_page_redirect_internal  - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
-    ** :crawl_id                      - the id used internally for identifying the crawl.  Can be used by the processing job to seperate crawls
-    ** :internal_urls                 - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
-    ** :external_urls                 - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
-    ** :seed_urls                     - an array of urls that are put into the queue regardless of any other setting, combine with {:external_urls => "*"} to limit to seed urls
-    ** :obey_robots                   - boolean determining if robots.txt should be honoured. (default: false)
-    ** :user_agent                    - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
-    ** :crawl_limit_by_page           - sets the crawl counter to only use html page types when counting objects crawled
-    ** :valid_mime_types              - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
-    ** :direct_call_process_job       - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
-    ** :raise_exceptions              - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
-    ** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
-    ** :proxy_addr                    - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
-    ** :proxy_port                    - port number of the proxy (default: nil)
-    ** :treat_https_as_http           - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
+  * options - The following hash keys can be defined:
+    ** @:follow_redirects@              - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
+    ** @:redirect_limit@                - sets the limit to be used for concurrent redirects (Default: 10)
+    ** @:processing_queue@              - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
+    ** @:crawl_finished_queue@          - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
+    ** @:debug@                         - enables debug output (Default: false)
+    ** @:quiet@                         - hides default output (Default: false)
+    ** @:cache@                         - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
+    ** @:timeout@                       - http timeout for requests (Default: 10)
+    ** @:redis_options@                 - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
+    ** @:internal_urls@                 - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
+    ** @:first_page_redirect_internal@  - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
+    ** @:crawl_id@                      - the id used internally for identifying the crawl.  Can be used by the processing job to seperate crawls
+    ** @:internal_urls@                 - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
+    ** @:external_urls@                 - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
+    ** @:seed_urls@                     - an array of urls that are put into the queue regardless of any other setting, combine with {:external_urls => "*"} to limit to seed urls
+    ** @:obey_robots@                   - boolean determining if robots.txt should be honoured. (default: false)
+    ** @:user_agent@                    - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
+    ** @:crawl_limit_by_page@           - sets the crawl counter to only use html page types when counting objects crawled
+    ** @:valid_mime_types@              - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
+    ** @:direct_call_process_job@       - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
+    ** @:raise_exceptions@              - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
+    ** @:use_encoding_safe_process_job@ - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
+    ** @:proxy_addr@                    - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
+    ** @:proxy_port@                    - port number of the proxy (default: nil)
+    ** @:treat_https_as_http@           - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
 bc. crawler = Cobweb.new(:follow_redirects => false)
 h4. start(base_url)
-Starts a crawl through resque.  Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
+Starts a crawl through resque.  Requires the @:processing_queue@ to be set to a valid class for the resque job to work with the data retrieved.
-  * base_url - the url to start the crawl from
+  * @base_url@ - the url to start the crawl from
 Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
@@ -135,7 +139,7 @@ h4. get(url)
 Simple get that obey's the options supplied in new.
-  * url - url requested
+  * @url@ - url requested
 bc. crawler.get("http://www.google.com/")
@@ -143,14 +147,14 @@ h4. head(url)
 Simple get that obey's the options supplied in new.
-  * url - url requested
+  * @url@ - url requested
 bc. crawler.head("http://www.google.com/")
 h4. Processing Queue
-The :processing_queue option is used to specify the class that contains the resque perform method to pass the content onto.  This class should be defined in your application to perform any tasks you wish to the content.  There are two options however, for running this.  Firstly, the default settings will push the content crawled onto a resque queue for that class.  This allows you the flexibility of running in queues on seperate machines etc.  The main drawback to this is that all your content is stored in redis within the queue.  This can be memory intensive if you are crawling large sites, or have large content that is being crawled.  To get around this you can specify that the crawl_job calls the perform method on the processing queue class directly, thereby not using memory in redis for the content.  This is performed by using the :direct_call_process_job. If you set that option to 'true' then instead of the job being queued, it will be executed within the crawl_job queue.
+The @:processing_queue@ option is used to specify the class that contains the resque perform method to pass the content onto.  This class should be defined in your application to perform any tasks you wish to the content.  There are two options however, for running this.  Firstly, the default settings will push the content crawled onto a resque queue for that class.  This allows you the flexibility of running in queues on seperate machines etc.  The main drawback to this is that all your content is stored in redis within the queue.  This can be memory intensive if you are crawling large sites, or have large content that is being crawled.  To get around this you can specify that the crawl_job calls the perform method on the processing queue class directly, thereby not using memory in redis for the content.  This is performed by using the :direct_call_process_job. If you set that option to 'true' then instead of the job being queued, it will be executed within the crawl_job queue.
 h3. CobwebCrawler
@@ -169,7 +173,7 @@ puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:ass
 There are some specific options for CobwebCrawler in addition to the normal cobweb options
-  * thread_count - specifies the number of threads used by the crawler, defaults to 1
+  * @thread_count@ - specifies the number of threads used by the crawler, defaults to 1
 h3. CobwebCrawlHelper
@@ -177,7 +181,7 @@ The CobwebCrawlHelper class is a helper class to assist in getting information a
 bc. crawl = CobwebCrawlHelper.new(options)
-  * options - the hash of options passed into Cobweb.new (must include a :crawl_id)
+  * @options@ - the hash of options passed into Cobweb.new (must include a @:crawl_id@)

data/lib/cobweb.rb CHANGED

@@ -8,17 +8,17 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
   require file
 end
-puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
+puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
 # Cobweb class is used to perform get and head requests.  You can use this on its own if you wish without the crawler
 class Cobweb
   # retrieves current version
   def self.version
     CobwebVersion.version
   end
   # used for setting default options
   def method_missing(method_sym, *arguments, &block)
     if method_sym.to_s =~ /^default_(.*)_to$/
@@ -28,7 +28,7 @@ class Cobweb
       super
     end
   end
   # See readme for more information on options available
   def initialize(options = {})
     @options = options
@@ -41,7 +41,7 @@ class Cobweb
       default_crawl_finished_queue_to           "CobwebFinishedJob"
     else
       default_processing_queue_to               "CrawlProcessWorker"
-      default_crawl_finished_queue_to           "CrawlFinishedWorker"
+      default_crawl_finished_queue_to           "CrawlFinishedWorker"
     end
     default_quiet_to                          true
     default_debug_to                          false
@@ -66,22 +66,22 @@ class Cobweb
   end
   # This method starts the resque based crawl and enqueues the base_url
   def start(base_url)
     raise ":base_url is required" unless base_url
     request = {
       :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
-      :url => base_url
-    }
+      :url => base_url
+    }
     if @options[:internal_urls].nil? || @options[:internal_urls].empty?
       uri = Addressable::URI.parse(base_url)
       @options[:internal_urls] = []
       @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
       @options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
     end
     request.merge!(@options)
     @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => RedisConnection.new(request[:redis_options]))
     @redis.set("original_base_url", base_url)
@@ -90,10 +90,10 @@ class Cobweb
     @redis.set("queue-counter", 1)
     @options[:seed_urls].map{|link| @redis.sadd "queued", link }
     @stats = Stats.new(request)
     @stats.start_crawl(request)
     # add internal_urls into redis
     @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
     if @options[:queue_system] == :resque
@@ -103,10 +103,10 @@ class Cobweb
     else
       raise "Unknown queue system: #{content_request[:queue_system]}"
     end
     request
   end
   # Returns array of cookies from content
   def get_cookies(response)
     all_cookies = response.get_fields('set-cookie')
@@ -134,7 +134,7 @@ class Cobweb
     else
       redirect_limit = 10
     end
     # connect to redis
     if options.has_key? :crawl_id
       redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
@@ -147,7 +147,7 @@ class Cobweb
     # check if it has already been cached
     if @options[:cache] && ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
-      if @options[:cache_type] == :crawl_based
+      if @options[:cache_type] == :crawl_based
         puts "Cache hit in crawl for #{url}" unless @options[:quiet]
         content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
       else
@@ -183,7 +183,7 @@ class Cobweb
         if @options[:range]
           request.set_range(@options[:range])
         end
         response = @http.request request
         if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
@@ -204,11 +204,11 @@ class Cobweb
           content[:redirect_through] = [uri.to_s] if content[:redirect_through].nil?
           content[:redirect_through].insert(0, url)
           content[:url] = content[:redirect_through].last
           content[:response_time] = Time.now.to_f - request_time
         else
           content[:response_time] = Time.now.to_f - request_time
           puts "Retrieved." unless @options[:quiet]
           # create the content container
@@ -237,7 +237,7 @@ class Cobweb
           # parse data for links
           link_parser = ContentLinkParser.new(content[:url], content[:body])
           content[:links] = link_parser.link_data
         end
         # add content to cache if required
         if @options[:cache]
@@ -252,10 +252,10 @@ class Cobweb
       rescue RedirectError => e
         if @options[:raise_exceptions]
           puts "Re-Raising error #{e.message} on #{uri.to_s}"
-          raise e
+          raise e
         end
         puts "ERROR RedirectError: #{e.message}"
         ## generate a blank content
         content = {}
         content[:url] = uri.to_s
@@ -267,11 +267,11 @@ class Cobweb
         content[:mime_type] = "error/dnslookup"
         content[:headers] = {}
         content[:links] = {}
       rescue SocketError => e
         raise e if @options[:raise_exceptions]
         puts "ERROR SocketError: #{e.message}"
         ## generate a blank content
         content = {}
         content[:url] = uri.to_s
@@ -283,11 +283,11 @@ class Cobweb
         content[:mime_type] = "error/dnslookup"
         content[:headers] = {}
         content[:links] = {}
       rescue Timeout::Error => e
         raise e if @options[:raise_exceptions]
         puts "ERROR Timeout::Error: #{e.message}"
         ## generate a blank content
         content = {}
         content[:url] = uri.to_s
@@ -306,7 +306,7 @@ class Cobweb
   # Performs a HTTP HEAD request to the specified url applying the options supplied
   def head(url, options = @options)
-    raise "url cannot be nil" if url.nil?
+    raise "url cannot be nil" if url.nil?
     uri = Addressable::URI.parse(url)
     uri.normalize!
     uri.fragment=nil
@@ -319,16 +319,16 @@ class Cobweb
     else
       redirect_limit = 10
     end
     # connect to redis
     if options.has_key? :crawl_id
       redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
     else
       redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
     end
     content = {:base_url => url}
     # check if it has already been cached
     if @options[:cache] && redis.get("head-#{unique_id}")
       puts "Cache hit for #{url}" unless @options[:quiet]
@@ -386,8 +386,8 @@ class Cobweb
               charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
               content[:character_set] = charset
             end
-          end
+          end
           # add content to cache if required
           if @options[:cache]
             puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
@@ -416,7 +416,7 @@ class Cobweb
       rescue SocketError => e
         raise e if @options[:raise_exceptions]
         puts "ERROR SocketError: #{e.message}"
         ## generate a blank content
         content = {}
         content[:url] = uri.to_s
@@ -428,11 +428,11 @@ class Cobweb
         content[:mime_type] = "error/dnslookup"
         content[:headers] = {}
         content[:links] = {}
       rescue Timeout::Error => e
         raise e if @options[:raise_exceptions]
         puts "ERROR Timeout::Error: #{e.message}"
         ## generate a blank content
         content = {}
         content[:url] = uri.to_s
@@ -445,10 +445,10 @@ class Cobweb
         content[:headers] = {}
         content[:links] = {}
       end
       content
     end
   end
   # escapes characters with meaning in regular expressions and adds wildcard expression
@@ -456,7 +456,7 @@ class Cobweb
     pattern = pattern.gsub(".", "\\.")
     pattern = pattern.gsub("?", "\\?")
     pattern = pattern.gsub("+", "\\\\+")
-    pattern = pattern.gsub("*", ".*?")
+    pattern = pattern.gsub("*", ".*?")
     if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
       pattern = pattern.gsub("http:", "https?:")
     end
@@ -464,9 +464,9 @@ class Cobweb
   end
   def clear_cache
   end
   private
   # checks if the mime_type is textual
   def text_content?(content_type)
@@ -475,5 +475,5 @@ class Cobweb
     end
     false
   end
 end