cobweb 1.0.28 → 1.0.29

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4ee0943ca3fabf5cb097b9d6edb324783cc155d6
4
- data.tar.gz: d79eca877414244d94d5e66062a368bee4104368
3
+ metadata.gz: ba10134e0b3da7418f0a1a5772ca196cf5525066
4
+ data.tar.gz: ae2f27f0036172b001968e277913d0038549220b
5
5
  SHA512:
6
- metadata.gz: 420d1fe0daff99694de78846491f97357d42632a78cdc9a29953f95afe93223c40ecbf2fc09b34bf378321322ff9a7958f28a45b352e629d4750af56cb192a0f
7
- data.tar.gz: e9e38c6a2c33ffff09e76c04e71f673512523a417b37b001a87142ffd8a01148b40140d1c98fb263b562e2278c0ebdb08e830a65467c41ad3fdb2ef654938012
6
+ metadata.gz: 175e8dedf0592c1cc8e9abb50cc0efa835ffb8e1d3c0df3ecb3c3a9900e24a1a2aebc479c5188ea44d57de210b559e0637efb8beac4f90c49f3f6e54bc0492d7
7
+ data.tar.gz: d4fc557a20e3b4d54daaecaa151b5365e4a483067b6e2cc4429f846e47066d1326a868383f2d1a2031159dc21cfbd053854e2c12ea57cd1eeb2de999f8aec2ab
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.28
1
+ h1. Cobweb v1.0.29
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -34,21 +34,21 @@ h3. Command Line
34
34
  h3. Data Returned For Each Page
35
35
  The data available in the returned hash are:
36
36
 
37
- * :url - url of the resource requested
38
- * :status_code - status code of the resource requested
39
- * :mime_type - content type of the resource
40
- * :character_set - character set of content determined from content type
41
- * :length - length of the content returned
42
- * :body - content of the resource
43
- * :location - location header if returned
44
- * :redirect_through - if your following redirects, any redirects are stored here detailing where you were redirected through to get to the final location
45
- * :headers - hash or the headers returned
46
- * :links - hash or links on the page split in to types
47
- ** :links - url's from a tags within the resource
48
- ** :images - url's from img tags within the resource
49
- ** :related - url's from link tags
50
- ** :scripts - url's from script tags
51
- ** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
37
+ * @:url@ - url of the resource requested
38
+ * @:status_code@ - status code of the resource requested
39
+ * @:mime_type@ - content type of the resource
40
+ * @:character_set@ - character set of content determined from content type
41
+ * @:length@ - length of the content returned
42
+ * @:body@ - content of the resource
43
+ * @:location@ - location header if returned
44
+ * @:redirect_through@ - if your following redirects, any redirects are stored here detailing where you were redirected through to get to the final location
45
+ * @:headers@ - hash or the headers returned
46
+ * @:links@ - hash or links on the page split in to types
47
+ ** @:links@ - urls from a tags within the resource
48
+ ** @:images@ - urls from img tags within the resource
49
+ ** @:related@ - urls from link tags
50
+ ** @:scripts@ - urls from script tags
51
+ ** @:styles@ - urls from within link tags with rel of stylesheet and from url() directives with stylesheets
52
52
 
53
53
  The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
54
54
 
@@ -58,23 +58,23 @@ h3. Statistics
58
58
 
59
59
  The data available within statistics is as follows:
60
60
 
61
- * :average_length - average size of each objet
62
- * :minimum_length - minimum length returned
63
- * :queued_at - date and time that the crawl was started at (eg: "2012-09-10T23:10:08+01:00")
64
- * :maximum_length - maximum length of object received
65
- * :status_counts - hash with the status returned as the key and value as number of pages (eg: {"404" => 1, "200" => 1})
66
- * :mime_counts - hash containing the mime type as key and count or pages as value (eg: {"text/html" => 8, "image/jpeg" => 25)})
67
- * :queue_counter - size of queue waiting to be processed for crawl
68
- * :page_count - number of html pages retrieved
69
- * :total_length - total size of data received
70
- * :current_status - Current status of crawl
71
- * :asset_count - count of non-html objects received
72
- * :page_size - total size of pages received
73
- * :average_response_time - average response time of all objects
74
- * :crawl_counter - number of objects that have been crawled
75
- * :minimum_response_time - quickest response time of crawl
76
- * :maximum_response_time - longest response time of crawl
77
- * :asset_size - total size of all non-assets received
61
+ * @:average_length@ - average size of each objet
62
+ * @:minimum_length@ - minimum length returned
63
+ * @:queued_at@ - date and time that the crawl was started at (eg: "2012-09-10T23:10:08+01:00")
64
+ * @:maximum_length@ - maximum length of object received
65
+ * @:status_counts@ - hash with the status returned as the key and value as number of pages (eg: {"404" => 1, "200" => 1})
66
+ * @:mime_counts@ - hash containing the mime type as key and count or pages as value (eg: {"text/html" => 8, "image/jpeg" => 25)})
67
+ * @:queue_counter@ - size of queue waiting to be processed for crawl
68
+ * @:page_count@ - number of html pages retrieved
69
+ * @:total_length@ - total size of data received
70
+ * @:current_status@ - Current status of crawl
71
+ * @:asset_count@ - count of non-html objects received
72
+ * @:page_size@ - total size of pages received
73
+ * @:average_response_time@ - average response time of all objects
74
+ * @:crawl_counter@ - number of objects that have been crawled
75
+ * @:minimum_response_time@ - quickest response time of crawl
76
+ * @:maximum_response_time@ - longest response time of crawl
77
+ * @:asset_size@ - total size of all non-assets received
78
78
 
79
79
  h2. Installation
80
80
 
@@ -82,6 +82,10 @@ Install crawler as a gem
82
82
 
83
83
  bc. gem install cobweb
84
84
 
85
+ or in a @Gemfile@
86
+
87
+ bc. gem 'cobweb'
88
+
85
89
  h2. Usage
86
90
 
87
91
  h3. Cobweb
@@ -90,42 +94,42 @@ h4. new(options)
90
94
 
91
95
  Creates a new crawler object based on a base_url
92
96
 
93
- * options - Options are passed in as a hash,
94
-
95
- ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
96
- ** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
97
- ** :processing_queue - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
98
- ** :crawl_finished_queue - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
99
- ** :debug - enables debug output (Default: false)
100
- ** :quiet - hides default output (Default: false)
101
- ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
102
- ** :timeout - http timeout for requests (Default: 10)
103
- ** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
104
- ** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
105
- ** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
106
- ** :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
107
- ** :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
108
- ** :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
109
- ** :seed_urls - an array of urls that are put into the queue regardless of any other setting, combine with {:external_urls => "*"} to limit to seed urls
110
- ** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
111
- ** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
112
- ** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
113
- ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
114
- ** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
115
- ** :raise_exceptions - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
116
- ** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
117
- ** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
118
- ** :proxy_port - port number of the proxy (default: nil)
119
- ** :treat_https_as_http - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
97
+ * options - The following hash keys can be defined:
98
+
99
+ ** @:follow_redirects@ - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
100
+ ** @:redirect_limit@ - sets the limit to be used for concurrent redirects (Default: 10)
101
+ ** @:processing_queue@ - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
102
+ ** @:crawl_finished_queue@ - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
103
+ ** @:debug@ - enables debug output (Default: false)
104
+ ** @:quiet@ - hides default output (Default: false)
105
+ ** @:cache@ - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
106
+ ** @:timeout@ - http timeout for requests (Default: 10)
107
+ ** @:redis_options@ - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
108
+ ** @:internal_urls@ - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
109
+ ** @:first_page_redirect_internal@ - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
110
+ ** @:crawl_id@ - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
111
+ ** @:internal_urls@ - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
112
+ ** @:external_urls@ - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
113
+ ** @:seed_urls@ - an array of urls that are put into the queue regardless of any other setting, combine with {:external_urls => "*"} to limit to seed urls
114
+ ** @:obey_robots@ - boolean determining if robots.txt should be honoured. (default: false)
115
+ ** @:user_agent@ - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
116
+ ** @:crawl_limit_by_page@ - sets the crawl counter to only use html page types when counting objects crawled
117
+ ** @:valid_mime_types@ - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
118
+ ** @:direct_call_process_job@ - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
119
+ ** @:raise_exceptions@ - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
120
+ ** @:use_encoding_safe_process_job@ - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
121
+ ** @:proxy_addr@ - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
122
+ ** @:proxy_port@ - port number of the proxy (default: nil)
123
+ ** @:treat_https_as_http@ - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
120
124
 
121
125
 
122
126
  bc. crawler = Cobweb.new(:follow_redirects => false)
123
127
 
124
128
  h4. start(base_url)
125
129
 
126
- Starts a crawl through resque. Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
130
+ Starts a crawl through resque. Requires the @:processing_queue@ to be set to a valid class for the resque job to work with the data retrieved.
127
131
 
128
- * base_url - the url to start the crawl from
132
+ * @base_url@ - the url to start the crawl from
129
133
 
130
134
  Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
131
135
 
@@ -135,7 +139,7 @@ h4. get(url)
135
139
 
136
140
  Simple get that obey's the options supplied in new.
137
141
 
138
- * url - url requested
142
+ * @url@ - url requested
139
143
 
140
144
  bc. crawler.get("http://www.google.com/")
141
145
 
@@ -143,14 +147,14 @@ h4. head(url)
143
147
 
144
148
  Simple get that obey's the options supplied in new.
145
149
 
146
- * url - url requested
150
+ * @url@ - url requested
147
151
 
148
152
  bc. crawler.head("http://www.google.com/")
149
153
 
150
154
 
151
155
  h4. Processing Queue
152
156
 
153
- The :processing_queue option is used to specify the class that contains the resque perform method to pass the content onto. This class should be defined in your application to perform any tasks you wish to the content. There are two options however, for running this. Firstly, the default settings will push the content crawled onto a resque queue for that class. This allows you the flexibility of running in queues on seperate machines etc. The main drawback to this is that all your content is stored in redis within the queue. This can be memory intensive if you are crawling large sites, or have large content that is being crawled. To get around this you can specify that the crawl_job calls the perform method on the processing queue class directly, thereby not using memory in redis for the content. This is performed by using the :direct_call_process_job. If you set that option to 'true' then instead of the job being queued, it will be executed within the crawl_job queue.
157
+ The @:processing_queue@ option is used to specify the class that contains the resque perform method to pass the content onto. This class should be defined in your application to perform any tasks you wish to the content. There are two options however, for running this. Firstly, the default settings will push the content crawled onto a resque queue for that class. This allows you the flexibility of running in queues on seperate machines etc. The main drawback to this is that all your content is stored in redis within the queue. This can be memory intensive if you are crawling large sites, or have large content that is being crawled. To get around this you can specify that the crawl_job calls the perform method on the processing queue class directly, thereby not using memory in redis for the content. This is performed by using the :direct_call_process_job. If you set that option to 'true' then instead of the job being queued, it will be executed within the crawl_job queue.
154
158
 
155
159
 
156
160
  h3. CobwebCrawler
@@ -169,7 +173,7 @@ puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:ass
169
173
 
170
174
  There are some specific options for CobwebCrawler in addition to the normal cobweb options
171
175
 
172
- * thread_count - specifies the number of threads used by the crawler, defaults to 1
176
+ * @thread_count@ - specifies the number of threads used by the crawler, defaults to 1
173
177
 
174
178
  h3. CobwebCrawlHelper
175
179
 
@@ -177,7 +181,7 @@ The CobwebCrawlHelper class is a helper class to assist in getting information a
177
181
 
178
182
  bc. crawl = CobwebCrawlHelper.new(options)
179
183
 
180
- * options - the hash of options passed into Cobweb.new (must include a :crawl_id)
184
+ * @options@ - the hash of options passed into Cobweb.new (must include a @:crawl_id@)
181
185
 
182
186
 
183
187
 
@@ -8,17 +8,17 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
8
8
  require file
9
9
  end
10
10
 
11
- puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
11
+ puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
12
12
 
13
13
 
14
14
  # Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
15
15
  class Cobweb
16
-
16
+
17
17
  # retrieves current version
18
18
  def self.version
19
19
  CobwebVersion.version
20
20
  end
21
-
21
+
22
22
  # used for setting default options
23
23
  def method_missing(method_sym, *arguments, &block)
24
24
  if method_sym.to_s =~ /^default_(.*)_to$/
@@ -28,7 +28,7 @@ class Cobweb
28
28
  super
29
29
  end
30
30
  end
31
-
31
+
32
32
  # See readme for more information on options available
33
33
  def initialize(options = {})
34
34
  @options = options
@@ -41,7 +41,7 @@ class Cobweb
41
41
  default_crawl_finished_queue_to "CobwebFinishedJob"
42
42
  else
43
43
  default_processing_queue_to "CrawlProcessWorker"
44
- default_crawl_finished_queue_to "CrawlFinishedWorker"
44
+ default_crawl_finished_queue_to "CrawlFinishedWorker"
45
45
  end
46
46
  default_quiet_to true
47
47
  default_debug_to false
@@ -66,22 +66,22 @@ class Cobweb
66
66
 
67
67
 
68
68
  end
69
-
69
+
70
70
  # This method starts the resque based crawl and enqueues the base_url
71
71
  def start(base_url)
72
72
  raise ":base_url is required" unless base_url
73
73
  request = {
74
74
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
75
- :url => base_url
76
- }
77
-
75
+ :url => base_url
76
+ }
77
+
78
78
  if @options[:internal_urls].nil? || @options[:internal_urls].empty?
79
79
  uri = Addressable::URI.parse(base_url)
80
80
  @options[:internal_urls] = []
81
81
  @options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
82
82
  @options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
83
83
  end
84
-
84
+
85
85
  request.merge!(@options)
86
86
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => RedisConnection.new(request[:redis_options]))
87
87
  @redis.set("original_base_url", base_url)
@@ -90,10 +90,10 @@ class Cobweb
90
90
  @redis.set("queue-counter", 1)
91
91
 
92
92
  @options[:seed_urls].map{|link| @redis.sadd "queued", link }
93
-
93
+
94
94
  @stats = Stats.new(request)
95
95
  @stats.start_crawl(request)
96
-
96
+
97
97
  # add internal_urls into redis
98
98
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
99
99
  if @options[:queue_system] == :resque
@@ -103,10 +103,10 @@ class Cobweb
103
103
  else
104
104
  raise "Unknown queue system: #{content_request[:queue_system]}"
105
105
  end
106
-
106
+
107
107
  request
108
108
  end
109
-
109
+
110
110
  # Returns array of cookies from content
111
111
  def get_cookies(response)
112
112
  all_cookies = response.get_fields('set-cookie')
@@ -134,7 +134,7 @@ class Cobweb
134
134
  else
135
135
  redirect_limit = 10
136
136
  end
137
-
137
+
138
138
  # connect to redis
139
139
  if options.has_key? :crawl_id
140
140
  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
@@ -147,7 +147,7 @@ class Cobweb
147
147
 
148
148
  # check if it has already been cached
149
149
  if @options[:cache] && ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
150
- if @options[:cache_type] == :crawl_based
150
+ if @options[:cache_type] == :crawl_based
151
151
  puts "Cache hit in crawl for #{url}" unless @options[:quiet]
152
152
  content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
153
153
  else
@@ -183,7 +183,7 @@ class Cobweb
183
183
  if @options[:range]
184
184
  request.set_range(@options[:range])
185
185
  end
186
-
186
+
187
187
  response = @http.request request
188
188
 
189
189
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
@@ -204,11 +204,11 @@ class Cobweb
204
204
  content[:redirect_through] = [uri.to_s] if content[:redirect_through].nil?
205
205
  content[:redirect_through].insert(0, url)
206
206
  content[:url] = content[:redirect_through].last
207
-
207
+
208
208
  content[:response_time] = Time.now.to_f - request_time
209
209
  else
210
210
  content[:response_time] = Time.now.to_f - request_time
211
-
211
+
212
212
  puts "Retrieved." unless @options[:quiet]
213
213
 
214
214
  # create the content container
@@ -237,7 +237,7 @@ class Cobweb
237
237
  # parse data for links
238
238
  link_parser = ContentLinkParser.new(content[:url], content[:body])
239
239
  content[:links] = link_parser.link_data
240
-
240
+
241
241
  end
242
242
  # add content to cache if required
243
243
  if @options[:cache]
@@ -252,10 +252,10 @@ class Cobweb
252
252
  rescue RedirectError => e
253
253
  if @options[:raise_exceptions]
254
254
  puts "Re-Raising error #{e.message} on #{uri.to_s}"
255
- raise e
255
+ raise e
256
256
  end
257
257
  puts "ERROR RedirectError: #{e.message}"
258
-
258
+
259
259
  ## generate a blank content
260
260
  content = {}
261
261
  content[:url] = uri.to_s
@@ -267,11 +267,11 @@ class Cobweb
267
267
  content[:mime_type] = "error/dnslookup"
268
268
  content[:headers] = {}
269
269
  content[:links] = {}
270
-
270
+
271
271
  rescue SocketError => e
272
272
  raise e if @options[:raise_exceptions]
273
273
  puts "ERROR SocketError: #{e.message}"
274
-
274
+
275
275
  ## generate a blank content
276
276
  content = {}
277
277
  content[:url] = uri.to_s
@@ -283,11 +283,11 @@ class Cobweb
283
283
  content[:mime_type] = "error/dnslookup"
284
284
  content[:headers] = {}
285
285
  content[:links] = {}
286
-
286
+
287
287
  rescue Timeout::Error => e
288
288
  raise e if @options[:raise_exceptions]
289
289
  puts "ERROR Timeout::Error: #{e.message}"
290
-
290
+
291
291
  ## generate a blank content
292
292
  content = {}
293
293
  content[:url] = uri.to_s
@@ -306,7 +306,7 @@ class Cobweb
306
306
 
307
307
  # Performs a HTTP HEAD request to the specified url applying the options supplied
308
308
  def head(url, options = @options)
309
- raise "url cannot be nil" if url.nil?
309
+ raise "url cannot be nil" if url.nil?
310
310
  uri = Addressable::URI.parse(url)
311
311
  uri.normalize!
312
312
  uri.fragment=nil
@@ -319,16 +319,16 @@ class Cobweb
319
319
  else
320
320
  redirect_limit = 10
321
321
  end
322
-
322
+
323
323
  # connect to redis
324
324
  if options.has_key? :crawl_id
325
325
  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
326
326
  else
327
327
  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
328
328
  end
329
-
329
+
330
330
  content = {:base_url => url}
331
-
331
+
332
332
  # check if it has already been cached
333
333
  if @options[:cache] && redis.get("head-#{unique_id}")
334
334
  puts "Cache hit for #{url}" unless @options[:quiet]
@@ -386,8 +386,8 @@ class Cobweb
386
386
  charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
387
387
  content[:character_set] = charset
388
388
  end
389
- end
390
-
389
+ end
390
+
391
391
  # add content to cache if required
392
392
  if @options[:cache]
393
393
  puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
@@ -416,7 +416,7 @@ class Cobweb
416
416
  rescue SocketError => e
417
417
  raise e if @options[:raise_exceptions]
418
418
  puts "ERROR SocketError: #{e.message}"
419
-
419
+
420
420
  ## generate a blank content
421
421
  content = {}
422
422
  content[:url] = uri.to_s
@@ -428,11 +428,11 @@ class Cobweb
428
428
  content[:mime_type] = "error/dnslookup"
429
429
  content[:headers] = {}
430
430
  content[:links] = {}
431
-
431
+
432
432
  rescue Timeout::Error => e
433
433
  raise e if @options[:raise_exceptions]
434
434
  puts "ERROR Timeout::Error: #{e.message}"
435
-
435
+
436
436
  ## generate a blank content
437
437
  content = {}
438
438
  content[:url] = uri.to_s
@@ -445,10 +445,10 @@ class Cobweb
445
445
  content[:headers] = {}
446
446
  content[:links] = {}
447
447
  end
448
-
448
+
449
449
  content
450
450
  end
451
-
451
+
452
452
  end
453
453
 
454
454
  # escapes characters with meaning in regular expressions and adds wildcard expression
@@ -456,7 +456,7 @@ class Cobweb
456
456
  pattern = pattern.gsub(".", "\\.")
457
457
  pattern = pattern.gsub("?", "\\?")
458
458
  pattern = pattern.gsub("+", "\\\\+")
459
- pattern = pattern.gsub("*", ".*?")
459
+ pattern = pattern.gsub("*", ".*?")
460
460
  if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
461
461
  pattern = pattern.gsub("http:", "https?:")
462
462
  end
@@ -464,9 +464,9 @@ class Cobweb
464
464
  end
465
465
 
466
466
  def clear_cache
467
-
467
+
468
468
  end
469
-
469
+
470
470
  private
471
471
  # checks if the mime_type is textual
472
472
  def text_content?(content_type)
@@ -475,5 +475,5 @@ class Cobweb
475
475
  end
476
476
  false
477
477
  end
478
-
478
+
479
479
  end