cobweb 1.0.28 → 1.0.29
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.textile +71 -67
- data/lib/cobweb.rb +41 -41
- data/lib/cobweb_crawl_helper.rb +1 -5
- data/lib/cobweb_version.rb +2 -2
- data/lib/crawl_worker.rb +14 -14
- data/lib/export_command.rb +3 -3
- data/lib/report_command.rb +1 -1
- data/lib/string.rb +4 -9
- data/spec/cobweb/cobweb_crawler_spec.rb +15 -15
- data/spec/cobweb/crawl_job_spec.rb +8 -6
- data/spec/cobweb/crawl_worker_spec.rb +32 -32
- data/spec/samples/sample_site/{boxgrid>withsillyname.html → boxgridwithsillyname.html} +37 -37
- data/spec/samples/sample_site/dashboard.html +1 -1
- data/spec/samples/sample_site/forms.html +1 -1
- data/spec/samples/sample_site/gallery.html +1 -1
- data/spec/samples/sample_site/more.html +1 -1
- data/spec/samples/sample_site/tables.html +1 -1
- data/spec/samples/sample_site/typography.html +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ba10134e0b3da7418f0a1a5772ca196cf5525066
|
4
|
+
data.tar.gz: ae2f27f0036172b001968e277913d0038549220b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 175e8dedf0592c1cc8e9abb50cc0efa835ffb8e1d3c0df3ecb3c3a9900e24a1a2aebc479c5188ea44d57de210b559e0637efb8beac4f90c49f3f6e54bc0492d7
|
7
|
+
data.tar.gz: d4fc557a20e3b4d54daaecaa151b5365e4a483067b6e2cc4429f846e47066d1326a868383f2d1a2031159dc21cfbd053854e2c12ea57cd1eeb2de999f8aec2ab
|
data/README.textile
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
h1. Cobweb v1.0.
|
1
|
+
h1. Cobweb v1.0.29
|
2
2
|
|
3
3
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
4
4
|
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
@@ -34,21 +34,21 @@ h3. Command Line
|
|
34
34
|
h3. Data Returned For Each Page
|
35
35
|
The data available in the returned hash are:
|
36
36
|
|
37
|
-
*
|
38
|
-
*
|
39
|
-
*
|
40
|
-
*
|
41
|
-
*
|
42
|
-
*
|
43
|
-
*
|
44
|
-
*
|
45
|
-
*
|
46
|
-
*
|
47
|
-
**
|
48
|
-
**
|
49
|
-
**
|
50
|
-
**
|
51
|
-
**
|
37
|
+
* @:url@ - url of the resource requested
|
38
|
+
* @:status_code@ - status code of the resource requested
|
39
|
+
* @:mime_type@ - content type of the resource
|
40
|
+
* @:character_set@ - character set of content determined from content type
|
41
|
+
* @:length@ - length of the content returned
|
42
|
+
* @:body@ - content of the resource
|
43
|
+
* @:location@ - location header if returned
|
44
|
+
* @:redirect_through@ - if your following redirects, any redirects are stored here detailing where you were redirected through to get to the final location
|
45
|
+
* @:headers@ - hash or the headers returned
|
46
|
+
* @:links@ - hash or links on the page split in to types
|
47
|
+
** @:links@ - urls from a tags within the resource
|
48
|
+
** @:images@ - urls from img tags within the resource
|
49
|
+
** @:related@ - urls from link tags
|
50
|
+
** @:scripts@ - urls from script tags
|
51
|
+
** @:styles@ - urls from within link tags with rel of stylesheet and from url() directives with stylesheets
|
52
52
|
|
53
53
|
The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
|
54
54
|
|
@@ -58,23 +58,23 @@ h3. Statistics
|
|
58
58
|
|
59
59
|
The data available within statistics is as follows:
|
60
60
|
|
61
|
-
*
|
62
|
-
*
|
63
|
-
*
|
64
|
-
*
|
65
|
-
*
|
66
|
-
*
|
67
|
-
*
|
68
|
-
*
|
69
|
-
*
|
70
|
-
*
|
71
|
-
*
|
72
|
-
*
|
73
|
-
*
|
74
|
-
*
|
75
|
-
*
|
76
|
-
*
|
77
|
-
*
|
61
|
+
* @:average_length@ - average size of each objet
|
62
|
+
* @:minimum_length@ - minimum length returned
|
63
|
+
* @:queued_at@ - date and time that the crawl was started at (eg: "2012-09-10T23:10:08+01:00")
|
64
|
+
* @:maximum_length@ - maximum length of object received
|
65
|
+
* @:status_counts@ - hash with the status returned as the key and value as number of pages (eg: {"404" => 1, "200" => 1})
|
66
|
+
* @:mime_counts@ - hash containing the mime type as key and count or pages as value (eg: {"text/html" => 8, "image/jpeg" => 25)})
|
67
|
+
* @:queue_counter@ - size of queue waiting to be processed for crawl
|
68
|
+
* @:page_count@ - number of html pages retrieved
|
69
|
+
* @:total_length@ - total size of data received
|
70
|
+
* @:current_status@ - Current status of crawl
|
71
|
+
* @:asset_count@ - count of non-html objects received
|
72
|
+
* @:page_size@ - total size of pages received
|
73
|
+
* @:average_response_time@ - average response time of all objects
|
74
|
+
* @:crawl_counter@ - number of objects that have been crawled
|
75
|
+
* @:minimum_response_time@ - quickest response time of crawl
|
76
|
+
* @:maximum_response_time@ - longest response time of crawl
|
77
|
+
* @:asset_size@ - total size of all non-assets received
|
78
78
|
|
79
79
|
h2. Installation
|
80
80
|
|
@@ -82,6 +82,10 @@ Install crawler as a gem
|
|
82
82
|
|
83
83
|
bc. gem install cobweb
|
84
84
|
|
85
|
+
or in a @Gemfile@
|
86
|
+
|
87
|
+
bc. gem 'cobweb'
|
88
|
+
|
85
89
|
h2. Usage
|
86
90
|
|
87
91
|
h3. Cobweb
|
@@ -90,42 +94,42 @@ h4. new(options)
|
|
90
94
|
|
91
95
|
Creates a new crawler object based on a base_url
|
92
96
|
|
93
|
-
* options -
|
94
|
-
|
95
|
-
**
|
96
|
-
**
|
97
|
-
**
|
98
|
-
**
|
99
|
-
**
|
100
|
-
**
|
101
|
-
**
|
102
|
-
**
|
103
|
-
**
|
104
|
-
**
|
105
|
-
**
|
106
|
-
**
|
107
|
-
**
|
108
|
-
**
|
109
|
-
**
|
110
|
-
**
|
111
|
-
**
|
112
|
-
**
|
113
|
-
**
|
114
|
-
**
|
115
|
-
**
|
116
|
-
**
|
117
|
-
**
|
118
|
-
**
|
119
|
-
**
|
97
|
+
* options - The following hash keys can be defined:
|
98
|
+
|
99
|
+
** @:follow_redirects@ - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
|
100
|
+
** @:redirect_limit@ - sets the limit to be used for concurrent redirects (Default: 10)
|
101
|
+
** @:processing_queue@ - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
|
102
|
+
** @:crawl_finished_queue@ - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
|
103
|
+
** @:debug@ - enables debug output (Default: false)
|
104
|
+
** @:quiet@ - hides default output (Default: false)
|
105
|
+
** @:cache@ - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
|
106
|
+
** @:timeout@ - http timeout for requests (Default: 10)
|
107
|
+
** @:redis_options@ - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
|
108
|
+
** @:internal_urls@ - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
|
109
|
+
** @:first_page_redirect_internal@ - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
|
110
|
+
** @:crawl_id@ - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
111
|
+
** @:internal_urls@ - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
|
112
|
+
** @:external_urls@ - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
|
113
|
+
** @:seed_urls@ - an array of urls that are put into the queue regardless of any other setting, combine with {:external_urls => "*"} to limit to seed urls
|
114
|
+
** @:obey_robots@ - boolean determining if robots.txt should be honoured. (default: false)
|
115
|
+
** @:user_agent@ - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
116
|
+
** @:crawl_limit_by_page@ - sets the crawl counter to only use html page types when counting objects crawled
|
117
|
+
** @:valid_mime_types@ - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
|
118
|
+
** @:direct_call_process_job@ - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
|
119
|
+
** @:raise_exceptions@ - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
|
120
|
+
** @:use_encoding_safe_process_job@ - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
|
121
|
+
** @:proxy_addr@ - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
|
122
|
+
** @:proxy_port@ - port number of the proxy (default: nil)
|
123
|
+
** @:treat_https_as_http@ - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
|
120
124
|
|
121
125
|
|
122
126
|
bc. crawler = Cobweb.new(:follow_redirects => false)
|
123
127
|
|
124
128
|
h4. start(base_url)
|
125
129
|
|
126
|
-
Starts a crawl through resque. Requires the
|
130
|
+
Starts a crawl through resque. Requires the @:processing_queue@ to be set to a valid class for the resque job to work with the data retrieved.
|
127
131
|
|
128
|
-
* base_url - the url to start the crawl from
|
132
|
+
* @base_url@ - the url to start the crawl from
|
129
133
|
|
130
134
|
Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
|
131
135
|
|
@@ -135,7 +139,7 @@ h4. get(url)
|
|
135
139
|
|
136
140
|
Simple get that obey's the options supplied in new.
|
137
141
|
|
138
|
-
* url - url requested
|
142
|
+
* @url@ - url requested
|
139
143
|
|
140
144
|
bc. crawler.get("http://www.google.com/")
|
141
145
|
|
@@ -143,14 +147,14 @@ h4. head(url)
|
|
143
147
|
|
144
148
|
Simple get that obey's the options supplied in new.
|
145
149
|
|
146
|
-
* url - url requested
|
150
|
+
* @url@ - url requested
|
147
151
|
|
148
152
|
bc. crawler.head("http://www.google.com/")
|
149
153
|
|
150
154
|
|
151
155
|
h4. Processing Queue
|
152
156
|
|
153
|
-
The
|
157
|
+
The @:processing_queue@ option is used to specify the class that contains the resque perform method to pass the content onto. This class should be defined in your application to perform any tasks you wish to the content. There are two options however, for running this. Firstly, the default settings will push the content crawled onto a resque queue for that class. This allows you the flexibility of running in queues on seperate machines etc. The main drawback to this is that all your content is stored in redis within the queue. This can be memory intensive if you are crawling large sites, or have large content that is being crawled. To get around this you can specify that the crawl_job calls the perform method on the processing queue class directly, thereby not using memory in redis for the content. This is performed by using the :direct_call_process_job. If you set that option to 'true' then instead of the job being queued, it will be executed within the crawl_job queue.
|
154
158
|
|
155
159
|
|
156
160
|
h3. CobwebCrawler
|
@@ -169,7 +173,7 @@ puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:ass
|
|
169
173
|
|
170
174
|
There are some specific options for CobwebCrawler in addition to the normal cobweb options
|
171
175
|
|
172
|
-
* thread_count - specifies the number of threads used by the crawler, defaults to 1
|
176
|
+
* @thread_count@ - specifies the number of threads used by the crawler, defaults to 1
|
173
177
|
|
174
178
|
h3. CobwebCrawlHelper
|
175
179
|
|
@@ -177,7 +181,7 @@ The CobwebCrawlHelper class is a helper class to assist in getting information a
|
|
177
181
|
|
178
182
|
bc. crawl = CobwebCrawlHelper.new(options)
|
179
183
|
|
180
|
-
* options - the hash of options passed into Cobweb.new (must include a
|
184
|
+
* @options@ - the hash of options passed into Cobweb.new (must include a @:crawl_id@)
|
181
185
|
|
182
186
|
|
183
187
|
|
data/lib/cobweb.rb
CHANGED
@@ -8,17 +8,17 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
|
|
8
8
|
require file
|
9
9
|
end
|
10
10
|
|
11
|
-
puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
|
11
|
+
puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
|
12
12
|
|
13
13
|
|
14
14
|
# Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
|
15
15
|
class Cobweb
|
16
|
-
|
16
|
+
|
17
17
|
# retrieves current version
|
18
18
|
def self.version
|
19
19
|
CobwebVersion.version
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
# used for setting default options
|
23
23
|
def method_missing(method_sym, *arguments, &block)
|
24
24
|
if method_sym.to_s =~ /^default_(.*)_to$/
|
@@ -28,7 +28,7 @@ class Cobweb
|
|
28
28
|
super
|
29
29
|
end
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
32
|
# See readme for more information on options available
|
33
33
|
def initialize(options = {})
|
34
34
|
@options = options
|
@@ -41,7 +41,7 @@ class Cobweb
|
|
41
41
|
default_crawl_finished_queue_to "CobwebFinishedJob"
|
42
42
|
else
|
43
43
|
default_processing_queue_to "CrawlProcessWorker"
|
44
|
-
default_crawl_finished_queue_to "CrawlFinishedWorker"
|
44
|
+
default_crawl_finished_queue_to "CrawlFinishedWorker"
|
45
45
|
end
|
46
46
|
default_quiet_to true
|
47
47
|
default_debug_to false
|
@@ -66,22 +66,22 @@ class Cobweb
|
|
66
66
|
|
67
67
|
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
# This method starts the resque based crawl and enqueues the base_url
|
71
71
|
def start(base_url)
|
72
72
|
raise ":base_url is required" unless base_url
|
73
73
|
request = {
|
74
74
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
75
|
-
:url => base_url
|
76
|
-
}
|
77
|
-
|
75
|
+
:url => base_url
|
76
|
+
}
|
77
|
+
|
78
78
|
if @options[:internal_urls].nil? || @options[:internal_urls].empty?
|
79
79
|
uri = Addressable::URI.parse(base_url)
|
80
80
|
@options[:internal_urls] = []
|
81
81
|
@options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
|
82
82
|
@options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
|
83
83
|
end
|
84
|
-
|
84
|
+
|
85
85
|
request.merge!(@options)
|
86
86
|
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => RedisConnection.new(request[:redis_options]))
|
87
87
|
@redis.set("original_base_url", base_url)
|
@@ -90,10 +90,10 @@ class Cobweb
|
|
90
90
|
@redis.set("queue-counter", 1)
|
91
91
|
|
92
92
|
@options[:seed_urls].map{|link| @redis.sadd "queued", link }
|
93
|
-
|
93
|
+
|
94
94
|
@stats = Stats.new(request)
|
95
95
|
@stats.start_crawl(request)
|
96
|
-
|
96
|
+
|
97
97
|
# add internal_urls into redis
|
98
98
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
99
99
|
if @options[:queue_system] == :resque
|
@@ -103,10 +103,10 @@ class Cobweb
|
|
103
103
|
else
|
104
104
|
raise "Unknown queue system: #{content_request[:queue_system]}"
|
105
105
|
end
|
106
|
-
|
106
|
+
|
107
107
|
request
|
108
108
|
end
|
109
|
-
|
109
|
+
|
110
110
|
# Returns array of cookies from content
|
111
111
|
def get_cookies(response)
|
112
112
|
all_cookies = response.get_fields('set-cookie')
|
@@ -134,7 +134,7 @@ class Cobweb
|
|
134
134
|
else
|
135
135
|
redirect_limit = 10
|
136
136
|
end
|
137
|
-
|
137
|
+
|
138
138
|
# connect to redis
|
139
139
|
if options.has_key? :crawl_id
|
140
140
|
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
@@ -147,7 +147,7 @@ class Cobweb
|
|
147
147
|
|
148
148
|
# check if it has already been cached
|
149
149
|
if @options[:cache] && ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
|
150
|
-
if @options[:cache_type] == :crawl_based
|
150
|
+
if @options[:cache_type] == :crawl_based
|
151
151
|
puts "Cache hit in crawl for #{url}" unless @options[:quiet]
|
152
152
|
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
153
153
|
else
|
@@ -183,7 +183,7 @@ class Cobweb
|
|
183
183
|
if @options[:range]
|
184
184
|
request.set_range(@options[:range])
|
185
185
|
end
|
186
|
-
|
186
|
+
|
187
187
|
response = @http.request request
|
188
188
|
|
189
189
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
@@ -204,11 +204,11 @@ class Cobweb
|
|
204
204
|
content[:redirect_through] = [uri.to_s] if content[:redirect_through].nil?
|
205
205
|
content[:redirect_through].insert(0, url)
|
206
206
|
content[:url] = content[:redirect_through].last
|
207
|
-
|
207
|
+
|
208
208
|
content[:response_time] = Time.now.to_f - request_time
|
209
209
|
else
|
210
210
|
content[:response_time] = Time.now.to_f - request_time
|
211
|
-
|
211
|
+
|
212
212
|
puts "Retrieved." unless @options[:quiet]
|
213
213
|
|
214
214
|
# create the content container
|
@@ -237,7 +237,7 @@ class Cobweb
|
|
237
237
|
# parse data for links
|
238
238
|
link_parser = ContentLinkParser.new(content[:url], content[:body])
|
239
239
|
content[:links] = link_parser.link_data
|
240
|
-
|
240
|
+
|
241
241
|
end
|
242
242
|
# add content to cache if required
|
243
243
|
if @options[:cache]
|
@@ -252,10 +252,10 @@ class Cobweb
|
|
252
252
|
rescue RedirectError => e
|
253
253
|
if @options[:raise_exceptions]
|
254
254
|
puts "Re-Raising error #{e.message} on #{uri.to_s}"
|
255
|
-
raise e
|
255
|
+
raise e
|
256
256
|
end
|
257
257
|
puts "ERROR RedirectError: #{e.message}"
|
258
|
-
|
258
|
+
|
259
259
|
## generate a blank content
|
260
260
|
content = {}
|
261
261
|
content[:url] = uri.to_s
|
@@ -267,11 +267,11 @@ class Cobweb
|
|
267
267
|
content[:mime_type] = "error/dnslookup"
|
268
268
|
content[:headers] = {}
|
269
269
|
content[:links] = {}
|
270
|
-
|
270
|
+
|
271
271
|
rescue SocketError => e
|
272
272
|
raise e if @options[:raise_exceptions]
|
273
273
|
puts "ERROR SocketError: #{e.message}"
|
274
|
-
|
274
|
+
|
275
275
|
## generate a blank content
|
276
276
|
content = {}
|
277
277
|
content[:url] = uri.to_s
|
@@ -283,11 +283,11 @@ class Cobweb
|
|
283
283
|
content[:mime_type] = "error/dnslookup"
|
284
284
|
content[:headers] = {}
|
285
285
|
content[:links] = {}
|
286
|
-
|
286
|
+
|
287
287
|
rescue Timeout::Error => e
|
288
288
|
raise e if @options[:raise_exceptions]
|
289
289
|
puts "ERROR Timeout::Error: #{e.message}"
|
290
|
-
|
290
|
+
|
291
291
|
## generate a blank content
|
292
292
|
content = {}
|
293
293
|
content[:url] = uri.to_s
|
@@ -306,7 +306,7 @@ class Cobweb
|
|
306
306
|
|
307
307
|
# Performs a HTTP HEAD request to the specified url applying the options supplied
|
308
308
|
def head(url, options = @options)
|
309
|
-
raise "url cannot be nil" if url.nil?
|
309
|
+
raise "url cannot be nil" if url.nil?
|
310
310
|
uri = Addressable::URI.parse(url)
|
311
311
|
uri.normalize!
|
312
312
|
uri.fragment=nil
|
@@ -319,16 +319,16 @@ class Cobweb
|
|
319
319
|
else
|
320
320
|
redirect_limit = 10
|
321
321
|
end
|
322
|
-
|
322
|
+
|
323
323
|
# connect to redis
|
324
324
|
if options.has_key? :crawl_id
|
325
325
|
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
326
326
|
else
|
327
327
|
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
|
328
328
|
end
|
329
|
-
|
329
|
+
|
330
330
|
content = {:base_url => url}
|
331
|
-
|
331
|
+
|
332
332
|
# check if it has already been cached
|
333
333
|
if @options[:cache] && redis.get("head-#{unique_id}")
|
334
334
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
@@ -386,8 +386,8 @@ class Cobweb
|
|
386
386
|
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
387
387
|
content[:character_set] = charset
|
388
388
|
end
|
389
|
-
end
|
390
|
-
|
389
|
+
end
|
390
|
+
|
391
391
|
# add content to cache if required
|
392
392
|
if @options[:cache]
|
393
393
|
puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
|
@@ -416,7 +416,7 @@ class Cobweb
|
|
416
416
|
rescue SocketError => e
|
417
417
|
raise e if @options[:raise_exceptions]
|
418
418
|
puts "ERROR SocketError: #{e.message}"
|
419
|
-
|
419
|
+
|
420
420
|
## generate a blank content
|
421
421
|
content = {}
|
422
422
|
content[:url] = uri.to_s
|
@@ -428,11 +428,11 @@ class Cobweb
|
|
428
428
|
content[:mime_type] = "error/dnslookup"
|
429
429
|
content[:headers] = {}
|
430
430
|
content[:links] = {}
|
431
|
-
|
431
|
+
|
432
432
|
rescue Timeout::Error => e
|
433
433
|
raise e if @options[:raise_exceptions]
|
434
434
|
puts "ERROR Timeout::Error: #{e.message}"
|
435
|
-
|
435
|
+
|
436
436
|
## generate a blank content
|
437
437
|
content = {}
|
438
438
|
content[:url] = uri.to_s
|
@@ -445,10 +445,10 @@ class Cobweb
|
|
445
445
|
content[:headers] = {}
|
446
446
|
content[:links] = {}
|
447
447
|
end
|
448
|
-
|
448
|
+
|
449
449
|
content
|
450
450
|
end
|
451
|
-
|
451
|
+
|
452
452
|
end
|
453
453
|
|
454
454
|
# escapes characters with meaning in regular expressions and adds wildcard expression
|
@@ -456,7 +456,7 @@ class Cobweb
|
|
456
456
|
pattern = pattern.gsub(".", "\\.")
|
457
457
|
pattern = pattern.gsub("?", "\\?")
|
458
458
|
pattern = pattern.gsub("+", "\\\\+")
|
459
|
-
pattern = pattern.gsub("*", ".*?")
|
459
|
+
pattern = pattern.gsub("*", ".*?")
|
460
460
|
if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
|
461
461
|
pattern = pattern.gsub("http:", "https?:")
|
462
462
|
end
|
@@ -464,9 +464,9 @@ class Cobweb
|
|
464
464
|
end
|
465
465
|
|
466
466
|
def clear_cache
|
467
|
-
|
467
|
+
|
468
468
|
end
|
469
|
-
|
469
|
+
|
470
470
|
private
|
471
471
|
# checks if the mime_type is textual
|
472
472
|
def text_content?(content_type)
|
@@ -475,5 +475,5 @@ class Cobweb
|
|
475
475
|
end
|
476
476
|
false
|
477
477
|
end
|
478
|
-
|
478
|
+
|
479
479
|
end
|