cobweb 1.0.27 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.textile +76 -68
- data/lib/cobweb.rb +67 -48
- data/lib/cobweb_crawl_helper.rb +29 -23
- data/lib/cobweb_crawler.rb +23 -23
- data/lib/cobweb_links.rb +3 -3
- data/lib/cobweb_version.rb +2 -2
- data/lib/content_link_parser.rb +5 -4
- data/lib/crawl.rb +48 -42
- data/lib/crawl_worker.rb +19 -18
- data/lib/export_command.rb +3 -3
- data/lib/report_command.rb +1 -1
- data/lib/sidekiq/cobweb_helper.rb +2 -1
- data/lib/string.rb +4 -9
- data/spec/cobweb/cobweb_crawl_helper_spec.rb +1 -1
- data/spec/cobweb/cobweb_crawl_spec.rb +6 -6
- data/spec/cobweb/cobweb_crawler_spec.rb +15 -15
- data/spec/cobweb/cobweb_links_spec.rb +36 -36
- data/spec/cobweb/crawl_job_spec.rb +8 -6
- data/spec/cobweb/crawl_worker_spec.rb +32 -32
- data/spec/cobweb/robots_spec.rb +13 -12
- data/spec/samples/sample_site/{boxgrid>withsillyname.html → boxgridwithsillyname.html} +37 -37
- data/spec/samples/sample_site/dashboard.html +1 -1
- data/spec/samples/sample_site/forms.html +1 -1
- data/spec/samples/sample_site/gallery.html +1 -1
- data/spec/samples/sample_site/more.html +1 -1
- data/spec/samples/sample_site/tables.html +1 -1
- data/spec/samples/sample_site/typography.html +1 -1
- data/spec/spec_helper.rb +3 -3
- metadata +152 -40
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2fa16c36a203417d0f90f884634267227698d83839580c59982d15b01bb1f495
|
4
|
+
data.tar.gz: a262f311869ccd9696fd377c1bd2de04ab32f19b1709df1d3321b80e9d4e8517
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b32dce38f7563f75dce48d4e87dfa083c918b1f3702a2f95159939fb09a30742ea27be5dd3e1551451f268df86cbbf9614603f030cfc114cc421fe17cda7950
|
7
|
+
data.tar.gz: dc8dfcdde6e157b73d4ff785d95afbb2d71ff3180169685762683a2b6aeeb37d1b47743608f76e3d2b944f37b12a3cab0cfd0a4bc1bb05c902d447b0f648709e
|
data/README.textile
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
h1. Cobweb v1.0
|
1
|
+
h1. Cobweb v1.1.0
|
2
2
|
|
3
3
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
4
4
|
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
5
|
+
!https://circleci.com/gh/stewartmckee/cobweb.svg?style=shield&circle-token=07357f0bd17ac67e21ea161fb9abdb35ecac4c2e!
|
5
6
|
!https://gemnasium.com/stewartmckee/cobweb.png!
|
6
7
|
!https://coveralls.io/repos/stewartmckee/cobweb/badge.png?branch=master(Coverage Status)!:https://coveralls.io/r/stewartmckee/cobweb
|
7
8
|
|
8
|
-
|
9
9
|
h2. Intro
|
10
10
|
|
11
11
|
CobWeb has three methods of running. Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource. The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly. Lastly you can run the crawler with a block that uses each of the pages found in the crawl.
|
@@ -34,21 +34,22 @@ h3. Command Line
|
|
34
34
|
h3. Data Returned For Each Page
|
35
35
|
The data available in the returned hash are:
|
36
36
|
|
37
|
-
*
|
38
|
-
*
|
39
|
-
*
|
40
|
-
*
|
41
|
-
*
|
42
|
-
*
|
43
|
-
*
|
44
|
-
*
|
45
|
-
*
|
46
|
-
*
|
47
|
-
|
48
|
-
**
|
49
|
-
**
|
50
|
-
**
|
51
|
-
**
|
37
|
+
* @:url@ - url of the resource requested
|
38
|
+
* @:status_code@ - status code of the resource requested
|
39
|
+
* @:response_time@ - response time of the resource requested
|
40
|
+
* @:mime_type@ - content type of the resource
|
41
|
+
* @:character_set@ - character set of content determined from content type
|
42
|
+
* @:length@ - length of the content returned
|
43
|
+
* @:body@ - content of the resource
|
44
|
+
* @:location@ - location header if returned
|
45
|
+
* @:redirect_through@ - if your following redirects, any redirects are stored here detailing where you were redirected through to get to the final location
|
46
|
+
* @:headers@ - hash or the headers returned
|
47
|
+
* @:links@ - hash or links on the page split in to types
|
48
|
+
** @:links@ - urls from a tags within the resource
|
49
|
+
** @:images@ - urls from img tags within the resource
|
50
|
+
** @:related@ - urls from link tags
|
51
|
+
** @:scripts@ - urls from script tags
|
52
|
+
** @:styles@ - urls from within link tags with rel of stylesheet and from url() directives with stylesheets
|
52
53
|
|
53
54
|
The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
|
54
55
|
|
@@ -58,23 +59,23 @@ h3. Statistics
|
|
58
59
|
|
59
60
|
The data available within statistics is as follows:
|
60
61
|
|
61
|
-
*
|
62
|
-
*
|
63
|
-
*
|
64
|
-
*
|
65
|
-
*
|
66
|
-
*
|
67
|
-
*
|
68
|
-
*
|
69
|
-
*
|
70
|
-
*
|
71
|
-
*
|
72
|
-
*
|
73
|
-
*
|
74
|
-
*
|
75
|
-
*
|
76
|
-
*
|
77
|
-
*
|
62
|
+
* @:average_length@ - average size of each objet
|
63
|
+
* @:minimum_length@ - minimum length returned
|
64
|
+
* @:queued_at@ - date and time that the crawl was started at (eg: "2012-09-10T23:10:08+01:00")
|
65
|
+
* @:maximum_length@ - maximum length of object received
|
66
|
+
* @:status_counts@ - hash with the status returned as the key and value as number of pages (eg: {"404" => 1, "200" => 1})
|
67
|
+
* @:mime_counts@ - hash containing the mime type as key and count or pages as value (eg: {"text/html" => 8, "image/jpeg" => 25)})
|
68
|
+
* @:queue_counter@ - size of queue waiting to be processed for crawl
|
69
|
+
* @:page_count@ - number of html pages retrieved
|
70
|
+
* @:total_length@ - total size of data received
|
71
|
+
* @:current_status@ - Current status of crawl
|
72
|
+
* @:asset_count@ - count of non-html objects received
|
73
|
+
* @:page_size@ - total size of pages received
|
74
|
+
* @:average_response_time@ - average response time of all objects
|
75
|
+
* @:crawl_counter@ - number of objects that have been crawled
|
76
|
+
* @:minimum_response_time@ - quickest response time of crawl
|
77
|
+
* @:maximum_response_time@ - longest response time of crawl
|
78
|
+
* @:asset_size@ - total size of all non-assets received
|
78
79
|
|
79
80
|
h2. Installation
|
80
81
|
|
@@ -82,6 +83,10 @@ Install crawler as a gem
|
|
82
83
|
|
83
84
|
bc. gem install cobweb
|
84
85
|
|
86
|
+
or in a @Gemfile@
|
87
|
+
|
88
|
+
bc. gem 'cobweb'
|
89
|
+
|
85
90
|
h2. Usage
|
86
91
|
|
87
92
|
h3. Cobweb
|
@@ -90,42 +95,43 @@ h4. new(options)
|
|
90
95
|
|
91
96
|
Creates a new crawler object based on a base_url
|
92
97
|
|
93
|
-
* options -
|
94
|
-
|
95
|
-
**
|
96
|
-
**
|
97
|
-
**
|
98
|
-
**
|
99
|
-
**
|
100
|
-
**
|
101
|
-
**
|
102
|
-
**
|
103
|
-
**
|
104
|
-
**
|
105
|
-
**
|
106
|
-
**
|
107
|
-
**
|
108
|
-
**
|
109
|
-
**
|
110
|
-
**
|
111
|
-
**
|
112
|
-
**
|
113
|
-
**
|
114
|
-
**
|
115
|
-
**
|
116
|
-
**
|
117
|
-
**
|
118
|
-
**
|
119
|
-
**
|
98
|
+
* options - The following hash keys can be defined:
|
99
|
+
|
100
|
+
** @:follow_redirects@ - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
|
101
|
+
** @:redirect_limit@ - sets the limit to be used for concurrent redirects (Default: 10)
|
102
|
+
** @:queue_system@ - sets the the queue system :resque or :sidekiq (Default: :resque)
|
103
|
+
** @:processing_queue@ - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
|
104
|
+
** @:crawl_finished_queue@ - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
|
105
|
+
** @:debug@ - enables debug output (Default: false)
|
106
|
+
** @:quiet@ - hides default output (Default: false)
|
107
|
+
** @:cache@ - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
|
108
|
+
** @:timeout@ - http timeout for requests (Default: 10)
|
109
|
+
** @:redis_options@ - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}) (Default: {})
|
110
|
+
** @:internal_urls@ - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*']) (Default: [], although your first url's scheme, host and domain are added)
|
111
|
+
** @:first_page_redirect_internal@ - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com) (Default: true)
|
112
|
+
** @:crawl_id@ - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
113
|
+
** @:internal_urls@ - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
|
114
|
+
** @:external_urls@ - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
|
115
|
+
** @:seed_urls@ - an array of urls that are put into the queue regardless of any other setting, combine with {:external_urls => "*"} to limit to seed urls
|
116
|
+
** @:obey_robots@ - boolean determining if robots.txt should be honoured. (default: false)
|
117
|
+
** @:user_agent@ - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
118
|
+
** @:crawl_limit_by_page@ - sets the crawl counter to only use html page types when counting objects crawled
|
119
|
+
** @:valid_mime_types@ - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
|
120
|
+
** @:direct_call_process_job@ - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
|
121
|
+
** @:raise_exceptions@ - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
|
122
|
+
** @:use_encoding_safe_process_job@ - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
|
123
|
+
** @:proxy_addr@ - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
|
124
|
+
** @:proxy_port@ - port number of the proxy (default: nil)
|
125
|
+
** @:treat_https_as_http@ - determines whether https and http urls are treated as the same (defaults to true, ie treated as the same)
|
120
126
|
|
121
127
|
|
122
128
|
bc. crawler = Cobweb.new(:follow_redirects => false)
|
123
129
|
|
124
130
|
h4. start(base_url)
|
125
131
|
|
126
|
-
Starts a crawl through resque. Requires the
|
132
|
+
Starts a crawl through resque. Requires the @:processing_queue@ to be set to a valid class for the resque job to work with the data retrieved.
|
127
133
|
|
128
|
-
* base_url - the url to start the crawl from
|
134
|
+
* @base_url@ - the url to start the crawl from
|
129
135
|
|
130
136
|
Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
|
131
137
|
|
@@ -135,7 +141,7 @@ h4. get(url)
|
|
135
141
|
|
136
142
|
Simple get that obey's the options supplied in new.
|
137
143
|
|
138
|
-
* url - url requested
|
144
|
+
* @url@ - url requested
|
139
145
|
|
140
146
|
bc. crawler.get("http://www.google.com/")
|
141
147
|
|
@@ -143,14 +149,14 @@ h4. head(url)
|
|
143
149
|
|
144
150
|
Simple get that obey's the options supplied in new.
|
145
151
|
|
146
|
-
* url - url requested
|
152
|
+
* @url@ - url requested
|
147
153
|
|
148
154
|
bc. crawler.head("http://www.google.com/")
|
149
155
|
|
150
156
|
|
151
157
|
h4. Processing Queue
|
152
158
|
|
153
|
-
The
|
159
|
+
The @:processing_queue@ option is used to specify the class that contains the resque perform method to pass the content onto. This class should be defined in your application to perform any tasks you wish to the content. There are two options however, for running this. Firstly, the default settings will push the content crawled onto a resque queue for that class. This allows you the flexibility of running in queues on seperate machines etc. The main drawback to this is that all your content is stored in redis within the queue. This can be memory intensive if you are crawling large sites, or have large content that is being crawled. To get around this you can specify that the crawl_job calls the perform method on the processing queue class directly, thereby not using memory in redis for the content. This is performed by using the :direct_call_process_job. If you set that option to 'true' then instead of the job being queued, it will be executed within the crawl_job queue.
|
154
160
|
|
155
161
|
|
156
162
|
h3. CobwebCrawler
|
@@ -169,7 +175,7 @@ puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:ass
|
|
169
175
|
|
170
176
|
There are some specific options for CobwebCrawler in addition to the normal cobweb options
|
171
177
|
|
172
|
-
* thread_count - specifies the number of threads used by the crawler, defaults to 1
|
178
|
+
* @thread_count@ - specifies the number of threads used by the crawler, defaults to 1
|
173
179
|
|
174
180
|
h3. CobwebCrawlHelper
|
175
181
|
|
@@ -177,12 +183,14 @@ The CobwebCrawlHelper class is a helper class to assist in getting information a
|
|
177
183
|
|
178
184
|
bc. crawl = CobwebCrawlHelper.new(options)
|
179
185
|
|
180
|
-
* options - the hash of options passed into Cobweb.new (must include a
|
186
|
+
* @options@ - the hash of options passed into Cobweb.new (must include a @:crawl_id@)
|
181
187
|
|
182
188
|
|
183
189
|
|
184
190
|
h2. Contributing/Testing
|
185
191
|
|
192
|
+
<p>Firstly, you could <a href="https://www.buymeacoffee.com/Z2yRGl3CX" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: auto !important;width: auto !important;" ></a></p>
|
193
|
+
|
186
194
|
Feel free to contribute small or large bits of code, just please make sure that there are rspec test for the features your submitting. We also test on travis at http://travis-ci.org/#!/stewartmckee/cobweb if you want to see the state of the project.
|
187
195
|
|
188
196
|
Continuous integration testing is performed by the excellent Travis: http://travis-ci.org/#!/stewartmckee/cobweb
|
data/lib/cobweb.rb
CHANGED
@@ -8,17 +8,16 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
|
|
8
8
|
require file
|
9
9
|
end
|
10
10
|
|
11
|
-
puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
|
12
|
-
|
11
|
+
puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
|
13
12
|
|
14
13
|
# Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
|
15
14
|
class Cobweb
|
16
|
-
|
15
|
+
|
17
16
|
# retrieves current version
|
18
17
|
def self.version
|
19
18
|
CobwebVersion.version
|
20
19
|
end
|
21
|
-
|
20
|
+
|
22
21
|
# used for setting default options
|
23
22
|
def method_missing(method_sym, *arguments, &block)
|
24
23
|
if method_sym.to_s =~ /^default_(.*)_to$/
|
@@ -28,7 +27,7 @@ class Cobweb
|
|
28
27
|
super
|
29
28
|
end
|
30
29
|
end
|
31
|
-
|
30
|
+
|
32
31
|
# See readme for more information on options available
|
33
32
|
def initialize(options = {})
|
34
33
|
@options = options
|
@@ -41,7 +40,7 @@ class Cobweb
|
|
41
40
|
default_crawl_finished_queue_to "CobwebFinishedJob"
|
42
41
|
else
|
43
42
|
default_processing_queue_to "CrawlProcessWorker"
|
44
|
-
default_crawl_finished_queue_to "CrawlFinishedWorker"
|
43
|
+
default_crawl_finished_queue_to "CrawlFinishedWorker"
|
45
44
|
end
|
46
45
|
default_quiet_to true
|
47
46
|
default_debug_to false
|
@@ -66,47 +65,65 @@ class Cobweb
|
|
66
65
|
|
67
66
|
|
68
67
|
end
|
69
|
-
|
68
|
+
|
70
69
|
# This method starts the resque based crawl and enqueues the base_url
|
71
70
|
def start(base_url)
|
72
71
|
raise ":base_url is required" unless base_url
|
73
72
|
request = {
|
74
73
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
75
|
-
:url => base_url
|
76
|
-
}
|
77
|
-
|
74
|
+
:url => base_url
|
75
|
+
}
|
76
|
+
|
78
77
|
if @options[:internal_urls].nil? || @options[:internal_urls].empty?
|
79
78
|
uri = Addressable::URI.parse(base_url)
|
80
79
|
@options[:internal_urls] = []
|
81
|
-
|
82
|
-
@options[:
|
80
|
+
|
81
|
+
if @options[:treat_https_as_http_to]
|
82
|
+
@options[:internal_urls] << ["http://", uri.host, "/*"].join
|
83
|
+
@options[:internal_urls] << ["http://", uri.host, ":", uri.inferred_port, "/*"].join
|
84
|
+
@options[:internal_urls] << ["https://", uri.host, "/*"].join
|
85
|
+
@options[:internal_urls] << ["https://", uri.host, ":", uri.inferred_port, "/*"].join
|
86
|
+
else
|
87
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
|
88
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, ":", uri.inferred_port, "/*"].join
|
89
|
+
end
|
90
|
+
|
83
91
|
end
|
84
|
-
|
92
|
+
|
85
93
|
request.merge!(@options)
|
86
94
|
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => RedisConnection.new(request[:redis_options]))
|
87
95
|
@redis.set("original_base_url", base_url)
|
88
96
|
@redis.hset "statistics", "queued_at", DateTime.now
|
89
97
|
@redis.set("crawl-counter", 0)
|
90
|
-
@
|
98
|
+
queue_counter = @options[:seed_urls].count + 1
|
99
|
+
puts "queue_counter being init to #{queue_counter}"
|
100
|
+
@redis.set("queue-counter", queue_counter)
|
101
|
+
|
91
102
|
|
92
103
|
@options[:seed_urls].map{|link| @redis.sadd "queued", link }
|
93
|
-
|
104
|
+
|
94
105
|
@stats = Stats.new(request)
|
95
106
|
@stats.start_crawl(request)
|
96
|
-
|
107
|
+
|
97
108
|
# add internal_urls into redis
|
98
109
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
99
110
|
if @options[:queue_system] == :resque
|
100
111
|
Resque.enqueue(CrawlJob, request)
|
101
112
|
elsif @options[:queue_system] == :sidekiq
|
102
113
|
CrawlWorker.perform_async(request)
|
114
|
+
@options[:seed_urls].map{|url|
|
115
|
+
new_request = request.clone
|
116
|
+
new_request[:url] = url
|
117
|
+
CrawlWorker.perform_async(new_request)
|
118
|
+
}
|
119
|
+
|
103
120
|
else
|
104
121
|
raise "Unknown queue system: #{content_request[:queue_system]}"
|
105
122
|
end
|
106
|
-
|
123
|
+
|
107
124
|
request
|
108
125
|
end
|
109
|
-
|
126
|
+
|
110
127
|
# Returns array of cookies from content
|
111
128
|
def get_cookies(response)
|
112
129
|
all_cookies = response.get_fields('set-cookie')
|
@@ -134,7 +151,7 @@ class Cobweb
|
|
134
151
|
else
|
135
152
|
redirect_limit = 10
|
136
153
|
end
|
137
|
-
|
154
|
+
|
138
155
|
# connect to redis
|
139
156
|
if options.has_key? :crawl_id
|
140
157
|
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
@@ -147,7 +164,7 @@ class Cobweb
|
|
147
164
|
|
148
165
|
# check if it has already been cached
|
149
166
|
if @options[:cache] && ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id)))
|
150
|
-
if @options[:cache_type] == :crawl_based
|
167
|
+
if @options[:cache_type] == :crawl_based
|
151
168
|
puts "Cache hit in crawl for #{url}" unless @options[:quiet]
|
152
169
|
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
153
170
|
else
|
@@ -171,7 +188,7 @@ class Cobweb
|
|
171
188
|
begin
|
172
189
|
puts "Retrieving #{uri}... " unless @options[:quiet]
|
173
190
|
request_options={}
|
174
|
-
request_options['Cookie']= options[:cookies] if options[:cookies]
|
191
|
+
request_options['Cookie']= options[:cookies].map{|k,v| [k,v].join("=") }.join("&") if options[:cookies]
|
175
192
|
request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
|
176
193
|
|
177
194
|
request = Net::HTTP::Get.new uri.request_uri, request_options
|
@@ -183,9 +200,11 @@ class Cobweb
|
|
183
200
|
if @options[:range]
|
184
201
|
request.set_range(@options[:range])
|
185
202
|
end
|
186
|
-
|
203
|
+
|
187
204
|
response = @http.request request
|
188
205
|
|
206
|
+
cookies = Hash[get_cookies(response).to_s.split("; ").map{|s| [CGI.unescape(s.split("=")[0]), s.split("=")[1]]}].merge(options[:cookies] || {})
|
207
|
+
|
189
208
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
190
209
|
|
191
210
|
# get location to redirect to
|
@@ -196,24 +215,24 @@ class Cobweb
|
|
196
215
|
redirect_limit = redirect_limit - 1
|
197
216
|
|
198
217
|
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
199
|
-
|
200
|
-
|
218
|
+
|
201
219
|
# get the content from redirect location
|
202
220
|
content = get(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
|
203
221
|
|
204
222
|
content[:redirect_through] = [uri.to_s] if content[:redirect_through].nil?
|
205
223
|
content[:redirect_through].insert(0, url)
|
206
224
|
content[:url] = content[:redirect_through].last
|
207
|
-
|
225
|
+
|
208
226
|
content[:response_time] = Time.now.to_f - request_time
|
209
227
|
else
|
210
228
|
content[:response_time] = Time.now.to_f - request_time
|
211
|
-
|
229
|
+
|
212
230
|
puts "Retrieved." unless @options[:quiet]
|
213
231
|
|
214
232
|
# create the content container
|
215
233
|
content[:url] = uri.to_s
|
216
234
|
content[:status_code] = response.code.to_i
|
235
|
+
content[:cookies] = cookies
|
217
236
|
content[:mime_type] = ""
|
218
237
|
content[:mime_type] = response.content_type.split(";")[0].strip unless response.content_type.nil?
|
219
238
|
if !response["Content-Type"].nil? && response["Content-Type"].include?(";")
|
@@ -237,7 +256,7 @@ class Cobweb
|
|
237
256
|
# parse data for links
|
238
257
|
link_parser = ContentLinkParser.new(content[:url], content[:body])
|
239
258
|
content[:links] = link_parser.link_data
|
240
|
-
|
259
|
+
|
241
260
|
end
|
242
261
|
# add content to cache if required
|
243
262
|
if @options[:cache]
|
@@ -252,10 +271,10 @@ class Cobweb
|
|
252
271
|
rescue RedirectError => e
|
253
272
|
if @options[:raise_exceptions]
|
254
273
|
puts "Re-Raising error #{e.message} on #{uri.to_s}"
|
255
|
-
raise e
|
274
|
+
raise e
|
256
275
|
end
|
257
276
|
puts "ERROR RedirectError: #{e.message}"
|
258
|
-
|
277
|
+
|
259
278
|
## generate a blank content
|
260
279
|
content = {}
|
261
280
|
content[:url] = uri.to_s
|
@@ -267,11 +286,11 @@ class Cobweb
|
|
267
286
|
content[:mime_type] = "error/dnslookup"
|
268
287
|
content[:headers] = {}
|
269
288
|
content[:links] = {}
|
270
|
-
|
289
|
+
|
271
290
|
rescue SocketError => e
|
272
291
|
raise e if @options[:raise_exceptions]
|
273
292
|
puts "ERROR SocketError: #{e.message}"
|
274
|
-
|
293
|
+
|
275
294
|
## generate a blank content
|
276
295
|
content = {}
|
277
296
|
content[:url] = uri.to_s
|
@@ -283,11 +302,11 @@ class Cobweb
|
|
283
302
|
content[:mime_type] = "error/dnslookup"
|
284
303
|
content[:headers] = {}
|
285
304
|
content[:links] = {}
|
286
|
-
|
305
|
+
|
287
306
|
rescue Timeout::Error => e
|
288
307
|
raise e if @options[:raise_exceptions]
|
289
308
|
puts "ERROR Timeout::Error: #{e.message}"
|
290
|
-
|
309
|
+
|
291
310
|
## generate a blank content
|
292
311
|
content = {}
|
293
312
|
content[:url] = uri.to_s
|
@@ -306,7 +325,7 @@ class Cobweb
|
|
306
325
|
|
307
326
|
# Performs a HTTP HEAD request to the specified url applying the options supplied
|
308
327
|
def head(url, options = @options)
|
309
|
-
raise "url cannot be nil" if url.nil?
|
328
|
+
raise "url cannot be nil" if url.nil?
|
310
329
|
uri = Addressable::URI.parse(url)
|
311
330
|
uri.normalize!
|
312
331
|
uri.fragment=nil
|
@@ -319,16 +338,16 @@ class Cobweb
|
|
319
338
|
else
|
320
339
|
redirect_limit = 10
|
321
340
|
end
|
322
|
-
|
341
|
+
|
323
342
|
# connect to redis
|
324
343
|
if options.has_key? :crawl_id
|
325
344
|
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
326
345
|
else
|
327
346
|
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
|
328
347
|
end
|
329
|
-
|
348
|
+
|
330
349
|
content = {:base_url => url}
|
331
|
-
|
350
|
+
|
332
351
|
# check if it has already been cached
|
333
352
|
if @options[:cache] && redis.get("head-#{unique_id}")
|
334
353
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
@@ -386,8 +405,8 @@ class Cobweb
|
|
386
405
|
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
387
406
|
content[:character_set] = charset
|
388
407
|
end
|
389
|
-
end
|
390
|
-
|
408
|
+
end
|
409
|
+
|
391
410
|
# add content to cache if required
|
392
411
|
if @options[:cache]
|
393
412
|
puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
|
@@ -416,7 +435,7 @@ class Cobweb
|
|
416
435
|
rescue SocketError => e
|
417
436
|
raise e if @options[:raise_exceptions]
|
418
437
|
puts "ERROR SocketError: #{e.message}"
|
419
|
-
|
438
|
+
|
420
439
|
## generate a blank content
|
421
440
|
content = {}
|
422
441
|
content[:url] = uri.to_s
|
@@ -428,11 +447,11 @@ class Cobweb
|
|
428
447
|
content[:mime_type] = "error/dnslookup"
|
429
448
|
content[:headers] = {}
|
430
449
|
content[:links] = {}
|
431
|
-
|
450
|
+
|
432
451
|
rescue Timeout::Error => e
|
433
452
|
raise e if @options[:raise_exceptions]
|
434
453
|
puts "ERROR Timeout::Error: #{e.message}"
|
435
|
-
|
454
|
+
|
436
455
|
## generate a blank content
|
437
456
|
content = {}
|
438
457
|
content[:url] = uri.to_s
|
@@ -445,10 +464,10 @@ class Cobweb
|
|
445
464
|
content[:headers] = {}
|
446
465
|
content[:links] = {}
|
447
466
|
end
|
448
|
-
|
467
|
+
|
449
468
|
content
|
450
469
|
end
|
451
|
-
|
470
|
+
|
452
471
|
end
|
453
472
|
|
454
473
|
# escapes characters with meaning in regular expressions and adds wildcard expression
|
@@ -456,7 +475,7 @@ class Cobweb
|
|
456
475
|
pattern = pattern.gsub(".", "\\.")
|
457
476
|
pattern = pattern.gsub("?", "\\?")
|
458
477
|
pattern = pattern.gsub("+", "\\\\+")
|
459
|
-
pattern = pattern.gsub("*", ".*?")
|
478
|
+
pattern = pattern.gsub("*", ".*?")
|
460
479
|
if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
|
461
480
|
pattern = pattern.gsub("http:", "https?:")
|
462
481
|
end
|
@@ -464,9 +483,9 @@ class Cobweb
|
|
464
483
|
end
|
465
484
|
|
466
485
|
def clear_cache
|
467
|
-
|
486
|
+
|
468
487
|
end
|
469
|
-
|
488
|
+
|
470
489
|
private
|
471
490
|
# checks if the mime_type is textual
|
472
491
|
def text_content?(content_type)
|
@@ -475,5 +494,5 @@ class Cobweb
|
|
475
494
|
end
|
476
495
|
false
|
477
496
|
end
|
478
|
-
|
497
|
+
|
479
498
|
end
|