cobweb 0.0.22 → 0.0.24
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +13 -9
- data/lib/cobweb.rb +56 -22
- data/lib/cobweb_process_job.rb +1 -1
- data/lib/content_link_parser.rb +0 -1
- data/lib/crawl_job.rb +102 -123
- data/lib/stats.rb +53 -1
- data/spec/cobweb/cobweb_spec.rb +20 -1
- metadata +30 -32
- data/lib/cobweb/version.rb +0 -1
- data/lib/hash.rb +0 -22
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.23
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
@@ -54,14 +54,16 @@ Creates a new crawler object based on a base_url
|
|
54
54
|
|
55
55
|
* options - Options are passed in as a hash,
|
56
56
|
|
57
|
-
** :follow_redirects
|
58
|
-
** :redirect_limit
|
59
|
-
** :processing_queue
|
60
|
-
** :debug
|
61
|
-
** :quiet
|
62
|
-
** :cache
|
63
|
-
** :timeout
|
64
|
-
** :redis_options
|
57
|
+
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
|
58
|
+
** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
|
59
|
+
** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
|
60
|
+
** :debug - enables debug output (Default: false)
|
61
|
+
** :quiet - hides default output (Default: false)
|
62
|
+
** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
|
63
|
+
** :timeout - http timeout for requests (Default: 10)
|
64
|
+
** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
|
65
|
+
** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*'])
|
66
|
+
** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com)
|
65
67
|
|
66
68
|
bq. crawler = CobWeb.new(:follow_redirects => false)
|
67
69
|
|
@@ -70,6 +72,8 @@ h4. start(base_url)
|
|
70
72
|
Starts a crawl through resque. Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
|
71
73
|
|
72
74
|
* base_url - the url to start the crawl from
|
75
|
+
|
76
|
+
Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
|
73
77
|
|
74
78
|
bq. crawler.start("http://www.google.com/")
|
75
79
|
|
data/lib/cobweb.rb
CHANGED
@@ -19,20 +19,33 @@ class Cobweb
|
|
19
19
|
# investigate using event machine for single threaded crawling
|
20
20
|
|
21
21
|
def self.version
|
22
|
-
"0.0.
|
22
|
+
"0.0.24"
|
23
|
+
end
|
24
|
+
|
25
|
+
def method_missing(method_sym, *arguments, &block)
|
26
|
+
if method_sym.to_s =~ /^default_(.*)_to$/
|
27
|
+
tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
|
28
|
+
@options[tag_name] = arguments[0] unless @options.has_key?(tag_name)
|
29
|
+
else
|
30
|
+
super
|
31
|
+
end
|
23
32
|
end
|
24
33
|
|
25
34
|
def initialize(options = {})
|
26
35
|
@options = options
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
+
|
37
|
+
default_follow_redirects_to true
|
38
|
+
default_redirect_limit_to 10
|
39
|
+
default_processing_queue_to CobwebProcessJob
|
40
|
+
default_crawl_finished_queue_to CobwebFinishedJob
|
41
|
+
default_quiet_to true
|
42
|
+
default_debug_to false
|
43
|
+
default_cache_to 300
|
44
|
+
default_timeout_to 10
|
45
|
+
default_redis_options_to Hash.new
|
46
|
+
default_internal_urls_to []
|
47
|
+
default_first_page_redirect_internal_to true
|
48
|
+
|
36
49
|
end
|
37
50
|
|
38
51
|
def start(base_url)
|
@@ -42,9 +55,20 @@ class Cobweb
|
|
42
55
|
:url => base_url
|
43
56
|
}
|
44
57
|
|
58
|
+
if @options[:internal_urls].empty?
|
59
|
+
uri = Addressable::URI.parse(base_url)
|
60
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
|
61
|
+
end
|
62
|
+
|
45
63
|
request.merge!(@options)
|
46
64
|
@redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
|
47
65
|
@redis.hset "statistics", "queued_at", DateTime.now
|
66
|
+
@redis.set("crawl-counter", 0)
|
67
|
+
@redis.set("queue-counter", 1)
|
68
|
+
|
69
|
+
|
70
|
+
# add internal_urls into redis
|
71
|
+
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
48
72
|
|
49
73
|
Resque.enqueue(CrawlJob, request)
|
50
74
|
end
|
@@ -70,7 +94,7 @@ class Cobweb
|
|
70
94
|
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
|
71
95
|
end
|
72
96
|
|
73
|
-
content = {}
|
97
|
+
content = {:base_url => url}
|
74
98
|
|
75
99
|
# check if it has already been cached
|
76
100
|
if redis.get(unique_id) and @options[:cache]
|
@@ -96,7 +120,7 @@ class Cobweb
|
|
96
120
|
begin
|
97
121
|
print "Retrieving #{url }... " unless @options[:quiet]
|
98
122
|
request = Net::HTTP::Get.new uri.request_uri
|
99
|
-
|
123
|
+
|
100
124
|
response = @http.request request
|
101
125
|
|
102
126
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
@@ -125,7 +149,7 @@ class Cobweb
|
|
125
149
|
content[:response_time] = Time.now.to_f - request_time
|
126
150
|
|
127
151
|
puts "Retrieved." unless @options[:quiet]
|
128
|
-
|
152
|
+
|
129
153
|
# create the content container
|
130
154
|
content[:url] = uri.to_s
|
131
155
|
content[:status_code] = response.code.to_i
|
@@ -138,12 +162,16 @@ class Cobweb
|
|
138
162
|
end
|
139
163
|
content[:length] = response.content_length
|
140
164
|
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
141
|
-
|
142
|
-
|
165
|
+
if response["Content-Encoding"]=="gzip"
|
166
|
+
content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
|
167
|
+
else
|
168
|
+
content[:body] = response.body
|
169
|
+
end
|
170
|
+
else
|
143
171
|
content[:body] = Base64.encode64(response.body)
|
144
172
|
end
|
145
173
|
content[:location] = response["location"]
|
146
|
-
content[:headers] = response.to_hash.
|
174
|
+
content[:headers] = response.to_hash.deep_symbolize_keys
|
147
175
|
# parse data for links
|
148
176
|
link_parser = ContentLinkParser.new(content[:url], content[:body])
|
149
177
|
content[:links] = link_parser.link_data
|
@@ -170,7 +198,7 @@ class Cobweb
|
|
170
198
|
content[:links] = {}
|
171
199
|
|
172
200
|
rescue SocketError => e
|
173
|
-
puts "ERROR: #{e.message}"
|
201
|
+
puts "ERROR: SocketError#{e.message}"
|
174
202
|
|
175
203
|
## generate a blank content
|
176
204
|
content = {}
|
@@ -185,7 +213,7 @@ class Cobweb
|
|
185
213
|
content[:links] = {}
|
186
214
|
|
187
215
|
rescue Timeout::Error => e
|
188
|
-
puts "ERROR: #{e.message}"
|
216
|
+
puts "ERROR Timeout::Error: #{e.message}"
|
189
217
|
|
190
218
|
## generate a blank content
|
191
219
|
content = {}
|
@@ -207,10 +235,14 @@ class Cobweb
|
|
207
235
|
raise "url cannot be nil" if url.nil?
|
208
236
|
|
209
237
|
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
210
|
-
|
238
|
+
|
211
239
|
# get the unique id for this request
|
212
240
|
unique_id = Digest::SHA1.hexdigest(url)
|
213
|
-
redirect_limit
|
241
|
+
if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
|
242
|
+
redirect_limit = options[:redirect_limit].to_i
|
243
|
+
else
|
244
|
+
redirect_limit = 10
|
245
|
+
end
|
214
246
|
|
215
247
|
# connect to redis
|
216
248
|
if options.has_key? :crawl_id
|
@@ -224,7 +256,7 @@ class Cobweb
|
|
224
256
|
# check if it has already been cached
|
225
257
|
if redis.get("head-#{unique_id}") and @options[:cache]
|
226
258
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
227
|
-
Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
|
259
|
+
content = Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
|
228
260
|
else
|
229
261
|
print "Retrieving #{url }... " unless @options[:quiet]
|
230
262
|
uri = Addressable::URI.parse(url.strip)
|
@@ -247,7 +279,9 @@ class Cobweb
|
|
247
279
|
puts "redirected... " unless @options[:quiet]
|
248
280
|
url = absolutize.url(response['location']).to_s
|
249
281
|
redirect_limit = redirect_limit - 1
|
250
|
-
|
282
|
+
options = options.clone
|
283
|
+
options[:redirect_limit]=redirect_limit
|
284
|
+
content = head(url, options)
|
251
285
|
content[:url] = uri.to_s
|
252
286
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
253
287
|
content[:redirect_through].insert(0, url)
|
data/lib/cobweb_process_job.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
@@ -6,163 +6,142 @@ class CrawlJob
|
|
6
6
|
|
7
7
|
@queue = :cobweb_crawl_job
|
8
8
|
|
9
|
-
## redis params used
|
10
|
-
#
|
11
|
-
# crawl-counter
|
12
|
-
# crawled
|
13
|
-
# queue-counter
|
14
|
-
# statistics[:average_response_time]
|
15
|
-
# statistics[:maximum_response_time]
|
16
|
-
# statistics[:minimum_response_time]
|
17
|
-
# statistics[:average_length]
|
18
|
-
# statistics[:maximum_length]
|
19
|
-
# statistics[:minimum_length]
|
20
|
-
# statistics[:queued_at]
|
21
|
-
# statistics[:started_at]
|
22
|
-
# statistics]:finished_at]
|
23
|
-
# total_pages
|
24
|
-
# total_assets
|
25
|
-
# statistics[:mime_counts]["mime_type"]
|
26
|
-
# statistics[:status_counts][xxx]
|
27
|
-
|
28
9
|
def self.perform(content_request)
|
29
|
-
|
30
|
-
|
31
|
-
|
10
|
+
|
11
|
+
# change all hash keys to symbols
|
12
|
+
content_request = content_request.deep_symbolize_keys
|
13
|
+
|
14
|
+
@redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
32
15
|
|
33
16
|
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
34
|
-
|
17
|
+
@debug = content_request[:debug]
|
18
|
+
|
19
|
+
refresh_counters
|
20
|
+
|
35
21
|
# check we haven't crawled this url before
|
36
|
-
|
37
|
-
queue_counter = redis.get("queue-counter").to_i
|
38
|
-
unless redis.sismember "crawled", content_request[:url]
|
22
|
+
unless @redis.sismember "crawled", content_request[:url]
|
39
23
|
|
40
|
-
#
|
41
|
-
|
42
|
-
crawl_counter += 1
|
43
|
-
if crawl_counter <= content_request[:crawl_limit].to_i
|
24
|
+
# if there is no limit or we're still under it lets get the url
|
25
|
+
if content_request[:crawl_limit].nil? or @crawl_counter <= content_request[:crawl_limit].to_i
|
44
26
|
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
45
|
-
|
27
|
+
|
46
28
|
## update statistics
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
if
|
55
|
-
|
56
|
-
|
57
|
-
|
29
|
+
Stats.set_statistics_in_redis(@redis, content)
|
30
|
+
|
31
|
+
# set the base url if this is the first page
|
32
|
+
set_base_url @redis, content, content_request
|
33
|
+
|
34
|
+
internal_links = all_links_from_content(content).map{|link| link.to_s}
|
35
|
+
|
36
|
+
# reject the link if we've crawled it or queued it
|
37
|
+
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
38
|
+
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
39
|
+
|
40
|
+
# select the link if its internal
|
41
|
+
internal_links.select!{|link| internal_link?(link)}
|
42
|
+
|
43
|
+
internal_links.each do |link|
|
44
|
+
enqueue_content(content_request, link)
|
58
45
|
end
|
59
|
-
redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
|
60
|
-
redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
|
61
46
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
mime_counts = {}
|
69
|
-
if redis.hexists "statistics", "mime_counts"
|
70
|
-
mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
|
71
|
-
if mime_counts.has_key? content[:mime_type]
|
72
|
-
mime_counts[content[:mime_type]] += 1
|
73
|
-
else
|
74
|
-
mime_counts[content[:mime_type]] = 1
|
75
|
-
end
|
76
|
-
else
|
77
|
-
mime_counts = {content[:mime_type] => 1}
|
78
|
-
end
|
79
|
-
redis.hset "statistics", "mime_counts", mime_counts.to_json
|
80
|
-
|
81
|
-
status_counts = {}
|
82
|
-
if redis.hexists "statistics", "status_counts"
|
83
|
-
status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
|
84
|
-
if status_counts.has_key? content[:status_code].to_i
|
85
|
-
status_counts[content[:status_code].to_i] += 1
|
86
|
-
else
|
87
|
-
status_counts[content[:status_code].to_i] = 1
|
88
|
-
end
|
89
|
-
else
|
90
|
-
status_counts = {content[:status_code].to_i => 1}
|
91
|
-
end
|
92
|
-
redis.hset "statistics", "status_counts", status_counts.to_json
|
93
|
-
|
94
|
-
redis.srem "queued", content_request[:url]
|
95
|
-
redis.sadd "crawled", content_request[:url]
|
96
|
-
set_base_url redis, content, content_request[:base_url]
|
97
|
-
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
98
|
-
link = link.to_s
|
99
|
-
unless redis.sismember "crawled", link
|
100
|
-
puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
|
101
|
-
if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
|
102
|
-
puts "Matched as #{link} as internal" if content_request[:debug]
|
103
|
-
unless redis.sismember("crawled", link) or redis.sismember("queued", link)
|
104
|
-
if queue_counter <= content_request[:crawl_limit].to_i
|
105
|
-
new_request = content_request.clone
|
106
|
-
new_request[:url] = link
|
107
|
-
new_request[:parent] = content_request[:url]
|
108
|
-
Resque.enqueue(CrawlJob, new_request)
|
109
|
-
redis.sadd "queued", link
|
110
|
-
redis.incr "queue-counter"
|
111
|
-
queue_counter += 1
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
end
|
116
|
-
end
|
47
|
+
# now that we're done, lets update the queues
|
48
|
+
@redis.srem "queued", content_request[:url]
|
49
|
+
decrement_queue_counter
|
50
|
+
@redis.sadd "crawled", content_request[:url]
|
51
|
+
increment_crawl_counter
|
117
52
|
|
118
53
|
# enqueue to processing queue
|
119
54
|
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
|
120
55
|
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
121
|
-
puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
|
122
|
-
|
123
|
-
|
56
|
+
puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
|
57
|
+
|
124
58
|
else
|
125
|
-
puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
|
59
|
+
puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
|
126
60
|
end
|
127
61
|
else
|
128
62
|
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
129
63
|
end
|
130
64
|
|
131
|
-
#
|
132
|
-
|
133
|
-
if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
|
65
|
+
# if the'res nothing left queued or the crawled limit has been reached
|
66
|
+
if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
|
134
67
|
|
135
|
-
puts "queue_counter: #{queue_counter}"
|
136
|
-
puts "crawl_counter: #{crawl_counter}"
|
68
|
+
puts "queue_counter: #{@queue_counter}"
|
69
|
+
puts "crawl_counter: #{@crawl_counter}"
|
137
70
|
puts "crawl_limit: #{content_request[:crawl_limit]}"
|
138
71
|
|
139
72
|
# finished
|
140
73
|
puts "FINISHED"
|
141
|
-
stats = redis.hgetall "statistics"
|
142
|
-
stats[:total_pages] = redis.get "total_pages"
|
143
|
-
stats[:total_assets] = redis.get "total_assets"
|
144
|
-
stats[:crawl_counter] = redis.get "crawl_counter"
|
145
|
-
stats[:queue_counter] = redis.get "queue_counter"
|
146
|
-
stats[:crawled] = redis.smembers "crawled"
|
74
|
+
stats = @redis.hgetall "statistics"
|
75
|
+
stats[:total_pages] = @redis.get "total_pages"
|
76
|
+
stats[:total_assets] = @redis.get "total_assets"
|
77
|
+
stats[:crawl_counter] = @redis.get "crawl_counter"
|
78
|
+
stats[:queue_counter] = @redis.get "queue_counter"
|
79
|
+
stats[:crawled] = @redis.smembers "crawled"
|
147
80
|
|
148
|
-
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]}))
|
81
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
149
82
|
|
150
|
-
ap stats
|
151
83
|
end
|
152
84
|
end
|
153
85
|
|
154
86
|
private
|
155
|
-
def self.set_base_url(redis, content,
|
87
|
+
def self.set_base_url(redis, content, content_request)
|
156
88
|
if redis.get("base_url").nil?
|
157
|
-
|
158
|
-
|
159
|
-
redis.
|
160
|
-
|
161
|
-
|
162
|
-
|
89
|
+
unless content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
|
90
|
+
uri = Addressable::URI.parse(content[:redirect_through].last)
|
91
|
+
redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
|
92
|
+
end
|
93
|
+
redis.set("base_url", content[:url])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.internal_link?(link)
|
98
|
+
puts "Checking for internal link for: #{link}" if @debug
|
99
|
+
@internal_patterns ||= @redis.smembers("internal_urls").map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}
|
100
|
+
valid_link = true
|
101
|
+
@internal_patterns.each do |pattern|
|
102
|
+
puts "Matching against #{pattern.source}" if @debug
|
103
|
+
if link.match(pattern)
|
104
|
+
puts "Matched as internal" if @debug
|
105
|
+
return true
|
163
106
|
end
|
164
107
|
end
|
108
|
+
puts "Didn't match any pattern so marked as not internal" if @debug
|
109
|
+
false
|
165
110
|
end
|
166
111
|
|
112
|
+
def self.all_links_from_content(content)
|
113
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten
|
114
|
+
end
|
167
115
|
|
116
|
+
def self.enqueue_content(content_request, link)
|
117
|
+
new_request = content_request.clone
|
118
|
+
new_request[:url] = link
|
119
|
+
new_request[:parent] = content_request[:url]
|
120
|
+
Resque.enqueue(CrawlJob, new_request)
|
121
|
+
@redis.sadd "queued", link
|
122
|
+
increment_queue_counter
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.increment_queue_counter
|
126
|
+
@redis.incr "queue-counter"
|
127
|
+
refresh_counters
|
128
|
+
end
|
129
|
+
def self.increment_crawl_counter
|
130
|
+
@redis.incr "crawl-counter"
|
131
|
+
refresh_counters
|
132
|
+
end
|
133
|
+
def self.decrement_queue_counter
|
134
|
+
@redis.decr "queue-counter"
|
135
|
+
refresh_counters
|
136
|
+
end
|
137
|
+
def self.refresh_counters
|
138
|
+
@crawl_counter = @redis.get("crawl-counter").to_i
|
139
|
+
@queue_counter = @redis.get("queue-counter").to_i
|
140
|
+
end
|
141
|
+
def self.reset_counters
|
142
|
+
@redis.set("crawl-counter", @redis.smembers("crawled").count)
|
143
|
+
@redis.set("queue-counter", @redis.smembers("queued").count)
|
144
|
+
@crawl_counter = @redis.get("crawl-counter").to_i
|
145
|
+
@queue_counter = @redis.get("queue-counter").to_i
|
146
|
+
end
|
168
147
|
end
|
data/lib/stats.rb
CHANGED
@@ -11,6 +11,59 @@ class Stats < Sinatra::Base
|
|
11
11
|
@@status = status
|
12
12
|
end
|
13
13
|
|
14
|
+
def self.set_statistics_in_redis(redis, content)
|
15
|
+
crawl_counter = redis.get("crawl-counter").to_i
|
16
|
+
queue_counter = redis.get("queue-counter").to_i
|
17
|
+
|
18
|
+
if redis.hexists "statistics", "average_response_time"
|
19
|
+
redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
|
20
|
+
else
|
21
|
+
redis.hset("statistics", "average_response_time", content[:response_time].to_f)
|
22
|
+
end
|
23
|
+
redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
|
24
|
+
redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
|
25
|
+
if redis.hexists "statistics", "average_length"
|
26
|
+
redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
|
27
|
+
else
|
28
|
+
redis.hset("statistics", "average_length", content[:length].to_i)
|
29
|
+
end
|
30
|
+
redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
|
31
|
+
redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
|
32
|
+
|
33
|
+
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
34
|
+
redis.incr "total_pages"
|
35
|
+
else
|
36
|
+
redis.incr "total_assets"
|
37
|
+
end
|
38
|
+
|
39
|
+
mime_counts = {}
|
40
|
+
if redis.hexists "statistics", "mime_counts"
|
41
|
+
mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
|
42
|
+
if mime_counts.has_key? content[:mime_type]
|
43
|
+
mime_counts[content[:mime_type]] += 1
|
44
|
+
else
|
45
|
+
mime_counts[content[:mime_type]] = 1
|
46
|
+
end
|
47
|
+
else
|
48
|
+
mime_counts = {content[:mime_type] => 1}
|
49
|
+
end
|
50
|
+
redis.hset "statistics", "mime_counts", mime_counts.to_json
|
51
|
+
|
52
|
+
status_counts = {}
|
53
|
+
if redis.hexists "statistics", "status_counts"
|
54
|
+
status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
|
55
|
+
if status_counts.has_key? content[:status_code].to_i
|
56
|
+
status_counts[content[:status_code].to_i] += 1
|
57
|
+
else
|
58
|
+
status_counts[content[:status_code].to_i] = 1
|
59
|
+
end
|
60
|
+
else
|
61
|
+
status_counts = {content[:status_code].to_i => 1}
|
62
|
+
end
|
63
|
+
redis.hset "statistics", "status_counts", status_counts.to_json
|
64
|
+
|
65
|
+
end
|
66
|
+
|
14
67
|
set :views, settings.root + '/../views'
|
15
68
|
|
16
69
|
get '/' do
|
@@ -19,7 +72,6 @@ class Stats < Sinatra::Base
|
|
19
72
|
haml :statistics
|
20
73
|
end
|
21
74
|
|
22
|
-
|
23
75
|
def self.start
|
24
76
|
thread = Thread.new do
|
25
77
|
Stats.run!
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -74,6 +74,25 @@ describe Cobweb do
|
|
74
74
|
Cobweb.new.should be_an_instance_of Cobweb
|
75
75
|
end
|
76
76
|
|
77
|
+
it "should setup with defaults" do
|
78
|
+
cobweb = Cobweb.new
|
79
|
+
|
80
|
+
options = cobweb.instance_eval("@options")
|
81
|
+
ap options
|
82
|
+
|
83
|
+
options[:follow_redirects].should == true
|
84
|
+
options[:redirect_limit].should == 10
|
85
|
+
options[:processing_queue].should == CobwebProcessJob
|
86
|
+
options[:crawl_finished_queue].should == CobwebFinishedJob
|
87
|
+
options[:quiet].should == true
|
88
|
+
options[:debug].should == false
|
89
|
+
options[:cache].should == 300
|
90
|
+
options[:timeout].should == 10
|
91
|
+
options[:redis_options].should == {}
|
92
|
+
options[:internal_urls].should == []
|
93
|
+
|
94
|
+
end
|
95
|
+
|
77
96
|
describe "get" do
|
78
97
|
it "should return a hash with default values" do
|
79
98
|
@cobweb.get(@base_url).should be_an_instance_of Hash
|
@@ -141,7 +160,7 @@ describe Cobweb do
|
|
141
160
|
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
142
161
|
#
|
143
162
|
#content = @cobweb.get(@base_url)
|
144
|
-
#content.should be_an_instance_of
|
163
|
+
#content.should be_an_instance_of HashHelper
|
145
164
|
#ap content
|
146
165
|
#content[:url].should == "http://redirect-me.com/redirect.html"
|
147
166
|
#content[:redirect_through].length.should == 2
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.24
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70268501331520 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70268501331520
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70268501331100 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70268501331100
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
|
-
name:
|
38
|
-
requirement: &
|
37
|
+
name: nokogiri
|
38
|
+
requirement: &70268501330680 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70268501330680
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
49
|
-
requirement: &
|
48
|
+
name: addressable
|
49
|
+
requirement: &70268501330240 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70268501330240
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
|
-
name:
|
60
|
-
requirement: &
|
59
|
+
name: rspec
|
60
|
+
requirement: &70268501329820 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70268501329820
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
71
|
-
requirement: &
|
70
|
+
name: awesome_print
|
71
|
+
requirement: &70268501329400 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70268501329400
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
|
-
name:
|
82
|
-
requirement: &
|
81
|
+
name: sinatra
|
82
|
+
requirement: &70268501328980 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70268501328980
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
|
-
name:
|
93
|
-
requirement: &
|
92
|
+
name: thin
|
93
|
+
requirement: &70268501328560 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70268501328560
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
|
-
name:
|
104
|
-
requirement: &
|
103
|
+
name: haml
|
104
|
+
requirement: &70268501328140 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70268501328140
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
|
-
name:
|
115
|
-
requirement: &
|
114
|
+
name: hashie
|
115
|
+
requirement: &70268501344080 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70268501344080
|
124
124
|
description:
|
125
125
|
email: stewart@rockwellcottage.com
|
126
126
|
executables: []
|
@@ -134,14 +134,12 @@ files:
|
|
134
134
|
- spec/samples/sample_html_links.html
|
135
135
|
- spec/spec.opts
|
136
136
|
- spec/spec_helper.rb
|
137
|
-
- lib/cobweb/version.rb
|
138
137
|
- lib/cobweb.rb
|
139
138
|
- lib/cobweb_crawler.rb
|
140
139
|
- lib/cobweb_finished_job.rb
|
141
140
|
- lib/cobweb_process_job.rb
|
142
141
|
- lib/content_link_parser.rb
|
143
142
|
- lib/crawl_job.rb
|
144
|
-
- lib/hash.rb
|
145
143
|
- lib/namespaced_redis.rb
|
146
144
|
- lib/redirect_error.rb
|
147
145
|
- lib/robots.rb
|
data/lib/cobweb/version.rb
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
VERSION = "0.0.21"
|
data/lib/hash.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
## add symbolize methods to hash
|
2
|
-
class Hash
|
3
|
-
def symbolize_keys
|
4
|
-
keys.each do |key|
|
5
|
-
if key.instance_of? String
|
6
|
-
value = self[key]
|
7
|
-
self.delete(key)
|
8
|
-
self[key.to_sym] = value
|
9
|
-
end
|
10
|
-
end
|
11
|
-
self
|
12
|
-
end
|
13
|
-
def deep_symbolize_keys
|
14
|
-
symbolize_keys
|
15
|
-
keys.each do |key|
|
16
|
-
if self[key].instance_of? Hash
|
17
|
-
self[key].deep_symbolize_keys
|
18
|
-
end
|
19
|
-
end
|
20
|
-
self
|
21
|
-
end
|
22
|
-
end
|