cobweb 0.0.22 → 0.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +13 -9
- data/lib/cobweb.rb +56 -22
- data/lib/cobweb_process_job.rb +1 -1
- data/lib/content_link_parser.rb +0 -1
- data/lib/crawl_job.rb +102 -123
- data/lib/stats.rb +53 -1
- data/spec/cobweb/cobweb_spec.rb +20 -1
- metadata +30 -32
- data/lib/cobweb/version.rb +0 -1
- data/lib/hash.rb +0 -22
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.23
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
@@ -54,14 +54,16 @@ Creates a new crawler object based on a base_url
|
|
54
54
|
|
55
55
|
* options - Options are passed in as a hash,
|
56
56
|
|
57
|
-
** :follow_redirects
|
58
|
-
** :redirect_limit
|
59
|
-
** :processing_queue
|
60
|
-
** :debug
|
61
|
-
** :quiet
|
62
|
-
** :cache
|
63
|
-
** :timeout
|
64
|
-
** :redis_options
|
57
|
+
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
|
58
|
+
** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
|
59
|
+
** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
|
60
|
+
** :debug - enables debug output (Default: false)
|
61
|
+
** :quiet - hides default output (Default: false)
|
62
|
+
** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
|
63
|
+
** :timeout - http timeout for requests (Default: 10)
|
64
|
+
** :redis_options - hash containing the initialization options for redis (e.g. {:host => "redis.mydomain.com"}
|
65
|
+
** :internal_urls - array of strings representing internal url forms for your site (eg: ['http://test.com/*', 'http://blog.test.com/*', 'http://externaltest.com/*'])
|
66
|
+
** :first_page_redirect_internal - if true and the first page crawled is a redirect, it will add the final destination of redirects to the internal_urls (e.g. http://www.test.com gets redirected to http://test.com)
|
65
67
|
|
66
68
|
bq. crawler = CobWeb.new(:follow_redirects => false)
|
67
69
|
|
@@ -70,6 +72,8 @@ h4. start(base_url)
|
|
70
72
|
Starts a crawl through resque. Requires the :processing_queue to be set to a valid class for the resque job to work with the data retrieved.
|
71
73
|
|
72
74
|
* base_url - the url to start the crawl from
|
75
|
+
|
76
|
+
Once the crawler starts, if the first page is redirected (eg from http://www.test.com to http://test.com) then the endpoint scheme, host and domain is added to the internal_urls automatically.
|
73
77
|
|
74
78
|
bq. crawler.start("http://www.google.com/")
|
75
79
|
|
data/lib/cobweb.rb
CHANGED
@@ -19,20 +19,33 @@ class Cobweb
|
|
19
19
|
# investigate using event machine for single threaded crawling
|
20
20
|
|
21
21
|
def self.version
|
22
|
-
"0.0.
|
22
|
+
"0.0.24"
|
23
|
+
end
|
24
|
+
|
25
|
+
def method_missing(method_sym, *arguments, &block)
|
26
|
+
if method_sym.to_s =~ /^default_(.*)_to$/
|
27
|
+
tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
|
28
|
+
@options[tag_name] = arguments[0] unless @options.has_key?(tag_name)
|
29
|
+
else
|
30
|
+
super
|
31
|
+
end
|
23
32
|
end
|
24
33
|
|
25
34
|
def initialize(options = {})
|
26
35
|
@options = options
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
+
|
37
|
+
default_follow_redirects_to true
|
38
|
+
default_redirect_limit_to 10
|
39
|
+
default_processing_queue_to CobwebProcessJob
|
40
|
+
default_crawl_finished_queue_to CobwebFinishedJob
|
41
|
+
default_quiet_to true
|
42
|
+
default_debug_to false
|
43
|
+
default_cache_to 300
|
44
|
+
default_timeout_to 10
|
45
|
+
default_redis_options_to Hash.new
|
46
|
+
default_internal_urls_to []
|
47
|
+
default_first_page_redirect_internal_to true
|
48
|
+
|
36
49
|
end
|
37
50
|
|
38
51
|
def start(base_url)
|
@@ -42,9 +55,20 @@ class Cobweb
|
|
42
55
|
:url => base_url
|
43
56
|
}
|
44
57
|
|
58
|
+
if @options[:internal_urls].empty?
|
59
|
+
uri = Addressable::URI.parse(base_url)
|
60
|
+
@options[:internal_urls] << [uri.scheme, "://", uri.host, "/*"].join
|
61
|
+
end
|
62
|
+
|
45
63
|
request.merge!(@options)
|
46
64
|
@redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
|
47
65
|
@redis.hset "statistics", "queued_at", DateTime.now
|
66
|
+
@redis.set("crawl-counter", 0)
|
67
|
+
@redis.set("queue-counter", 1)
|
68
|
+
|
69
|
+
|
70
|
+
# add internal_urls into redis
|
71
|
+
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
48
72
|
|
49
73
|
Resque.enqueue(CrawlJob, request)
|
50
74
|
end
|
@@ -70,7 +94,7 @@ class Cobweb
|
|
70
94
|
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
|
71
95
|
end
|
72
96
|
|
73
|
-
content = {}
|
97
|
+
content = {:base_url => url}
|
74
98
|
|
75
99
|
# check if it has already been cached
|
76
100
|
if redis.get(unique_id) and @options[:cache]
|
@@ -96,7 +120,7 @@ class Cobweb
|
|
96
120
|
begin
|
97
121
|
print "Retrieving #{url }... " unless @options[:quiet]
|
98
122
|
request = Net::HTTP::Get.new uri.request_uri
|
99
|
-
|
123
|
+
|
100
124
|
response = @http.request request
|
101
125
|
|
102
126
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
@@ -125,7 +149,7 @@ class Cobweb
|
|
125
149
|
content[:response_time] = Time.now.to_f - request_time
|
126
150
|
|
127
151
|
puts "Retrieved." unless @options[:quiet]
|
128
|
-
|
152
|
+
|
129
153
|
# create the content container
|
130
154
|
content[:url] = uri.to_s
|
131
155
|
content[:status_code] = response.code.to_i
|
@@ -138,12 +162,16 @@ class Cobweb
|
|
138
162
|
end
|
139
163
|
content[:length] = response.content_length
|
140
164
|
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
141
|
-
|
142
|
-
|
165
|
+
if response["Content-Encoding"]=="gzip"
|
166
|
+
content[:body] = Zlib::GzipReader.new(StringIO.new(response.body)).read
|
167
|
+
else
|
168
|
+
content[:body] = response.body
|
169
|
+
end
|
170
|
+
else
|
143
171
|
content[:body] = Base64.encode64(response.body)
|
144
172
|
end
|
145
173
|
content[:location] = response["location"]
|
146
|
-
content[:headers] = response.to_hash.
|
174
|
+
content[:headers] = response.to_hash.deep_symbolize_keys
|
147
175
|
# parse data for links
|
148
176
|
link_parser = ContentLinkParser.new(content[:url], content[:body])
|
149
177
|
content[:links] = link_parser.link_data
|
@@ -170,7 +198,7 @@ class Cobweb
|
|
170
198
|
content[:links] = {}
|
171
199
|
|
172
200
|
rescue SocketError => e
|
173
|
-
puts "ERROR: #{e.message}"
|
201
|
+
puts "ERROR: SocketError#{e.message}"
|
174
202
|
|
175
203
|
## generate a blank content
|
176
204
|
content = {}
|
@@ -185,7 +213,7 @@ class Cobweb
|
|
185
213
|
content[:links] = {}
|
186
214
|
|
187
215
|
rescue Timeout::Error => e
|
188
|
-
puts "ERROR: #{e.message}"
|
216
|
+
puts "ERROR Timeout::Error: #{e.message}"
|
189
217
|
|
190
218
|
## generate a blank content
|
191
219
|
content = {}
|
@@ -207,10 +235,14 @@ class Cobweb
|
|
207
235
|
raise "url cannot be nil" if url.nil?
|
208
236
|
|
209
237
|
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
210
|
-
|
238
|
+
|
211
239
|
# get the unique id for this request
|
212
240
|
unique_id = Digest::SHA1.hexdigest(url)
|
213
|
-
redirect_limit
|
241
|
+
if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
|
242
|
+
redirect_limit = options[:redirect_limit].to_i
|
243
|
+
else
|
244
|
+
redirect_limit = 10
|
245
|
+
end
|
214
246
|
|
215
247
|
# connect to redis
|
216
248
|
if options.has_key? :crawl_id
|
@@ -224,7 +256,7 @@ class Cobweb
|
|
224
256
|
# check if it has already been cached
|
225
257
|
if redis.get("head-#{unique_id}") and @options[:cache]
|
226
258
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
227
|
-
Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
|
259
|
+
content = Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
|
228
260
|
else
|
229
261
|
print "Retrieving #{url }... " unless @options[:quiet]
|
230
262
|
uri = Addressable::URI.parse(url.strip)
|
@@ -247,7 +279,9 @@ class Cobweb
|
|
247
279
|
puts "redirected... " unless @options[:quiet]
|
248
280
|
url = absolutize.url(response['location']).to_s
|
249
281
|
redirect_limit = redirect_limit - 1
|
250
|
-
|
282
|
+
options = options.clone
|
283
|
+
options[:redirect_limit]=redirect_limit
|
284
|
+
content = head(url, options)
|
251
285
|
content[:url] = uri.to_s
|
252
286
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
253
287
|
content[:redirect_through].insert(0, url)
|
data/lib/cobweb_process_job.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
@@ -6,163 +6,142 @@ class CrawlJob
|
|
6
6
|
|
7
7
|
@queue = :cobweb_crawl_job
|
8
8
|
|
9
|
-
## redis params used
|
10
|
-
#
|
11
|
-
# crawl-counter
|
12
|
-
# crawled
|
13
|
-
# queue-counter
|
14
|
-
# statistics[:average_response_time]
|
15
|
-
# statistics[:maximum_response_time]
|
16
|
-
# statistics[:minimum_response_time]
|
17
|
-
# statistics[:average_length]
|
18
|
-
# statistics[:maximum_length]
|
19
|
-
# statistics[:minimum_length]
|
20
|
-
# statistics[:queued_at]
|
21
|
-
# statistics[:started_at]
|
22
|
-
# statistics]:finished_at]
|
23
|
-
# total_pages
|
24
|
-
# total_assets
|
25
|
-
# statistics[:mime_counts]["mime_type"]
|
26
|
-
# statistics[:status_counts][xxx]
|
27
|
-
|
28
9
|
def self.perform(content_request)
|
29
|
-
|
30
|
-
|
31
|
-
|
10
|
+
|
11
|
+
# change all hash keys to symbols
|
12
|
+
content_request = content_request.deep_symbolize_keys
|
13
|
+
|
14
|
+
@redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
32
15
|
|
33
16
|
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
34
|
-
|
17
|
+
@debug = content_request[:debug]
|
18
|
+
|
19
|
+
refresh_counters
|
20
|
+
|
35
21
|
# check we haven't crawled this url before
|
36
|
-
|
37
|
-
queue_counter = redis.get("queue-counter").to_i
|
38
|
-
unless redis.sismember "crawled", content_request[:url]
|
22
|
+
unless @redis.sismember "crawled", content_request[:url]
|
39
23
|
|
40
|
-
#
|
41
|
-
|
42
|
-
crawl_counter += 1
|
43
|
-
if crawl_counter <= content_request[:crawl_limit].to_i
|
24
|
+
# if there is no limit or we're still under it lets get the url
|
25
|
+
if content_request[:crawl_limit].nil? or @crawl_counter <= content_request[:crawl_limit].to_i
|
44
26
|
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
45
|
-
|
27
|
+
|
46
28
|
## update statistics
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
if
|
55
|
-
|
56
|
-
|
57
|
-
|
29
|
+
Stats.set_statistics_in_redis(@redis, content)
|
30
|
+
|
31
|
+
# set the base url if this is the first page
|
32
|
+
set_base_url @redis, content, content_request
|
33
|
+
|
34
|
+
internal_links = all_links_from_content(content).map{|link| link.to_s}
|
35
|
+
|
36
|
+
# reject the link if we've crawled it or queued it
|
37
|
+
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
38
|
+
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
39
|
+
|
40
|
+
# select the link if its internal
|
41
|
+
internal_links.select!{|link| internal_link?(link)}
|
42
|
+
|
43
|
+
internal_links.each do |link|
|
44
|
+
enqueue_content(content_request, link)
|
58
45
|
end
|
59
|
-
redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
|
60
|
-
redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
|
61
46
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
mime_counts = {}
|
69
|
-
if redis.hexists "statistics", "mime_counts"
|
70
|
-
mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
|
71
|
-
if mime_counts.has_key? content[:mime_type]
|
72
|
-
mime_counts[content[:mime_type]] += 1
|
73
|
-
else
|
74
|
-
mime_counts[content[:mime_type]] = 1
|
75
|
-
end
|
76
|
-
else
|
77
|
-
mime_counts = {content[:mime_type] => 1}
|
78
|
-
end
|
79
|
-
redis.hset "statistics", "mime_counts", mime_counts.to_json
|
80
|
-
|
81
|
-
status_counts = {}
|
82
|
-
if redis.hexists "statistics", "status_counts"
|
83
|
-
status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
|
84
|
-
if status_counts.has_key? content[:status_code].to_i
|
85
|
-
status_counts[content[:status_code].to_i] += 1
|
86
|
-
else
|
87
|
-
status_counts[content[:status_code].to_i] = 1
|
88
|
-
end
|
89
|
-
else
|
90
|
-
status_counts = {content[:status_code].to_i => 1}
|
91
|
-
end
|
92
|
-
redis.hset "statistics", "status_counts", status_counts.to_json
|
93
|
-
|
94
|
-
redis.srem "queued", content_request[:url]
|
95
|
-
redis.sadd "crawled", content_request[:url]
|
96
|
-
set_base_url redis, content, content_request[:base_url]
|
97
|
-
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
98
|
-
link = link.to_s
|
99
|
-
unless redis.sismember "crawled", link
|
100
|
-
puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
|
101
|
-
if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
|
102
|
-
puts "Matched as #{link} as internal" if content_request[:debug]
|
103
|
-
unless redis.sismember("crawled", link) or redis.sismember("queued", link)
|
104
|
-
if queue_counter <= content_request[:crawl_limit].to_i
|
105
|
-
new_request = content_request.clone
|
106
|
-
new_request[:url] = link
|
107
|
-
new_request[:parent] = content_request[:url]
|
108
|
-
Resque.enqueue(CrawlJob, new_request)
|
109
|
-
redis.sadd "queued", link
|
110
|
-
redis.incr "queue-counter"
|
111
|
-
queue_counter += 1
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
end
|
116
|
-
end
|
47
|
+
# now that we're done, lets update the queues
|
48
|
+
@redis.srem "queued", content_request[:url]
|
49
|
+
decrement_queue_counter
|
50
|
+
@redis.sadd "crawled", content_request[:url]
|
51
|
+
increment_crawl_counter
|
117
52
|
|
118
53
|
# enqueue to processing queue
|
119
54
|
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
|
120
55
|
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
121
|
-
puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
|
122
|
-
|
123
|
-
|
56
|
+
puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
|
57
|
+
|
124
58
|
else
|
125
|
-
puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
|
59
|
+
puts "Crawl Limit Exceeded by #{@crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
|
126
60
|
end
|
127
61
|
else
|
128
62
|
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
129
63
|
end
|
130
64
|
|
131
|
-
#
|
132
|
-
|
133
|
-
if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
|
65
|
+
# if the'res nothing left queued or the crawled limit has been reached
|
66
|
+
if @queue_counter == 0 || @crawl_counter >= content_request[:crawl_limit].to_i
|
134
67
|
|
135
|
-
puts "queue_counter: #{queue_counter}"
|
136
|
-
puts "crawl_counter: #{crawl_counter}"
|
68
|
+
puts "queue_counter: #{@queue_counter}"
|
69
|
+
puts "crawl_counter: #{@crawl_counter}"
|
137
70
|
puts "crawl_limit: #{content_request[:crawl_limit]}"
|
138
71
|
|
139
72
|
# finished
|
140
73
|
puts "FINISHED"
|
141
|
-
stats = redis.hgetall "statistics"
|
142
|
-
stats[:total_pages] = redis.get "total_pages"
|
143
|
-
stats[:total_assets] = redis.get "total_assets"
|
144
|
-
stats[:crawl_counter] = redis.get "crawl_counter"
|
145
|
-
stats[:queue_counter] = redis.get "queue_counter"
|
146
|
-
stats[:crawled] = redis.smembers "crawled"
|
74
|
+
stats = @redis.hgetall "statistics"
|
75
|
+
stats[:total_pages] = @redis.get "total_pages"
|
76
|
+
stats[:total_assets] = @redis.get "total_assets"
|
77
|
+
stats[:crawl_counter] = @redis.get "crawl_counter"
|
78
|
+
stats[:queue_counter] = @redis.get "queue_counter"
|
79
|
+
stats[:crawled] = @redis.smembers "crawled"
|
147
80
|
|
148
|
-
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]}))
|
81
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
149
82
|
|
150
|
-
ap stats
|
151
83
|
end
|
152
84
|
end
|
153
85
|
|
154
86
|
private
|
155
|
-
def self.set_base_url(redis, content,
|
87
|
+
def self.set_base_url(redis, content, content_request)
|
156
88
|
if redis.get("base_url").nil?
|
157
|
-
|
158
|
-
|
159
|
-
redis.
|
160
|
-
|
161
|
-
|
162
|
-
|
89
|
+
unless content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
|
90
|
+
uri = Addressable::URI.parse(content[:redirect_through].last)
|
91
|
+
redis.sadd("internal_urls", [uri.scheme, "://", uri.host, "/*"].join)
|
92
|
+
end
|
93
|
+
redis.set("base_url", content[:url])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.internal_link?(link)
|
98
|
+
puts "Checking for internal link for: #{link}" if @debug
|
99
|
+
@internal_patterns ||= @redis.smembers("internal_urls").map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}
|
100
|
+
valid_link = true
|
101
|
+
@internal_patterns.each do |pattern|
|
102
|
+
puts "Matching against #{pattern.source}" if @debug
|
103
|
+
if link.match(pattern)
|
104
|
+
puts "Matched as internal" if @debug
|
105
|
+
return true
|
163
106
|
end
|
164
107
|
end
|
108
|
+
puts "Didn't match any pattern so marked as not internal" if @debug
|
109
|
+
false
|
165
110
|
end
|
166
111
|
|
112
|
+
def self.all_links_from_content(content)
|
113
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten
|
114
|
+
end
|
167
115
|
|
116
|
+
def self.enqueue_content(content_request, link)
|
117
|
+
new_request = content_request.clone
|
118
|
+
new_request[:url] = link
|
119
|
+
new_request[:parent] = content_request[:url]
|
120
|
+
Resque.enqueue(CrawlJob, new_request)
|
121
|
+
@redis.sadd "queued", link
|
122
|
+
increment_queue_counter
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.increment_queue_counter
|
126
|
+
@redis.incr "queue-counter"
|
127
|
+
refresh_counters
|
128
|
+
end
|
129
|
+
def self.increment_crawl_counter
|
130
|
+
@redis.incr "crawl-counter"
|
131
|
+
refresh_counters
|
132
|
+
end
|
133
|
+
def self.decrement_queue_counter
|
134
|
+
@redis.decr "queue-counter"
|
135
|
+
refresh_counters
|
136
|
+
end
|
137
|
+
def self.refresh_counters
|
138
|
+
@crawl_counter = @redis.get("crawl-counter").to_i
|
139
|
+
@queue_counter = @redis.get("queue-counter").to_i
|
140
|
+
end
|
141
|
+
def self.reset_counters
|
142
|
+
@redis.set("crawl-counter", @redis.smembers("crawled").count)
|
143
|
+
@redis.set("queue-counter", @redis.smembers("queued").count)
|
144
|
+
@crawl_counter = @redis.get("crawl-counter").to_i
|
145
|
+
@queue_counter = @redis.get("queue-counter").to_i
|
146
|
+
end
|
168
147
|
end
|
data/lib/stats.rb
CHANGED
@@ -11,6 +11,59 @@ class Stats < Sinatra::Base
|
|
11
11
|
@@status = status
|
12
12
|
end
|
13
13
|
|
14
|
+
def self.set_statistics_in_redis(redis, content)
|
15
|
+
crawl_counter = redis.get("crawl-counter").to_i
|
16
|
+
queue_counter = redis.get("queue-counter").to_i
|
17
|
+
|
18
|
+
if redis.hexists "statistics", "average_response_time"
|
19
|
+
redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
|
20
|
+
else
|
21
|
+
redis.hset("statistics", "average_response_time", content[:response_time].to_f)
|
22
|
+
end
|
23
|
+
redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
|
24
|
+
redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
|
25
|
+
if redis.hexists "statistics", "average_length"
|
26
|
+
redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
|
27
|
+
else
|
28
|
+
redis.hset("statistics", "average_length", content[:length].to_i)
|
29
|
+
end
|
30
|
+
redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
|
31
|
+
redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
|
32
|
+
|
33
|
+
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
34
|
+
redis.incr "total_pages"
|
35
|
+
else
|
36
|
+
redis.incr "total_assets"
|
37
|
+
end
|
38
|
+
|
39
|
+
mime_counts = {}
|
40
|
+
if redis.hexists "statistics", "mime_counts"
|
41
|
+
mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
|
42
|
+
if mime_counts.has_key? content[:mime_type]
|
43
|
+
mime_counts[content[:mime_type]] += 1
|
44
|
+
else
|
45
|
+
mime_counts[content[:mime_type]] = 1
|
46
|
+
end
|
47
|
+
else
|
48
|
+
mime_counts = {content[:mime_type] => 1}
|
49
|
+
end
|
50
|
+
redis.hset "statistics", "mime_counts", mime_counts.to_json
|
51
|
+
|
52
|
+
status_counts = {}
|
53
|
+
if redis.hexists "statistics", "status_counts"
|
54
|
+
status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
|
55
|
+
if status_counts.has_key? content[:status_code].to_i
|
56
|
+
status_counts[content[:status_code].to_i] += 1
|
57
|
+
else
|
58
|
+
status_counts[content[:status_code].to_i] = 1
|
59
|
+
end
|
60
|
+
else
|
61
|
+
status_counts = {content[:status_code].to_i => 1}
|
62
|
+
end
|
63
|
+
redis.hset "statistics", "status_counts", status_counts.to_json
|
64
|
+
|
65
|
+
end
|
66
|
+
|
14
67
|
set :views, settings.root + '/../views'
|
15
68
|
|
16
69
|
get '/' do
|
@@ -19,7 +72,6 @@ class Stats < Sinatra::Base
|
|
19
72
|
haml :statistics
|
20
73
|
end
|
21
74
|
|
22
|
-
|
23
75
|
def self.start
|
24
76
|
thread = Thread.new do
|
25
77
|
Stats.run!
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -74,6 +74,25 @@ describe Cobweb do
|
|
74
74
|
Cobweb.new.should be_an_instance_of Cobweb
|
75
75
|
end
|
76
76
|
|
77
|
+
it "should setup with defaults" do
|
78
|
+
cobweb = Cobweb.new
|
79
|
+
|
80
|
+
options = cobweb.instance_eval("@options")
|
81
|
+
ap options
|
82
|
+
|
83
|
+
options[:follow_redirects].should == true
|
84
|
+
options[:redirect_limit].should == 10
|
85
|
+
options[:processing_queue].should == CobwebProcessJob
|
86
|
+
options[:crawl_finished_queue].should == CobwebFinishedJob
|
87
|
+
options[:quiet].should == true
|
88
|
+
options[:debug].should == false
|
89
|
+
options[:cache].should == 300
|
90
|
+
options[:timeout].should == 10
|
91
|
+
options[:redis_options].should == {}
|
92
|
+
options[:internal_urls].should == []
|
93
|
+
|
94
|
+
end
|
95
|
+
|
77
96
|
describe "get" do
|
78
97
|
it "should return a hash with default values" do
|
79
98
|
@cobweb.get(@base_url).should be_an_instance_of Hash
|
@@ -141,7 +160,7 @@ describe Cobweb do
|
|
141
160
|
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
142
161
|
#
|
143
162
|
#content = @cobweb.get(@base_url)
|
144
|
-
#content.should be_an_instance_of
|
163
|
+
#content.should be_an_instance_of HashHelper
|
145
164
|
#ap content
|
146
165
|
#content[:url].should == "http://redirect-me.com/redirect.html"
|
147
166
|
#content[:redirect_through].length.should == 2
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.24
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70268501331520 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70268501331520
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70268501331100 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70268501331100
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
|
-
name:
|
38
|
-
requirement: &
|
37
|
+
name: nokogiri
|
38
|
+
requirement: &70268501330680 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70268501330680
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
|
-
name:
|
49
|
-
requirement: &
|
48
|
+
name: addressable
|
49
|
+
requirement: &70268501330240 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70268501330240
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
|
-
name:
|
60
|
-
requirement: &
|
59
|
+
name: rspec
|
60
|
+
requirement: &70268501329820 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70268501329820
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
71
|
-
requirement: &
|
70
|
+
name: awesome_print
|
71
|
+
requirement: &70268501329400 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70268501329400
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
|
-
name:
|
82
|
-
requirement: &
|
81
|
+
name: sinatra
|
82
|
+
requirement: &70268501328980 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70268501328980
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
|
-
name:
|
93
|
-
requirement: &
|
92
|
+
name: thin
|
93
|
+
requirement: &70268501328560 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70268501328560
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
|
-
name:
|
104
|
-
requirement: &
|
103
|
+
name: haml
|
104
|
+
requirement: &70268501328140 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70268501328140
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
|
-
name:
|
115
|
-
requirement: &
|
114
|
+
name: hashie
|
115
|
+
requirement: &70268501344080 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70268501344080
|
124
124
|
description:
|
125
125
|
email: stewart@rockwellcottage.com
|
126
126
|
executables: []
|
@@ -134,14 +134,12 @@ files:
|
|
134
134
|
- spec/samples/sample_html_links.html
|
135
135
|
- spec/spec.opts
|
136
136
|
- spec/spec_helper.rb
|
137
|
-
- lib/cobweb/version.rb
|
138
137
|
- lib/cobweb.rb
|
139
138
|
- lib/cobweb_crawler.rb
|
140
139
|
- lib/cobweb_finished_job.rb
|
141
140
|
- lib/cobweb_process_job.rb
|
142
141
|
- lib/content_link_parser.rb
|
143
142
|
- lib/crawl_job.rb
|
144
|
-
- lib/hash.rb
|
145
143
|
- lib/namespaced_redis.rb
|
146
144
|
- lib/redirect_error.rb
|
147
145
|
- lib/robots.rb
|
data/lib/cobweb/version.rb
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
VERSION = "0.0.21"
|
data/lib/hash.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
## add symbolize methods to hash
|
2
|
-
class Hash
|
3
|
-
def symbolize_keys
|
4
|
-
keys.each do |key|
|
5
|
-
if key.instance_of? String
|
6
|
-
value = self[key]
|
7
|
-
self.delete(key)
|
8
|
-
self[key.to_sym] = value
|
9
|
-
end
|
10
|
-
end
|
11
|
-
self
|
12
|
-
end
|
13
|
-
def deep_symbolize_keys
|
14
|
-
symbolize_keys
|
15
|
-
keys.each do |key|
|
16
|
-
if self[key].instance_of? Hash
|
17
|
-
self[key].deep_symbolize_keys
|
18
|
-
end
|
19
|
-
end
|
20
|
-
self
|
21
|
-
end
|
22
|
-
end
|