cobweb 0.0.13 → 0.0.17
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/cobweb.rb +39 -28
- data/lib/cobweb_crawler.rb +107 -82
- data/lib/crawl_job.rb +1 -1
- data/lib/hash.rb +22 -0
- data/lib/namespaced_redis.rb +12 -0
- data/lib/redirect_error.rb +2 -0
- data/lib/stats.rb +32 -0
- data/spec/cobweb/cobweb_crawler_spec.rb +3 -2
- data/spec/cobweb/cobweb_spec.rb +9 -7
- data/views/statistics.haml +71 -0
- metadata +52 -15
data/lib/cobweb.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
2
3
|
require 'uri'
|
3
4
|
require 'resque'
|
4
5
|
require "addressable/uri"
|
@@ -9,7 +10,7 @@ Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
|
|
9
10
|
require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
|
10
11
|
end
|
11
12
|
|
12
|
-
class
|
13
|
+
class Cobweb
|
13
14
|
|
14
15
|
## TASKS
|
15
16
|
|
@@ -65,7 +66,7 @@ class CobWeb
|
|
65
66
|
if redis.get(unique_id) and @options[:cache]
|
66
67
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
67
68
|
content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
|
68
|
-
content[:body] = Base64.decode64(content[:body])
|
69
|
+
content[:body] = Base64.decode64(content[:body])
|
69
70
|
|
70
71
|
content
|
71
72
|
else
|
@@ -93,13 +94,25 @@ class CobWeb
|
|
93
94
|
|
94
95
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
95
96
|
puts "redirected... " unless @options[:quiet]
|
97
|
+
|
98
|
+
# get location to redirect to
|
96
99
|
url = absolutize.url(response['location']).to_s
|
100
|
+
|
101
|
+
# decrement redirect limit
|
97
102
|
redirect_limit = redirect_limit - 1
|
103
|
+
|
104
|
+
# raise exception if we're being redirected to somewhere we've been redirected to in this content request
|
105
|
+
#raise RedirectError("Loop detected in redirect for - #{url}") if content[:redirect_through].include? url
|
106
|
+
|
107
|
+
# raise exception if redirect limit has reached 0
|
108
|
+
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
109
|
+
|
110
|
+
# get the content from redirect location
|
98
111
|
content = get(url, redirect_limit)
|
99
112
|
content[:url] = uri.to_s
|
100
113
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
101
114
|
content[:redirect_through].insert(0, url)
|
102
|
-
|
115
|
+
|
103
116
|
content[:response_time] = Time.now.to_f - request_time
|
104
117
|
else
|
105
118
|
content[:response_time] = Time.now.to_f - request_time
|
@@ -109,8 +122,9 @@ class CobWeb
|
|
109
122
|
# create the content container
|
110
123
|
content[:url] = uri.to_s
|
111
124
|
content[:status_code] = response.code.to_i
|
112
|
-
content[:mime_type] =
|
113
|
-
|
125
|
+
content[:mime_type] = ""
|
126
|
+
content[:mime_type] = response.content_type.split(";")[0].strip unless response.content_type.nil?
|
127
|
+
if !response["Content-Type"].nil? && response["Content-Type"].include?(";")
|
114
128
|
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
115
129
|
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
116
130
|
content[:character_set] = charset
|
@@ -130,10 +144,25 @@ class CobWeb
|
|
130
144
|
end
|
131
145
|
# add content to cache if required
|
132
146
|
if @options[:cache]
|
133
|
-
content[:body] = Base64.encode64(content[:body])
|
147
|
+
content[:body] = Base64.encode64(content[:body])
|
134
148
|
redis.set(unique_id, content.to_json)
|
135
149
|
redis.expire unique_id, @options[:cache].to_i
|
136
150
|
end
|
151
|
+
rescue RedirectError => e
|
152
|
+
puts "ERROR: #{e.message}"
|
153
|
+
|
154
|
+
## generate a blank content
|
155
|
+
content = {}
|
156
|
+
content[:url] = uri.to_s
|
157
|
+
content[:response_time] = Time.now.to_f - request_time
|
158
|
+
content[:status_code] = 0
|
159
|
+
content[:length] = 0
|
160
|
+
content[:body] = ""
|
161
|
+
content[:error] = e.message
|
162
|
+
content[:mime_type] = "error/dnslookup"
|
163
|
+
content[:headers] = {}
|
164
|
+
content[:links] = {}
|
165
|
+
|
137
166
|
rescue SocketError => e
|
138
167
|
puts "ERROR: #{e.message}"
|
139
168
|
|
@@ -273,25 +302,7 @@ class CobWeb
|
|
273
302
|
end
|
274
303
|
end
|
275
304
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if key.instance_of? String
|
281
|
-
value = self[key]
|
282
|
-
self.delete(key)
|
283
|
-
self[key.to_sym] = value
|
284
|
-
end
|
285
|
-
end
|
286
|
-
self
|
287
|
-
end
|
288
|
-
def deep_symbolize_keys
|
289
|
-
symbolize_keys
|
290
|
-
keys.each do |key|
|
291
|
-
if self[key].instance_of? Hash
|
292
|
-
self[key].deep_symbolize_keys
|
293
|
-
end
|
294
|
-
end
|
295
|
-
self
|
296
|
-
end
|
297
|
-
end
|
305
|
+
|
306
|
+
|
307
|
+
|
308
|
+
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -1,13 +1,20 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'date'
|
3
|
+
require 'ap'
|
4
|
+
|
1
5
|
class CobwebCrawler
|
2
6
|
|
3
7
|
def initialize(options={})
|
4
8
|
@options = options
|
5
9
|
|
6
10
|
@statistic = {}
|
7
|
-
@queue = []
|
8
|
-
@crawled = []
|
9
11
|
|
10
|
-
@
|
12
|
+
@options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
|
13
|
+
crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
|
14
|
+
|
15
|
+
@redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{crawl_id}")
|
16
|
+
|
17
|
+
@cobweb = Cobweb.new(@options)
|
11
18
|
end
|
12
19
|
|
13
20
|
def crawl(base_url, crawl_options = {}, &block)
|
@@ -17,104 +24,122 @@ class CobwebCrawler
|
|
17
24
|
|
18
25
|
@absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
19
26
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
while !@queue.empty? && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
|
25
|
-
|
26
|
-
url = @queue.first
|
27
|
-
@options[:url] = url
|
28
|
-
unless @crawled.include?(url) || url =~ /\/(.+?)\/\1\/\1/
|
29
|
-
begin
|
30
|
-
content = @cobweb.get(@options[:url])
|
27
|
+
@redis.sadd "queued", base_url
|
28
|
+
crawl_counter = @redis.scard("crawled").to_i
|
29
|
+
queue_counter = @redis.scard("queued").to_i
|
31
30
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
|
32
|
+
thread = Thread.new do
|
33
|
+
|
34
|
+
url = @redis.spop "queued"
|
35
|
+
crawl_counter = @redis.scard("crawled").to_i
|
36
|
+
queue_counter = @redis.scard("queued").to_i
|
37
|
+
|
38
|
+
@options[:url] = url
|
39
|
+
unless @redis.sismember("crawled", url.to_s)
|
40
|
+
begin
|
41
|
+
Stats.update_status("Requesting #{url}...")
|
42
|
+
content = @cobweb.get(url)
|
43
|
+
Stats.update_status("Processing #{url}...")
|
44
|
+
|
45
|
+
if @statistic[:average_response_time].nil?
|
46
|
+
@statistic[:average_response_time] = content[:response_time].to_f
|
47
|
+
else
|
48
|
+
@statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
|
49
|
+
end
|
37
50
|
|
38
|
-
|
39
|
-
|
51
|
+
@statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
|
52
|
+
@statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]
|
40
53
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
54
|
+
if @statistic[:average_length]
|
55
|
+
@statistic[:average_length] = (((@statistic[:average_length].to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
|
56
|
+
else
|
57
|
+
@statistic[:average_length] = content[:length].to_i
|
58
|
+
end
|
46
59
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
52
|
-
@statistic[:page_count] = @statistic[:page_count].to_i + 1
|
53
|
-
@statistic[:page_size] = @statistic[:page_count].to_i + content[:length].to_i
|
54
|
-
else
|
55
|
-
@statistic[:asset_count] = @statistic[:asset_count].to_i + 1
|
56
|
-
@statistic[:asset_size] = @statistic[:asset_count].to_i + content[:length].to_i
|
57
|
-
end
|
60
|
+
@statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
|
61
|
+
@statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i < @statistic[:minimum_length].to_i
|
62
|
+
@statistic[:total_length] = @statistic[:total_length].to_i + content[:length].to_i
|
58
63
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
if mime_counts.has_key? content[:mime_type]
|
63
|
-
mime_counts[content[:mime_type]] += 1
|
64
|
+
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
65
|
+
@statistic[:page_count] = @statistic[:page_count].to_i + 1
|
66
|
+
@statistic[:page_size] = @statistic[:page_size].to_i + content[:length].to_i
|
64
67
|
else
|
65
|
-
|
68
|
+
@statistic[:asset_count] = @statistic[:asset_count].to_i + 1
|
69
|
+
@statistic[:asset_size] = @statistic[:asset_size].to_i + content[:length].to_i
|
66
70
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
+
|
72
|
+
@statistic[:total_redirects] = 0 if @statistic[:total_redirects].nil?
|
73
|
+
@statistic[:total_redirects] += content[:redirect_through].count unless content[:redirect_through].nil?
|
74
|
+
|
75
|
+
@statistic[:crawl_counter] = crawl_counter
|
76
|
+
@statistic[:queue_counter] = queue_counter
|
77
|
+
|
78
|
+
mime_counts = {}
|
79
|
+
if @statistic.has_key? :mime_counts
|
80
|
+
mime_counts = @statistic[:mime_counts]
|
81
|
+
if mime_counts.has_key? content[:mime_type]
|
82
|
+
mime_counts[content[:mime_type]] += 1
|
83
|
+
else
|
84
|
+
mime_counts[content[:mime_type]] = 1
|
85
|
+
end
|
86
|
+
else
|
87
|
+
mime_counts = {content[:mime_type] => 1}
|
88
|
+
end
|
89
|
+
@statistic[:mime_counts] = mime_counts
|
71
90
|
|
72
|
-
|
91
|
+
status_counts = {}
|
73
92
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
93
|
+
if @statistic.has_key? :status_counts
|
94
|
+
status_counts = @statistic[:status_counts]
|
95
|
+
if status_counts.has_key? content[:status_code].to_i
|
96
|
+
status_counts[content[:status_code].to_i] += 1
|
97
|
+
else
|
98
|
+
status_counts[content[:status_code].to_i] = 1
|
99
|
+
end
|
78
100
|
else
|
79
|
-
status_counts
|
101
|
+
status_counts = {content[:status_code].to_i => 1}
|
80
102
|
end
|
81
|
-
|
82
|
-
status_counts = {content[:status_code].to_i => 1}
|
83
|
-
end
|
84
|
-
@statistic[:status_counts] = status_counts
|
103
|
+
@statistic[:status_counts] = status_counts
|
85
104
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
105
|
+
@redis.sadd "crawled", url.to_s
|
106
|
+
@redis.incr "crawl-counter"
|
107
|
+
|
108
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |content_link|
|
109
|
+
link = content_link.to_s
|
110
|
+
unless @redis.sismember("crawled", link)
|
111
|
+
puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
|
112
|
+
if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
|
113
|
+
puts "Matched as #{link} as internal" if @options[:debug]
|
114
|
+
unless @redis.sismember("crawled", link) || @redis.sismember("queued", link)
|
115
|
+
puts "Added #{link.to_s} to queue" if @options[:debug]
|
116
|
+
@redis.sadd "queued", link
|
117
|
+
crawl_counter = @redis.scard("crawled").to_i
|
118
|
+
queue_counter = @redis.scard("queued").to_i
|
119
|
+
end
|
97
120
|
end
|
98
121
|
end
|
99
122
|
end
|
100
|
-
end
|
101
|
-
@queue.uniq!
|
102
123
|
|
103
|
-
|
104
|
-
|
105
|
-
|
124
|
+
crawl_counter = @redis.scard("crawled").to_i
|
125
|
+
queue_counter = @redis.scard("queued").to_i
|
126
|
+
Stats.update_statistics(@statistic)
|
127
|
+
Stats.update_status("Completed #{url}.")
|
128
|
+
puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @options[:debug]
|
129
|
+
|
130
|
+
yield content, @statistic if block_given?
|
106
131
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
132
|
+
rescue => e
|
133
|
+
puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
134
|
+
ap e
|
135
|
+
ap e.backtrace
|
136
|
+
end
|
137
|
+
else
|
138
|
+
puts "Already crawled #{@options[:url]}" if @options[:debug]
|
112
139
|
end
|
113
|
-
else
|
114
|
-
puts "Already crawled #{@options[:url]}" if @options[:debug]
|
115
140
|
end
|
141
|
+
thread.join
|
116
142
|
end
|
117
143
|
@statistic
|
118
144
|
end
|
119
|
-
|
120
|
-
end
|
145
|
+
end
|
data/lib/crawl_job.rb
CHANGED
@@ -40,7 +40,7 @@ class CrawlJob
|
|
40
40
|
redis.incr "crawl-counter"
|
41
41
|
crawl_counter += 1
|
42
42
|
if crawl_counter <= content_request[:crawl_limit].to_i
|
43
|
-
content =
|
43
|
+
content = Cobweb.new(content_request).get(content_request[:url])
|
44
44
|
|
45
45
|
## update statistics
|
46
46
|
if redis.hexists "statistics", "average_response_time"
|
data/lib/hash.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
## add symbolize methods to hash
|
2
|
+
class Hash
|
3
|
+
def symbolize_keys
|
4
|
+
keys.each do |key|
|
5
|
+
if key.instance_of? String
|
6
|
+
value = self[key]
|
7
|
+
self.delete(key)
|
8
|
+
self[key.to_sym] = value
|
9
|
+
end
|
10
|
+
end
|
11
|
+
self
|
12
|
+
end
|
13
|
+
def deep_symbolize_keys
|
14
|
+
symbolize_keys
|
15
|
+
keys.each do |key|
|
16
|
+
if self[key].instance_of? Hash
|
17
|
+
self[key].deep_symbolize_keys
|
18
|
+
end
|
19
|
+
end
|
20
|
+
self
|
21
|
+
end
|
22
|
+
end
|
data/lib/namespaced_redis.rb
CHANGED
@@ -17,8 +17,16 @@ class NamespacedRedis
|
|
17
17
|
@redis.srem namespaced(key), member
|
18
18
|
end
|
19
19
|
|
20
|
+
def spop(key)
|
21
|
+
@redis.spop namespaced(key)
|
22
|
+
end
|
23
|
+
|
20
24
|
def smembers(key)
|
21
25
|
@redis.smembers namespaced(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
def scard(key)
|
29
|
+
@redis.scard namespaced(key)
|
22
30
|
end
|
23
31
|
|
24
32
|
def get(key)
|
@@ -29,6 +37,10 @@ class NamespacedRedis
|
|
29
37
|
@redis.incr namespaced(key)
|
30
38
|
end
|
31
39
|
|
40
|
+
def decr(key)
|
41
|
+
@redis.decr namespaced(key)
|
42
|
+
end
|
43
|
+
|
32
44
|
def exist(key)
|
33
45
|
@redis.exist namespaced(key)
|
34
46
|
end
|
data/lib/stats.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'sinatra'
|
3
|
+
require 'haml'
|
4
|
+
|
5
|
+
class Stats < Sinatra::Base
|
6
|
+
|
7
|
+
def self.update_statistics(statistics)
|
8
|
+
@@statistics = statistics
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.update_status(status)
|
12
|
+
@@status = status
|
13
|
+
end
|
14
|
+
|
15
|
+
ap settings.root
|
16
|
+
set :views, settings.root + '/../views'
|
17
|
+
|
18
|
+
get '/' do
|
19
|
+
@statistics = @@statistics
|
20
|
+
@status = @@status
|
21
|
+
haml :statistics
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
thread = Thread.new do
|
27
|
+
Stats.run!
|
28
|
+
|
29
|
+
## we need to manually kill the main thread as sinatra traps the interrupts
|
30
|
+
Thread.main.kill
|
31
|
+
end
|
32
|
+
|
@@ -31,7 +31,7 @@ describe CobwebCrawler do
|
|
31
31
|
|
32
32
|
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
33
33
|
|
34
|
-
statistics = crawler.crawl("http://
|
34
|
+
statistics = crawler.crawl("http://rockwellcottage.heroku.com/")
|
35
35
|
|
36
36
|
ap statistics
|
37
37
|
|
@@ -43,8 +43,9 @@ describe CobwebCrawler do
|
|
43
43
|
|
44
44
|
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
45
45
|
|
46
|
-
statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content|
|
46
|
+
statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content, statistics|
|
47
47
|
ap content[:url]
|
48
|
+
ap statistics[:average_length]
|
48
49
|
end
|
49
50
|
|
50
51
|
ap statistics
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe Cobweb do
|
4
4
|
|
5
5
|
before(:each) do
|
6
6
|
|
@@ -15,7 +15,7 @@ describe CobWeb do
|
|
15
15
|
"Server" => "gws",
|
16
16
|
"X-XSS-Protection" => "1; mode=block"}
|
17
17
|
|
18
|
-
@cobweb =
|
18
|
+
@cobweb = Cobweb.new :quiet => true, :cache => nil
|
19
19
|
end
|
20
20
|
|
21
21
|
describe "with mock" do
|
@@ -40,7 +40,9 @@ describe CobWeb do
|
|
40
40
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
41
41
|
@mock_http_client.stub!(:read_timeout=).and_return(nil)
|
42
42
|
@mock_http_client.stub!(:open_timeout=).and_return(nil)
|
43
|
-
@mock_http_client.stub!(:start).and_return(@mock_http_response)
|
43
|
+
@mock_http_client.stub!(:start).and_return(@mock_http_response)
|
44
|
+
@mock_http_client.stub!(:address).and_return("www.baseurl.com")
|
45
|
+
@mock_http_client.stub!(:port).and_return("80 ")
|
44
46
|
|
45
47
|
@mock_http_response.stub!(:code).and_return(200)
|
46
48
|
@mock_http_response.stub!(:content_type).and_return("text/html")
|
@@ -69,7 +71,7 @@ describe CobWeb do
|
|
69
71
|
end
|
70
72
|
|
71
73
|
it "should generate a cobweb object" do
|
72
|
-
|
74
|
+
Cobweb.new.should be_an_instance_of Cobweb
|
73
75
|
end
|
74
76
|
|
75
77
|
describe "get" do
|
@@ -130,7 +132,7 @@ describe CobWeb do
|
|
130
132
|
|
131
133
|
before(:each) do
|
132
134
|
@base_url = "http://redirect-me.com/redirect.html"
|
133
|
-
@cobweb =
|
135
|
+
@cobweb = Cobweb.new(:follow_redirects => true, :quiet => true, :cache => nil)
|
134
136
|
end
|
135
137
|
|
136
138
|
it "should flow through redirect" #do
|
@@ -155,8 +157,8 @@ describe CobWeb do
|
|
155
157
|
|
156
158
|
#end
|
157
159
|
it "should not follow with redirect disabled" do
|
158
|
-
@cobweb =
|
159
|
-
@mock_http_client.should_receive(:
|
160
|
+
@cobweb = Cobweb.new(:follow_redirects => false, :cache => nil)
|
161
|
+
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
160
162
|
|
161
163
|
content = @cobweb.get(@base_url)
|
162
164
|
content[:url].should == "http://redirect-me.com/redirect.html"
|
@@ -0,0 +1,71 @@
|
|
1
|
+
%h1 Cobweb Statistics
|
2
|
+
|
3
|
+
%h4= @status
|
4
|
+
|
5
|
+
%table
|
6
|
+
%tr
|
7
|
+
%th Page Count
|
8
|
+
%th Asset Count
|
9
|
+
%th Redirect Count
|
10
|
+
%tr
|
11
|
+
%td= @statistics[:page_count]
|
12
|
+
%td= @statistics[:asset_count]
|
13
|
+
%td= @statistics[:total_redirects]
|
14
|
+
|
15
|
+
%table
|
16
|
+
%tr
|
17
|
+
%th Total Page Size
|
18
|
+
%th Total Asset Size
|
19
|
+
%tr
|
20
|
+
%td= @statistics[:page_size]
|
21
|
+
%td= @statistics[:asset_size]
|
22
|
+
|
23
|
+
%table
|
24
|
+
%tr
|
25
|
+
%th Crawled
|
26
|
+
%th Queued
|
27
|
+
%tr
|
28
|
+
%td= @statistics[:crawl_counter]
|
29
|
+
%td= @statistics[:queue_counter]
|
30
|
+
|
31
|
+
|
32
|
+
%table
|
33
|
+
%tr
|
34
|
+
%th{:colspan => 2} Response Times
|
35
|
+
%tr
|
36
|
+
%th Average
|
37
|
+
%td= @statistics[:average_response_time]
|
38
|
+
%tr
|
39
|
+
%th Maximum
|
40
|
+
%td= @statistics[:maximum_response_time]
|
41
|
+
%tr
|
42
|
+
%th Minimum
|
43
|
+
%td= @statistics[:minimum_response_time]
|
44
|
+
|
45
|
+
%table
|
46
|
+
%tr
|
47
|
+
%th{:colspan => 2} Content Sizes
|
48
|
+
%tr
|
49
|
+
%th Average
|
50
|
+
%td= @statistics[:average_length]
|
51
|
+
%tr
|
52
|
+
%th Maximum
|
53
|
+
%td= @statistics[:maximum_length]
|
54
|
+
%tr
|
55
|
+
%th Minimum
|
56
|
+
%td= @statistics[:minimum_length]
|
57
|
+
%tr
|
58
|
+
%th Total
|
59
|
+
%td= @statistics[:total_length]
|
60
|
+
|
61
|
+
%table
|
62
|
+
- @statistics[:mime_counts].keys.each do |mime_type|
|
63
|
+
%tr
|
64
|
+
%td= mime_type
|
65
|
+
%td= @statistics[:mime_counts][mime_type]
|
66
|
+
|
67
|
+
%table
|
68
|
+
- @statistics[:status_counts].keys.each do |status|
|
69
|
+
%tr
|
70
|
+
%td= status
|
71
|
+
%td= @statistics[:status_counts][status]
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.17
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70173021660540 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70173021660540
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70173021659920 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70173021659920
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: absolutize
|
38
|
-
requirement: &
|
38
|
+
requirement: &70173021659420 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70173021659420
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70173021658760 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70173021658760
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: addressable
|
60
|
-
requirement: &
|
60
|
+
requirement: &70173021658080 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70173021658080
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
|
-
requirement: &
|
71
|
+
requirement: &70173021657320 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,7 +76,40 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70173021657320
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: awesome_print
|
82
|
+
requirement: &70173021654680 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
type: :runtime
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *70173021654680
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: sinatra
|
93
|
+
requirement: &70173021654060 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
type: :runtime
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *70173021654060
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: thin
|
104
|
+
requirement: &70173021653560 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: *70173021653560
|
80
113
|
description:
|
81
114
|
email: stewart@rockwellcottage.com
|
82
115
|
executables: []
|
@@ -96,7 +129,11 @@ files:
|
|
96
129
|
- lib/cobweb_process_job.rb
|
97
130
|
- lib/content_link_parser.rb
|
98
131
|
- lib/crawl_job.rb
|
132
|
+
- lib/hash.rb
|
99
133
|
- lib/namespaced_redis.rb
|
134
|
+
- lib/redirect_error.rb
|
135
|
+
- lib/stats.rb
|
136
|
+
- views/statistics.haml
|
100
137
|
- README.textile
|
101
138
|
homepage: http://github.com/stewartmckee/cobweb
|
102
139
|
licenses: []
|
@@ -118,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
155
|
version: '0'
|
119
156
|
requirements: []
|
120
157
|
rubyforge_project:
|
121
|
-
rubygems_version: 1.8.
|
158
|
+
rubygems_version: 1.8.10
|
122
159
|
signing_key:
|
123
160
|
specification_version: 3
|
124
161
|
summary: Web Crawler that uses resque background job engine to allow you to cluster
|