cobweb 0.0.13 → 0.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/cobweb.rb +39 -28
- data/lib/cobweb_crawler.rb +107 -82
- data/lib/crawl_job.rb +1 -1
- data/lib/hash.rb +22 -0
- data/lib/namespaced_redis.rb +12 -0
- data/lib/redirect_error.rb +2 -0
- data/lib/stats.rb +32 -0
- data/spec/cobweb/cobweb_crawler_spec.rb +3 -2
- data/spec/cobweb/cobweb_spec.rb +9 -7
- data/views/statistics.haml +71 -0
- metadata +52 -15
data/lib/cobweb.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'rubygems'
|
2
|
+
require 'bundler/setup'
|
2
3
|
require 'uri'
|
3
4
|
require 'resque'
|
4
5
|
require "addressable/uri"
|
@@ -9,7 +10,7 @@ Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
|
|
9
10
|
require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
|
10
11
|
end
|
11
12
|
|
12
|
-
class
|
13
|
+
class Cobweb
|
13
14
|
|
14
15
|
## TASKS
|
15
16
|
|
@@ -65,7 +66,7 @@ class CobWeb
|
|
65
66
|
if redis.get(unique_id) and @options[:cache]
|
66
67
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
67
68
|
content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
|
68
|
-
content[:body] = Base64.decode64(content[:body])
|
69
|
+
content[:body] = Base64.decode64(content[:body])
|
69
70
|
|
70
71
|
content
|
71
72
|
else
|
@@ -93,13 +94,25 @@ class CobWeb
|
|
93
94
|
|
94
95
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
95
96
|
puts "redirected... " unless @options[:quiet]
|
97
|
+
|
98
|
+
# get location to redirect to
|
96
99
|
url = absolutize.url(response['location']).to_s
|
100
|
+
|
101
|
+
# decrement redirect limit
|
97
102
|
redirect_limit = redirect_limit - 1
|
103
|
+
|
104
|
+
# raise exception if we're being redirected to somewhere we've been redirected to in this content request
|
105
|
+
#raise RedirectError("Loop detected in redirect for - #{url}") if content[:redirect_through].include? url
|
106
|
+
|
107
|
+
# raise exception if redirect limit has reached 0
|
108
|
+
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
109
|
+
|
110
|
+
# get the content from redirect location
|
98
111
|
content = get(url, redirect_limit)
|
99
112
|
content[:url] = uri.to_s
|
100
113
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
101
114
|
content[:redirect_through].insert(0, url)
|
102
|
-
|
115
|
+
|
103
116
|
content[:response_time] = Time.now.to_f - request_time
|
104
117
|
else
|
105
118
|
content[:response_time] = Time.now.to_f - request_time
|
@@ -109,8 +122,9 @@ class CobWeb
|
|
109
122
|
# create the content container
|
110
123
|
content[:url] = uri.to_s
|
111
124
|
content[:status_code] = response.code.to_i
|
112
|
-
content[:mime_type] =
|
113
|
-
|
125
|
+
content[:mime_type] = ""
|
126
|
+
content[:mime_type] = response.content_type.split(";")[0].strip unless response.content_type.nil?
|
127
|
+
if !response["Content-Type"].nil? && response["Content-Type"].include?(";")
|
114
128
|
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
115
129
|
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
116
130
|
content[:character_set] = charset
|
@@ -130,10 +144,25 @@ class CobWeb
|
|
130
144
|
end
|
131
145
|
# add content to cache if required
|
132
146
|
if @options[:cache]
|
133
|
-
content[:body] = Base64.encode64(content[:body])
|
147
|
+
content[:body] = Base64.encode64(content[:body])
|
134
148
|
redis.set(unique_id, content.to_json)
|
135
149
|
redis.expire unique_id, @options[:cache].to_i
|
136
150
|
end
|
151
|
+
rescue RedirectError => e
|
152
|
+
puts "ERROR: #{e.message}"
|
153
|
+
|
154
|
+
## generate a blank content
|
155
|
+
content = {}
|
156
|
+
content[:url] = uri.to_s
|
157
|
+
content[:response_time] = Time.now.to_f - request_time
|
158
|
+
content[:status_code] = 0
|
159
|
+
content[:length] = 0
|
160
|
+
content[:body] = ""
|
161
|
+
content[:error] = e.message
|
162
|
+
content[:mime_type] = "error/dnslookup"
|
163
|
+
content[:headers] = {}
|
164
|
+
content[:links] = {}
|
165
|
+
|
137
166
|
rescue SocketError => e
|
138
167
|
puts "ERROR: #{e.message}"
|
139
168
|
|
@@ -273,25 +302,7 @@ class CobWeb
|
|
273
302
|
end
|
274
303
|
end
|
275
304
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if key.instance_of? String
|
281
|
-
value = self[key]
|
282
|
-
self.delete(key)
|
283
|
-
self[key.to_sym] = value
|
284
|
-
end
|
285
|
-
end
|
286
|
-
self
|
287
|
-
end
|
288
|
-
def deep_symbolize_keys
|
289
|
-
symbolize_keys
|
290
|
-
keys.each do |key|
|
291
|
-
if self[key].instance_of? Hash
|
292
|
-
self[key].deep_symbolize_keys
|
293
|
-
end
|
294
|
-
end
|
295
|
-
self
|
296
|
-
end
|
297
|
-
end
|
305
|
+
|
306
|
+
|
307
|
+
|
308
|
+
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -1,13 +1,20 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'date'
|
3
|
+
require 'ap'
|
4
|
+
|
1
5
|
class CobwebCrawler
|
2
6
|
|
3
7
|
def initialize(options={})
|
4
8
|
@options = options
|
5
9
|
|
6
10
|
@statistic = {}
|
7
|
-
@queue = []
|
8
|
-
@crawled = []
|
9
11
|
|
10
|
-
@
|
12
|
+
@options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
|
13
|
+
crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
|
14
|
+
|
15
|
+
@redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{crawl_id}")
|
16
|
+
|
17
|
+
@cobweb = Cobweb.new(@options)
|
11
18
|
end
|
12
19
|
|
13
20
|
def crawl(base_url, crawl_options = {}, &block)
|
@@ -17,104 +24,122 @@ class CobwebCrawler
|
|
17
24
|
|
18
25
|
@absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
19
26
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
while !@queue.empty? && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
|
25
|
-
|
26
|
-
url = @queue.first
|
27
|
-
@options[:url] = url
|
28
|
-
unless @crawled.include?(url) || url =~ /\/(.+?)\/\1\/\1/
|
29
|
-
begin
|
30
|
-
content = @cobweb.get(@options[:url])
|
27
|
+
@redis.sadd "queued", base_url
|
28
|
+
crawl_counter = @redis.scard("crawled").to_i
|
29
|
+
queue_counter = @redis.scard("queued").to_i
|
31
30
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
31
|
+
while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
|
32
|
+
thread = Thread.new do
|
33
|
+
|
34
|
+
url = @redis.spop "queued"
|
35
|
+
crawl_counter = @redis.scard("crawled").to_i
|
36
|
+
queue_counter = @redis.scard("queued").to_i
|
37
|
+
|
38
|
+
@options[:url] = url
|
39
|
+
unless @redis.sismember("crawled", url.to_s)
|
40
|
+
begin
|
41
|
+
Stats.update_status("Requesting #{url}...")
|
42
|
+
content = @cobweb.get(url)
|
43
|
+
Stats.update_status("Processing #{url}...")
|
44
|
+
|
45
|
+
if @statistic[:average_response_time].nil?
|
46
|
+
@statistic[:average_response_time] = content[:response_time].to_f
|
47
|
+
else
|
48
|
+
@statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
|
49
|
+
end
|
37
50
|
|
38
|
-
|
39
|
-
|
51
|
+
@statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
|
52
|
+
@statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]
|
40
53
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
54
|
+
if @statistic[:average_length]
|
55
|
+
@statistic[:average_length] = (((@statistic[:average_length].to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
|
56
|
+
else
|
57
|
+
@statistic[:average_length] = content[:length].to_i
|
58
|
+
end
|
46
59
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
52
|
-
@statistic[:page_count] = @statistic[:page_count].to_i + 1
|
53
|
-
@statistic[:page_size] = @statistic[:page_count].to_i + content[:length].to_i
|
54
|
-
else
|
55
|
-
@statistic[:asset_count] = @statistic[:asset_count].to_i + 1
|
56
|
-
@statistic[:asset_size] = @statistic[:asset_count].to_i + content[:length].to_i
|
57
|
-
end
|
60
|
+
@statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
|
61
|
+
@statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i < @statistic[:minimum_length].to_i
|
62
|
+
@statistic[:total_length] = @statistic[:total_length].to_i + content[:length].to_i
|
58
63
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
if mime_counts.has_key? content[:mime_type]
|
63
|
-
mime_counts[content[:mime_type]] += 1
|
64
|
+
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
65
|
+
@statistic[:page_count] = @statistic[:page_count].to_i + 1
|
66
|
+
@statistic[:page_size] = @statistic[:page_size].to_i + content[:length].to_i
|
64
67
|
else
|
65
|
-
|
68
|
+
@statistic[:asset_count] = @statistic[:asset_count].to_i + 1
|
69
|
+
@statistic[:asset_size] = @statistic[:asset_size].to_i + content[:length].to_i
|
66
70
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
+
|
72
|
+
@statistic[:total_redirects] = 0 if @statistic[:total_redirects].nil?
|
73
|
+
@statistic[:total_redirects] += content[:redirect_through].count unless content[:redirect_through].nil?
|
74
|
+
|
75
|
+
@statistic[:crawl_counter] = crawl_counter
|
76
|
+
@statistic[:queue_counter] = queue_counter
|
77
|
+
|
78
|
+
mime_counts = {}
|
79
|
+
if @statistic.has_key? :mime_counts
|
80
|
+
mime_counts = @statistic[:mime_counts]
|
81
|
+
if mime_counts.has_key? content[:mime_type]
|
82
|
+
mime_counts[content[:mime_type]] += 1
|
83
|
+
else
|
84
|
+
mime_counts[content[:mime_type]] = 1
|
85
|
+
end
|
86
|
+
else
|
87
|
+
mime_counts = {content[:mime_type] => 1}
|
88
|
+
end
|
89
|
+
@statistic[:mime_counts] = mime_counts
|
71
90
|
|
72
|
-
|
91
|
+
status_counts = {}
|
73
92
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
93
|
+
if @statistic.has_key? :status_counts
|
94
|
+
status_counts = @statistic[:status_counts]
|
95
|
+
if status_counts.has_key? content[:status_code].to_i
|
96
|
+
status_counts[content[:status_code].to_i] += 1
|
97
|
+
else
|
98
|
+
status_counts[content[:status_code].to_i] = 1
|
99
|
+
end
|
78
100
|
else
|
79
|
-
status_counts
|
101
|
+
status_counts = {content[:status_code].to_i => 1}
|
80
102
|
end
|
81
|
-
|
82
|
-
status_counts = {content[:status_code].to_i => 1}
|
83
|
-
end
|
84
|
-
@statistic[:status_counts] = status_counts
|
103
|
+
@statistic[:status_counts] = status_counts
|
85
104
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
105
|
+
@redis.sadd "crawled", url.to_s
|
106
|
+
@redis.incr "crawl-counter"
|
107
|
+
|
108
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |content_link|
|
109
|
+
link = content_link.to_s
|
110
|
+
unless @redis.sismember("crawled", link)
|
111
|
+
puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
|
112
|
+
if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
|
113
|
+
puts "Matched as #{link} as internal" if @options[:debug]
|
114
|
+
unless @redis.sismember("crawled", link) || @redis.sismember("queued", link)
|
115
|
+
puts "Added #{link.to_s} to queue" if @options[:debug]
|
116
|
+
@redis.sadd "queued", link
|
117
|
+
crawl_counter = @redis.scard("crawled").to_i
|
118
|
+
queue_counter = @redis.scard("queued").to_i
|
119
|
+
end
|
97
120
|
end
|
98
121
|
end
|
99
122
|
end
|
100
|
-
end
|
101
|
-
@queue.uniq!
|
102
123
|
|
103
|
-
|
104
|
-
|
105
|
-
|
124
|
+
crawl_counter = @redis.scard("crawled").to_i
|
125
|
+
queue_counter = @redis.scard("queued").to_i
|
126
|
+
Stats.update_statistics(@statistic)
|
127
|
+
Stats.update_status("Completed #{url}.")
|
128
|
+
puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @options[:debug]
|
129
|
+
|
130
|
+
yield content, @statistic if block_given?
|
106
131
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
132
|
+
rescue => e
|
133
|
+
puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
134
|
+
ap e
|
135
|
+
ap e.backtrace
|
136
|
+
end
|
137
|
+
else
|
138
|
+
puts "Already crawled #{@options[:url]}" if @options[:debug]
|
112
139
|
end
|
113
|
-
else
|
114
|
-
puts "Already crawled #{@options[:url]}" if @options[:debug]
|
115
140
|
end
|
141
|
+
thread.join
|
116
142
|
end
|
117
143
|
@statistic
|
118
144
|
end
|
119
|
-
|
120
|
-
end
|
145
|
+
end
|
data/lib/crawl_job.rb
CHANGED
@@ -40,7 +40,7 @@ class CrawlJob
|
|
40
40
|
redis.incr "crawl-counter"
|
41
41
|
crawl_counter += 1
|
42
42
|
if crawl_counter <= content_request[:crawl_limit].to_i
|
43
|
-
content =
|
43
|
+
content = Cobweb.new(content_request).get(content_request[:url])
|
44
44
|
|
45
45
|
## update statistics
|
46
46
|
if redis.hexists "statistics", "average_response_time"
|
data/lib/hash.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
## add symbolize methods to hash
|
2
|
+
class Hash
|
3
|
+
def symbolize_keys
|
4
|
+
keys.each do |key|
|
5
|
+
if key.instance_of? String
|
6
|
+
value = self[key]
|
7
|
+
self.delete(key)
|
8
|
+
self[key.to_sym] = value
|
9
|
+
end
|
10
|
+
end
|
11
|
+
self
|
12
|
+
end
|
13
|
+
def deep_symbolize_keys
|
14
|
+
symbolize_keys
|
15
|
+
keys.each do |key|
|
16
|
+
if self[key].instance_of? Hash
|
17
|
+
self[key].deep_symbolize_keys
|
18
|
+
end
|
19
|
+
end
|
20
|
+
self
|
21
|
+
end
|
22
|
+
end
|
data/lib/namespaced_redis.rb
CHANGED
@@ -17,8 +17,16 @@ class NamespacedRedis
|
|
17
17
|
@redis.srem namespaced(key), member
|
18
18
|
end
|
19
19
|
|
20
|
+
def spop(key)
|
21
|
+
@redis.spop namespaced(key)
|
22
|
+
end
|
23
|
+
|
20
24
|
def smembers(key)
|
21
25
|
@redis.smembers namespaced(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
def scard(key)
|
29
|
+
@redis.scard namespaced(key)
|
22
30
|
end
|
23
31
|
|
24
32
|
def get(key)
|
@@ -29,6 +37,10 @@ class NamespacedRedis
|
|
29
37
|
@redis.incr namespaced(key)
|
30
38
|
end
|
31
39
|
|
40
|
+
def decr(key)
|
41
|
+
@redis.decr namespaced(key)
|
42
|
+
end
|
43
|
+
|
32
44
|
def exist(key)
|
33
45
|
@redis.exist namespaced(key)
|
34
46
|
end
|
data/lib/stats.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'sinatra'
|
3
|
+
require 'haml'
|
4
|
+
|
5
|
+
class Stats < Sinatra::Base
|
6
|
+
|
7
|
+
def self.update_statistics(statistics)
|
8
|
+
@@statistics = statistics
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.update_status(status)
|
12
|
+
@@status = status
|
13
|
+
end
|
14
|
+
|
15
|
+
ap settings.root
|
16
|
+
set :views, settings.root + '/../views'
|
17
|
+
|
18
|
+
get '/' do
|
19
|
+
@statistics = @@statistics
|
20
|
+
@status = @@status
|
21
|
+
haml :statistics
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
thread = Thread.new do
|
27
|
+
Stats.run!
|
28
|
+
|
29
|
+
## we need to manually kill the main thread as sinatra traps the interrupts
|
30
|
+
Thread.main.kill
|
31
|
+
end
|
32
|
+
|
@@ -31,7 +31,7 @@ describe CobwebCrawler do
|
|
31
31
|
|
32
32
|
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
33
33
|
|
34
|
-
statistics = crawler.crawl("http://
|
34
|
+
statistics = crawler.crawl("http://rockwellcottage.heroku.com/")
|
35
35
|
|
36
36
|
ap statistics
|
37
37
|
|
@@ -43,8 +43,9 @@ describe CobwebCrawler do
|
|
43
43
|
|
44
44
|
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
45
45
|
|
46
|
-
statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content|
|
46
|
+
statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content, statistics|
|
47
47
|
ap content[:url]
|
48
|
+
ap statistics[:average_length]
|
48
49
|
end
|
49
50
|
|
50
51
|
ap statistics
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe Cobweb do
|
4
4
|
|
5
5
|
before(:each) do
|
6
6
|
|
@@ -15,7 +15,7 @@ describe CobWeb do
|
|
15
15
|
"Server" => "gws",
|
16
16
|
"X-XSS-Protection" => "1; mode=block"}
|
17
17
|
|
18
|
-
@cobweb =
|
18
|
+
@cobweb = Cobweb.new :quiet => true, :cache => nil
|
19
19
|
end
|
20
20
|
|
21
21
|
describe "with mock" do
|
@@ -40,7 +40,9 @@ describe CobWeb do
|
|
40
40
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
41
41
|
@mock_http_client.stub!(:read_timeout=).and_return(nil)
|
42
42
|
@mock_http_client.stub!(:open_timeout=).and_return(nil)
|
43
|
-
@mock_http_client.stub!(:start).and_return(@mock_http_response)
|
43
|
+
@mock_http_client.stub!(:start).and_return(@mock_http_response)
|
44
|
+
@mock_http_client.stub!(:address).and_return("www.baseurl.com")
|
45
|
+
@mock_http_client.stub!(:port).and_return("80 ")
|
44
46
|
|
45
47
|
@mock_http_response.stub!(:code).and_return(200)
|
46
48
|
@mock_http_response.stub!(:content_type).and_return("text/html")
|
@@ -69,7 +71,7 @@ describe CobWeb do
|
|
69
71
|
end
|
70
72
|
|
71
73
|
it "should generate a cobweb object" do
|
72
|
-
|
74
|
+
Cobweb.new.should be_an_instance_of Cobweb
|
73
75
|
end
|
74
76
|
|
75
77
|
describe "get" do
|
@@ -130,7 +132,7 @@ describe CobWeb do
|
|
130
132
|
|
131
133
|
before(:each) do
|
132
134
|
@base_url = "http://redirect-me.com/redirect.html"
|
133
|
-
@cobweb =
|
135
|
+
@cobweb = Cobweb.new(:follow_redirects => true, :quiet => true, :cache => nil)
|
134
136
|
end
|
135
137
|
|
136
138
|
it "should flow through redirect" #do
|
@@ -155,8 +157,8 @@ describe CobWeb do
|
|
155
157
|
|
156
158
|
#end
|
157
159
|
it "should not follow with redirect disabled" do
|
158
|
-
@cobweb =
|
159
|
-
@mock_http_client.should_receive(:
|
160
|
+
@cobweb = Cobweb.new(:follow_redirects => false, :cache => nil)
|
161
|
+
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
160
162
|
|
161
163
|
content = @cobweb.get(@base_url)
|
162
164
|
content[:url].should == "http://redirect-me.com/redirect.html"
|
@@ -0,0 +1,71 @@
|
|
1
|
+
%h1 Cobweb Statistics
|
2
|
+
|
3
|
+
%h4= @status
|
4
|
+
|
5
|
+
%table
|
6
|
+
%tr
|
7
|
+
%th Page Count
|
8
|
+
%th Asset Count
|
9
|
+
%th Redirect Count
|
10
|
+
%tr
|
11
|
+
%td= @statistics[:page_count]
|
12
|
+
%td= @statistics[:asset_count]
|
13
|
+
%td= @statistics[:total_redirects]
|
14
|
+
|
15
|
+
%table
|
16
|
+
%tr
|
17
|
+
%th Total Page Size
|
18
|
+
%th Total Asset Size
|
19
|
+
%tr
|
20
|
+
%td= @statistics[:page_size]
|
21
|
+
%td= @statistics[:asset_size]
|
22
|
+
|
23
|
+
%table
|
24
|
+
%tr
|
25
|
+
%th Crawled
|
26
|
+
%th Queued
|
27
|
+
%tr
|
28
|
+
%td= @statistics[:crawl_counter]
|
29
|
+
%td= @statistics[:queue_counter]
|
30
|
+
|
31
|
+
|
32
|
+
%table
|
33
|
+
%tr
|
34
|
+
%th{:colspan => 2} Response Times
|
35
|
+
%tr
|
36
|
+
%th Average
|
37
|
+
%td= @statistics[:average_response_time]
|
38
|
+
%tr
|
39
|
+
%th Maximum
|
40
|
+
%td= @statistics[:maximum_response_time]
|
41
|
+
%tr
|
42
|
+
%th Minimum
|
43
|
+
%td= @statistics[:minimum_response_time]
|
44
|
+
|
45
|
+
%table
|
46
|
+
%tr
|
47
|
+
%th{:colspan => 2} Content Sizes
|
48
|
+
%tr
|
49
|
+
%th Average
|
50
|
+
%td= @statistics[:average_length]
|
51
|
+
%tr
|
52
|
+
%th Maximum
|
53
|
+
%td= @statistics[:maximum_length]
|
54
|
+
%tr
|
55
|
+
%th Minimum
|
56
|
+
%td= @statistics[:minimum_length]
|
57
|
+
%tr
|
58
|
+
%th Total
|
59
|
+
%td= @statistics[:total_length]
|
60
|
+
|
61
|
+
%table
|
62
|
+
- @statistics[:mime_counts].keys.each do |mime_type|
|
63
|
+
%tr
|
64
|
+
%td= mime_type
|
65
|
+
%td= @statistics[:mime_counts][mime_type]
|
66
|
+
|
67
|
+
%table
|
68
|
+
- @statistics[:status_counts].keys.each do |status|
|
69
|
+
%tr
|
70
|
+
%td= status
|
71
|
+
%td= @statistics[:status_counts][status]
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.17
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70173021660540 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70173021660540
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70173021659920 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70173021659920
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: absolutize
|
38
|
-
requirement: &
|
38
|
+
requirement: &70173021659420 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70173021659420
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70173021658760 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70173021658760
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: addressable
|
60
|
-
requirement: &
|
60
|
+
requirement: &70173021658080 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70173021658080
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
|
-
requirement: &
|
71
|
+
requirement: &70173021657320 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,7 +76,40 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70173021657320
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: awesome_print
|
82
|
+
requirement: &70173021654680 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
type: :runtime
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *70173021654680
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: sinatra
|
93
|
+
requirement: &70173021654060 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
type: :runtime
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *70173021654060
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: thin
|
104
|
+
requirement: &70173021653560 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: *70173021653560
|
80
113
|
description:
|
81
114
|
email: stewart@rockwellcottage.com
|
82
115
|
executables: []
|
@@ -96,7 +129,11 @@ files:
|
|
96
129
|
- lib/cobweb_process_job.rb
|
97
130
|
- lib/content_link_parser.rb
|
98
131
|
- lib/crawl_job.rb
|
132
|
+
- lib/hash.rb
|
99
133
|
- lib/namespaced_redis.rb
|
134
|
+
- lib/redirect_error.rb
|
135
|
+
- lib/stats.rb
|
136
|
+
- views/statistics.haml
|
100
137
|
- README.textile
|
101
138
|
homepage: http://github.com/stewartmckee/cobweb
|
102
139
|
licenses: []
|
@@ -118,7 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
155
|
version: '0'
|
119
156
|
requirements: []
|
120
157
|
rubyforge_project:
|
121
|
-
rubygems_version: 1.8.
|
158
|
+
rubygems_version: 1.8.10
|
122
159
|
signing_key:
|
123
160
|
specification_version: 3
|
124
161
|
summary: Web Crawler that uses resque background job engine to allow you to cluster
|