cobweb 0.0.65 → 0.0.66
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +2 -1
- data/lib/cobweb.rb +1 -1
- data/lib/cobweb_crawler.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl_job.rb +22 -21
- data/lib/server.rb +45 -15
- data/lib/stats.rb +0 -1
- data/spec/cobweb/cobweb_job_spec.rb +57 -59
- data/views/home.haml +2 -0
- data/views/statistics.haml +3 -3
- metadata +22 -22
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -45,7 +45,7 @@ class Cobweb
|
|
45
45
|
default_first_page_redirect_internal_to true
|
46
46
|
default_text_mime_types_to ["text/*", "application/xhtml+xml"]
|
47
47
|
default_obey_robots_to false
|
48
|
-
default_user_agent_to "cobweb"
|
48
|
+
default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
|
49
49
|
|
50
50
|
end
|
51
51
|
|
data/lib/cobweb_crawler.rb
CHANGED
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
@@ -24,12 +24,11 @@ class CrawlJob
|
|
24
24
|
|
25
25
|
@debug = content_request[:debug]
|
26
26
|
|
27
|
-
decrement_queue_counter
|
28
|
-
|
29
27
|
# check we haven't crawled this url before
|
30
28
|
unless @redis.sismember "crawled", content_request[:url]
|
31
29
|
# if there is no limit or we're still under it lets get the url
|
32
30
|
if within_crawl_limits?(content_request[:crawl_limit])
|
31
|
+
puts "cbpl: #{content_request[:url]}" if content_request[:crawl_limit_by_page]
|
33
32
|
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
34
33
|
if content_request[:url] == @redis.get("original_base_url")
|
35
34
|
@redis.set("crawled_base_url", content[:base_url])
|
@@ -78,7 +77,7 @@ class CrawlJob
|
|
78
77
|
send_to_processing_queue(content, content_request)
|
79
78
|
|
80
79
|
#if the enqueue counter has been requested update that
|
81
|
-
if content_request.has_key?
|
80
|
+
if content_request.has_key?(:enqueue_counter_key)
|
82
81
|
enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
|
83
82
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
84
83
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
@@ -89,14 +88,16 @@ class CrawlJob
|
|
89
88
|
|
90
89
|
# update the queue and crawl counts -- doing this very late in the piece so that the following transaction all occurs at once.
|
91
90
|
# really we should do this with a lock https://github.com/PatrickTulskie/redis-lock
|
91
|
+
# stewart: i'm looking at the layout of this, think that there is scope for cleaning up the perform method to be more DRY.
|
92
92
|
if content_request[:crawl_limit_by_page]
|
93
93
|
if content[:mime_type].match("text/html")
|
94
94
|
increment_crawl_counter
|
95
|
+
ap "clbp: #{crawl_counter}"
|
95
96
|
end
|
96
97
|
else
|
97
98
|
increment_crawl_counter
|
98
99
|
end
|
99
|
-
puts "Crawled: #{
|
100
|
+
puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter} In Progress: #{crawl_started_counter-crawl_counter}" if @debug
|
100
101
|
end
|
101
102
|
else
|
102
103
|
puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
|
@@ -110,13 +111,15 @@ class CrawlJob
|
|
110
111
|
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
111
112
|
end
|
112
113
|
|
114
|
+
decrement_queue_counter
|
115
|
+
puts content_request[:crawl_limit]
|
116
|
+
print_counters
|
113
117
|
# if there's nothing left queued or the crawled limit has been reached
|
114
|
-
refresh_counters
|
115
118
|
if content_request[:crawl_limit].nil? || content_request[:crawl_limit] == 0
|
116
|
-
if
|
119
|
+
if queue_counter + crawl_started_counter - crawl_counter == 0
|
117
120
|
finished(content_request)
|
118
121
|
end
|
119
|
-
elsif (
|
122
|
+
elsif (queue_counter+crawl_started_counter-crawl_counter)== 0 || crawl_counter >= content_request[:crawl_limit].to_i
|
120
123
|
finished(content_request)
|
121
124
|
end
|
122
125
|
|
@@ -167,14 +170,12 @@ class CrawlJob
|
|
167
170
|
|
168
171
|
# Returns true if the crawl count is within limits
|
169
172
|
def self.within_crawl_limits?(crawl_limit)
|
170
|
-
|
171
|
-
crawl_limit.nil? or @crawl_started_counter < crawl_limit.to_i
|
173
|
+
crawl_limit.nil? or crawl_counter < crawl_limit.to_i
|
172
174
|
end
|
173
175
|
|
174
176
|
# Returns true if the queue count is calculated to be still within limits when complete
|
175
177
|
def self.within_queue_limits?(crawl_limit)
|
176
|
-
|
177
|
-
(@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or @crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (@queue_counter + @crawl_counter) < crawl_limit.to_i)
|
178
|
+
(@content_request[:crawl_limit_by_page]&& (crawl_limit.nil? or crawl_counter < crawl_limit.to_i)) || within_crawl_limits?(crawl_limit) && (crawl_limit.nil? || (queue_counter + crawl_counter) < crawl_limit.to_i)
|
178
179
|
end
|
179
180
|
|
180
181
|
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
@@ -201,27 +202,27 @@ class CrawlJob
|
|
201
202
|
# Increments the queue counter and refreshes crawl counters
|
202
203
|
def self.increment_queue_counter
|
203
204
|
@redis.incr "queue-counter"
|
204
|
-
refresh_counters
|
205
205
|
end
|
206
206
|
# Increments the crawl counter and refreshes crawl counters
|
207
207
|
def self.increment_crawl_counter
|
208
208
|
@redis.incr "crawl-counter"
|
209
|
-
refresh_counters
|
210
209
|
end
|
211
210
|
def self.increment_crawl_started_counter
|
212
211
|
@redis.incr "crawl-started-counter"
|
213
|
-
refresh_counters
|
214
212
|
end
|
215
213
|
# Decrements the queue counter and refreshes crawl counters
|
216
214
|
def self.decrement_queue_counter
|
217
215
|
@redis.decr "queue-counter"
|
218
|
-
refresh_counters
|
219
216
|
end
|
220
|
-
|
221
|
-
def self.
|
222
|
-
@
|
223
|
-
|
224
|
-
|
217
|
+
|
218
|
+
def self.crawl_counter
|
219
|
+
@redis.get("crawl-counter").to_i
|
220
|
+
end
|
221
|
+
def self.crawl_started_counter
|
222
|
+
@redis.get("crawl-started-counter").to_i
|
223
|
+
end
|
224
|
+
def self.queue_counter
|
225
|
+
@redis.get("queue-counter").to_i
|
225
226
|
end
|
226
227
|
|
227
228
|
def self.print_counters
|
@@ -229,7 +230,7 @@ class CrawlJob
|
|
229
230
|
end
|
230
231
|
|
231
232
|
def self.counters
|
232
|
-
"
|
233
|
+
"crawl_counter: #{crawl_counter} crawl_started_counter: #{crawl_started_counter} queue_counter: #{queue_counter}"
|
233
234
|
end
|
234
235
|
|
235
236
|
end
|
data/lib/server.rb
CHANGED
@@ -10,17 +10,18 @@ class Server < Sinatra::Base
|
|
10
10
|
|
11
11
|
# Sinatra Dashboard
|
12
12
|
get '/' do
|
13
|
-
@full_redis = Redis.new
|
14
|
-
|
13
|
+
@full_redis = Redis.new(redis_options)
|
15
14
|
@colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
|
16
15
|
|
17
16
|
@crawls = []
|
18
|
-
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
19
|
-
|
17
|
+
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
18
|
+
version = cobweb_version(crawl_id)
|
19
|
+
redis = NamespacedRedis.new(redis_options, "cobweb-#{version}-#{crawl_id}")
|
20
20
|
stats = HashUtil.deep_symbolize_keys({
|
21
|
-
:
|
21
|
+
:cobweb_version => version,
|
22
|
+
:crawl_details => redis.hgetall("crawl_details"),
|
22
23
|
:statistics => redis.hgetall("statistics"),
|
23
|
-
:minute_totals => redis.hgetall("minute_totals")
|
24
|
+
:minute_totals => redis.hgetall("minute_totals"),
|
24
25
|
})
|
25
26
|
@crawls << stats
|
26
27
|
end
|
@@ -30,7 +31,9 @@ class Server < Sinatra::Base
|
|
30
31
|
|
31
32
|
# Sinatra Crawl Detail
|
32
33
|
get '/statistics/:crawl_id' do
|
33
|
-
|
34
|
+
|
35
|
+
version = cobweb_version(params[:crawl_id])
|
36
|
+
redis = NamespacedRedis.new(redis_options, "cobweb-#{version}-#{params[:crawl_id]}")
|
34
37
|
|
35
38
|
@statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
|
36
39
|
if @statistics[:status_counts].nil?
|
@@ -44,6 +47,7 @@ class Server < Sinatra::Base
|
|
44
47
|
@statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
|
45
48
|
end
|
46
49
|
@crawl = {
|
50
|
+
:cobweb_version => version,
|
47
51
|
:statistics => @statistics,
|
48
52
|
:crawl_details => HashUtil.deep_symbolize_keys(redis.hgetall("crawl_details")),
|
49
53
|
:minute_totals => HashUtil.deep_symbolize_keys(redis.hgetall("minute_totals")),
|
@@ -56,22 +60,49 @@ class Server < Sinatra::Base
|
|
56
60
|
:pages_count => HashUtil.deep_symbolize_keys(redis.hgetall("pages_count")),
|
57
61
|
:assets_count => HashUtil.deep_symbolize_keys(redis.hgetall("assets_count"))
|
58
62
|
}
|
59
|
-
|
63
|
+
(1..30).each do |minutes|
|
64
|
+
date = (DateTime.now.new_offset(0) - (minutes/1440.0)).strftime("%Y-%m-%d %H:%M").to_sym
|
65
|
+
end
|
66
|
+
|
60
67
|
haml :statistics
|
61
68
|
end
|
62
69
|
|
70
|
+
def cobweb_version(crawl_id)
|
71
|
+
redis = Redis.new(redis_options)
|
72
|
+
key = redis.keys("cobweb-*-#{crawl_id}-crawl_details").first
|
73
|
+
key =~ /cobweb-(.*?)-(.*?)-crawl_details/
|
74
|
+
cobweb_version = $1
|
75
|
+
end
|
76
|
+
|
77
|
+
def redis_options
|
78
|
+
Server.cobweb_options[:redis_options]
|
79
|
+
end
|
80
|
+
|
63
81
|
# Starts the Sinatra server, and kills the processes when shutdown
|
64
|
-
def self.start
|
82
|
+
def self.start(options={})
|
83
|
+
@options = options
|
84
|
+
@options[:redis_options] = {} unless @options.has_key? :redis_options
|
85
|
+
ap @options
|
65
86
|
unless Server.running?
|
66
|
-
|
67
|
-
puts "Starting Sinatra"
|
87
|
+
if @options[:run_as_server]
|
88
|
+
puts "Starting Sinatra for cobweb v#{Cobweb.version}"
|
68
89
|
Server.run!
|
69
90
|
puts "Stopping crawl..."
|
70
|
-
|
71
|
-
Thread.
|
91
|
+
else
|
92
|
+
thread = Thread.new do
|
93
|
+
puts "Starting Sinatra"
|
94
|
+
Server.run!
|
95
|
+
puts "Stopping crawl..."
|
96
|
+
## we need to manually kill the main thread as sinatra traps the interrupts
|
97
|
+
Thread.main.kill
|
98
|
+
end
|
72
99
|
end
|
73
100
|
end
|
74
|
-
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.cobweb_options
|
104
|
+
@options
|
105
|
+
end
|
75
106
|
|
76
107
|
end
|
77
108
|
|
@@ -81,7 +112,6 @@ class Numeric
|
|
81
112
|
#Returns a human readable format for a number representing a data size
|
82
113
|
def to_human
|
83
114
|
units = %w{B KB MB GB TB}
|
84
|
-
ap self
|
85
115
|
e = 0
|
86
116
|
e = (Math.log(self)/Math.log(1024)).floor unless self==0
|
87
117
|
s = "%.3f" % (to_f / 1024**e)
|
data/lib/stats.rb
CHANGED
@@ -9,7 +9,7 @@ describe Cobweb, :local_only => true do
|
|
9
9
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
10
|
puts "Starting Workers... Please Wait..."
|
11
11
|
`mkdir log`
|
12
|
-
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=
|
12
|
+
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=1 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
13
|
puts "Workers Started."
|
14
14
|
|
15
15
|
end
|
@@ -17,61 +17,58 @@ describe Cobweb, :local_only => true do
|
|
17
17
|
before(:each) do
|
18
18
|
@base_url = "http://localhost:3532/"
|
19
19
|
@base_page_count = 77
|
20
|
-
|
21
20
|
clear_queues
|
22
21
|
end
|
23
|
-
|
22
|
+
|
24
23
|
describe "with no crawl limit" do
|
25
24
|
before(:each) do
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
25
|
+
@request = {
|
26
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
27
|
+
:crawl_limit => nil,
|
28
|
+
:quiet => false,
|
29
|
+
:debug => false,
|
30
|
+
:cache => nil
|
31
|
+
}
|
32
|
+
@cobweb = Cobweb.new @request
|
34
33
|
end
|
35
|
-
|
34
|
+
|
36
35
|
it "should crawl entire site" do
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
36
|
+
crawl = @cobweb.start(@base_url)
|
37
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
38
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
39
|
+
Resque.size("cobweb_process_job").should == @base_page_count
|
41
40
|
end
|
42
|
-
it "detect crawl finished" do
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
41
|
+
it "detect crawl finished once" do
|
42
|
+
crawl = @cobweb.start(@base_url)
|
43
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
44
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
45
|
+
Resque.size("cobweb_finished_job").should == 1
|
47
46
|
end
|
48
47
|
end
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
end
|
48
|
+
describe "with limited mime_types" do
|
49
|
+
before(:each) do
|
50
|
+
@request = {
|
51
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
52
|
+
:quiet => true,
|
53
|
+
:cache => nil,
|
54
|
+
:valid_mime_types => ["text/html"]
|
55
|
+
}
|
56
|
+
@cobweb = Cobweb.new @request
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should only crawl html pages" do
|
60
|
+
crawl = @cobweb.start(@base_url)
|
61
|
+
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
62
|
+
wait_for_crawl_finished crawl[:crawl_id]
|
63
|
+
Resque.size("cobweb_process_job").should == 8
|
64
|
+
|
65
|
+
mime_types = Resque.peek("cobweb_process_job", 0, 100).map{|job| job["args"][0]["mime_type"]}
|
66
|
+
mime_types.count.should == 8
|
67
|
+
mime_types.map{|m| m.should == "text/html"}
|
68
|
+
mime_types.select{|m| m=="text/html"}.count.should == 8
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
75
72
|
describe "with a crawl limit" do
|
76
73
|
before(:each) do
|
77
74
|
@request = {
|
@@ -86,7 +83,7 @@ describe Cobweb, :local_only => true do
|
|
86
83
|
@request[:crawl_limit] = 1
|
87
84
|
@cobweb = Cobweb.new @request
|
88
85
|
end
|
89
|
-
|
86
|
+
|
90
87
|
it "should not crawl the entire site" do
|
91
88
|
crawl = @cobweb.start(@base_url)
|
92
89
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
@@ -99,13 +96,12 @@ describe Cobweb, :local_only => true do
|
|
99
96
|
wait_for_crawl_finished crawl[:crawl_id]
|
100
97
|
Resque.size("cobweb_process_job").should == 1
|
101
98
|
end
|
102
|
-
it "should notify of crawl finished" do
|
99
|
+
it "should notify of crawl finished once" do
|
103
100
|
crawl = @cobweb.start(@base_url)
|
104
101
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
105
102
|
wait_for_crawl_finished crawl[:crawl_id]
|
106
103
|
Resque.size("cobweb_finished_job").should == 1
|
107
|
-
end
|
108
|
-
|
104
|
+
end
|
109
105
|
end
|
110
106
|
|
111
107
|
describe "for pages only" do
|
@@ -114,17 +110,17 @@ describe Cobweb, :local_only => true do
|
|
114
110
|
@request[:crawl_limit] = 5
|
115
111
|
@cobweb = Cobweb.new @request
|
116
112
|
end
|
117
|
-
|
113
|
+
|
118
114
|
it "should only use html pages towards the crawl limit" do
|
119
115
|
crawl = @cobweb.start(@base_url)
|
120
116
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
121
117
|
wait_for_crawl_finished crawl[:crawl_id]
|
122
118
|
mime_types = Resque.peek("cobweb_process_job", 0, 200).map{|job| job["args"][0]["mime_type"]}
|
123
|
-
|
119
|
+
Resque.peek("cobweb_process_job", 0, 200).count.should > 5
|
124
120
|
mime_types.select{|m| m=="text/html"}.count.should == 5
|
125
121
|
end
|
126
122
|
end
|
127
|
-
|
123
|
+
|
128
124
|
describe "limit to 10" do
|
129
125
|
before(:each) do
|
130
126
|
@request[:crawl_limit] = 10
|
@@ -137,7 +133,7 @@ describe Cobweb, :local_only => true do
|
|
137
133
|
wait_for_crawl_finished crawl[:crawl_id]
|
138
134
|
Resque.size("cobweb_process_job").should_not == @base_page_count
|
139
135
|
end
|
140
|
-
it "should notify of crawl finished" do
|
136
|
+
it "should notify of crawl finished once" do
|
141
137
|
crawl = @cobweb.start(@base_url)
|
142
138
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
143
139
|
wait_for_crawl_finished crawl[:crawl_id]
|
@@ -150,7 +146,7 @@ describe Cobweb, :local_only => true do
|
|
150
146
|
Resque.size("cobweb_process_job").should == 10
|
151
147
|
end
|
152
148
|
end
|
153
|
-
|
149
|
+
|
154
150
|
describe "limit to 100" do
|
155
151
|
before(:each) do
|
156
152
|
@request[:crawl_limit] = 100
|
@@ -163,7 +159,7 @@ describe Cobweb, :local_only => true do
|
|
163
159
|
wait_for_crawl_finished crawl[:crawl_id]
|
164
160
|
Resque.size("cobweb_process_job").should == @base_page_count
|
165
161
|
end
|
166
|
-
it "should notify of crawl finished" do
|
162
|
+
it "should notify of crawl finished once" do
|
167
163
|
crawl = @cobweb.start(@base_url)
|
168
164
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
169
165
|
wait_for_crawl_finished crawl[:crawl_id]
|
@@ -175,12 +171,13 @@ describe Cobweb, :local_only => true do
|
|
175
171
|
wait_for_crawl_finished crawl[:crawl_id]
|
176
172
|
Resque.size("cobweb_process_job").should_not == 100
|
177
173
|
end
|
178
|
-
end
|
174
|
+
end
|
179
175
|
end
|
180
176
|
|
181
177
|
after(:all) do
|
178
|
+
|
182
179
|
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
183
|
-
command = "kill #{(@all_processes - @existing_processes).join(" ")}"
|
180
|
+
command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
|
184
181
|
IO.popen(command)
|
185
182
|
|
186
183
|
clear_queues
|
@@ -210,6 +207,7 @@ def clear_queues
|
|
210
207
|
|
211
208
|
Resque.size("cobweb_process_job").should == 0
|
212
209
|
Resque.size("cobweb_finished_job").should == 0
|
210
|
+
Resque.peek("cobweb_process_job", 0, 200).should be_empty
|
213
211
|
end
|
214
212
|
|
215
213
|
|
data/views/home.haml
CHANGED
@@ -40,6 +40,7 @@
|
|
40
40
|
%thead
|
41
41
|
%tr
|
42
42
|
%th Base URL
|
43
|
+
%th Cobweb Version
|
43
44
|
%th Total Pages
|
44
45
|
%th Total Assets
|
45
46
|
%th Queued Objects
|
@@ -48,6 +49,7 @@
|
|
48
49
|
%tr
|
49
50
|
%td
|
50
51
|
%a{:href => "/statistics/#{crawl[:crawl_details][:crawl_id]}"}= crawl[:crawl_details][:base_url]
|
52
|
+
%td= crawl[:cobweb_version]
|
51
53
|
%td= crawl[:statistics][:page_count]
|
52
54
|
%td= crawl[:statistics][:asset_count]
|
53
55
|
%td= crawl[:statistics][:queue_counter]
|
data/views/statistics.haml
CHANGED
@@ -40,7 +40,7 @@
|
|
40
40
|
|
41
41
|
%tbody
|
42
42
|
- (1..30).each do |minutes|
|
43
|
-
- date = (DateTime.now - (minutes/1440.0)).strftime("%Y-%m-%d %H:%M").to_sym
|
43
|
+
- date = (DateTime.now.new_offset(0) - (minutes/1440.0)).strftime("%Y-%m-%d %H:%M").to_sym
|
44
44
|
%tr
|
45
45
|
%th= minutes
|
46
46
|
%td= @crawl[:pages_count][date]
|
@@ -88,7 +88,7 @@
|
|
88
88
|
|
89
89
|
%tbody
|
90
90
|
- (1..30).each do |minutes|
|
91
|
-
- date = (DateTime.now - (minutes/1440.0)).strftime("%Y-%m-%d %H:%M").to_sym
|
91
|
+
- date = (DateTime.now.new_offset(0) - (minutes/1440.0)).strftime("%Y-%m-%d %H:%M").to_sym
|
92
92
|
%tr
|
93
93
|
%th= minutes
|
94
94
|
%td= @crawl[:status_200_count][date]
|
@@ -113,7 +113,7 @@
|
|
113
113
|
|
114
114
|
%tbody
|
115
115
|
- (1..30).each do |minutes|
|
116
|
-
- date = (DateTime.now - (minutes/1440.0)).strftime("%Y-%m-%d %H:%M").to_sym
|
116
|
+
- date = (DateTime.now.new_offset(0) - (minutes/1440.0)).strftime("%Y-%m-%d %H:%M").to_sym
|
117
117
|
%tr
|
118
118
|
%th= minutes
|
119
119
|
%td= @crawl[:mime_text_count][date]
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.66
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70145280967560 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70145280967560
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70145280966480 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70145280966480
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70145280965880 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70145280965880
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70145280964660 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70145280964660
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70145280964040 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70145280964040
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70145280963260 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70145280963260
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70145280962560 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70145280962560
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70145280961780 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70145280961780
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70145280960840 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70145280960840
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70145280960100 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70145280960100
|
124
124
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
125
125
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
126
126
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|