cobweb 0.0.74 → 0.0.75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/content_link_parser.rb +1 -0
- data/lib/crawl.rb +110 -68
- data/lib/crawl_job.rb +36 -26
- data/spec/cobweb/cobweb_job_spec.rb +36 -15
- metadata +24 -24
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -124,7 +124,7 @@ class Cobweb
|
|
124
124
|
else
|
125
125
|
# retrieve data
|
126
126
|
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
127
|
-
puts "Creating connection to #{uri.host}..."
|
127
|
+
puts "Creating connection to #{uri.host}..." if @options[:debug]
|
128
128
|
@http = Net::HTTP.new(uri.host, uri.inferred_port)
|
129
129
|
end
|
130
130
|
if uri.scheme == "https"
|
data/lib/cobweb_version.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -1,26 +1,26 @@
|
|
1
1
|
module CobwebModule
|
2
2
|
class Crawl
|
3
|
-
|
3
|
+
|
4
4
|
def initialize(options={})
|
5
5
|
@options = HashUtil.deep_symbolize_keys(options)
|
6
|
-
|
6
|
+
|
7
7
|
setup_defaults
|
8
8
|
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", Redis.new(@options[:redis_options]))
|
9
9
|
@stats = Stats.new(@options)
|
10
10
|
@debug = @options[:debug]
|
11
11
|
@first_to_finish = false
|
12
|
-
|
12
|
+
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
# Returns true if the url requested is already in the crawled queue
|
16
16
|
def already_crawled?(link=@options[:url])
|
17
|
-
|
17
|
+
@redis.sismember "crawled", link
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
def already_queued?(link)
|
21
21
|
@redis.sismember "queued", link
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
# Returns true if the crawl count is within limits
|
25
25
|
def within_crawl_limits?
|
26
26
|
@options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
|
@@ -33,51 +33,60 @@ module CobwebModule
|
|
33
33
|
|
34
34
|
# Returns true if the queue count is calculated to be still within limits when complete
|
35
35
|
def within_queue_limits?
|
36
|
-
|
36
|
+
|
37
37
|
# if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
|
38
38
|
if @options[:crawl_limit_by_page]
|
39
39
|
return true
|
40
|
-
|
41
|
-
|
40
|
+
|
41
|
+
# if a crawl limit is set, limit queue size to crawled + queue
|
42
42
|
elsif @options[:crawl_limit].to_i > 0
|
43
43
|
(queue_counter + crawl_counter) < @options[:crawl_limit].to_i
|
44
|
-
|
45
|
-
|
44
|
+
|
45
|
+
# no crawl limit set so always within queue limit
|
46
46
|
else
|
47
47
|
true
|
48
48
|
end
|
49
49
|
end
|
50
|
-
|
50
|
+
|
51
51
|
def retrieve
|
52
|
-
|
53
|
-
|
54
|
-
@
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
52
|
+
lock("retrieve") do
|
53
|
+
unless @redis.sismember("currently_running", @options[:url])
|
54
|
+
@redis.sadd("currently_running", @options[:url])
|
55
|
+
unless already_crawled?
|
56
|
+
if within_crawl_limits?
|
57
|
+
@stats.update_status("Retrieving #{@options[:url]}...")
|
58
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
59
|
+
if @options[:url] == @redis.get("original_base_url")
|
60
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
61
|
+
end
|
62
|
+
update_queues
|
63
|
+
|
64
|
+
if content.permitted_type?
|
65
|
+
## update statistics
|
66
|
+
|
67
|
+
@stats.update_statistics(@content)
|
68
|
+
return true
|
69
|
+
end
|
70
|
+
else
|
71
|
+
decrement_queue_counter
|
72
|
+
end
|
73
|
+
else
|
74
|
+
decrement_queue_counter
|
66
75
|
end
|
67
76
|
else
|
77
|
+
debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
|
78
|
+
debug_ap @redis.smembers("currently_running")
|
68
79
|
decrement_queue_counter
|
69
80
|
end
|
70
|
-
|
71
|
-
decrement_queue_counter
|
81
|
+
false
|
72
82
|
end
|
73
|
-
false
|
74
83
|
end
|
75
|
-
|
84
|
+
|
76
85
|
def process_links &block
|
77
|
-
|
86
|
+
|
78
87
|
# set the base url if this is the first page
|
79
88
|
set_base_url @redis
|
80
|
-
|
89
|
+
|
81
90
|
@cobweb_links = CobwebLinks.new(@options)
|
82
91
|
if within_queue_limits?
|
83
92
|
internal_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
|
@@ -99,20 +108,20 @@ module CobwebModule
|
|
99
108
|
increment_queue_counter
|
100
109
|
end
|
101
110
|
else
|
102
|
-
|
111
|
+
debug_puts "Cannot enqueue new content as crawl has been cancelled."
|
103
112
|
end
|
104
113
|
end
|
105
114
|
end
|
106
115
|
end
|
107
116
|
end
|
108
|
-
|
117
|
+
|
109
118
|
def content
|
110
119
|
raise "Content is not available" if @content.nil?
|
111
|
-
CobwebModule::CrawlObject.new(@content, @options)
|
120
|
+
CobwebModule::CrawlObject.new(@content, @options)
|
112
121
|
end
|
113
|
-
|
122
|
+
|
114
123
|
def update_queues
|
115
|
-
|
124
|
+
lock("update_queues") do
|
116
125
|
#@redis.incr "inprogress"
|
117
126
|
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
118
127
|
@redis.srem "queued", @options[:url]
|
@@ -123,7 +132,6 @@ module CobwebModule
|
|
123
132
|
end
|
124
133
|
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
125
134
|
if @options[:crawl_limit_by_page]
|
126
|
-
ap "#{content.mime_type} - #{content.url}"
|
127
135
|
if content.mime_type.match("text/html")
|
128
136
|
increment_crawl_counter
|
129
137
|
end
|
@@ -133,12 +141,12 @@ module CobwebModule
|
|
133
141
|
decrement_queue_counter
|
134
142
|
end
|
135
143
|
end
|
136
|
-
|
144
|
+
|
137
145
|
def to_be_processed?
|
138
|
-
!finished? ||
|
146
|
+
(!finished? || within_process_limits?) && !@redis.sismember("enqueued", @options[:url])
|
139
147
|
end
|
140
|
-
|
141
|
-
def process
|
148
|
+
|
149
|
+
def process(&block)
|
142
150
|
if @options[:crawl_limit_by_page]
|
143
151
|
if content.mime_type.match("text/html")
|
144
152
|
increment_process_counter
|
@@ -146,34 +154,42 @@ module CobwebModule
|
|
146
154
|
else
|
147
155
|
increment_process_counter
|
148
156
|
end
|
157
|
+
@redis.sadd "enqueued", @options[:url]
|
158
|
+
|
159
|
+
yield if block_given?
|
160
|
+
@redis.incr("crawl_job_enqueued_count")
|
149
161
|
end
|
150
|
-
|
162
|
+
|
163
|
+
def finished_processing
|
164
|
+
@redis.srem "currently_running", @options[:url]
|
165
|
+
end
|
166
|
+
|
151
167
|
def finished?
|
152
168
|
print_counters
|
153
|
-
# if there's nothing left queued or the crawled limit has been reached
|
169
|
+
# if there's nothing left queued or the crawled limit has been reached and we're not still processing something
|
154
170
|
if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
|
155
|
-
if queue_counter
|
171
|
+
if queue_counter == 0 && @redis.smembers("currently_running").empty?
|
156
172
|
finished
|
157
173
|
return true
|
158
174
|
end
|
159
|
-
elsif (queue_counter
|
175
|
+
elsif (queue_counter == 0 && @redis.smembers("currently_running").empty?) || process_counter >= @options[:crawl_limit].to_i
|
160
176
|
finished
|
161
177
|
return true
|
162
178
|
end
|
163
179
|
false
|
164
180
|
end
|
165
|
-
|
181
|
+
|
166
182
|
def finished
|
167
|
-
set_first_to_finish
|
168
|
-
|
183
|
+
set_first_to_finish
|
184
|
+
debug_ap "CRAWL FINISHED #{@options[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}"
|
169
185
|
@stats.end_crawl(@options)
|
170
186
|
end
|
171
|
-
|
187
|
+
|
172
188
|
def set_first_to_finish
|
173
189
|
@redis.watch("first_to_finish") do
|
174
190
|
if !@redis.exists("first_to_finish")
|
175
191
|
@redis.multi do
|
176
|
-
|
192
|
+
debug_puts "set first to finish"
|
177
193
|
@first_to_finish = true
|
178
194
|
@redis.set("first_to_finish", 1)
|
179
195
|
end
|
@@ -182,23 +198,51 @@ module CobwebModule
|
|
182
198
|
end
|
183
199
|
end
|
184
200
|
end
|
185
|
-
|
186
|
-
|
187
|
-
def first_to_finish?
|
201
|
+
|
202
|
+
|
203
|
+
def first_to_finish?
|
188
204
|
@first_to_finish
|
189
205
|
end
|
190
206
|
|
191
207
|
def crawled_base_url
|
192
208
|
@redis.get("crawled_base_url")
|
193
209
|
end
|
194
|
-
|
210
|
+
|
195
211
|
def statistics
|
196
212
|
@stats.get_statistics
|
197
213
|
end
|
198
|
-
|
214
|
+
|
199
215
|
def redis
|
200
216
|
@redis
|
201
217
|
end
|
218
|
+
|
219
|
+
def lock(key, &block)
|
220
|
+
debug_puts "REQUESTING LOCK [#{key}]"
|
221
|
+
set_nx = @redis.setnx("#{key}_lock", "locked")
|
222
|
+
debug_puts "LOCK:#{key}:#{set_nx}"
|
223
|
+
while !set_nx
|
224
|
+
debug_puts "===== WAITING FOR LOCK [#{key}] ====="
|
225
|
+
sleep 0.01
|
226
|
+
set_nx = @redis.setnx("#{key}_lock", "locked")
|
227
|
+
end
|
228
|
+
|
229
|
+
debug_puts "RECEIVED LOCK [#{key}]"
|
230
|
+
begin
|
231
|
+
result = yield
|
232
|
+
ensure
|
233
|
+
@redis.del("#{key}_lock")
|
234
|
+
debug_puts "LOCK RELEASED [#{key}]"
|
235
|
+
end
|
236
|
+
result
|
237
|
+
end
|
238
|
+
|
239
|
+
def debug_ap(value)
|
240
|
+
ap(value) if @options[:debug]
|
241
|
+
end
|
242
|
+
|
243
|
+
def debug_puts(value)
|
244
|
+
puts(value) if @options[:debug]
|
245
|
+
end
|
202
246
|
|
203
247
|
private
|
204
248
|
def setup_defaults
|
@@ -206,7 +250,7 @@ module CobwebModule
|
|
206
250
|
@options[:crawl_limit_by_page] = false unless @options.has_key? :crawl_limit_by_page
|
207
251
|
@options[:valid_mime_types] = ["*/*"] unless @options.has_key? :valid_mime_types
|
208
252
|
end
|
209
|
-
|
253
|
+
|
210
254
|
# Increments the queue counter and refreshes crawl counters
|
211
255
|
def increment_queue_counter
|
212
256
|
@redis.incr "queue-counter"
|
@@ -223,7 +267,7 @@ module CobwebModule
|
|
223
267
|
def decrement_queue_counter
|
224
268
|
@redis.decr "queue-counter"
|
225
269
|
end
|
226
|
-
|
270
|
+
|
227
271
|
def crawl_counter
|
228
272
|
@redis.get("crawl-counter").to_i
|
229
273
|
end
|
@@ -233,19 +277,19 @@ module CobwebModule
|
|
233
277
|
def process_counter
|
234
278
|
@redis.get("process-counter").to_i
|
235
279
|
end
|
236
|
-
|
280
|
+
|
237
281
|
def status
|
238
282
|
@stats.get_status
|
239
283
|
end
|
240
|
-
|
284
|
+
|
241
285
|
def print_counters
|
242
|
-
|
286
|
+
debug_puts counters
|
243
287
|
end
|
244
|
-
|
288
|
+
|
245
289
|
def counters
|
246
|
-
"crawl_counter: #{crawl_counter} queue_counter: #{queue_counter} process_counter: #{process_counter} crawl_limit: #{@options[:crawl_limit]}"
|
290
|
+
"crawl_counter: #{crawl_counter} queue_counter: #{queue_counter} process_counter: #{process_counter} crawl_limit: #{@options[:crawl_limit]} currently_running: #{@redis.smembers("currently_running").count}"
|
247
291
|
end
|
248
|
-
|
292
|
+
|
249
293
|
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
250
294
|
def set_base_url(redis)
|
251
295
|
if redis.get("base_url").nil?
|
@@ -257,7 +301,5 @@ module CobwebModule
|
|
257
301
|
end
|
258
302
|
end
|
259
303
|
|
260
|
-
|
261
|
-
|
262
304
|
end
|
263
|
-
end
|
305
|
+
end
|
data/lib/crawl_job.rb
CHANGED
@@ -24,40 +24,49 @@ class CrawlJob
|
|
24
24
|
@crawl.process_links do |link|
|
25
25
|
|
26
26
|
# enqueue the links to resque
|
27
|
-
|
27
|
+
@crawl.debug_puts "ENQUEUED LINK: #{link}"
|
28
28
|
enqueue_content(content_request, link)
|
29
29
|
|
30
30
|
end
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
32
|
+
@crawl.lock("crawl_job_process") do
|
33
|
+
if @crawl.to_be_processed?
|
34
|
+
|
35
|
+
@crawl.process do
|
36
|
+
|
37
|
+
# enqueue to processing queue
|
38
|
+
@crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
|
39
|
+
send_to_processing_queue(@crawl.content.to_hash, content_request)
|
40
|
+
|
41
|
+
#if the enqueue counter has been requested update that
|
42
|
+
if content_request.has_key?(:enqueue_counter_key)
|
43
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
|
44
|
+
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
45
|
+
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
else
|
50
|
+
@crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
|
51
|
+
@crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
52
|
+
@crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
47
53
|
end
|
48
|
-
|
49
|
-
ap "@crawl.finished? #{@crawl.finished?}"
|
50
|
-
ap "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
|
51
|
-
ap "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
|
54
|
+
|
52
55
|
end
|
53
56
|
end
|
54
57
|
end
|
58
|
+
|
59
|
+
@crawl.lock("finished") do
|
60
|
+
# let the crawl know we're finished with this object
|
61
|
+
@crawl.finished_processing
|
55
62
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
63
|
+
# test queue and crawl sizes to see if we have completed the crawl
|
64
|
+
@crawl.debug_puts "finished? #{@crawl.finished?}"
|
65
|
+
@crawl.debug_puts "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
|
66
|
+
if @crawl.finished? && @crawl.first_to_finish?
|
67
|
+
@crawl.debug_puts "Calling crawl_job finished"
|
68
|
+
finished(content_request)
|
69
|
+
end
|
61
70
|
end
|
62
71
|
|
63
72
|
end
|
@@ -68,6 +77,7 @@ class CrawlJob
|
|
68
77
|
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
69
78
|
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
70
79
|
|
80
|
+
@crawl.debug_puts "increment crawl_finished_enqueued_count"
|
71
81
|
@crawl.redis.incr("crawl_finished_enqueued_count")
|
72
82
|
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
|
73
83
|
end
|
@@ -85,7 +95,7 @@ class CrawlJob
|
|
85
95
|
else
|
86
96
|
Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
|
87
97
|
end
|
88
|
-
|
98
|
+
@crawl.debug_puts "#{content_request[:url]} has been sent for processing. use_encoding_safe_process_job: #{content_request[:use_encoding_safe_process_job]}"
|
89
99
|
end
|
90
100
|
|
91
101
|
private
|
@@ -9,7 +9,7 @@ describe Cobweb, :local_only => true do
|
|
9
9
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
10
|
puts "Starting Workers... Please Wait..."
|
11
11
|
`mkdir log`
|
12
|
-
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=
|
12
|
+
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=10 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
13
|
puts "Workers Started."
|
14
14
|
|
15
15
|
end
|
@@ -42,16 +42,16 @@ describe Cobweb, :local_only => true do
|
|
42
42
|
@redis.get("crawl_job_enqueued_count").to_i.should == 0
|
43
43
|
end
|
44
44
|
|
45
|
-
it "should not complete the crawl when cancelled" do
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
end
|
45
|
+
# it "should not complete the crawl when cancelled" do
|
46
|
+
# crawl = @cobweb.start(@base_url)
|
47
|
+
# crawl_obj = CobwebCrawlHelper.new(crawl)
|
48
|
+
# sleep 6
|
49
|
+
# crawl_obj.destroy
|
50
|
+
# @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
51
|
+
# wait_for_crawl_finished crawl[:crawl_id]
|
52
|
+
# @redis.get("crawl_job_enqueued_count").to_i.should > 0
|
53
|
+
# @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
54
|
+
# end
|
55
55
|
|
56
56
|
end
|
57
57
|
describe "with no crawl limit" do
|
@@ -83,11 +83,13 @@ describe Cobweb, :local_only => true do
|
|
83
83
|
@redis.get("crawl_finished_enqueued_count").to_i.should == 1
|
84
84
|
end
|
85
85
|
end
|
86
|
+
|
86
87
|
describe "with limited mime_types" do
|
87
88
|
before(:each) do
|
88
89
|
@request = {
|
89
90
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
90
|
-
:quiet =>
|
91
|
+
:quiet => false,
|
92
|
+
:debug => false,
|
91
93
|
:cache => nil,
|
92
94
|
:valid_mime_types => ["text/html"]
|
93
95
|
}
|
@@ -112,12 +114,26 @@ describe Cobweb, :local_only => true do
|
|
112
114
|
before(:each) do
|
113
115
|
@request = {
|
114
116
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
115
|
-
:quiet =>
|
117
|
+
:quiet => false,
|
118
|
+
:debug => false,
|
116
119
|
:cache => nil
|
117
120
|
}
|
118
121
|
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", Redis.new)
|
119
122
|
end
|
120
123
|
|
124
|
+
# describe "crawling http://yepadeperrors.wordpress.com/ with limit of 20" do
|
125
|
+
# before(:each) do
|
126
|
+
# @request[:crawl_limit] = 20
|
127
|
+
# @cobweb = Cobweb.new @request
|
128
|
+
# end
|
129
|
+
# it "should crawl exactly 20" do
|
130
|
+
# crawl = @cobweb.start("http://yepadeperrors.wordpress.com/")
|
131
|
+
# @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
132
|
+
# wait_for_crawl_finished crawl[:crawl_id]
|
133
|
+
# @redis.get("crawl_job_enqueued_count").to_i.should == 20
|
134
|
+
# end
|
135
|
+
#
|
136
|
+
# end
|
121
137
|
describe "limit to 1" do
|
122
138
|
before(:each) do
|
123
139
|
@request[:crawl_limit] = 1
|
@@ -245,12 +261,17 @@ def running?(crawl_id)
|
|
245
261
|
result = true
|
246
262
|
else
|
247
263
|
if status == @last_stat
|
248
|
-
if @counter >
|
264
|
+
if @counter > 20
|
265
|
+
puts ""
|
249
266
|
raise "Static status: #{status}"
|
250
267
|
else
|
251
268
|
@counter += 1
|
252
269
|
end
|
253
|
-
|
270
|
+
if @counter == 1
|
271
|
+
print "Static Status.. #{21-@counter}"
|
272
|
+
else
|
273
|
+
print ".#{21-@counter}"
|
274
|
+
end
|
254
275
|
else
|
255
276
|
result = status != CobwebCrawlHelper::FINISHED && status != CobwebCrawlHelper::CANCELLED
|
256
277
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.75
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-10-
|
12
|
+
date: 2012-10-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70303208832100 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70303208832100
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70303208831180 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70303208831180
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70303208830080 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70303208830080
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70303208829280 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70303208829280
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70303208828000 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70303208828000
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70303208826740 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70303208826740
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70303208825020 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70303208825020
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70303208823900 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70303208823900
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70303208822980 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70303208822980
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70303208821840 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70303208821840
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: json
|
126
|
-
requirement: &
|
126
|
+
requirement: &70303208821320 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,7 +131,7 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70303208821320
|
135
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
136
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
137
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|