cobweb 1.0.18 → 1.0.19
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +7 -3
- data/lib/cobweb.rb +13 -8
- data/lib/cobweb_crawler.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +55 -28
- data/lib/crawl_helper.rb +2 -2
- data/lib/crawl_job.rb +11 -8
- data/lib/crawl_worker.rb +14 -9
- data/lib/redis_connection.rb +23 -0
- data/lib/server.rb +4 -4
- data/lib/sidekiq/cobweb_helper.rb +3 -2
- data/spec/cobweb/cobweb_spec.rb +21 -11
- data/spec/cobweb/crawl_job_spec.rb +20 -19
- data/spec/cobweb/crawl_worker_spec.rb +8 -7
- metadata +30 -5
- checksums.yaml +0 -15
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v1.0.
|
2
|
+
h1. Cobweb v1.0.19
|
3
3
|
|
4
4
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
5
5
|
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
@@ -95,7 +95,8 @@ Creates a new crawler object based on a base_url
|
|
95
95
|
|
96
96
|
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
|
97
97
|
** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
|
98
|
-
** :processing_queue - specifies the processing queue for content to be sent to (Default:
|
98
|
+
** :processing_queue - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
|
99
|
+
** :crawl_finished_queue - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
|
99
100
|
** :debug - enables debug output (Default: false)
|
100
101
|
** :quiet - hides default output (Default: false)
|
101
102
|
** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
|
@@ -110,9 +111,12 @@ Creates a new crawler object based on a base_url
|
|
110
111
|
** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
|
111
112
|
** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
112
113
|
** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
|
113
|
-
** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
|
114
|
+
** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
|
114
115
|
** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
|
115
116
|
** :raise_exceptions - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
|
117
|
+
** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
|
118
|
+
** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
|
119
|
+
** :proxy_port - port number of the proxy (default: nil)
|
116
120
|
|
117
121
|
|
118
122
|
bc. crawler = Cobweb.new(:follow_redirects => false)
|
data/lib/cobweb.rb
CHANGED
@@ -9,6 +9,9 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
|
|
9
9
|
require file
|
10
10
|
end
|
11
11
|
|
12
|
+
puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
|
13
|
+
|
14
|
+
|
12
15
|
# Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
|
13
16
|
class Cobweb
|
14
17
|
|
@@ -57,6 +60,8 @@ class Cobweb
|
|
57
60
|
default_valid_mime_types_to ["*/*"]
|
58
61
|
default_raise_exceptions_to false
|
59
62
|
default_store_inbound_links_to false
|
63
|
+
default_proxy_addr_to nil
|
64
|
+
default_proxy_port_to nil
|
60
65
|
|
61
66
|
end
|
62
67
|
|
@@ -76,7 +81,7 @@ class Cobweb
|
|
76
81
|
end
|
77
82
|
|
78
83
|
request.merge!(@options)
|
79
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis =>
|
84
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => RedisConnection.new(request[:redis_options]))
|
80
85
|
@redis.set("original_base_url", base_url)
|
81
86
|
@redis.hset "statistics", "queued_at", DateTime.now
|
82
87
|
@redis.set("crawl-counter", 0)
|
@@ -130,11 +135,11 @@ class Cobweb
|
|
130
135
|
|
131
136
|
# connect to redis
|
132
137
|
if options.has_key? :crawl_id
|
133
|
-
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis =>
|
138
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
134
139
|
else
|
135
|
-
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis =>
|
140
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
|
136
141
|
end
|
137
|
-
full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis =>
|
142
|
+
full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
|
138
143
|
|
139
144
|
content = {:base_url => url}
|
140
145
|
|
@@ -151,7 +156,7 @@ class Cobweb
|
|
151
156
|
# retrieve data
|
152
157
|
#unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
153
158
|
puts "Creating connection to #{uri.host}..." if @options[:debug]
|
154
|
-
@http = Net::HTTP.new(uri.host, uri.inferred_port)
|
159
|
+
@http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
|
155
160
|
#end
|
156
161
|
if uri.scheme == "https"
|
157
162
|
@http.use_ssl = true
|
@@ -309,9 +314,9 @@ class Cobweb
|
|
309
314
|
|
310
315
|
# connect to redis
|
311
316
|
if options.has_key? :crawl_id
|
312
|
-
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis =>
|
317
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
313
318
|
else
|
314
|
-
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis =>
|
319
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
|
315
320
|
end
|
316
321
|
|
317
322
|
content = {:base_url => url}
|
@@ -324,7 +329,7 @@ class Cobweb
|
|
324
329
|
# retrieve data
|
325
330
|
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
326
331
|
puts "Creating connection to #{uri.host}..." unless @options[:quiet]
|
327
|
-
@http = Net::HTTP.new(uri.host, uri.inferred_port)
|
332
|
+
@http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
|
328
333
|
end
|
329
334
|
if uri.scheme == "https"
|
330
335
|
@http.use_ssl = true
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -20,7 +20,7 @@ class CobwebCrawler
|
|
20
20
|
@options[:crawl_id] = @crawl_id
|
21
21
|
end
|
22
22
|
|
23
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis =>
|
23
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options]))
|
24
24
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
25
25
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
26
26
|
@options[:seed_urls] = [] if @options[:seed_urls].nil?
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -6,7 +6,7 @@ module CobwebModule
|
|
6
6
|
|
7
7
|
setup_defaults
|
8
8
|
|
9
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis =>
|
9
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
10
10
|
@stats = Stats.new(@options)
|
11
11
|
@debug = @options[:debug]
|
12
12
|
@first_to_finish = false
|
@@ -22,6 +22,15 @@ module CobwebModule
|
|
22
22
|
@redis.sismember "queued", link
|
23
23
|
end
|
24
24
|
|
25
|
+
def already_running?(link)
|
26
|
+
@redis.sismember "currently_running", link
|
27
|
+
end
|
28
|
+
|
29
|
+
def already_handled?(link)
|
30
|
+
already_crawled?(link) || already_queued?(link) || already_running?(link)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
25
34
|
# Returns true if the crawl count is within limits
|
26
35
|
def within_crawl_limits?
|
27
36
|
@options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
|
@@ -50,16 +59,19 @@ module CobwebModule
|
|
50
59
|
end
|
51
60
|
|
52
61
|
def retrieve
|
53
|
-
|
54
|
-
|
55
|
-
unless already_crawled?
|
62
|
+
|
63
|
+
unless already_running? @options[:url]
|
64
|
+
unless already_crawled? @options[:url]
|
65
|
+
@redis.sadd("currently_running", @options[:url])
|
56
66
|
if within_crawl_limits?
|
57
67
|
@stats.update_status("Retrieving #{@options[:url]}...")
|
58
|
-
|
59
|
-
|
60
|
-
@redis.
|
68
|
+
lock("update_queues") do
|
69
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
70
|
+
if @options[:url] == @redis.get("original_base_url")
|
71
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
72
|
+
end
|
73
|
+
update_queues
|
61
74
|
end
|
62
|
-
update_queues
|
63
75
|
|
64
76
|
if content.permitted_type?
|
65
77
|
## update statistics
|
@@ -128,7 +140,7 @@ module CobwebModule
|
|
128
140
|
end
|
129
141
|
|
130
142
|
def update_queues
|
131
|
-
lock("update_queues") do
|
143
|
+
#lock("update_queues") do
|
132
144
|
#@redis.incr "inprogress"
|
133
145
|
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
134
146
|
@redis.srem "queued", @options[:url]
|
@@ -146,25 +158,27 @@ module CobwebModule
|
|
146
158
|
increment_crawl_counter
|
147
159
|
end
|
148
160
|
decrement_queue_counter
|
149
|
-
end
|
161
|
+
#end
|
150
162
|
end
|
151
163
|
|
152
164
|
def to_be_processed?
|
153
|
-
|
165
|
+
!finished? && within_process_limits? && !@redis.sismember("queued", @options[:url])
|
154
166
|
end
|
155
167
|
|
156
168
|
def process(&block)
|
157
|
-
|
158
|
-
if
|
169
|
+
lock("process") do
|
170
|
+
if @options[:crawl_limit_by_page]
|
171
|
+
if content.mime_type.match("text/html")
|
172
|
+
increment_process_counter
|
173
|
+
end
|
174
|
+
else
|
159
175
|
increment_process_counter
|
160
176
|
end
|
161
|
-
|
162
|
-
increment_process_counter
|
163
|
-
end
|
164
|
-
@redis.sadd "enqueued", @options[:url]
|
177
|
+
#@redis.sadd "queued", @options[:url]
|
165
178
|
|
166
|
-
|
167
|
-
|
179
|
+
yield if block_given?
|
180
|
+
@redis.incr("crawl_job_enqueued_count")
|
181
|
+
end
|
168
182
|
end
|
169
183
|
|
170
184
|
def finished_processing
|
@@ -173,20 +187,33 @@ module CobwebModule
|
|
173
187
|
|
174
188
|
def finished?
|
175
189
|
print_counters
|
190
|
+
debug_puts @stats.get_status
|
191
|
+
if @stats.get_status == CobwebCrawlHelper::FINISHED
|
192
|
+
debug_puts "Already Finished!"
|
193
|
+
end
|
176
194
|
# if there's nothing left queued or the crawled limit has been reached and we're not still processing something
|
177
195
|
if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
|
178
196
|
if queue_counter == 0 && @redis.smembers("currently_running").empty?
|
179
|
-
|
197
|
+
debug_puts "queue_counter is 0 and currently_running is empty so we're done"
|
198
|
+
#finished
|
180
199
|
return true
|
181
200
|
end
|
182
|
-
elsif (queue_counter == 0 && @redis.smembers("currently_running").empty?
|
183
|
-
finished
|
201
|
+
elsif (queue_counter == 0 || process_counter >= @options[:crawl_limit].to_i) && @redis.smembers("currently_running").empty?
|
202
|
+
#finished
|
203
|
+
debug_puts "queue_counter: #{queue_counter}, @redis.smembers(\"currently_running\").empty?: #{@redis.smembers("currently_running").empty?}, process_counter: #{process_counter}, @options[:crawl_limit].to_i: #{@options[:crawl_limit].to_i}"
|
184
204
|
return true
|
185
205
|
end
|
186
206
|
false
|
187
207
|
end
|
188
208
|
|
189
|
-
def
|
209
|
+
def finish
|
210
|
+
debug_puts ""
|
211
|
+
debug_puts "========================================================================"
|
212
|
+
debug_puts "finished crawl on #{@options[:url]}"
|
213
|
+
print_counters
|
214
|
+
debug_puts "========================================================================"
|
215
|
+
debug_puts ""
|
216
|
+
|
190
217
|
set_first_to_finish
|
191
218
|
@stats.end_crawl(@options)
|
192
219
|
end
|
@@ -223,22 +250,22 @@ module CobwebModule
|
|
223
250
|
end
|
224
251
|
|
225
252
|
def lock(key, &block)
|
226
|
-
debug_puts "REQUESTING LOCK [#{key}]"
|
253
|
+
#debug_puts "REQUESTING LOCK [#{key}]"
|
227
254
|
set_nx = @redis.setnx("#{key}_lock", "locked")
|
228
|
-
debug_puts "LOCK:#{key}:#{set_nx}"
|
255
|
+
#debug_puts "LOCK:#{key}:#{set_nx}"
|
229
256
|
while !set_nx
|
230
|
-
debug_puts "===== WAITING FOR LOCK [#{key}] ====="
|
257
|
+
#debug_puts "===== WAITING FOR LOCK [#{key}] ====="
|
231
258
|
sleep 0.01
|
232
259
|
set_nx = @redis.setnx("#{key}_lock", "locked")
|
233
260
|
end
|
234
261
|
|
235
|
-
debug_puts "RECEIVED LOCK [#{key}]"
|
262
|
+
#debug_puts "RECEIVED LOCK [#{key}]"
|
236
263
|
@redis.expire("#{key}_lock", 10)
|
237
264
|
begin
|
238
265
|
result = yield
|
239
266
|
ensure
|
240
267
|
@redis.del("#{key}_lock")
|
241
|
-
debug_puts "LOCK RELEASED [#{key}]"
|
268
|
+
#debug_puts "LOCK RELEASED [#{key}]"
|
242
269
|
end
|
243
270
|
result
|
244
271
|
end
|
data/lib/crawl_helper.rb
CHANGED
@@ -15,7 +15,7 @@ class CrawlHelper
|
|
15
15
|
content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
|
16
16
|
content_request[:queue_system] = content_request[:queue_system].to_sym
|
17
17
|
|
18
|
-
@redis =
|
18
|
+
@redis = NamespacedRedisConnection.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
19
19
|
@stats = Stats.new(content_request)
|
20
20
|
|
21
21
|
@debug = content_request[:debug]
|
@@ -74,7 +74,7 @@ class CrawlHelper
|
|
74
74
|
|
75
75
|
#if the enqueue counter has been requested update that
|
76
76
|
if content_request.has_key? :enqueue_counter_key
|
77
|
-
enqueue_redis =
|
77
|
+
enqueue_redis = NamespacedRedisConnection.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
|
78
78
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
79
79
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
80
80
|
end
|
data/lib/crawl_job.rb
CHANGED
@@ -23,12 +23,14 @@ class CrawlJob
|
|
23
23
|
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
24
24
|
@crawl.process_links do |link|
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
if @crawl.within_crawl_limits?
|
27
|
+
# enqueue the links to resque
|
28
|
+
@crawl.debug_puts "ENQUEUED LINK: #{link}"
|
29
|
+
enqueue_content(content_request, link)
|
30
|
+
end
|
29
31
|
|
30
32
|
end
|
31
|
-
|
33
|
+
|
32
34
|
if @crawl.to_be_processed?
|
33
35
|
|
34
36
|
@crawl.process do
|
@@ -39,7 +41,7 @@ class CrawlJob
|
|
39
41
|
|
40
42
|
#if the enqueue counter has been requested update that
|
41
43
|
if content_request.has_key?(:enqueue_counter_key)
|
42
|
-
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis =>
|
44
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => RedisConnection.new(content_request[:redis_options]))
|
43
45
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
44
46
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
45
47
|
end
|
@@ -60,8 +62,7 @@ class CrawlJob
|
|
60
62
|
|
61
63
|
# test queue and crawl sizes to see if we have completed the crawl
|
62
64
|
@crawl.debug_puts "finished? #{@crawl.finished?}"
|
63
|
-
|
64
|
-
if @crawl.finished? && @crawl.first_to_finish?
|
65
|
+
if @crawl.finished?
|
65
66
|
@crawl.debug_puts "Calling crawl_job finished"
|
66
67
|
finished(content_request)
|
67
68
|
end
|
@@ -75,7 +76,9 @@ class CrawlJob
|
|
75
76
|
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
76
77
|
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
77
78
|
|
78
|
-
@crawl.
|
79
|
+
@crawl.finish
|
80
|
+
|
81
|
+
@crawl.debug_puts "increment crawl_finished_enqueued_count from #{@crawl.redis.get("crawl_finished_enqueued_count")}"
|
79
82
|
@crawl.redis.incr("crawl_finished_enqueued_count")
|
80
83
|
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
|
81
84
|
end
|
data/lib/crawl_worker.rb
CHANGED
@@ -16,6 +16,7 @@ class CrawlWorker
|
|
16
16
|
sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
|
17
17
|
|
18
18
|
def perform(content_request)
|
19
|
+
puts "Performing for #{content_request["url"]}"
|
19
20
|
# setup the crawl class to manage the crawl of this object
|
20
21
|
@crawl = CobwebModule::Crawl.new(content_request)
|
21
22
|
|
@@ -25,12 +26,17 @@ class CrawlWorker
|
|
25
26
|
# if the crawled object is an object type we are interested
|
26
27
|
if @crawl.content.permitted_type?
|
27
28
|
|
28
|
-
|
29
|
-
|
29
|
+
@crawl.lock("queue_links") do
|
30
|
+
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
31
|
+
@crawl.process_links do |link|
|
30
32
|
|
31
|
-
|
32
|
-
|
33
|
+
if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
|
34
|
+
# enqueue the links to sidekiq
|
35
|
+
@crawl.debug_puts "QUEUED LINK: #{link}"
|
36
|
+
enqueue_content(content_request, link)
|
37
|
+
end
|
33
38
|
|
39
|
+
end
|
34
40
|
end
|
35
41
|
|
36
42
|
if @crawl.to_be_processed?
|
@@ -38,12 +44,12 @@ class CrawlWorker
|
|
38
44
|
@crawl.process do
|
39
45
|
|
40
46
|
# enqueue to processing queue
|
41
|
-
@crawl.debug_puts "
|
47
|
+
@crawl.debug_puts "SENT FOR PROCESSING [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
|
42
48
|
send_to_processing_queue(@crawl.content.to_hash, content_request)
|
43
49
|
|
44
50
|
#if the enqueue counter has been requested update that
|
45
51
|
if content_request.has_key?(:enqueue_counter_key)
|
46
|
-
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis =>
|
52
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => RedisConnection.new(content_request[:redis_options]))
|
47
53
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
48
54
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
49
55
|
end
|
@@ -64,8 +70,7 @@ class CrawlWorker
|
|
64
70
|
|
65
71
|
# test queue and crawl sizes to see if we have completed the crawl
|
66
72
|
@crawl.debug_puts "finished? #{@crawl.finished?}"
|
67
|
-
|
68
|
-
if @crawl.finished? && @crawl.first_to_finish?
|
73
|
+
if @crawl.finished?
|
69
74
|
@crawl.debug_puts "Calling crawl_job finished"
|
70
75
|
finished(content_request)
|
71
76
|
end
|
@@ -84,7 +89,7 @@ class CrawlWorker
|
|
84
89
|
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
85
90
|
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
86
91
|
|
87
|
-
@crawl.
|
92
|
+
@crawl.finish
|
88
93
|
|
89
94
|
@crawl.debug_puts "increment crawl_finished_enqueued_count"
|
90
95
|
@crawl.redis.incr("crawl_finished_enqueued_count")
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class RedisConnection
|
2
|
+
|
3
|
+
@@redis_connections = {}
|
4
|
+
|
5
|
+
def initialize(options={})
|
6
|
+
key = options.keys.sort.map{|k| "#{k}:#{options[k]}"}.join(",")
|
7
|
+
unless @@redis_connections.has_key?(key)
|
8
|
+
@@redis_connections[key] = Redis.new(options)
|
9
|
+
end
|
10
|
+
@current_connection = @@redis_connections[key]
|
11
|
+
@current_connection
|
12
|
+
end
|
13
|
+
|
14
|
+
def method_missing(m, *args, &block)
|
15
|
+
if @current_connection.respond_to?(m)
|
16
|
+
@current_connection.send(m, *args)
|
17
|
+
else
|
18
|
+
super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
end
|
data/lib/server.rb
CHANGED
@@ -12,14 +12,14 @@ class Server < Sinatra::Base
|
|
12
12
|
|
13
13
|
# Sinatra Dashboard
|
14
14
|
get '/' do
|
15
|
-
@full_redis =
|
15
|
+
@full_redis = RedisConnection.new(redis_options)
|
16
16
|
@colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
|
17
17
|
|
18
18
|
@crawls = []
|
19
19
|
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
20
20
|
version = cobweb_version(crawl_id)
|
21
21
|
if version == Cobweb.version
|
22
|
-
redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis =>
|
22
|
+
redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => RedisConnection.new(redis_options))
|
23
23
|
stats = HashUtil.deep_symbolize_keys({
|
24
24
|
:cobweb_version => version,
|
25
25
|
:crawl_details => redis.hgetall("crawl_details"),
|
@@ -38,7 +38,7 @@ class Server < Sinatra::Base
|
|
38
38
|
get '/statistics/:crawl_id' do
|
39
39
|
|
40
40
|
version = cobweb_version(params[:crawl_id])
|
41
|
-
redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis =>
|
41
|
+
redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => RedisConnection.new(redis_options))
|
42
42
|
|
43
43
|
@statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
|
44
44
|
if @statistics[:status_counts].nil?
|
@@ -71,7 +71,7 @@ class Server < Sinatra::Base
|
|
71
71
|
end
|
72
72
|
|
73
73
|
def cobweb_version(crawl_id)
|
74
|
-
redis =
|
74
|
+
redis = RedisConnection.new(redis_options)
|
75
75
|
key = redis.keys("cobweb-*-#{crawl_id}:queued").first
|
76
76
|
|
77
77
|
key =~ /cobweb-(.*?)-(.*?):queued/
|
@@ -1,9 +1,10 @@
|
|
1
|
-
|
1
|
+
|
2
|
+
if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
|
2
3
|
SIDEKIQ_INSTALLED = true
|
3
4
|
require 'sidekiq'
|
4
5
|
else
|
5
6
|
SIDEKIQ_INSTALLED = false
|
6
|
-
puts "
|
7
|
+
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
7
8
|
end
|
8
9
|
|
9
10
|
module Sidekiq
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -31,6 +31,8 @@ describe Cobweb do
|
|
31
31
|
options[:timeout].should == 10
|
32
32
|
options[:redis_options].should == {}
|
33
33
|
options[:internal_urls].should == []
|
34
|
+
options[:proxy_addr].should be_nil
|
35
|
+
options[:proxy_port].should be_nil
|
34
36
|
|
35
37
|
end
|
36
38
|
|
@@ -52,15 +54,15 @@ describe Cobweb do
|
|
52
54
|
@cobweb.get(@base_url)[:url].should == @base_url
|
53
55
|
end
|
54
56
|
it "should return correct content-type" do
|
55
|
-
@mock_http_response.stub
|
57
|
+
@mock_http_response.stub(:content_type).and_return("image/jpeg")
|
56
58
|
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
57
59
|
end
|
58
60
|
it "should return correct status-code" do
|
59
|
-
@mock_http_response.stub
|
61
|
+
@mock_http_response.stub(:code).and_return(404)
|
60
62
|
@cobweb.get(@base_url)[:status_code].should == 404
|
61
63
|
end
|
62
64
|
it "should return correct status-code" do
|
63
|
-
@mock_http_response.stub
|
65
|
+
@mock_http_response.stub(:code).and_return(404)
|
64
66
|
@cobweb.get(@base_url)[:status_code].should == 404
|
65
67
|
end
|
66
68
|
it "should return correct character_set" do
|
@@ -75,7 +77,7 @@ describe Cobweb do
|
|
75
77
|
it "should return correct location" do
|
76
78
|
@cobweb.get(@base_url)[:location].should == nil
|
77
79
|
|
78
|
-
@mock_http_response.stub
|
80
|
+
@mock_http_response.stub(:[]).with("location").and_return("http://google.com/")
|
79
81
|
@cobweb.get(@base_url)[:location].should == "http://google.com/"
|
80
82
|
end
|
81
83
|
it "should return correct headers" do
|
@@ -135,17 +137,17 @@ describe Cobweb do
|
|
135
137
|
@cobweb.get(@base_url)[:url].should == @base_url
|
136
138
|
end
|
137
139
|
it "should return correct content-type" do
|
138
|
-
@mock_http_response.stub
|
140
|
+
@mock_http_response.stub(:content_type).and_return("image/jpeg")
|
139
141
|
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
140
142
|
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
141
143
|
end
|
142
144
|
it "should return correct status-code" do
|
143
|
-
@mock_http_response.stub
|
145
|
+
@mock_http_response.stub(:code).and_return(404)
|
144
146
|
@cobweb.get(@base_url)[:status_code].should == 404
|
145
147
|
@cobweb.get(@base_url)[:status_code].should == 404
|
146
148
|
end
|
147
149
|
it "should return correct status-code" do
|
148
|
-
@mock_http_response.stub
|
150
|
+
@mock_http_response.stub(:code).and_return(404)
|
149
151
|
@cobweb.get(@base_url)[:status_code].should == 404
|
150
152
|
@cobweb.get(@base_url)[:status_code].should == 404
|
151
153
|
end
|
@@ -177,26 +179,34 @@ describe Cobweb do
|
|
177
179
|
end
|
178
180
|
describe "location setting" do
|
179
181
|
it "Get should strip fragments" do
|
180
|
-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
182
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
|
181
183
|
Net::HTTP::Get.should_receive(:new).with("/", @default_options)
|
182
184
|
@cobweb.get("http://www.google.com/#ignore")
|
183
185
|
end
|
184
186
|
it "head should strip fragments" do
|
185
|
-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
187
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
|
186
188
|
Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
|
187
189
|
@cobweb.head("http://www.google.com/#ignore")
|
188
190
|
end
|
189
191
|
it "get should not strip path" do
|
190
|
-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
192
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
|
191
193
|
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", @default_options)
|
192
194
|
@cobweb.get("http://www.google.com/path/to/stuff#ignore")
|
193
195
|
end
|
194
196
|
it "get should not strip query string" do
|
195
|
-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
197
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
|
196
198
|
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", @default_options)
|
197
199
|
@cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
|
198
200
|
end
|
199
201
|
end
|
202
|
+
describe "with proxy" do
|
203
|
+
it "provides proxy parameters to Net::HTTP" do
|
204
|
+
cobweb = Cobweb.new proxy_addr: 'proxy.example.com', proxy_port: 1234
|
205
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, "proxy.example.com", 1234)
|
206
|
+
|
207
|
+
cobweb.get("http://www.google.com/")
|
208
|
+
end
|
209
|
+
end
|
200
210
|
|
201
211
|
end
|
202
212
|
end
|
@@ -9,7 +9,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
9
9
|
|
10
10
|
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
11
11
|
if Resque.workers.count > 0 && @existing_processes.empty?
|
12
|
-
raise "Ghost workers present in resque, please clear before running specs"
|
12
|
+
raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
|
13
13
|
elsif Resque.workers.count == 0 && !@existing_processes.empty?
|
14
14
|
raise "Ghost worker processes present (#{@existing_processes.join(',')})"
|
15
15
|
elsif Resque.workers.count > 0 && !@existing_processes.empty?
|
@@ -23,25 +23,23 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
23
23
|
io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
|
24
24
|
|
25
25
|
counter = 0
|
26
|
-
print "Starting Resque Processes"
|
27
26
|
until counter > 10 || workers_processes_started?
|
28
|
-
print "
|
27
|
+
print "\rStarting Resque Processes... #{10-counter} "
|
29
28
|
counter += 1
|
30
|
-
sleep
|
29
|
+
sleep 1
|
31
30
|
end
|
32
31
|
puts ""
|
33
32
|
|
34
33
|
|
35
34
|
counter = 0
|
36
|
-
|
37
|
-
|
38
|
-
print "."
|
35
|
+
until counter > 30 || workers_running?
|
36
|
+
print "\rWaiting for Resque Workers... #{30-counter} "
|
39
37
|
counter += 1
|
40
|
-
sleep
|
38
|
+
sleep 1
|
41
39
|
end
|
42
40
|
puts ""
|
43
41
|
|
44
|
-
if
|
42
|
+
if workers_running?
|
45
43
|
puts "Workers Running."
|
46
44
|
else
|
47
45
|
raise "Workers didn't appear, please check environment"
|
@@ -62,10 +60,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
62
60
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
63
61
|
:crawl_limit => nil,
|
64
62
|
:quiet => false,
|
65
|
-
:debug =>
|
63
|
+
:debug => true,
|
66
64
|
:cache => nil
|
67
65
|
}
|
68
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis =>
|
66
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
|
69
67
|
@cobweb = Cobweb.new @request
|
70
68
|
end
|
71
69
|
it "should not crawl anything if nothing has started" do
|
@@ -95,10 +93,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
95
93
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
96
94
|
:crawl_limit => nil,
|
97
95
|
:quiet => false,
|
98
|
-
:debug =>
|
96
|
+
:debug => true,
|
99
97
|
:cache => nil
|
100
98
|
}
|
101
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis =>
|
99
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
|
102
100
|
|
103
101
|
@cobweb = Cobweb.new @request
|
104
102
|
end
|
@@ -124,11 +122,11 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
124
122
|
@request = {
|
125
123
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
126
124
|
:quiet => false,
|
127
|
-
:debug =>
|
125
|
+
:debug => true,
|
128
126
|
:cache => nil,
|
129
127
|
:valid_mime_types => ["text/html"]
|
130
128
|
}
|
131
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis =>
|
129
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
|
132
130
|
@cobweb = Cobweb.new @request
|
133
131
|
end
|
134
132
|
|
@@ -150,10 +148,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
150
148
|
@request = {
|
151
149
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
152
150
|
:quiet => false,
|
153
|
-
:debug =>
|
151
|
+
:debug => true,
|
154
152
|
:cache => nil
|
155
153
|
}
|
156
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis =>
|
154
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
|
157
155
|
end
|
158
156
|
|
159
157
|
# describe "crawling http://yepadeperrors.wordpress.com/ with limit of 20" do
|
@@ -226,6 +224,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
226
224
|
@redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
227
225
|
end
|
228
226
|
it "should notify of crawl finished once" do
|
227
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 0
|
229
228
|
crawl = @cobweb.start(@base_url)
|
230
229
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
231
230
|
wait_for_crawl_finished crawl[:crawl_id]
|
@@ -280,11 +279,13 @@ end
|
|
280
279
|
|
281
280
|
def wait_for_crawl_finished(crawl_id, timeout=20)
|
282
281
|
@counter = 0
|
282
|
+
@timeout = timeout unless @timeout
|
283
283
|
start_time = Time.now
|
284
284
|
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
285
285
|
sleep 1
|
286
286
|
end
|
287
|
-
if Time.now > start_time + timeout
|
287
|
+
if Time.now > start_time + @timeout
|
288
|
+
@timeout = 5
|
288
289
|
raise "End of crawl not detected"
|
289
290
|
end
|
290
291
|
end
|
@@ -296,7 +297,7 @@ def workers_processes_started?
|
|
296
297
|
end
|
297
298
|
|
298
299
|
def workers_running?
|
299
|
-
Resque.workers.count
|
300
|
+
Resque.workers.count == RESQUE_WORKER_COUNT
|
300
301
|
end
|
301
302
|
|
302
303
|
def running?(crawl_id)
|
@@ -8,8 +8,8 @@ describe CrawlWorker, :local_only => true do
|
|
8
8
|
if SIDEKIQ_INSTALLED
|
9
9
|
#store all existing resque process ids so we don't kill them afterwards
|
10
10
|
@existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
|
11
|
-
|
12
|
-
@existing_processes.
|
11
|
+
|
12
|
+
raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
|
13
13
|
|
14
14
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
15
15
|
puts "Starting Workers... Please Wait..."
|
@@ -34,7 +34,7 @@ describe CrawlWorker, :local_only => true do
|
|
34
34
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
35
35
|
:crawl_limit => nil,
|
36
36
|
:quiet => false,
|
37
|
-
:debug =>
|
37
|
+
:debug => true,
|
38
38
|
:cache => nil,
|
39
39
|
:queue_system => :sidekiq
|
40
40
|
}
|
@@ -60,6 +60,7 @@ describe CrawlWorker, :local_only => true do
|
|
60
60
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
61
61
|
:quiet => true,
|
62
62
|
:cache => nil,
|
63
|
+
:debug => true,
|
63
64
|
:queue_system => :sidekiq,
|
64
65
|
:valid_mime_types => ["text/html"]
|
65
66
|
}
|
@@ -87,6 +88,7 @@ describe CrawlWorker, :local_only => true do
|
|
87
88
|
@request = {
|
88
89
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
89
90
|
:quiet => true,
|
91
|
+
:debug => true,
|
90
92
|
:queue_system => :sidekiq,
|
91
93
|
:cache => nil
|
92
94
|
}
|
@@ -136,7 +138,6 @@ describe CrawlWorker, :local_only => true do
|
|
136
138
|
wait_for_crawl_finished crawl[:crawl_id]
|
137
139
|
|
138
140
|
mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
|
139
|
-
ap mime_types
|
140
141
|
mime_types.select{|m| m=="text/html"}.count.should == 5
|
141
142
|
end
|
142
143
|
end
|
@@ -186,11 +187,11 @@ describe CrawlWorker, :local_only => true do
|
|
186
187
|
wait_for_crawl_finished crawl[:crawl_id]
|
187
188
|
CrawlFinishedWorker.queue_size.should == 1
|
188
189
|
end
|
189
|
-
it "should not crawl 100 pages" do
|
190
|
+
it "should not crawl more than 100 pages" do
|
190
191
|
crawl = @cobweb.start(@base_url)
|
191
192
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
192
193
|
wait_for_crawl_finished crawl[:crawl_id]
|
193
|
-
CrawlProcessWorker.queue_size.should_not
|
194
|
+
CrawlProcessWorker.queue_size.should_not > 100
|
194
195
|
end
|
195
196
|
end
|
196
197
|
end
|
@@ -244,7 +245,7 @@ def clear_sidekiq_queues
|
|
244
245
|
conn.srem("queues", queue_name)
|
245
246
|
end
|
246
247
|
end
|
247
|
-
sleep
|
248
|
+
sleep 5
|
248
249
|
|
249
250
|
CrawlProcessWorker.queue_size.should == 0
|
250
251
|
CrawlFinishedWorker.queue_size.should == 0
|
metadata
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.19
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Stewart McKee
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-
|
12
|
+
date: 2013-11-26 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: redis
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
19
|
- - ! '>='
|
18
20
|
- !ruby/object:Gem::Version
|
@@ -20,6 +22,7 @@ dependencies:
|
|
20
22
|
type: :runtime
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
27
|
- - ! '>='
|
25
28
|
- !ruby/object:Gem::Version
|
@@ -27,6 +30,7 @@ dependencies:
|
|
27
30
|
- !ruby/object:Gem::Dependency
|
28
31
|
name: nokogiri
|
29
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
30
34
|
requirements:
|
31
35
|
- - ! '>='
|
32
36
|
- !ruby/object:Gem::Version
|
@@ -34,6 +38,7 @@ dependencies:
|
|
34
38
|
type: :runtime
|
35
39
|
prerelease: false
|
36
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
37
42
|
requirements:
|
38
43
|
- - ! '>='
|
39
44
|
- !ruby/object:Gem::Version
|
@@ -41,6 +46,7 @@ dependencies:
|
|
41
46
|
- !ruby/object:Gem::Dependency
|
42
47
|
name: addressable
|
43
48
|
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
44
50
|
requirements:
|
45
51
|
- - ! '>='
|
46
52
|
- !ruby/object:Gem::Version
|
@@ -48,6 +54,7 @@ dependencies:
|
|
48
54
|
type: :runtime
|
49
55
|
prerelease: false
|
50
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
51
58
|
requirements:
|
52
59
|
- - ! '>='
|
53
60
|
- !ruby/object:Gem::Version
|
@@ -55,6 +62,7 @@ dependencies:
|
|
55
62
|
- !ruby/object:Gem::Dependency
|
56
63
|
name: rspec
|
57
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
58
66
|
requirements:
|
59
67
|
- - ! '>='
|
60
68
|
- !ruby/object:Gem::Version
|
@@ -62,6 +70,7 @@ dependencies:
|
|
62
70
|
type: :runtime
|
63
71
|
prerelease: false
|
64
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
65
74
|
requirements:
|
66
75
|
- - ! '>='
|
67
76
|
- !ruby/object:Gem::Version
|
@@ -69,6 +78,7 @@ dependencies:
|
|
69
78
|
- !ruby/object:Gem::Dependency
|
70
79
|
name: awesome_print
|
71
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
72
82
|
requirements:
|
73
83
|
- - ! '>='
|
74
84
|
- !ruby/object:Gem::Version
|
@@ -76,6 +86,7 @@ dependencies:
|
|
76
86
|
type: :runtime
|
77
87
|
prerelease: false
|
78
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
79
90
|
requirements:
|
80
91
|
- - ! '>='
|
81
92
|
- !ruby/object:Gem::Version
|
@@ -83,6 +94,7 @@ dependencies:
|
|
83
94
|
- !ruby/object:Gem::Dependency
|
84
95
|
name: sinatra
|
85
96
|
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
86
98
|
requirements:
|
87
99
|
- - ! '>='
|
88
100
|
- !ruby/object:Gem::Version
|
@@ -90,6 +102,7 @@ dependencies:
|
|
90
102
|
type: :runtime
|
91
103
|
prerelease: false
|
92
104
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
93
106
|
requirements:
|
94
107
|
- - ! '>='
|
95
108
|
- !ruby/object:Gem::Version
|
@@ -97,6 +110,7 @@ dependencies:
|
|
97
110
|
- !ruby/object:Gem::Dependency
|
98
111
|
name: thin
|
99
112
|
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
100
114
|
requirements:
|
101
115
|
- - ! '>='
|
102
116
|
- !ruby/object:Gem::Version
|
@@ -104,6 +118,7 @@ dependencies:
|
|
104
118
|
type: :runtime
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
107
122
|
requirements:
|
108
123
|
- - ! '>='
|
109
124
|
- !ruby/object:Gem::Version
|
@@ -111,6 +126,7 @@ dependencies:
|
|
111
126
|
- !ruby/object:Gem::Dependency
|
112
127
|
name: haml
|
113
128
|
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
114
130
|
requirements:
|
115
131
|
- - ! '>='
|
116
132
|
- !ruby/object:Gem::Version
|
@@ -118,6 +134,7 @@ dependencies:
|
|
118
134
|
type: :runtime
|
119
135
|
prerelease: false
|
120
136
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
121
138
|
requirements:
|
122
139
|
- - ! '>='
|
123
140
|
- !ruby/object:Gem::Version
|
@@ -125,6 +142,7 @@ dependencies:
|
|
125
142
|
- !ruby/object:Gem::Dependency
|
126
143
|
name: namespaced_redis
|
127
144
|
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
128
146
|
requirements:
|
129
147
|
- - ! '>='
|
130
148
|
- !ruby/object:Gem::Version
|
@@ -132,6 +150,7 @@ dependencies:
|
|
132
150
|
type: :runtime
|
133
151
|
prerelease: false
|
134
152
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
135
154
|
requirements:
|
136
155
|
- - ! '>='
|
137
156
|
- !ruby/object:Gem::Version
|
@@ -139,6 +158,7 @@ dependencies:
|
|
139
158
|
- !ruby/object:Gem::Dependency
|
140
159
|
name: json
|
141
160
|
requirement: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
142
162
|
requirements:
|
143
163
|
- - ! '>='
|
144
164
|
- !ruby/object:Gem::Version
|
@@ -146,6 +166,7 @@ dependencies:
|
|
146
166
|
type: :runtime
|
147
167
|
prerelease: false
|
148
168
|
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
149
170
|
requirements:
|
150
171
|
- - ! '>='
|
151
172
|
- !ruby/object:Gem::Version
|
@@ -153,6 +174,7 @@ dependencies:
|
|
153
174
|
- !ruby/object:Gem::Dependency
|
154
175
|
name: slop
|
155
176
|
requirement: !ruby/object:Gem::Requirement
|
177
|
+
none: false
|
156
178
|
requirements:
|
157
179
|
- - ! '>='
|
158
180
|
- !ruby/object:Gem::Version
|
@@ -160,6 +182,7 @@ dependencies:
|
|
160
182
|
type: :runtime
|
161
183
|
prerelease: false
|
162
184
|
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
163
186
|
requirements:
|
164
187
|
- - ! '>='
|
165
188
|
- !ruby/object:Gem::Version
|
@@ -376,6 +399,7 @@ files:
|
|
376
399
|
- lib/export_command.rb
|
377
400
|
- lib/hash_util.rb
|
378
401
|
- lib/redirect_error.rb
|
402
|
+
- lib/redis_connection.rb
|
379
403
|
- lib/report_command.rb
|
380
404
|
- lib/robots.rb
|
381
405
|
- lib/server.rb
|
@@ -540,26 +564,27 @@ files:
|
|
540
564
|
homepage: http://github.com/stewartmckee/cobweb
|
541
565
|
licenses:
|
542
566
|
- MIT
|
543
|
-
metadata: {}
|
544
567
|
post_install_message:
|
545
568
|
rdoc_options: []
|
546
569
|
require_paths:
|
547
570
|
- lib
|
548
571
|
required_ruby_version: !ruby/object:Gem::Requirement
|
572
|
+
none: false
|
549
573
|
requirements:
|
550
574
|
- - ! '>='
|
551
575
|
- !ruby/object:Gem::Version
|
552
576
|
version: '0'
|
553
577
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
578
|
+
none: false
|
554
579
|
requirements:
|
555
580
|
- - ! '>='
|
556
581
|
- !ruby/object:Gem::Version
|
557
582
|
version: '0'
|
558
583
|
requirements: []
|
559
584
|
rubyforge_project:
|
560
|
-
rubygems_version:
|
585
|
+
rubygems_version: 1.8.25
|
561
586
|
signing_key:
|
562
|
-
specification_version:
|
587
|
+
specification_version: 3
|
563
588
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
564
589
|
crawl extremely large sites faster than multi-threaded crawlers. It is also a standalone
|
565
590
|
crawler that has a sophisticated statistics monitoring interface to monitor the
|
checksums.yaml
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
---
|
2
|
-
!binary "U0hBMQ==":
|
3
|
-
metadata.gz: !binary |-
|
4
|
-
MTg3ODFiMWE1MmZlYWFjYzZiZjIzZjQ1NmFjZmJmMWU1MDVjZTc5Mg==
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
ZGU4NmFiYTJlNmZlODRiMjRmNTkzZjMwOWQyMzEyZjU4OGQzMWUxMw==
|
7
|
-
!binary "U0hBNTEy":
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
ZDVmN2MwYzBiMjQ1N2E2YjBmYmM0ZTk5ZWJjMGVkN2VmMDM4ODhkNTQ0OTIx
|
10
|
-
ZTg4YzMzMWE0OTY2ZjgyNWRiNzZlZjgyZDlkM2Y4MTQ2OTVmZTg5Zjc1NTA1
|
11
|
-
MTZhYzc2ZmYwNmM2ODRlMmViODljMGFjODYwNTY5OThlNjY2M2Y=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
M2M2YzU4ZTE5YzkxMWVmNmJiNTQ5OWFhNDExZGUwNzkxMGEzY2IyYTFmYTJl
|
14
|
-
YTE0OWI2ZmZhN2I0ZjA2YjU4NWFmNmUwMjY5ZDM4YWQ3ZmJkZmViNzRlNWMw
|
15
|
-
ZWMzNjIwNDkxNDk0NmMxOTE3NzljMGQ5MjlmYzgyODc3ZWQ2ZTY=
|