cobweb 1.0.18 → 1.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +7 -3
- data/lib/cobweb.rb +13 -8
- data/lib/cobweb_crawler.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +55 -28
- data/lib/crawl_helper.rb +2 -2
- data/lib/crawl_job.rb +11 -8
- data/lib/crawl_worker.rb +14 -9
- data/lib/redis_connection.rb +23 -0
- data/lib/server.rb +4 -4
- data/lib/sidekiq/cobweb_helper.rb +3 -2
- data/spec/cobweb/cobweb_spec.rb +21 -11
- data/spec/cobweb/crawl_job_spec.rb +20 -19
- data/spec/cobweb/crawl_worker_spec.rb +8 -7
- metadata +30 -5
- checksums.yaml +0 -15
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v1.0.
|
2
|
+
h1. Cobweb v1.0.19
|
3
3
|
|
4
4
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
5
5
|
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
@@ -95,7 +95,8 @@ Creates a new crawler object based on a base_url
|
|
95
95
|
|
96
96
|
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
|
97
97
|
** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
|
98
|
-
** :processing_queue - specifies the processing queue for content to be sent to (Default:
|
98
|
+
** :processing_queue - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
|
99
|
+
** :crawl_finished_queue - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
|
99
100
|
** :debug - enables debug output (Default: false)
|
100
101
|
** :quiet - hides default output (Default: false)
|
101
102
|
** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
|
@@ -110,9 +111,12 @@ Creates a new crawler object based on a base_url
|
|
110
111
|
** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
|
111
112
|
** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
112
113
|
** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
|
113
|
-
** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
|
114
|
+
** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
|
114
115
|
** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
|
115
116
|
** :raise_exceptions - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
|
117
|
+
** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
|
118
|
+
** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
|
119
|
+
** :proxy_port - port number of the proxy (default: nil)
|
116
120
|
|
117
121
|
|
118
122
|
bc. crawler = Cobweb.new(:follow_redirects => false)
|
data/lib/cobweb.rb
CHANGED
@@ -9,6 +9,9 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
|
|
9
9
|
require file
|
10
10
|
end
|
11
11
|
|
12
|
+
puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
|
13
|
+
|
14
|
+
|
12
15
|
# Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
|
13
16
|
class Cobweb
|
14
17
|
|
@@ -57,6 +60,8 @@ class Cobweb
|
|
57
60
|
default_valid_mime_types_to ["*/*"]
|
58
61
|
default_raise_exceptions_to false
|
59
62
|
default_store_inbound_links_to false
|
63
|
+
default_proxy_addr_to nil
|
64
|
+
default_proxy_port_to nil
|
60
65
|
|
61
66
|
end
|
62
67
|
|
@@ -76,7 +81,7 @@ class Cobweb
|
|
76
81
|
end
|
77
82
|
|
78
83
|
request.merge!(@options)
|
79
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis =>
|
84
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => RedisConnection.new(request[:redis_options]))
|
80
85
|
@redis.set("original_base_url", base_url)
|
81
86
|
@redis.hset "statistics", "queued_at", DateTime.now
|
82
87
|
@redis.set("crawl-counter", 0)
|
@@ -130,11 +135,11 @@ class Cobweb
|
|
130
135
|
|
131
136
|
# connect to redis
|
132
137
|
if options.has_key? :crawl_id
|
133
|
-
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis =>
|
138
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
134
139
|
else
|
135
|
-
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis =>
|
140
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
|
136
141
|
end
|
137
|
-
full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis =>
|
142
|
+
full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
|
138
143
|
|
139
144
|
content = {:base_url => url}
|
140
145
|
|
@@ -151,7 +156,7 @@ class Cobweb
|
|
151
156
|
# retrieve data
|
152
157
|
#unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
153
158
|
puts "Creating connection to #{uri.host}..." if @options[:debug]
|
154
|
-
@http = Net::HTTP.new(uri.host, uri.inferred_port)
|
159
|
+
@http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
|
155
160
|
#end
|
156
161
|
if uri.scheme == "https"
|
157
162
|
@http.use_ssl = true
|
@@ -309,9 +314,9 @@ class Cobweb
|
|
309
314
|
|
310
315
|
# connect to redis
|
311
316
|
if options.has_key? :crawl_id
|
312
|
-
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis =>
|
317
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
313
318
|
else
|
314
|
-
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis =>
|
319
|
+
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
|
315
320
|
end
|
316
321
|
|
317
322
|
content = {:base_url => url}
|
@@ -324,7 +329,7 @@ class Cobweb
|
|
324
329
|
# retrieve data
|
325
330
|
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
326
331
|
puts "Creating connection to #{uri.host}..." unless @options[:quiet]
|
327
|
-
@http = Net::HTTP.new(uri.host, uri.inferred_port)
|
332
|
+
@http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
|
328
333
|
end
|
329
334
|
if uri.scheme == "https"
|
330
335
|
@http.use_ssl = true
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -20,7 +20,7 @@ class CobwebCrawler
|
|
20
20
|
@options[:crawl_id] = @crawl_id
|
21
21
|
end
|
22
22
|
|
23
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis =>
|
23
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options]))
|
24
24
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
25
25
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
26
26
|
@options[:seed_urls] = [] if @options[:seed_urls].nil?
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -6,7 +6,7 @@ module CobwebModule
|
|
6
6
|
|
7
7
|
setup_defaults
|
8
8
|
|
9
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis =>
|
9
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
|
10
10
|
@stats = Stats.new(@options)
|
11
11
|
@debug = @options[:debug]
|
12
12
|
@first_to_finish = false
|
@@ -22,6 +22,15 @@ module CobwebModule
|
|
22
22
|
@redis.sismember "queued", link
|
23
23
|
end
|
24
24
|
|
25
|
+
def already_running?(link)
|
26
|
+
@redis.sismember "currently_running", link
|
27
|
+
end
|
28
|
+
|
29
|
+
def already_handled?(link)
|
30
|
+
already_crawled?(link) || already_queued?(link) || already_running?(link)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
25
34
|
# Returns true if the crawl count is within limits
|
26
35
|
def within_crawl_limits?
|
27
36
|
@options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
|
@@ -50,16 +59,19 @@ module CobwebModule
|
|
50
59
|
end
|
51
60
|
|
52
61
|
def retrieve
|
53
|
-
|
54
|
-
|
55
|
-
unless already_crawled?
|
62
|
+
|
63
|
+
unless already_running? @options[:url]
|
64
|
+
unless already_crawled? @options[:url]
|
65
|
+
@redis.sadd("currently_running", @options[:url])
|
56
66
|
if within_crawl_limits?
|
57
67
|
@stats.update_status("Retrieving #{@options[:url]}...")
|
58
|
-
|
59
|
-
|
60
|
-
@redis.
|
68
|
+
lock("update_queues") do
|
69
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
70
|
+
if @options[:url] == @redis.get("original_base_url")
|
71
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
72
|
+
end
|
73
|
+
update_queues
|
61
74
|
end
|
62
|
-
update_queues
|
63
75
|
|
64
76
|
if content.permitted_type?
|
65
77
|
## update statistics
|
@@ -128,7 +140,7 @@ module CobwebModule
|
|
128
140
|
end
|
129
141
|
|
130
142
|
def update_queues
|
131
|
-
lock("update_queues") do
|
143
|
+
#lock("update_queues") do
|
132
144
|
#@redis.incr "inprogress"
|
133
145
|
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
134
146
|
@redis.srem "queued", @options[:url]
|
@@ -146,25 +158,27 @@ module CobwebModule
|
|
146
158
|
increment_crawl_counter
|
147
159
|
end
|
148
160
|
decrement_queue_counter
|
149
|
-
end
|
161
|
+
#end
|
150
162
|
end
|
151
163
|
|
152
164
|
def to_be_processed?
|
153
|
-
|
165
|
+
!finished? && within_process_limits? && !@redis.sismember("queued", @options[:url])
|
154
166
|
end
|
155
167
|
|
156
168
|
def process(&block)
|
157
|
-
|
158
|
-
if
|
169
|
+
lock("process") do
|
170
|
+
if @options[:crawl_limit_by_page]
|
171
|
+
if content.mime_type.match("text/html")
|
172
|
+
increment_process_counter
|
173
|
+
end
|
174
|
+
else
|
159
175
|
increment_process_counter
|
160
176
|
end
|
161
|
-
|
162
|
-
increment_process_counter
|
163
|
-
end
|
164
|
-
@redis.sadd "enqueued", @options[:url]
|
177
|
+
#@redis.sadd "queued", @options[:url]
|
165
178
|
|
166
|
-
|
167
|
-
|
179
|
+
yield if block_given?
|
180
|
+
@redis.incr("crawl_job_enqueued_count")
|
181
|
+
end
|
168
182
|
end
|
169
183
|
|
170
184
|
def finished_processing
|
@@ -173,20 +187,33 @@ module CobwebModule
|
|
173
187
|
|
174
188
|
def finished?
|
175
189
|
print_counters
|
190
|
+
debug_puts @stats.get_status
|
191
|
+
if @stats.get_status == CobwebCrawlHelper::FINISHED
|
192
|
+
debug_puts "Already Finished!"
|
193
|
+
end
|
176
194
|
# if there's nothing left queued or the crawled limit has been reached and we're not still processing something
|
177
195
|
if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
|
178
196
|
if queue_counter == 0 && @redis.smembers("currently_running").empty?
|
179
|
-
|
197
|
+
debug_puts "queue_counter is 0 and currently_running is empty so we're done"
|
198
|
+
#finished
|
180
199
|
return true
|
181
200
|
end
|
182
|
-
elsif (queue_counter == 0 && @redis.smembers("currently_running").empty?
|
183
|
-
finished
|
201
|
+
elsif (queue_counter == 0 || process_counter >= @options[:crawl_limit].to_i) && @redis.smembers("currently_running").empty?
|
202
|
+
#finished
|
203
|
+
debug_puts "queue_counter: #{queue_counter}, @redis.smembers(\"currently_running\").empty?: #{@redis.smembers("currently_running").empty?}, process_counter: #{process_counter}, @options[:crawl_limit].to_i: #{@options[:crawl_limit].to_i}"
|
184
204
|
return true
|
185
205
|
end
|
186
206
|
false
|
187
207
|
end
|
188
208
|
|
189
|
-
def
|
209
|
+
def finish
|
210
|
+
debug_puts ""
|
211
|
+
debug_puts "========================================================================"
|
212
|
+
debug_puts "finished crawl on #{@options[:url]}"
|
213
|
+
print_counters
|
214
|
+
debug_puts "========================================================================"
|
215
|
+
debug_puts ""
|
216
|
+
|
190
217
|
set_first_to_finish
|
191
218
|
@stats.end_crawl(@options)
|
192
219
|
end
|
@@ -223,22 +250,22 @@ module CobwebModule
|
|
223
250
|
end
|
224
251
|
|
225
252
|
def lock(key, &block)
|
226
|
-
debug_puts "REQUESTING LOCK [#{key}]"
|
253
|
+
#debug_puts "REQUESTING LOCK [#{key}]"
|
227
254
|
set_nx = @redis.setnx("#{key}_lock", "locked")
|
228
|
-
debug_puts "LOCK:#{key}:#{set_nx}"
|
255
|
+
#debug_puts "LOCK:#{key}:#{set_nx}"
|
229
256
|
while !set_nx
|
230
|
-
debug_puts "===== WAITING FOR LOCK [#{key}] ====="
|
257
|
+
#debug_puts "===== WAITING FOR LOCK [#{key}] ====="
|
231
258
|
sleep 0.01
|
232
259
|
set_nx = @redis.setnx("#{key}_lock", "locked")
|
233
260
|
end
|
234
261
|
|
235
|
-
debug_puts "RECEIVED LOCK [#{key}]"
|
262
|
+
#debug_puts "RECEIVED LOCK [#{key}]"
|
236
263
|
@redis.expire("#{key}_lock", 10)
|
237
264
|
begin
|
238
265
|
result = yield
|
239
266
|
ensure
|
240
267
|
@redis.del("#{key}_lock")
|
241
|
-
debug_puts "LOCK RELEASED [#{key}]"
|
268
|
+
#debug_puts "LOCK RELEASED [#{key}]"
|
242
269
|
end
|
243
270
|
result
|
244
271
|
end
|
data/lib/crawl_helper.rb
CHANGED
@@ -15,7 +15,7 @@ class CrawlHelper
|
|
15
15
|
content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
|
16
16
|
content_request[:queue_system] = content_request[:queue_system].to_sym
|
17
17
|
|
18
|
-
@redis =
|
18
|
+
@redis = NamespacedRedisConnection.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
19
19
|
@stats = Stats.new(content_request)
|
20
20
|
|
21
21
|
@debug = content_request[:debug]
|
@@ -74,7 +74,7 @@ class CrawlHelper
|
|
74
74
|
|
75
75
|
#if the enqueue counter has been requested update that
|
76
76
|
if content_request.has_key? :enqueue_counter_key
|
77
|
-
enqueue_redis =
|
77
|
+
enqueue_redis = NamespacedRedisConnection.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
|
78
78
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
79
79
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
80
80
|
end
|
data/lib/crawl_job.rb
CHANGED
@@ -23,12 +23,14 @@ class CrawlJob
|
|
23
23
|
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
24
24
|
@crawl.process_links do |link|
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
26
|
+
if @crawl.within_crawl_limits?
|
27
|
+
# enqueue the links to resque
|
28
|
+
@crawl.debug_puts "ENQUEUED LINK: #{link}"
|
29
|
+
enqueue_content(content_request, link)
|
30
|
+
end
|
29
31
|
|
30
32
|
end
|
31
|
-
|
33
|
+
|
32
34
|
if @crawl.to_be_processed?
|
33
35
|
|
34
36
|
@crawl.process do
|
@@ -39,7 +41,7 @@ class CrawlJob
|
|
39
41
|
|
40
42
|
#if the enqueue counter has been requested update that
|
41
43
|
if content_request.has_key?(:enqueue_counter_key)
|
42
|
-
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis =>
|
44
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => RedisConnection.new(content_request[:redis_options]))
|
43
45
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
44
46
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
45
47
|
end
|
@@ -60,8 +62,7 @@ class CrawlJob
|
|
60
62
|
|
61
63
|
# test queue and crawl sizes to see if we have completed the crawl
|
62
64
|
@crawl.debug_puts "finished? #{@crawl.finished?}"
|
63
|
-
|
64
|
-
if @crawl.finished? && @crawl.first_to_finish?
|
65
|
+
if @crawl.finished?
|
65
66
|
@crawl.debug_puts "Calling crawl_job finished"
|
66
67
|
finished(content_request)
|
67
68
|
end
|
@@ -75,7 +76,9 @@ class CrawlJob
|
|
75
76
|
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
76
77
|
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
77
78
|
|
78
|
-
@crawl.
|
79
|
+
@crawl.finish
|
80
|
+
|
81
|
+
@crawl.debug_puts "increment crawl_finished_enqueued_count from #{@crawl.redis.get("crawl_finished_enqueued_count")}"
|
79
82
|
@crawl.redis.incr("crawl_finished_enqueued_count")
|
80
83
|
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
|
81
84
|
end
|
data/lib/crawl_worker.rb
CHANGED
@@ -16,6 +16,7 @@ class CrawlWorker
|
|
16
16
|
sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
|
17
17
|
|
18
18
|
def perform(content_request)
|
19
|
+
puts "Performing for #{content_request["url"]}"
|
19
20
|
# setup the crawl class to manage the crawl of this object
|
20
21
|
@crawl = CobwebModule::Crawl.new(content_request)
|
21
22
|
|
@@ -25,12 +26,17 @@ class CrawlWorker
|
|
25
26
|
# if the crawled object is an object type we are interested
|
26
27
|
if @crawl.content.permitted_type?
|
27
28
|
|
28
|
-
|
29
|
-
|
29
|
+
@crawl.lock("queue_links") do
|
30
|
+
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
31
|
+
@crawl.process_links do |link|
|
30
32
|
|
31
|
-
|
32
|
-
|
33
|
+
if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
|
34
|
+
# enqueue the links to sidekiq
|
35
|
+
@crawl.debug_puts "QUEUED LINK: #{link}"
|
36
|
+
enqueue_content(content_request, link)
|
37
|
+
end
|
33
38
|
|
39
|
+
end
|
34
40
|
end
|
35
41
|
|
36
42
|
if @crawl.to_be_processed?
|
@@ -38,12 +44,12 @@ class CrawlWorker
|
|
38
44
|
@crawl.process do
|
39
45
|
|
40
46
|
# enqueue to processing queue
|
41
|
-
@crawl.debug_puts "
|
47
|
+
@crawl.debug_puts "SENT FOR PROCESSING [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
|
42
48
|
send_to_processing_queue(@crawl.content.to_hash, content_request)
|
43
49
|
|
44
50
|
#if the enqueue counter has been requested update that
|
45
51
|
if content_request.has_key?(:enqueue_counter_key)
|
46
|
-
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis =>
|
52
|
+
enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => RedisConnection.new(content_request[:redis_options]))
|
47
53
|
current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
|
48
54
|
enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
|
49
55
|
end
|
@@ -64,8 +70,7 @@ class CrawlWorker
|
|
64
70
|
|
65
71
|
# test queue and crawl sizes to see if we have completed the crawl
|
66
72
|
@crawl.debug_puts "finished? #{@crawl.finished?}"
|
67
|
-
|
68
|
-
if @crawl.finished? && @crawl.first_to_finish?
|
73
|
+
if @crawl.finished?
|
69
74
|
@crawl.debug_puts "Calling crawl_job finished"
|
70
75
|
finished(content_request)
|
71
76
|
end
|
@@ -84,7 +89,7 @@ class CrawlWorker
|
|
84
89
|
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
85
90
|
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
86
91
|
|
87
|
-
@crawl.
|
92
|
+
@crawl.finish
|
88
93
|
|
89
94
|
@crawl.debug_puts "increment crawl_finished_enqueued_count"
|
90
95
|
@crawl.redis.incr("crawl_finished_enqueued_count")
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class RedisConnection
|
2
|
+
|
3
|
+
@@redis_connections = {}
|
4
|
+
|
5
|
+
def initialize(options={})
|
6
|
+
key = options.keys.sort.map{|k| "#{k}:#{options[k]}"}.join(",")
|
7
|
+
unless @@redis_connections.has_key?(key)
|
8
|
+
@@redis_connections[key] = Redis.new(options)
|
9
|
+
end
|
10
|
+
@current_connection = @@redis_connections[key]
|
11
|
+
@current_connection
|
12
|
+
end
|
13
|
+
|
14
|
+
def method_missing(m, *args, &block)
|
15
|
+
if @current_connection.respond_to?(m)
|
16
|
+
@current_connection.send(m, *args)
|
17
|
+
else
|
18
|
+
super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
end
|
data/lib/server.rb
CHANGED
@@ -12,14 +12,14 @@ class Server < Sinatra::Base
|
|
12
12
|
|
13
13
|
# Sinatra Dashboard
|
14
14
|
get '/' do
|
15
|
-
@full_redis =
|
15
|
+
@full_redis = RedisConnection.new(redis_options)
|
16
16
|
@colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
|
17
17
|
|
18
18
|
@crawls = []
|
19
19
|
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
20
20
|
version = cobweb_version(crawl_id)
|
21
21
|
if version == Cobweb.version
|
22
|
-
redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis =>
|
22
|
+
redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => RedisConnection.new(redis_options))
|
23
23
|
stats = HashUtil.deep_symbolize_keys({
|
24
24
|
:cobweb_version => version,
|
25
25
|
:crawl_details => redis.hgetall("crawl_details"),
|
@@ -38,7 +38,7 @@ class Server < Sinatra::Base
|
|
38
38
|
get '/statistics/:crawl_id' do
|
39
39
|
|
40
40
|
version = cobweb_version(params[:crawl_id])
|
41
|
-
redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis =>
|
41
|
+
redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => RedisConnection.new(redis_options))
|
42
42
|
|
43
43
|
@statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
|
44
44
|
if @statistics[:status_counts].nil?
|
@@ -71,7 +71,7 @@ class Server < Sinatra::Base
|
|
71
71
|
end
|
72
72
|
|
73
73
|
def cobweb_version(crawl_id)
|
74
|
-
redis =
|
74
|
+
redis = RedisConnection.new(redis_options)
|
75
75
|
key = redis.keys("cobweb-*-#{crawl_id}:queued").first
|
76
76
|
|
77
77
|
key =~ /cobweb-(.*?)-(.*?):queued/
|
@@ -1,9 +1,10 @@
|
|
1
|
-
|
1
|
+
|
2
|
+
if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
|
2
3
|
SIDEKIQ_INSTALLED = true
|
3
4
|
require 'sidekiq'
|
4
5
|
else
|
5
6
|
SIDEKIQ_INSTALLED = false
|
6
|
-
puts "
|
7
|
+
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
7
8
|
end
|
8
9
|
|
9
10
|
module Sidekiq
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -31,6 +31,8 @@ describe Cobweb do
|
|
31
31
|
options[:timeout].should == 10
|
32
32
|
options[:redis_options].should == {}
|
33
33
|
options[:internal_urls].should == []
|
34
|
+
options[:proxy_addr].should be_nil
|
35
|
+
options[:proxy_port].should be_nil
|
34
36
|
|
35
37
|
end
|
36
38
|
|
@@ -52,15 +54,15 @@ describe Cobweb do
|
|
52
54
|
@cobweb.get(@base_url)[:url].should == @base_url
|
53
55
|
end
|
54
56
|
it "should return correct content-type" do
|
55
|
-
@mock_http_response.stub
|
57
|
+
@mock_http_response.stub(:content_type).and_return("image/jpeg")
|
56
58
|
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
57
59
|
end
|
58
60
|
it "should return correct status-code" do
|
59
|
-
@mock_http_response.stub
|
61
|
+
@mock_http_response.stub(:code).and_return(404)
|
60
62
|
@cobweb.get(@base_url)[:status_code].should == 404
|
61
63
|
end
|
62
64
|
it "should return correct status-code" do
|
63
|
-
@mock_http_response.stub
|
65
|
+
@mock_http_response.stub(:code).and_return(404)
|
64
66
|
@cobweb.get(@base_url)[:status_code].should == 404
|
65
67
|
end
|
66
68
|
it "should return correct character_set" do
|
@@ -75,7 +77,7 @@ describe Cobweb do
|
|
75
77
|
it "should return correct location" do
|
76
78
|
@cobweb.get(@base_url)[:location].should == nil
|
77
79
|
|
78
|
-
@mock_http_response.stub
|
80
|
+
@mock_http_response.stub(:[]).with("location").and_return("http://google.com/")
|
79
81
|
@cobweb.get(@base_url)[:location].should == "http://google.com/"
|
80
82
|
end
|
81
83
|
it "should return correct headers" do
|
@@ -135,17 +137,17 @@ describe Cobweb do
|
|
135
137
|
@cobweb.get(@base_url)[:url].should == @base_url
|
136
138
|
end
|
137
139
|
it "should return correct content-type" do
|
138
|
-
@mock_http_response.stub
|
140
|
+
@mock_http_response.stub(:content_type).and_return("image/jpeg")
|
139
141
|
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
140
142
|
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
141
143
|
end
|
142
144
|
it "should return correct status-code" do
|
143
|
-
@mock_http_response.stub
|
145
|
+
@mock_http_response.stub(:code).and_return(404)
|
144
146
|
@cobweb.get(@base_url)[:status_code].should == 404
|
145
147
|
@cobweb.get(@base_url)[:status_code].should == 404
|
146
148
|
end
|
147
149
|
it "should return correct status-code" do
|
148
|
-
@mock_http_response.stub
|
150
|
+
@mock_http_response.stub(:code).and_return(404)
|
149
151
|
@cobweb.get(@base_url)[:status_code].should == 404
|
150
152
|
@cobweb.get(@base_url)[:status_code].should == 404
|
151
153
|
end
|
@@ -177,26 +179,34 @@ describe Cobweb do
|
|
177
179
|
end
|
178
180
|
describe "location setting" do
|
179
181
|
it "Get should strip fragments" do
|
180
|
-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
182
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
|
181
183
|
Net::HTTP::Get.should_receive(:new).with("/", @default_options)
|
182
184
|
@cobweb.get("http://www.google.com/#ignore")
|
183
185
|
end
|
184
186
|
it "head should strip fragments" do
|
185
|
-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
187
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
|
186
188
|
Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
|
187
189
|
@cobweb.head("http://www.google.com/#ignore")
|
188
190
|
end
|
189
191
|
it "get should not strip path" do
|
190
|
-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
192
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
|
191
193
|
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", @default_options)
|
192
194
|
@cobweb.get("http://www.google.com/path/to/stuff#ignore")
|
193
195
|
end
|
194
196
|
it "get should not strip query string" do
|
195
|
-
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
197
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
|
196
198
|
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", @default_options)
|
197
199
|
@cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
|
198
200
|
end
|
199
201
|
end
|
202
|
+
describe "with proxy" do
|
203
|
+
it "provides proxy parameters to Net::HTTP" do
|
204
|
+
cobweb = Cobweb.new proxy_addr: 'proxy.example.com', proxy_port: 1234
|
205
|
+
Net::HTTP.should_receive(:new).with("www.google.com", 80, "proxy.example.com", 1234)
|
206
|
+
|
207
|
+
cobweb.get("http://www.google.com/")
|
208
|
+
end
|
209
|
+
end
|
200
210
|
|
201
211
|
end
|
202
212
|
end
|
@@ -9,7 +9,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
9
9
|
|
10
10
|
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
11
11
|
if Resque.workers.count > 0 && @existing_processes.empty?
|
12
|
-
raise "Ghost workers present in resque, please clear before running specs"
|
12
|
+
raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
|
13
13
|
elsif Resque.workers.count == 0 && !@existing_processes.empty?
|
14
14
|
raise "Ghost worker processes present (#{@existing_processes.join(',')})"
|
15
15
|
elsif Resque.workers.count > 0 && !@existing_processes.empty?
|
@@ -23,25 +23,23 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
23
23
|
io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
|
24
24
|
|
25
25
|
counter = 0
|
26
|
-
print "Starting Resque Processes"
|
27
26
|
until counter > 10 || workers_processes_started?
|
28
|
-
print "
|
27
|
+
print "\rStarting Resque Processes... #{10-counter} "
|
29
28
|
counter += 1
|
30
|
-
sleep
|
29
|
+
sleep 1
|
31
30
|
end
|
32
31
|
puts ""
|
33
32
|
|
34
33
|
|
35
34
|
counter = 0
|
36
|
-
|
37
|
-
|
38
|
-
print "."
|
35
|
+
until counter > 30 || workers_running?
|
36
|
+
print "\rWaiting for Resque Workers... #{30-counter} "
|
39
37
|
counter += 1
|
40
|
-
sleep
|
38
|
+
sleep 1
|
41
39
|
end
|
42
40
|
puts ""
|
43
41
|
|
44
|
-
if
|
42
|
+
if workers_running?
|
45
43
|
puts "Workers Running."
|
46
44
|
else
|
47
45
|
raise "Workers didn't appear, please check environment"
|
@@ -62,10 +60,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
62
60
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
63
61
|
:crawl_limit => nil,
|
64
62
|
:quiet => false,
|
65
|
-
:debug =>
|
63
|
+
:debug => true,
|
66
64
|
:cache => nil
|
67
65
|
}
|
68
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis =>
|
66
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
|
69
67
|
@cobweb = Cobweb.new @request
|
70
68
|
end
|
71
69
|
it "should not crawl anything if nothing has started" do
|
@@ -95,10 +93,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
95
93
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
96
94
|
:crawl_limit => nil,
|
97
95
|
:quiet => false,
|
98
|
-
:debug =>
|
96
|
+
:debug => true,
|
99
97
|
:cache => nil
|
100
98
|
}
|
101
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis =>
|
99
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
|
102
100
|
|
103
101
|
@cobweb = Cobweb.new @request
|
104
102
|
end
|
@@ -124,11 +122,11 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
124
122
|
@request = {
|
125
123
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
126
124
|
:quiet => false,
|
127
|
-
:debug =>
|
125
|
+
:debug => true,
|
128
126
|
:cache => nil,
|
129
127
|
:valid_mime_types => ["text/html"]
|
130
128
|
}
|
131
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis =>
|
129
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
|
132
130
|
@cobweb = Cobweb.new @request
|
133
131
|
end
|
134
132
|
|
@@ -150,10 +148,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
150
148
|
@request = {
|
151
149
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
152
150
|
:quiet => false,
|
153
|
-
:debug =>
|
151
|
+
:debug => true,
|
154
152
|
:cache => nil
|
155
153
|
}
|
156
|
-
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis =>
|
154
|
+
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
|
157
155
|
end
|
158
156
|
|
159
157
|
# describe "crawling http://yepadeperrors.wordpress.com/ with limit of 20" do
|
@@ -226,6 +224,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
226
224
|
@redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
|
227
225
|
end
|
228
226
|
it "should notify of crawl finished once" do
|
227
|
+
@redis.get("crawl_finished_enqueued_count").to_i.should == 0
|
229
228
|
crawl = @cobweb.start(@base_url)
|
230
229
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
231
230
|
wait_for_crawl_finished crawl[:crawl_id]
|
@@ -280,11 +279,13 @@ end
|
|
280
279
|
|
281
280
|
def wait_for_crawl_finished(crawl_id, timeout=20)
|
282
281
|
@counter = 0
|
282
|
+
@timeout = timeout unless @timeout
|
283
283
|
start_time = Time.now
|
284
284
|
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
285
285
|
sleep 1
|
286
286
|
end
|
287
|
-
if Time.now > start_time + timeout
|
287
|
+
if Time.now > start_time + @timeout
|
288
|
+
@timeout = 5
|
288
289
|
raise "End of crawl not detected"
|
289
290
|
end
|
290
291
|
end
|
@@ -296,7 +297,7 @@ def workers_processes_started?
|
|
296
297
|
end
|
297
298
|
|
298
299
|
def workers_running?
|
299
|
-
Resque.workers.count
|
300
|
+
Resque.workers.count == RESQUE_WORKER_COUNT
|
300
301
|
end
|
301
302
|
|
302
303
|
def running?(crawl_id)
|
@@ -8,8 +8,8 @@ describe CrawlWorker, :local_only => true do
|
|
8
8
|
if SIDEKIQ_INSTALLED
|
9
9
|
#store all existing resque process ids so we don't kill them afterwards
|
10
10
|
@existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
|
11
|
-
|
12
|
-
@existing_processes.
|
11
|
+
|
12
|
+
raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
|
13
13
|
|
14
14
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
15
15
|
puts "Starting Workers... Please Wait..."
|
@@ -34,7 +34,7 @@ describe CrawlWorker, :local_only => true do
|
|
34
34
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
35
35
|
:crawl_limit => nil,
|
36
36
|
:quiet => false,
|
37
|
-
:debug =>
|
37
|
+
:debug => true,
|
38
38
|
:cache => nil,
|
39
39
|
:queue_system => :sidekiq
|
40
40
|
}
|
@@ -60,6 +60,7 @@ describe CrawlWorker, :local_only => true do
|
|
60
60
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
61
61
|
:quiet => true,
|
62
62
|
:cache => nil,
|
63
|
+
:debug => true,
|
63
64
|
:queue_system => :sidekiq,
|
64
65
|
:valid_mime_types => ["text/html"]
|
65
66
|
}
|
@@ -87,6 +88,7 @@ describe CrawlWorker, :local_only => true do
|
|
87
88
|
@request = {
|
88
89
|
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
89
90
|
:quiet => true,
|
91
|
+
:debug => true,
|
90
92
|
:queue_system => :sidekiq,
|
91
93
|
:cache => nil
|
92
94
|
}
|
@@ -136,7 +138,6 @@ describe CrawlWorker, :local_only => true do
|
|
136
138
|
wait_for_crawl_finished crawl[:crawl_id]
|
137
139
|
|
138
140
|
mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
|
139
|
-
ap mime_types
|
140
141
|
mime_types.select{|m| m=="text/html"}.count.should == 5
|
141
142
|
end
|
142
143
|
end
|
@@ -186,11 +187,11 @@ describe CrawlWorker, :local_only => true do
|
|
186
187
|
wait_for_crawl_finished crawl[:crawl_id]
|
187
188
|
CrawlFinishedWorker.queue_size.should == 1
|
188
189
|
end
|
189
|
-
it "should not crawl 100 pages" do
|
190
|
+
it "should not crawl more than 100 pages" do
|
190
191
|
crawl = @cobweb.start(@base_url)
|
191
192
|
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
192
193
|
wait_for_crawl_finished crawl[:crawl_id]
|
193
|
-
CrawlProcessWorker.queue_size.should_not
|
194
|
+
CrawlProcessWorker.queue_size.should_not > 100
|
194
195
|
end
|
195
196
|
end
|
196
197
|
end
|
@@ -244,7 +245,7 @@ def clear_sidekiq_queues
|
|
244
245
|
conn.srem("queues", queue_name)
|
245
246
|
end
|
246
247
|
end
|
247
|
-
sleep
|
248
|
+
sleep 5
|
248
249
|
|
249
250
|
CrawlProcessWorker.queue_size.should == 0
|
250
251
|
CrawlFinishedWorker.queue_size.should == 0
|
metadata
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.19
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Stewart McKee
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-
|
12
|
+
date: 2013-11-26 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: redis
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
19
|
- - ! '>='
|
18
20
|
- !ruby/object:Gem::Version
|
@@ -20,6 +22,7 @@ dependencies:
|
|
20
22
|
type: :runtime
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
27
|
- - ! '>='
|
25
28
|
- !ruby/object:Gem::Version
|
@@ -27,6 +30,7 @@ dependencies:
|
|
27
30
|
- !ruby/object:Gem::Dependency
|
28
31
|
name: nokogiri
|
29
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
30
34
|
requirements:
|
31
35
|
- - ! '>='
|
32
36
|
- !ruby/object:Gem::Version
|
@@ -34,6 +38,7 @@ dependencies:
|
|
34
38
|
type: :runtime
|
35
39
|
prerelease: false
|
36
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
37
42
|
requirements:
|
38
43
|
- - ! '>='
|
39
44
|
- !ruby/object:Gem::Version
|
@@ -41,6 +46,7 @@ dependencies:
|
|
41
46
|
- !ruby/object:Gem::Dependency
|
42
47
|
name: addressable
|
43
48
|
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
44
50
|
requirements:
|
45
51
|
- - ! '>='
|
46
52
|
- !ruby/object:Gem::Version
|
@@ -48,6 +54,7 @@ dependencies:
|
|
48
54
|
type: :runtime
|
49
55
|
prerelease: false
|
50
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
51
58
|
requirements:
|
52
59
|
- - ! '>='
|
53
60
|
- !ruby/object:Gem::Version
|
@@ -55,6 +62,7 @@ dependencies:
|
|
55
62
|
- !ruby/object:Gem::Dependency
|
56
63
|
name: rspec
|
57
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
58
66
|
requirements:
|
59
67
|
- - ! '>='
|
60
68
|
- !ruby/object:Gem::Version
|
@@ -62,6 +70,7 @@ dependencies:
|
|
62
70
|
type: :runtime
|
63
71
|
prerelease: false
|
64
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
65
74
|
requirements:
|
66
75
|
- - ! '>='
|
67
76
|
- !ruby/object:Gem::Version
|
@@ -69,6 +78,7 @@ dependencies:
|
|
69
78
|
- !ruby/object:Gem::Dependency
|
70
79
|
name: awesome_print
|
71
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
72
82
|
requirements:
|
73
83
|
- - ! '>='
|
74
84
|
- !ruby/object:Gem::Version
|
@@ -76,6 +86,7 @@ dependencies:
|
|
76
86
|
type: :runtime
|
77
87
|
prerelease: false
|
78
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
79
90
|
requirements:
|
80
91
|
- - ! '>='
|
81
92
|
- !ruby/object:Gem::Version
|
@@ -83,6 +94,7 @@ dependencies:
|
|
83
94
|
- !ruby/object:Gem::Dependency
|
84
95
|
name: sinatra
|
85
96
|
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
86
98
|
requirements:
|
87
99
|
- - ! '>='
|
88
100
|
- !ruby/object:Gem::Version
|
@@ -90,6 +102,7 @@ dependencies:
|
|
90
102
|
type: :runtime
|
91
103
|
prerelease: false
|
92
104
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
93
106
|
requirements:
|
94
107
|
- - ! '>='
|
95
108
|
- !ruby/object:Gem::Version
|
@@ -97,6 +110,7 @@ dependencies:
|
|
97
110
|
- !ruby/object:Gem::Dependency
|
98
111
|
name: thin
|
99
112
|
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
100
114
|
requirements:
|
101
115
|
- - ! '>='
|
102
116
|
- !ruby/object:Gem::Version
|
@@ -104,6 +118,7 @@ dependencies:
|
|
104
118
|
type: :runtime
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
107
122
|
requirements:
|
108
123
|
- - ! '>='
|
109
124
|
- !ruby/object:Gem::Version
|
@@ -111,6 +126,7 @@ dependencies:
|
|
111
126
|
- !ruby/object:Gem::Dependency
|
112
127
|
name: haml
|
113
128
|
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
114
130
|
requirements:
|
115
131
|
- - ! '>='
|
116
132
|
- !ruby/object:Gem::Version
|
@@ -118,6 +134,7 @@ dependencies:
|
|
118
134
|
type: :runtime
|
119
135
|
prerelease: false
|
120
136
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
121
138
|
requirements:
|
122
139
|
- - ! '>='
|
123
140
|
- !ruby/object:Gem::Version
|
@@ -125,6 +142,7 @@ dependencies:
|
|
125
142
|
- !ruby/object:Gem::Dependency
|
126
143
|
name: namespaced_redis
|
127
144
|
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
128
146
|
requirements:
|
129
147
|
- - ! '>='
|
130
148
|
- !ruby/object:Gem::Version
|
@@ -132,6 +150,7 @@ dependencies:
|
|
132
150
|
type: :runtime
|
133
151
|
prerelease: false
|
134
152
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
135
154
|
requirements:
|
136
155
|
- - ! '>='
|
137
156
|
- !ruby/object:Gem::Version
|
@@ -139,6 +158,7 @@ dependencies:
|
|
139
158
|
- !ruby/object:Gem::Dependency
|
140
159
|
name: json
|
141
160
|
requirement: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
142
162
|
requirements:
|
143
163
|
- - ! '>='
|
144
164
|
- !ruby/object:Gem::Version
|
@@ -146,6 +166,7 @@ dependencies:
|
|
146
166
|
type: :runtime
|
147
167
|
prerelease: false
|
148
168
|
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
149
170
|
requirements:
|
150
171
|
- - ! '>='
|
151
172
|
- !ruby/object:Gem::Version
|
@@ -153,6 +174,7 @@ dependencies:
|
|
153
174
|
- !ruby/object:Gem::Dependency
|
154
175
|
name: slop
|
155
176
|
requirement: !ruby/object:Gem::Requirement
|
177
|
+
none: false
|
156
178
|
requirements:
|
157
179
|
- - ! '>='
|
158
180
|
- !ruby/object:Gem::Version
|
@@ -160,6 +182,7 @@ dependencies:
|
|
160
182
|
type: :runtime
|
161
183
|
prerelease: false
|
162
184
|
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
163
186
|
requirements:
|
164
187
|
- - ! '>='
|
165
188
|
- !ruby/object:Gem::Version
|
@@ -376,6 +399,7 @@ files:
|
|
376
399
|
- lib/export_command.rb
|
377
400
|
- lib/hash_util.rb
|
378
401
|
- lib/redirect_error.rb
|
402
|
+
- lib/redis_connection.rb
|
379
403
|
- lib/report_command.rb
|
380
404
|
- lib/robots.rb
|
381
405
|
- lib/server.rb
|
@@ -540,26 +564,27 @@ files:
|
|
540
564
|
homepage: http://github.com/stewartmckee/cobweb
|
541
565
|
licenses:
|
542
566
|
- MIT
|
543
|
-
metadata: {}
|
544
567
|
post_install_message:
|
545
568
|
rdoc_options: []
|
546
569
|
require_paths:
|
547
570
|
- lib
|
548
571
|
required_ruby_version: !ruby/object:Gem::Requirement
|
572
|
+
none: false
|
549
573
|
requirements:
|
550
574
|
- - ! '>='
|
551
575
|
- !ruby/object:Gem::Version
|
552
576
|
version: '0'
|
553
577
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
578
|
+
none: false
|
554
579
|
requirements:
|
555
580
|
- - ! '>='
|
556
581
|
- !ruby/object:Gem::Version
|
557
582
|
version: '0'
|
558
583
|
requirements: []
|
559
584
|
rubyforge_project:
|
560
|
-
rubygems_version:
|
585
|
+
rubygems_version: 1.8.25
|
561
586
|
signing_key:
|
562
|
-
specification_version:
|
587
|
+
specification_version: 3
|
563
588
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
564
589
|
crawl extremely large sites faster than multi-threaded crawlers. It is also a standalone
|
565
590
|
crawler that has a sophisticated statistics monitoring interface to monitor the
|
checksums.yaml
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
---
|
2
|
-
!binary "U0hBMQ==":
|
3
|
-
metadata.gz: !binary |-
|
4
|
-
MTg3ODFiMWE1MmZlYWFjYzZiZjIzZjQ1NmFjZmJmMWU1MDVjZTc5Mg==
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
ZGU4NmFiYTJlNmZlODRiMjRmNTkzZjMwOWQyMzEyZjU4OGQzMWUxMw==
|
7
|
-
!binary "U0hBNTEy":
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
ZDVmN2MwYzBiMjQ1N2E2YjBmYmM0ZTk5ZWJjMGVkN2VmMDM4ODhkNTQ0OTIx
|
10
|
-
ZTg4YzMzMWE0OTY2ZjgyNWRiNzZlZjgyZDlkM2Y4MTQ2OTVmZTg5Zjc1NTA1
|
11
|
-
MTZhYzc2ZmYwNmM2ODRlMmViODljMGFjODYwNTY5OThlNjY2M2Y=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
M2M2YzU4ZTE5YzkxMWVmNmJiNTQ5OWFhNDExZGUwNzkxMGEzY2IyYTFmYTJl
|
14
|
-
YTE0OWI2ZmZhN2I0ZjA2YjU4NWFmNmUwMjY5ZDM4YWQ3ZmJkZmViNzRlNWMw
|
15
|
-
ZWMzNjIwNDkxNDk0NmMxOTE3NzljMGQ5MjlmYzgyODc3ZWQ2ZTY=
|