cobweb 1.0.18 → 1.0.19

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.18
2
+ h1. Cobweb v1.0.19
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -95,7 +95,8 @@ Creates a new crawler object based on a base_url
95
95
 
96
96
  ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
97
97
  ** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
98
- ** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
98
+ ** :processing_queue - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
99
+ ** :crawl_finished_queue - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
99
100
  ** :debug - enables debug output (Default: false)
100
101
  ** :quiet - hides default output (Default: false)
101
102
  ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
@@ -110,9 +111,12 @@ Creates a new crawler object based on a base_url
110
111
  ** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
111
112
  ** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
112
113
  ** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
113
- ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
114
+ ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
114
115
  ** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
115
116
  ** :raise_exceptions - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
117
+ ** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
118
+ ** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
119
+ ** :proxy_port - port number of the proxy (default: nil)
116
120
 
117
121
 
118
122
  bc. crawler = Cobweb.new(:follow_redirects => false)
@@ -9,6 +9,9 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
9
9
  require file
10
10
  end
11
11
 
12
+ puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
13
+
14
+
12
15
  # Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
13
16
  class Cobweb
14
17
 
@@ -57,6 +60,8 @@ class Cobweb
57
60
  default_valid_mime_types_to ["*/*"]
58
61
  default_raise_exceptions_to false
59
62
  default_store_inbound_links_to false
63
+ default_proxy_addr_to nil
64
+ default_proxy_port_to nil
60
65
 
61
66
  end
62
67
 
@@ -76,7 +81,7 @@ class Cobweb
76
81
  end
77
82
 
78
83
  request.merge!(@options)
79
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => Redis.new(request[:redis_options]))
84
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => RedisConnection.new(request[:redis_options]))
80
85
  @redis.set("original_base_url", base_url)
81
86
  @redis.hset "statistics", "queued_at", DateTime.now
82
87
  @redis.set("crawl-counter", 0)
@@ -130,11 +135,11 @@ class Cobweb
130
135
 
131
136
  # connect to redis
132
137
  if options.has_key? :crawl_id
133
- redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
138
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
134
139
  else
135
- redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
140
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
136
141
  end
137
- full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
142
+ full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
138
143
 
139
144
  content = {:base_url => url}
140
145
 
@@ -151,7 +156,7 @@ class Cobweb
151
156
  # retrieve data
152
157
  #unless @http && @http.address == uri.host && @http.port == uri.inferred_port
153
158
  puts "Creating connection to #{uri.host}..." if @options[:debug]
154
- @http = Net::HTTP.new(uri.host, uri.inferred_port)
159
+ @http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
155
160
  #end
156
161
  if uri.scheme == "https"
157
162
  @http.use_ssl = true
@@ -309,9 +314,9 @@ class Cobweb
309
314
 
310
315
  # connect to redis
311
316
  if options.has_key? :crawl_id
312
- redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
317
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
313
318
  else
314
- redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
319
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
315
320
  end
316
321
 
317
322
  content = {:base_url => url}
@@ -324,7 +329,7 @@ class Cobweb
324
329
  # retrieve data
325
330
  unless @http && @http.address == uri.host && @http.port == uri.inferred_port
326
331
  puts "Creating connection to #{uri.host}..." unless @options[:quiet]
327
- @http = Net::HTTP.new(uri.host, uri.inferred_port)
332
+ @http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
328
333
  end
329
334
  if uri.scheme == "https"
330
335
  @http.use_ssl = true
@@ -20,7 +20,7 @@ class CobwebCrawler
20
20
  @options[:crawl_id] = @crawl_id
21
21
  end
22
22
 
23
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
23
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options]))
24
24
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
25
25
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
26
26
  @options[:seed_urls] = [] if @options[:seed_urls].nil?
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.18"
6
+ "1.0.19"
7
7
  end
8
8
 
9
9
  end
@@ -6,7 +6,7 @@ module CobwebModule
6
6
 
7
7
  setup_defaults
8
8
 
9
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
9
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
10
10
  @stats = Stats.new(@options)
11
11
  @debug = @options[:debug]
12
12
  @first_to_finish = false
@@ -22,6 +22,15 @@ module CobwebModule
22
22
  @redis.sismember "queued", link
23
23
  end
24
24
 
25
+ def already_running?(link)
26
+ @redis.sismember "currently_running", link
27
+ end
28
+
29
+ def already_handled?(link)
30
+ already_crawled?(link) || already_queued?(link) || already_running?(link)
31
+ end
32
+
33
+
25
34
  # Returns true if the crawl count is within limits
26
35
  def within_crawl_limits?
27
36
  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
@@ -50,16 +59,19 @@ module CobwebModule
50
59
  end
51
60
 
52
61
  def retrieve
53
- unless @redis.sismember("currently_running", @options[:url])
54
- @redis.sadd("currently_running", @options[:url])
55
- unless already_crawled?
62
+
63
+ unless already_running? @options[:url]
64
+ unless already_crawled? @options[:url]
65
+ @redis.sadd("currently_running", @options[:url])
56
66
  if within_crawl_limits?
57
67
  @stats.update_status("Retrieving #{@options[:url]}...")
58
- @content = Cobweb.new(@options).get(@options[:url], @options)
59
- if @options[:url] == @redis.get("original_base_url")
60
- @redis.set("crawled_base_url", @content[:base_url])
68
+ lock("update_queues") do
69
+ @content = Cobweb.new(@options).get(@options[:url], @options)
70
+ if @options[:url] == @redis.get("original_base_url")
71
+ @redis.set("crawled_base_url", @content[:base_url])
72
+ end
73
+ update_queues
61
74
  end
62
- update_queues
63
75
 
64
76
  if content.permitted_type?
65
77
  ## update statistics
@@ -128,7 +140,7 @@ module CobwebModule
128
140
  end
129
141
 
130
142
  def update_queues
131
- lock("update_queues") do
143
+ #lock("update_queues") do
132
144
  #@redis.incr "inprogress"
133
145
  # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
134
146
  @redis.srem "queued", @options[:url]
@@ -146,25 +158,27 @@ module CobwebModule
146
158
  increment_crawl_counter
147
159
  end
148
160
  decrement_queue_counter
149
- end
161
+ #end
150
162
  end
151
163
 
152
164
  def to_be_processed?
153
- (!finished? || within_process_limits?) && !@redis.sismember("enqueued", @options[:url])
165
+ !finished? && within_process_limits? && !@redis.sismember("queued", @options[:url])
154
166
  end
155
167
 
156
168
  def process(&block)
157
- if @options[:crawl_limit_by_page]
158
- if content.mime_type.match("text/html")
169
+ lock("process") do
170
+ if @options[:crawl_limit_by_page]
171
+ if content.mime_type.match("text/html")
172
+ increment_process_counter
173
+ end
174
+ else
159
175
  increment_process_counter
160
176
  end
161
- else
162
- increment_process_counter
163
- end
164
- @redis.sadd "enqueued", @options[:url]
177
+ #@redis.sadd "queued", @options[:url]
165
178
 
166
- yield if block_given?
167
- @redis.incr("crawl_job_enqueued_count")
179
+ yield if block_given?
180
+ @redis.incr("crawl_job_enqueued_count")
181
+ end
168
182
  end
169
183
 
170
184
  def finished_processing
@@ -173,20 +187,33 @@ module CobwebModule
173
187
 
174
188
  def finished?
175
189
  print_counters
190
+ debug_puts @stats.get_status
191
+ if @stats.get_status == CobwebCrawlHelper::FINISHED
192
+ debug_puts "Already Finished!"
193
+ end
176
194
  # if there's nothing left queued or the crawled limit has been reached and we're not still processing something
177
195
  if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
178
196
  if queue_counter == 0 && @redis.smembers("currently_running").empty?
179
- finished
197
+ debug_puts "queue_counter is 0 and currently_running is empty so we're done"
198
+ #finished
180
199
  return true
181
200
  end
182
- elsif (queue_counter == 0 && @redis.smembers("currently_running").empty?) || process_counter >= @options[:crawl_limit].to_i
183
- finished
201
+ elsif (queue_counter == 0 || process_counter >= @options[:crawl_limit].to_i) && @redis.smembers("currently_running").empty?
202
+ #finished
203
+ debug_puts "queue_counter: #{queue_counter}, @redis.smembers(\"currently_running\").empty?: #{@redis.smembers("currently_running").empty?}, process_counter: #{process_counter}, @options[:crawl_limit].to_i: #{@options[:crawl_limit].to_i}"
184
204
  return true
185
205
  end
186
206
  false
187
207
  end
188
208
 
189
- def finished
209
+ def finish
210
+ debug_puts ""
211
+ debug_puts "========================================================================"
212
+ debug_puts "finished crawl on #{@options[:url]}"
213
+ print_counters
214
+ debug_puts "========================================================================"
215
+ debug_puts ""
216
+
190
217
  set_first_to_finish
191
218
  @stats.end_crawl(@options)
192
219
  end
@@ -223,22 +250,22 @@ module CobwebModule
223
250
  end
224
251
 
225
252
  def lock(key, &block)
226
- debug_puts "REQUESTING LOCK [#{key}]"
253
+ #debug_puts "REQUESTING LOCK [#{key}]"
227
254
  set_nx = @redis.setnx("#{key}_lock", "locked")
228
- debug_puts "LOCK:#{key}:#{set_nx}"
255
+ #debug_puts "LOCK:#{key}:#{set_nx}"
229
256
  while !set_nx
230
- debug_puts "===== WAITING FOR LOCK [#{key}] ====="
257
+ #debug_puts "===== WAITING FOR LOCK [#{key}] ====="
231
258
  sleep 0.01
232
259
  set_nx = @redis.setnx("#{key}_lock", "locked")
233
260
  end
234
261
 
235
- debug_puts "RECEIVED LOCK [#{key}]"
262
+ #debug_puts "RECEIVED LOCK [#{key}]"
236
263
  @redis.expire("#{key}_lock", 10)
237
264
  begin
238
265
  result = yield
239
266
  ensure
240
267
  @redis.del("#{key}_lock")
241
- debug_puts "LOCK RELEASED [#{key}]"
268
+ #debug_puts "LOCK RELEASED [#{key}]"
242
269
  end
243
270
  result
244
271
  end
@@ -15,7 +15,7 @@ class CrawlHelper
15
15
  content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
16
16
  content_request[:queue_system] = content_request[:queue_system].to_sym
17
17
 
18
- @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
18
+ @redis = NamespacedRedisConnection.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
19
19
  @stats = Stats.new(content_request)
20
20
 
21
21
  @debug = content_request[:debug]
@@ -74,7 +74,7 @@ class CrawlHelper
74
74
 
75
75
  #if the enqueue counter has been requested update that
76
76
  if content_request.has_key? :enqueue_counter_key
77
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
77
+ enqueue_redis = NamespacedRedisConnection.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
78
78
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
79
79
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
80
80
  end
@@ -23,12 +23,14 @@ class CrawlJob
23
23
  # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
24
24
  @crawl.process_links do |link|
25
25
 
26
- # enqueue the links to resque
27
- @crawl.debug_puts "ENQUEUED LINK: #{link}"
28
- enqueue_content(content_request, link)
26
+ if @crawl.within_crawl_limits?
27
+ # enqueue the links to resque
28
+ @crawl.debug_puts "ENQUEUED LINK: #{link}"
29
+ enqueue_content(content_request, link)
30
+ end
29
31
 
30
32
  end
31
-
33
+
32
34
  if @crawl.to_be_processed?
33
35
 
34
36
  @crawl.process do
@@ -39,7 +41,7 @@ class CrawlJob
39
41
 
40
42
  #if the enqueue counter has been requested update that
41
43
  if content_request.has_key?(:enqueue_counter_key)
42
- enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
44
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => RedisConnection.new(content_request[:redis_options]))
43
45
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
44
46
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
45
47
  end
@@ -60,8 +62,7 @@ class CrawlJob
60
62
 
61
63
  # test queue and crawl sizes to see if we have completed the crawl
62
64
  @crawl.debug_puts "finished? #{@crawl.finished?}"
63
- @crawl.debug_puts "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
64
- if @crawl.finished? && @crawl.first_to_finish?
65
+ if @crawl.finished?
65
66
  @crawl.debug_puts "Calling crawl_job finished"
66
67
  finished(content_request)
67
68
  end
@@ -75,7 +76,9 @@ class CrawlJob
75
76
  additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
76
77
  additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
77
78
 
78
- @crawl.debug_puts "increment crawl_finished_enqueued_count"
79
+ @crawl.finish
80
+
81
+ @crawl.debug_puts "increment crawl_finished_enqueued_count from #{@crawl.redis.get("crawl_finished_enqueued_count")}"
79
82
  @crawl.redis.incr("crawl_finished_enqueued_count")
80
83
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
81
84
  end
@@ -16,6 +16,7 @@ class CrawlWorker
16
16
  sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
17
17
 
18
18
  def perform(content_request)
19
+ puts "Performing for #{content_request["url"]}"
19
20
  # setup the crawl class to manage the crawl of this object
20
21
  @crawl = CobwebModule::Crawl.new(content_request)
21
22
 
@@ -25,12 +26,17 @@ class CrawlWorker
25
26
  # if the crawled object is an object type we are interested
26
27
  if @crawl.content.permitted_type?
27
28
 
28
- # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
29
- @crawl.process_links do |link|
29
+ @crawl.lock("queue_links") do
30
+ # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
31
+ @crawl.process_links do |link|
30
32
 
31
- @crawl.debug_puts "ENQUEUED LINK: #{link}"
32
- enqueue_content(content_request, link)
33
+ if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
34
+ # enqueue the links to sidekiq
35
+ @crawl.debug_puts "QUEUED LINK: #{link}"
36
+ enqueue_content(content_request, link)
37
+ end
33
38
 
39
+ end
34
40
  end
35
41
 
36
42
  if @crawl.to_be_processed?
@@ -38,12 +44,12 @@ class CrawlWorker
38
44
  @crawl.process do
39
45
 
40
46
  # enqueue to processing queue
41
- @crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
47
+ @crawl.debug_puts "SENT FOR PROCESSING [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
42
48
  send_to_processing_queue(@crawl.content.to_hash, content_request)
43
49
 
44
50
  #if the enqueue counter has been requested update that
45
51
  if content_request.has_key?(:enqueue_counter_key)
46
- enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
52
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => RedisConnection.new(content_request[:redis_options]))
47
53
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
48
54
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
49
55
  end
@@ -64,8 +70,7 @@ class CrawlWorker
64
70
 
65
71
  # test queue and crawl sizes to see if we have completed the crawl
66
72
  @crawl.debug_puts "finished? #{@crawl.finished?}"
67
- @crawl.debug_puts "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
68
- if @crawl.finished? && @crawl.first_to_finish?
73
+ if @crawl.finished?
69
74
  @crawl.debug_puts "Calling crawl_job finished"
70
75
  finished(content_request)
71
76
  end
@@ -84,7 +89,7 @@ class CrawlWorker
84
89
  additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
85
90
  additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
86
91
 
87
- @crawl.finished
92
+ @crawl.finish
88
93
 
89
94
  @crawl.debug_puts "increment crawl_finished_enqueued_count"
90
95
  @crawl.redis.incr("crawl_finished_enqueued_count")
@@ -0,0 +1,23 @@
1
+ class RedisConnection
2
+
3
+ @@redis_connections = {}
4
+
5
+ def initialize(options={})
6
+ key = options.keys.sort.map{|k| "#{k}:#{options[k]}"}.join(",")
7
+ unless @@redis_connections.has_key?(key)
8
+ @@redis_connections[key] = Redis.new(options)
9
+ end
10
+ @current_connection = @@redis_connections[key]
11
+ @current_connection
12
+ end
13
+
14
+ def method_missing(m, *args, &block)
15
+ if @current_connection.respond_to?(m)
16
+ @current_connection.send(m, *args)
17
+ else
18
+ super
19
+ end
20
+ end
21
+
22
+
23
+ end
@@ -12,14 +12,14 @@ class Server < Sinatra::Base
12
12
 
13
13
  # Sinatra Dashboard
14
14
  get '/' do
15
- @full_redis = Redis.new(redis_options)
15
+ @full_redis = RedisConnection.new(redis_options)
16
16
  @colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
17
17
 
18
18
  @crawls = []
19
19
  @full_redis.smembers("cobweb_crawls").each do |crawl_id|
20
20
  version = cobweb_version(crawl_id)
21
21
  if version == Cobweb.version
22
- redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => Redis.new(redis_options))
22
+ redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => RedisConnection.new(redis_options))
23
23
  stats = HashUtil.deep_symbolize_keys({
24
24
  :cobweb_version => version,
25
25
  :crawl_details => redis.hgetall("crawl_details"),
@@ -38,7 +38,7 @@ class Server < Sinatra::Base
38
38
  get '/statistics/:crawl_id' do
39
39
 
40
40
  version = cobweb_version(params[:crawl_id])
41
- redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => Redis.new(redis_options))
41
+ redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => RedisConnection.new(redis_options))
42
42
 
43
43
  @statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
44
44
  if @statistics[:status_counts].nil?
@@ -71,7 +71,7 @@ class Server < Sinatra::Base
71
71
  end
72
72
 
73
73
  def cobweb_version(crawl_id)
74
- redis = Redis.new(redis_options)
74
+ redis = RedisConnection.new(redis_options)
75
75
  key = redis.keys("cobweb-*-#{crawl_id}:queued").first
76
76
 
77
77
  key =~ /cobweb-(.*?)-(.*?):queued/
@@ -1,9 +1,10 @@
1
- if Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0").count > 1
1
+
2
+ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
2
3
  SIDEKIQ_INSTALLED = true
3
4
  require 'sidekiq'
4
5
  else
5
6
  SIDEKIQ_INSTALLED = false
6
- puts "can't find sidekiq gem"
7
+ puts "sidekiq gem not installed, skipping crawl_worker specs"
7
8
  end
8
9
 
9
10
  module Sidekiq
@@ -31,6 +31,8 @@ describe Cobweb do
31
31
  options[:timeout].should == 10
32
32
  options[:redis_options].should == {}
33
33
  options[:internal_urls].should == []
34
+ options[:proxy_addr].should be_nil
35
+ options[:proxy_port].should be_nil
34
36
 
35
37
  end
36
38
 
@@ -52,15 +54,15 @@ describe Cobweb do
52
54
  @cobweb.get(@base_url)[:url].should == @base_url
53
55
  end
54
56
  it "should return correct content-type" do
55
- @mock_http_response.stub!(:content_type).and_return("image/jpeg")
57
+ @mock_http_response.stub(:content_type).and_return("image/jpeg")
56
58
  @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
57
59
  end
58
60
  it "should return correct status-code" do
59
- @mock_http_response.stub!(:code).and_return(404)
61
+ @mock_http_response.stub(:code).and_return(404)
60
62
  @cobweb.get(@base_url)[:status_code].should == 404
61
63
  end
62
64
  it "should return correct status-code" do
63
- @mock_http_response.stub!(:code).and_return(404)
65
+ @mock_http_response.stub(:code).and_return(404)
64
66
  @cobweb.get(@base_url)[:status_code].should == 404
65
67
  end
66
68
  it "should return correct character_set" do
@@ -75,7 +77,7 @@ describe Cobweb do
75
77
  it "should return correct location" do
76
78
  @cobweb.get(@base_url)[:location].should == nil
77
79
 
78
- @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
80
+ @mock_http_response.stub(:[]).with("location").and_return("http://google.com/")
79
81
  @cobweb.get(@base_url)[:location].should == "http://google.com/"
80
82
  end
81
83
  it "should return correct headers" do
@@ -135,17 +137,17 @@ describe Cobweb do
135
137
  @cobweb.get(@base_url)[:url].should == @base_url
136
138
  end
137
139
  it "should return correct content-type" do
138
- @mock_http_response.stub!(:content_type).and_return("image/jpeg")
140
+ @mock_http_response.stub(:content_type).and_return("image/jpeg")
139
141
  @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
140
142
  @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
141
143
  end
142
144
  it "should return correct status-code" do
143
- @mock_http_response.stub!(:code).and_return(404)
145
+ @mock_http_response.stub(:code).and_return(404)
144
146
  @cobweb.get(@base_url)[:status_code].should == 404
145
147
  @cobweb.get(@base_url)[:status_code].should == 404
146
148
  end
147
149
  it "should return correct status-code" do
148
- @mock_http_response.stub!(:code).and_return(404)
150
+ @mock_http_response.stub(:code).and_return(404)
149
151
  @cobweb.get(@base_url)[:status_code].should == 404
150
152
  @cobweb.get(@base_url)[:status_code].should == 404
151
153
  end
@@ -177,26 +179,34 @@ describe Cobweb do
177
179
  end
178
180
  describe "location setting" do
179
181
  it "Get should strip fragments" do
180
- Net::HTTP.should_receive(:new).with("www.google.com", 80)
182
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
181
183
  Net::HTTP::Get.should_receive(:new).with("/", @default_options)
182
184
  @cobweb.get("http://www.google.com/#ignore")
183
185
  end
184
186
  it "head should strip fragments" do
185
- Net::HTTP.should_receive(:new).with("www.google.com", 80)
187
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
186
188
  Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
187
189
  @cobweb.head("http://www.google.com/#ignore")
188
190
  end
189
191
  it "get should not strip path" do
190
- Net::HTTP.should_receive(:new).with("www.google.com", 80)
192
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
191
193
  Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", @default_options)
192
194
  @cobweb.get("http://www.google.com/path/to/stuff#ignore")
193
195
  end
194
196
  it "get should not strip query string" do
195
- Net::HTTP.should_receive(:new).with("www.google.com", 80)
197
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
196
198
  Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", @default_options)
197
199
  @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
198
200
  end
199
201
  end
202
+ describe "with proxy" do
203
+ it "provides proxy parameters to Net::HTTP" do
204
+ cobweb = Cobweb.new proxy_addr: 'proxy.example.com', proxy_port: 1234
205
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, "proxy.example.com", 1234)
206
+
207
+ cobweb.get("http://www.google.com/")
208
+ end
209
+ end
200
210
 
201
211
  end
202
212
  end
@@ -9,7 +9,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
9
9
 
10
10
  @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
11
11
  if Resque.workers.count > 0 && @existing_processes.empty?
12
- raise "Ghost workers present in resque, please clear before running specs"
12
+ raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
13
13
  elsif Resque.workers.count == 0 && !@existing_processes.empty?
14
14
  raise "Ghost worker processes present (#{@existing_processes.join(',')})"
15
15
  elsif Resque.workers.count > 0 && !@existing_processes.empty?
@@ -23,25 +23,23 @@ describe CrawlJob, :local_only => true, :disabled => true do
23
23
  io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
24
24
 
25
25
  counter = 0
26
- print "Starting Resque Processes"
27
26
  until counter > 10 || workers_processes_started?
28
- print "."
27
+ print "\rStarting Resque Processes... #{10-counter} "
29
28
  counter += 1
30
- sleep 0.5
29
+ sleep 1
31
30
  end
32
31
  puts ""
33
32
 
34
33
 
35
34
  counter = 0
36
- print "Waiting for Resque Workers"
37
- until counter > 50 || workers_running?
38
- print "."
35
+ until counter > 30 || workers_running?
36
+ print "\rWaiting for Resque Workers... #{30-counter} "
39
37
  counter += 1
40
- sleep 0.5
38
+ sleep 1
41
39
  end
42
40
  puts ""
43
41
 
44
- if Resque.workers.count == RESQUE_WORKER_COUNT
42
+ if workers_running?
45
43
  puts "Workers Running."
46
44
  else
47
45
  raise "Workers didn't appear, please check environment"
@@ -62,10 +60,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
62
60
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
63
61
  :crawl_limit => nil,
64
62
  :quiet => false,
65
- :debug => false,
63
+ :debug => true,
66
64
  :cache => nil
67
65
  }
68
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
66
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
69
67
  @cobweb = Cobweb.new @request
70
68
  end
71
69
  it "should not crawl anything if nothing has started" do
@@ -95,10 +93,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
95
93
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
96
94
  :crawl_limit => nil,
97
95
  :quiet => false,
98
- :debug => false,
96
+ :debug => true,
99
97
  :cache => nil
100
98
  }
101
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
99
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
102
100
 
103
101
  @cobweb = Cobweb.new @request
104
102
  end
@@ -124,11 +122,11 @@ describe CrawlJob, :local_only => true, :disabled => true do
124
122
  @request = {
125
123
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
126
124
  :quiet => false,
127
- :debug => false,
125
+ :debug => true,
128
126
  :cache => nil,
129
127
  :valid_mime_types => ["text/html"]
130
128
  }
131
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
129
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
132
130
  @cobweb = Cobweb.new @request
133
131
  end
134
132
 
@@ -150,10 +148,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
150
148
  @request = {
151
149
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
152
150
  :quiet => false,
153
- :debug => false,
151
+ :debug => true,
154
152
  :cache => nil
155
153
  }
156
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
154
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
157
155
  end
158
156
 
159
157
  # describe "crawling http://yepadeperrors.wordpress.com/ with limit of 20" do
@@ -226,6 +224,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
226
224
  @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
227
225
  end
228
226
  it "should notify of crawl finished once" do
227
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 0
229
228
  crawl = @cobweb.start(@base_url)
230
229
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
231
230
  wait_for_crawl_finished crawl[:crawl_id]
@@ -280,11 +279,13 @@ end
280
279
 
281
280
  def wait_for_crawl_finished(crawl_id, timeout=20)
282
281
  @counter = 0
282
+ @timeout = timeout unless @timeout
283
283
  start_time = Time.now
284
284
  while(running?(crawl_id) && Time.now < start_time + timeout) do
285
285
  sleep 1
286
286
  end
287
- if Time.now > start_time + timeout
287
+ if Time.now > start_time + @timeout
288
+ @timeout = 5
288
289
  raise "End of crawl not detected"
289
290
  end
290
291
  end
@@ -296,7 +297,7 @@ def workers_processes_started?
296
297
  end
297
298
 
298
299
  def workers_running?
299
- Resque.workers.count > 0
300
+ Resque.workers.count == RESQUE_WORKER_COUNT
300
301
  end
301
302
 
302
303
  def running?(crawl_id)
@@ -8,8 +8,8 @@ describe CrawlWorker, :local_only => true do
8
8
  if SIDEKIQ_INSTALLED
9
9
  #store all existing resque process ids so we don't kill them afterwards
10
10
  @existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
11
- puts @existing_processes
12
- @existing_processes.should be_empty
11
+
12
+ raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
13
13
 
14
14
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
15
15
  puts "Starting Workers... Please Wait..."
@@ -34,7 +34,7 @@ describe CrawlWorker, :local_only => true do
34
34
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
35
35
  :crawl_limit => nil,
36
36
  :quiet => false,
37
- :debug => false,
37
+ :debug => true,
38
38
  :cache => nil,
39
39
  :queue_system => :sidekiq
40
40
  }
@@ -60,6 +60,7 @@ describe CrawlWorker, :local_only => true do
60
60
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
61
61
  :quiet => true,
62
62
  :cache => nil,
63
+ :debug => true,
63
64
  :queue_system => :sidekiq,
64
65
  :valid_mime_types => ["text/html"]
65
66
  }
@@ -87,6 +88,7 @@ describe CrawlWorker, :local_only => true do
87
88
  @request = {
88
89
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
89
90
  :quiet => true,
91
+ :debug => true,
90
92
  :queue_system => :sidekiq,
91
93
  :cache => nil
92
94
  }
@@ -136,7 +138,6 @@ describe CrawlWorker, :local_only => true do
136
138
  wait_for_crawl_finished crawl[:crawl_id]
137
139
 
138
140
  mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
139
- ap mime_types
140
141
  mime_types.select{|m| m=="text/html"}.count.should == 5
141
142
  end
142
143
  end
@@ -186,11 +187,11 @@ describe CrawlWorker, :local_only => true do
186
187
  wait_for_crawl_finished crawl[:crawl_id]
187
188
  CrawlFinishedWorker.queue_size.should == 1
188
189
  end
189
- it "should not crawl 100 pages" do
190
+ it "should not crawl more than 100 pages" do
190
191
  crawl = @cobweb.start(@base_url)
191
192
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
192
193
  wait_for_crawl_finished crawl[:crawl_id]
193
- CrawlProcessWorker.queue_size.should_not == 100
194
+ CrawlProcessWorker.queue_size.should_not > 100
194
195
  end
195
196
  end
196
197
  end
@@ -244,7 +245,7 @@ def clear_sidekiq_queues
244
245
  conn.srem("queues", queue_name)
245
246
  end
246
247
  end
247
- sleep 2
248
+ sleep 5
248
249
 
249
250
  CrawlProcessWorker.queue_size.should == 0
250
251
  CrawlFinishedWorker.queue_size.should == 0
metadata CHANGED
@@ -1,18 +1,20 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.18
4
+ version: 1.0.19
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Stewart McKee
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-10-17 00:00:00.000000000 Z
12
+ date: 2013-11-26 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: redis
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
19
  - - ! '>='
18
20
  - !ruby/object:Gem::Version
@@ -20,6 +22,7 @@ dependencies:
20
22
  type: :runtime
21
23
  prerelease: false
22
24
  version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
23
26
  requirements:
24
27
  - - ! '>='
25
28
  - !ruby/object:Gem::Version
@@ -27,6 +30,7 @@ dependencies:
27
30
  - !ruby/object:Gem::Dependency
28
31
  name: nokogiri
29
32
  requirement: !ruby/object:Gem::Requirement
33
+ none: false
30
34
  requirements:
31
35
  - - ! '>='
32
36
  - !ruby/object:Gem::Version
@@ -34,6 +38,7 @@ dependencies:
34
38
  type: :runtime
35
39
  prerelease: false
36
40
  version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
37
42
  requirements:
38
43
  - - ! '>='
39
44
  - !ruby/object:Gem::Version
@@ -41,6 +46,7 @@ dependencies:
41
46
  - !ruby/object:Gem::Dependency
42
47
  name: addressable
43
48
  requirement: !ruby/object:Gem::Requirement
49
+ none: false
44
50
  requirements:
45
51
  - - ! '>='
46
52
  - !ruby/object:Gem::Version
@@ -48,6 +54,7 @@ dependencies:
48
54
  type: :runtime
49
55
  prerelease: false
50
56
  version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
51
58
  requirements:
52
59
  - - ! '>='
53
60
  - !ruby/object:Gem::Version
@@ -55,6 +62,7 @@ dependencies:
55
62
  - !ruby/object:Gem::Dependency
56
63
  name: rspec
57
64
  requirement: !ruby/object:Gem::Requirement
65
+ none: false
58
66
  requirements:
59
67
  - - ! '>='
60
68
  - !ruby/object:Gem::Version
@@ -62,6 +70,7 @@ dependencies:
62
70
  type: :runtime
63
71
  prerelease: false
64
72
  version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
65
74
  requirements:
66
75
  - - ! '>='
67
76
  - !ruby/object:Gem::Version
@@ -69,6 +78,7 @@ dependencies:
69
78
  - !ruby/object:Gem::Dependency
70
79
  name: awesome_print
71
80
  requirement: !ruby/object:Gem::Requirement
81
+ none: false
72
82
  requirements:
73
83
  - - ! '>='
74
84
  - !ruby/object:Gem::Version
@@ -76,6 +86,7 @@ dependencies:
76
86
  type: :runtime
77
87
  prerelease: false
78
88
  version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
79
90
  requirements:
80
91
  - - ! '>='
81
92
  - !ruby/object:Gem::Version
@@ -83,6 +94,7 @@ dependencies:
83
94
  - !ruby/object:Gem::Dependency
84
95
  name: sinatra
85
96
  requirement: !ruby/object:Gem::Requirement
97
+ none: false
86
98
  requirements:
87
99
  - - ! '>='
88
100
  - !ruby/object:Gem::Version
@@ -90,6 +102,7 @@ dependencies:
90
102
  type: :runtime
91
103
  prerelease: false
92
104
  version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
93
106
  requirements:
94
107
  - - ! '>='
95
108
  - !ruby/object:Gem::Version
@@ -97,6 +110,7 @@ dependencies:
97
110
  - !ruby/object:Gem::Dependency
98
111
  name: thin
99
112
  requirement: !ruby/object:Gem::Requirement
113
+ none: false
100
114
  requirements:
101
115
  - - ! '>='
102
116
  - !ruby/object:Gem::Version
@@ -104,6 +118,7 @@ dependencies:
104
118
  type: :runtime
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
107
122
  requirements:
108
123
  - - ! '>='
109
124
  - !ruby/object:Gem::Version
@@ -111,6 +126,7 @@ dependencies:
111
126
  - !ruby/object:Gem::Dependency
112
127
  name: haml
113
128
  requirement: !ruby/object:Gem::Requirement
129
+ none: false
114
130
  requirements:
115
131
  - - ! '>='
116
132
  - !ruby/object:Gem::Version
@@ -118,6 +134,7 @@ dependencies:
118
134
  type: :runtime
119
135
  prerelease: false
120
136
  version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
121
138
  requirements:
122
139
  - - ! '>='
123
140
  - !ruby/object:Gem::Version
@@ -125,6 +142,7 @@ dependencies:
125
142
  - !ruby/object:Gem::Dependency
126
143
  name: namespaced_redis
127
144
  requirement: !ruby/object:Gem::Requirement
145
+ none: false
128
146
  requirements:
129
147
  - - ! '>='
130
148
  - !ruby/object:Gem::Version
@@ -132,6 +150,7 @@ dependencies:
132
150
  type: :runtime
133
151
  prerelease: false
134
152
  version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
135
154
  requirements:
136
155
  - - ! '>='
137
156
  - !ruby/object:Gem::Version
@@ -139,6 +158,7 @@ dependencies:
139
158
  - !ruby/object:Gem::Dependency
140
159
  name: json
141
160
  requirement: !ruby/object:Gem::Requirement
161
+ none: false
142
162
  requirements:
143
163
  - - ! '>='
144
164
  - !ruby/object:Gem::Version
@@ -146,6 +166,7 @@ dependencies:
146
166
  type: :runtime
147
167
  prerelease: false
148
168
  version_requirements: !ruby/object:Gem::Requirement
169
+ none: false
149
170
  requirements:
150
171
  - - ! '>='
151
172
  - !ruby/object:Gem::Version
@@ -153,6 +174,7 @@ dependencies:
153
174
  - !ruby/object:Gem::Dependency
154
175
  name: slop
155
176
  requirement: !ruby/object:Gem::Requirement
177
+ none: false
156
178
  requirements:
157
179
  - - ! '>='
158
180
  - !ruby/object:Gem::Version
@@ -160,6 +182,7 @@ dependencies:
160
182
  type: :runtime
161
183
  prerelease: false
162
184
  version_requirements: !ruby/object:Gem::Requirement
185
+ none: false
163
186
  requirements:
164
187
  - - ! '>='
165
188
  - !ruby/object:Gem::Version
@@ -376,6 +399,7 @@ files:
376
399
  - lib/export_command.rb
377
400
  - lib/hash_util.rb
378
401
  - lib/redirect_error.rb
402
+ - lib/redis_connection.rb
379
403
  - lib/report_command.rb
380
404
  - lib/robots.rb
381
405
  - lib/server.rb
@@ -540,26 +564,27 @@ files:
540
564
  homepage: http://github.com/stewartmckee/cobweb
541
565
  licenses:
542
566
  - MIT
543
- metadata: {}
544
567
  post_install_message:
545
568
  rdoc_options: []
546
569
  require_paths:
547
570
  - lib
548
571
  required_ruby_version: !ruby/object:Gem::Requirement
572
+ none: false
549
573
  requirements:
550
574
  - - ! '>='
551
575
  - !ruby/object:Gem::Version
552
576
  version: '0'
553
577
  required_rubygems_version: !ruby/object:Gem::Requirement
578
+ none: false
554
579
  requirements:
555
580
  - - ! '>='
556
581
  - !ruby/object:Gem::Version
557
582
  version: '0'
558
583
  requirements: []
559
584
  rubyforge_project:
560
- rubygems_version: 2.0.3
585
+ rubygems_version: 1.8.25
561
586
  signing_key:
562
- specification_version: 4
587
+ specification_version: 3
563
588
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
564
589
  crawl extremely large sites faster than multi-threaded crawlers. It is also a standalone
565
590
  crawler that has a sophisticated statistics monitoring interface to monitor the
checksums.yaml DELETED
@@ -1,15 +0,0 @@
1
- ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MTg3ODFiMWE1MmZlYWFjYzZiZjIzZjQ1NmFjZmJmMWU1MDVjZTc5Mg==
5
- data.tar.gz: !binary |-
6
- ZGU4NmFiYTJlNmZlODRiMjRmNTkzZjMwOWQyMzEyZjU4OGQzMWUxMw==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- ZDVmN2MwYzBiMjQ1N2E2YjBmYmM0ZTk5ZWJjMGVkN2VmMDM4ODhkNTQ0OTIx
10
- ZTg4YzMzMWE0OTY2ZjgyNWRiNzZlZjgyZDlkM2Y4MTQ2OTVmZTg5Zjc1NTA1
11
- MTZhYzc2ZmYwNmM2ODRlMmViODljMGFjODYwNTY5OThlNjY2M2Y=
12
- data.tar.gz: !binary |-
13
- M2M2YzU4ZTE5YzkxMWVmNmJiNTQ5OWFhNDExZGUwNzkxMGEzY2IyYTFmYTJl
14
- YTE0OWI2ZmZhN2I0ZjA2YjU4NWFmNmUwMjY5ZDM4YWQ3ZmJkZmViNzRlNWMw
15
- ZWMzNjIwNDkxNDk0NmMxOTE3NzljMGQ5MjlmYzgyODc3ZWQ2ZTY=