cobweb 1.0.18 → 1.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.18
2
+ h1. Cobweb v1.0.19
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -95,7 +95,8 @@ Creates a new crawler object based on a base_url
95
95
 
96
96
  ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
97
97
  ** :redirect_limit - sets the limit to be used for concurrent redirects (Default: 10)
98
- ** :processing_queue - specifies the processing queue for content to be sent to (Default: ContentProcessJob)
98
+ ** :processing_queue - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
99
+ ** :crawl_finished_queue - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
99
100
  ** :debug - enables debug output (Default: false)
100
101
  ** :quiet - hides default output (Default: false)
101
102
  ** :cache - sets the ttl for caching pages, set to nil to disable caching (Default: 300)
@@ -110,9 +111,12 @@ Creates a new crawler object based on a base_url
110
111
  ** :obey_robots - boolean determining if robots.txt should be honoured. (default: false)
111
112
  ** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
112
113
  ** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
113
- ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
114
+ ** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to @['*/*']@
114
115
  ** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
115
116
  ** :raise_exceptions - defaults to handling exceptions with debug output, setting this to true will raise exceptions in your app
117
+ ** :use_encoding_safe_process_job - Base64-encode the body when storing job in queue; set to true when you are expecting non-ASCII content (Default: false)
118
+ ** :proxy_addr - hostname of a proxy to use for crawling (e. g., 'myproxy.example.net', default: nil)
119
+ ** :proxy_port - port number of the proxy (default: nil)
116
120
 
117
121
 
118
122
  bc. crawler = Cobweb.new(:follow_redirects => false)
@@ -9,6 +9,9 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
9
9
  require file
10
10
  end
11
11
 
12
+ puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
13
+
14
+
12
15
  # Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
13
16
  class Cobweb
14
17
 
@@ -57,6 +60,8 @@ class Cobweb
57
60
  default_valid_mime_types_to ["*/*"]
58
61
  default_raise_exceptions_to false
59
62
  default_store_inbound_links_to false
63
+ default_proxy_addr_to nil
64
+ default_proxy_port_to nil
60
65
 
61
66
  end
62
67
 
@@ -76,7 +81,7 @@ class Cobweb
76
81
  end
77
82
 
78
83
  request.merge!(@options)
79
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => Redis.new(request[:redis_options]))
84
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{request[:crawl_id]}", :redis => RedisConnection.new(request[:redis_options]))
80
85
  @redis.set("original_base_url", base_url)
81
86
  @redis.hset "statistics", "queued_at", DateTime.now
82
87
  @redis.set("crawl-counter", 0)
@@ -130,11 +135,11 @@ class Cobweb
130
135
 
131
136
  # connect to redis
132
137
  if options.has_key? :crawl_id
133
- redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
138
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
134
139
  else
135
- redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
140
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
136
141
  end
137
- full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
142
+ full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
138
143
 
139
144
  content = {:base_url => url}
140
145
 
@@ -151,7 +156,7 @@ class Cobweb
151
156
  # retrieve data
152
157
  #unless @http && @http.address == uri.host && @http.port == uri.inferred_port
153
158
  puts "Creating connection to #{uri.host}..." if @options[:debug]
154
- @http = Net::HTTP.new(uri.host, uri.inferred_port)
159
+ @http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
155
160
  #end
156
161
  if uri.scheme == "https"
157
162
  @http.use_ssl = true
@@ -309,9 +314,9 @@ class Cobweb
309
314
 
310
315
  # connect to redis
311
316
  if options.has_key? :crawl_id
312
- redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
317
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
313
318
  else
314
- redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
319
+ redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => RedisConnection.new(@options[:redis_options]))
315
320
  end
316
321
 
317
322
  content = {:base_url => url}
@@ -324,7 +329,7 @@ class Cobweb
324
329
  # retrieve data
325
330
  unless @http && @http.address == uri.host && @http.port == uri.inferred_port
326
331
  puts "Creating connection to #{uri.host}..." unless @options[:quiet]
327
- @http = Net::HTTP.new(uri.host, uri.inferred_port)
332
+ @http = Net::HTTP.new(uri.host, uri.inferred_port, @options[:proxy_addr], @options[:proxy_port])
328
333
  end
329
334
  if uri.scheme == "https"
330
335
  @http.use_ssl = true
@@ -20,7 +20,7 @@ class CobwebCrawler
20
20
  @options[:crawl_id] = @crawl_id
21
21
  end
22
22
 
23
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
23
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options]))
24
24
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
25
25
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
26
26
  @options[:seed_urls] = [] if @options[:seed_urls].nil?
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.18"
6
+ "1.0.19"
7
7
  end
8
8
 
9
9
  end
@@ -6,7 +6,7 @@ module CobwebModule
6
6
 
7
7
  setup_defaults
8
8
 
9
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
9
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
10
10
  @stats = Stats.new(@options)
11
11
  @debug = @options[:debug]
12
12
  @first_to_finish = false
@@ -22,6 +22,15 @@ module CobwebModule
22
22
  @redis.sismember "queued", link
23
23
  end
24
24
 
25
+ def already_running?(link)
26
+ @redis.sismember "currently_running", link
27
+ end
28
+
29
+ def already_handled?(link)
30
+ already_crawled?(link) || already_queued?(link) || already_running?(link)
31
+ end
32
+
33
+
25
34
  # Returns true if the crawl count is within limits
26
35
  def within_crawl_limits?
27
36
  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
@@ -50,16 +59,19 @@ module CobwebModule
50
59
  end
51
60
 
52
61
  def retrieve
53
- unless @redis.sismember("currently_running", @options[:url])
54
- @redis.sadd("currently_running", @options[:url])
55
- unless already_crawled?
62
+
63
+ unless already_running? @options[:url]
64
+ unless already_crawled? @options[:url]
65
+ @redis.sadd("currently_running", @options[:url])
56
66
  if within_crawl_limits?
57
67
  @stats.update_status("Retrieving #{@options[:url]}...")
58
- @content = Cobweb.new(@options).get(@options[:url], @options)
59
- if @options[:url] == @redis.get("original_base_url")
60
- @redis.set("crawled_base_url", @content[:base_url])
68
+ lock("update_queues") do
69
+ @content = Cobweb.new(@options).get(@options[:url], @options)
70
+ if @options[:url] == @redis.get("original_base_url")
71
+ @redis.set("crawled_base_url", @content[:base_url])
72
+ end
73
+ update_queues
61
74
  end
62
- update_queues
63
75
 
64
76
  if content.permitted_type?
65
77
  ## update statistics
@@ -128,7 +140,7 @@ module CobwebModule
128
140
  end
129
141
 
130
142
  def update_queues
131
- lock("update_queues") do
143
+ #lock("update_queues") do
132
144
  #@redis.incr "inprogress"
133
145
  # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
134
146
  @redis.srem "queued", @options[:url]
@@ -146,25 +158,27 @@ module CobwebModule
146
158
  increment_crawl_counter
147
159
  end
148
160
  decrement_queue_counter
149
- end
161
+ #end
150
162
  end
151
163
 
152
164
  def to_be_processed?
153
- (!finished? || within_process_limits?) && !@redis.sismember("enqueued", @options[:url])
165
+ !finished? && within_process_limits? && !@redis.sismember("queued", @options[:url])
154
166
  end
155
167
 
156
168
  def process(&block)
157
- if @options[:crawl_limit_by_page]
158
- if content.mime_type.match("text/html")
169
+ lock("process") do
170
+ if @options[:crawl_limit_by_page]
171
+ if content.mime_type.match("text/html")
172
+ increment_process_counter
173
+ end
174
+ else
159
175
  increment_process_counter
160
176
  end
161
- else
162
- increment_process_counter
163
- end
164
- @redis.sadd "enqueued", @options[:url]
177
+ #@redis.sadd "queued", @options[:url]
165
178
 
166
- yield if block_given?
167
- @redis.incr("crawl_job_enqueued_count")
179
+ yield if block_given?
180
+ @redis.incr("crawl_job_enqueued_count")
181
+ end
168
182
  end
169
183
 
170
184
  def finished_processing
@@ -173,20 +187,33 @@ module CobwebModule
173
187
 
174
188
  def finished?
175
189
  print_counters
190
+ debug_puts @stats.get_status
191
+ if @stats.get_status == CobwebCrawlHelper::FINISHED
192
+ debug_puts "Already Finished!"
193
+ end
176
194
  # if there's nothing left queued or the crawled limit has been reached and we're not still processing something
177
195
  if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
178
196
  if queue_counter == 0 && @redis.smembers("currently_running").empty?
179
- finished
197
+ debug_puts "queue_counter is 0 and currently_running is empty so we're done"
198
+ #finished
180
199
  return true
181
200
  end
182
- elsif (queue_counter == 0 && @redis.smembers("currently_running").empty?) || process_counter >= @options[:crawl_limit].to_i
183
- finished
201
+ elsif (queue_counter == 0 || process_counter >= @options[:crawl_limit].to_i) && @redis.smembers("currently_running").empty?
202
+ #finished
203
+ debug_puts "queue_counter: #{queue_counter}, @redis.smembers(\"currently_running\").empty?: #{@redis.smembers("currently_running").empty?}, process_counter: #{process_counter}, @options[:crawl_limit].to_i: #{@options[:crawl_limit].to_i}"
184
204
  return true
185
205
  end
186
206
  false
187
207
  end
188
208
 
189
- def finished
209
+ def finish
210
+ debug_puts ""
211
+ debug_puts "========================================================================"
212
+ debug_puts "finished crawl on #{@options[:url]}"
213
+ print_counters
214
+ debug_puts "========================================================================"
215
+ debug_puts ""
216
+
190
217
  set_first_to_finish
191
218
  @stats.end_crawl(@options)
192
219
  end
@@ -223,22 +250,22 @@ module CobwebModule
223
250
  end
224
251
 
225
252
  def lock(key, &block)
226
- debug_puts "REQUESTING LOCK [#{key}]"
253
+ #debug_puts "REQUESTING LOCK [#{key}]"
227
254
  set_nx = @redis.setnx("#{key}_lock", "locked")
228
- debug_puts "LOCK:#{key}:#{set_nx}"
255
+ #debug_puts "LOCK:#{key}:#{set_nx}"
229
256
  while !set_nx
230
- debug_puts "===== WAITING FOR LOCK [#{key}] ====="
257
+ #debug_puts "===== WAITING FOR LOCK [#{key}] ====="
231
258
  sleep 0.01
232
259
  set_nx = @redis.setnx("#{key}_lock", "locked")
233
260
  end
234
261
 
235
- debug_puts "RECEIVED LOCK [#{key}]"
262
+ #debug_puts "RECEIVED LOCK [#{key}]"
236
263
  @redis.expire("#{key}_lock", 10)
237
264
  begin
238
265
  result = yield
239
266
  ensure
240
267
  @redis.del("#{key}_lock")
241
- debug_puts "LOCK RELEASED [#{key}]"
268
+ #debug_puts "LOCK RELEASED [#{key}]"
242
269
  end
243
270
  result
244
271
  end
@@ -15,7 +15,7 @@ class CrawlHelper
15
15
  content_request[:valid_mime_types] = ["*/*"] unless content_request.has_key? :valid_mime_types
16
16
  content_request[:queue_system] = content_request[:queue_system].to_sym
17
17
 
18
- @redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
18
+ @redis = NamespacedRedisConnection.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
19
19
  @stats = Stats.new(content_request)
20
20
 
21
21
  @debug = content_request[:debug]
@@ -74,7 +74,7 @@ class CrawlHelper
74
74
 
75
75
  #if the enqueue counter has been requested update that
76
76
  if content_request.has_key? :enqueue_counter_key
77
- enqueue_redis = NamespacedRedis.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
77
+ enqueue_redis = NamespacedRedisConnection.new(content_request[:redis_options], content_request[:enqueue_counter_namespace].to_s)
78
78
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
79
79
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
80
80
  end
@@ -23,12 +23,14 @@ class CrawlJob
23
23
  # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
24
24
  @crawl.process_links do |link|
25
25
 
26
- # enqueue the links to resque
27
- @crawl.debug_puts "ENQUEUED LINK: #{link}"
28
- enqueue_content(content_request, link)
26
+ if @crawl.within_crawl_limits?
27
+ # enqueue the links to resque
28
+ @crawl.debug_puts "ENQUEUED LINK: #{link}"
29
+ enqueue_content(content_request, link)
30
+ end
29
31
 
30
32
  end
31
-
33
+
32
34
  if @crawl.to_be_processed?
33
35
 
34
36
  @crawl.process do
@@ -39,7 +41,7 @@ class CrawlJob
39
41
 
40
42
  #if the enqueue counter has been requested update that
41
43
  if content_request.has_key?(:enqueue_counter_key)
42
- enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
44
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => RedisConnection.new(content_request[:redis_options]))
43
45
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
44
46
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
45
47
  end
@@ -60,8 +62,7 @@ class CrawlJob
60
62
 
61
63
  # test queue and crawl sizes to see if we have completed the crawl
62
64
  @crawl.debug_puts "finished? #{@crawl.finished?}"
63
- @crawl.debug_puts "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
64
- if @crawl.finished? && @crawl.first_to_finish?
65
+ if @crawl.finished?
65
66
  @crawl.debug_puts "Calling crawl_job finished"
66
67
  finished(content_request)
67
68
  end
@@ -75,7 +76,9 @@ class CrawlJob
75
76
  additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
76
77
  additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
77
78
 
78
- @crawl.debug_puts "increment crawl_finished_enqueued_count"
79
+ @crawl.finish
80
+
81
+ @crawl.debug_puts "increment crawl_finished_enqueued_count from #{@crawl.redis.get("crawl_finished_enqueued_count")}"
79
82
  @crawl.redis.incr("crawl_finished_enqueued_count")
80
83
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @crawl.statistics.merge(additional_stats))
81
84
  end
@@ -16,6 +16,7 @@ class CrawlWorker
16
16
  sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
17
17
 
18
18
  def perform(content_request)
19
+ puts "Performing for #{content_request["url"]}"
19
20
  # setup the crawl class to manage the crawl of this object
20
21
  @crawl = CobwebModule::Crawl.new(content_request)
21
22
 
@@ -25,12 +26,17 @@ class CrawlWorker
25
26
  # if the crawled object is an object type we are interested
26
27
  if @crawl.content.permitted_type?
27
28
 
28
- # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
29
- @crawl.process_links do |link|
29
+ @crawl.lock("queue_links") do
30
+ # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
31
+ @crawl.process_links do |link|
30
32
 
31
- @crawl.debug_puts "ENQUEUED LINK: #{link}"
32
- enqueue_content(content_request, link)
33
+ if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
34
+ # enqueue the links to sidekiq
35
+ @crawl.debug_puts "QUEUED LINK: #{link}"
36
+ enqueue_content(content_request, link)
37
+ end
33
38
 
39
+ end
34
40
  end
35
41
 
36
42
  if @crawl.to_be_processed?
@@ -38,12 +44,12 @@ class CrawlWorker
38
44
  @crawl.process do
39
45
 
40
46
  # enqueue to processing queue
41
- @crawl.debug_puts "ENQUEUED [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
47
+ @crawl.debug_puts "SENT FOR PROCESSING [#{@crawl.redis.get("crawl_job_enqueued_count")}] URL: #{@crawl.content.url}"
42
48
  send_to_processing_queue(@crawl.content.to_hash, content_request)
43
49
 
44
50
  #if the enqueue counter has been requested update that
45
51
  if content_request.has_key?(:enqueue_counter_key)
46
- enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => Redis.new(content_request[:redis_options]))
52
+ enqueue_redis = Redis::Namespace.new(content_request[:enqueue_counter_namespace].to_s, :redis => RedisConnection.new(content_request[:redis_options]))
47
53
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
48
54
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
49
55
  end
@@ -64,8 +70,7 @@ class CrawlWorker
64
70
 
65
71
  # test queue and crawl sizes to see if we have completed the crawl
66
72
  @crawl.debug_puts "finished? #{@crawl.finished?}"
67
- @crawl.debug_puts "first_to_finish? #{@crawl.first_to_finish?}" if @crawl.finished?
68
- if @crawl.finished? && @crawl.first_to_finish?
73
+ if @crawl.finished?
69
74
  @crawl.debug_puts "Calling crawl_job finished"
70
75
  finished(content_request)
71
76
  end
@@ -84,7 +89,7 @@ class CrawlWorker
84
89
  additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
85
90
  additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
86
91
 
87
- @crawl.finished
92
+ @crawl.finish
88
93
 
89
94
  @crawl.debug_puts "increment crawl_finished_enqueued_count"
90
95
  @crawl.redis.incr("crawl_finished_enqueued_count")
@@ -0,0 +1,23 @@
1
+ class RedisConnection
2
+
3
+ @@redis_connections = {}
4
+
5
+ def initialize(options={})
6
+ key = options.keys.sort.map{|k| "#{k}:#{options[k]}"}.join(",")
7
+ unless @@redis_connections.has_key?(key)
8
+ @@redis_connections[key] = Redis.new(options)
9
+ end
10
+ @current_connection = @@redis_connections[key]
11
+ @current_connection
12
+ end
13
+
14
+ def method_missing(m, *args, &block)
15
+ if @current_connection.respond_to?(m)
16
+ @current_connection.send(m, *args)
17
+ else
18
+ super
19
+ end
20
+ end
21
+
22
+
23
+ end
@@ -12,14 +12,14 @@ class Server < Sinatra::Base
12
12
 
13
13
  # Sinatra Dashboard
14
14
  get '/' do
15
- @full_redis = Redis.new(redis_options)
15
+ @full_redis = RedisConnection.new(redis_options)
16
16
  @colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
17
17
 
18
18
  @crawls = []
19
19
  @full_redis.smembers("cobweb_crawls").each do |crawl_id|
20
20
  version = cobweb_version(crawl_id)
21
21
  if version == Cobweb.version
22
- redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => Redis.new(redis_options))
22
+ redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => RedisConnection.new(redis_options))
23
23
  stats = HashUtil.deep_symbolize_keys({
24
24
  :cobweb_version => version,
25
25
  :crawl_details => redis.hgetall("crawl_details"),
@@ -38,7 +38,7 @@ class Server < Sinatra::Base
38
38
  get '/statistics/:crawl_id' do
39
39
 
40
40
  version = cobweb_version(params[:crawl_id])
41
- redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => Redis.new(redis_options))
41
+ redis = Redis::Namespace.new("cobweb-#{version}-#{params[:crawl_id]}", :redis => RedisConnection.new(redis_options))
42
42
 
43
43
  @statistics = HashUtil.deep_symbolize_keys(redis.hgetall("statistics"))
44
44
  if @statistics[:status_counts].nil?
@@ -71,7 +71,7 @@ class Server < Sinatra::Base
71
71
  end
72
72
 
73
73
  def cobweb_version(crawl_id)
74
- redis = Redis.new(redis_options)
74
+ redis = RedisConnection.new(redis_options)
75
75
  key = redis.keys("cobweb-*-#{crawl_id}:queued").first
76
76
 
77
77
  key =~ /cobweb-(.*?)-(.*?):queued/
@@ -1,9 +1,10 @@
1
- if Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0").count > 1
1
+
2
+ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
2
3
  SIDEKIQ_INSTALLED = true
3
4
  require 'sidekiq'
4
5
  else
5
6
  SIDEKIQ_INSTALLED = false
6
- puts "can't find sidekiq gem"
7
+ puts "sidekiq gem not installed, skipping crawl_worker specs"
7
8
  end
8
9
 
9
10
  module Sidekiq
@@ -31,6 +31,8 @@ describe Cobweb do
31
31
  options[:timeout].should == 10
32
32
  options[:redis_options].should == {}
33
33
  options[:internal_urls].should == []
34
+ options[:proxy_addr].should be_nil
35
+ options[:proxy_port].should be_nil
34
36
 
35
37
  end
36
38
 
@@ -52,15 +54,15 @@ describe Cobweb do
52
54
  @cobweb.get(@base_url)[:url].should == @base_url
53
55
  end
54
56
  it "should return correct content-type" do
55
- @mock_http_response.stub!(:content_type).and_return("image/jpeg")
57
+ @mock_http_response.stub(:content_type).and_return("image/jpeg")
56
58
  @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
57
59
  end
58
60
  it "should return correct status-code" do
59
- @mock_http_response.stub!(:code).and_return(404)
61
+ @mock_http_response.stub(:code).and_return(404)
60
62
  @cobweb.get(@base_url)[:status_code].should == 404
61
63
  end
62
64
  it "should return correct status-code" do
63
- @mock_http_response.stub!(:code).and_return(404)
65
+ @mock_http_response.stub(:code).and_return(404)
64
66
  @cobweb.get(@base_url)[:status_code].should == 404
65
67
  end
66
68
  it "should return correct character_set" do
@@ -75,7 +77,7 @@ describe Cobweb do
75
77
  it "should return correct location" do
76
78
  @cobweb.get(@base_url)[:location].should == nil
77
79
 
78
- @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
80
+ @mock_http_response.stub(:[]).with("location").and_return("http://google.com/")
79
81
  @cobweb.get(@base_url)[:location].should == "http://google.com/"
80
82
  end
81
83
  it "should return correct headers" do
@@ -135,17 +137,17 @@ describe Cobweb do
135
137
  @cobweb.get(@base_url)[:url].should == @base_url
136
138
  end
137
139
  it "should return correct content-type" do
138
- @mock_http_response.stub!(:content_type).and_return("image/jpeg")
140
+ @mock_http_response.stub(:content_type).and_return("image/jpeg")
139
141
  @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
140
142
  @cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
141
143
  end
142
144
  it "should return correct status-code" do
143
- @mock_http_response.stub!(:code).and_return(404)
145
+ @mock_http_response.stub(:code).and_return(404)
144
146
  @cobweb.get(@base_url)[:status_code].should == 404
145
147
  @cobweb.get(@base_url)[:status_code].should == 404
146
148
  end
147
149
  it "should return correct status-code" do
148
- @mock_http_response.stub!(:code).and_return(404)
150
+ @mock_http_response.stub(:code).and_return(404)
149
151
  @cobweb.get(@base_url)[:status_code].should == 404
150
152
  @cobweb.get(@base_url)[:status_code].should == 404
151
153
  end
@@ -177,26 +179,34 @@ describe Cobweb do
177
179
  end
178
180
  describe "location setting" do
179
181
  it "Get should strip fragments" do
180
- Net::HTTP.should_receive(:new).with("www.google.com", 80)
182
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
181
183
  Net::HTTP::Get.should_receive(:new).with("/", @default_options)
182
184
  @cobweb.get("http://www.google.com/#ignore")
183
185
  end
184
186
  it "head should strip fragments" do
185
- Net::HTTP.should_receive(:new).with("www.google.com", 80)
187
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
186
188
  Net::HTTP::Head.should_receive(:new).with("/", {}).and_return(@mock_http_request)
187
189
  @cobweb.head("http://www.google.com/#ignore")
188
190
  end
189
191
  it "get should not strip path" do
190
- Net::HTTP.should_receive(:new).with("www.google.com", 80)
192
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
191
193
  Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", @default_options)
192
194
  @cobweb.get("http://www.google.com/path/to/stuff#ignore")
193
195
  end
194
196
  it "get should not strip query string" do
195
- Net::HTTP.should_receive(:new).with("www.google.com", 80)
197
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, nil, nil)
196
198
  Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", @default_options)
197
199
  @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
198
200
  end
199
201
  end
202
+ describe "with proxy" do
203
+ it "provides proxy parameters to Net::HTTP" do
204
+ cobweb = Cobweb.new proxy_addr: 'proxy.example.com', proxy_port: 1234
205
+ Net::HTTP.should_receive(:new).with("www.google.com", 80, "proxy.example.com", 1234)
206
+
207
+ cobweb.get("http://www.google.com/")
208
+ end
209
+ end
200
210
 
201
211
  end
202
212
  end
@@ -9,7 +9,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
9
9
 
10
10
  @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
11
11
  if Resque.workers.count > 0 && @existing_processes.empty?
12
- raise "Ghost workers present in resque, please clear before running specs"
12
+ raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
13
13
  elsif Resque.workers.count == 0 && !@existing_processes.empty?
14
14
  raise "Ghost worker processes present (#{@existing_processes.join(',')})"
15
15
  elsif Resque.workers.count > 0 && !@existing_processes.empty?
@@ -23,25 +23,23 @@ describe CrawlJob, :local_only => true, :disabled => true do
23
23
  io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
24
24
 
25
25
  counter = 0
26
- print "Starting Resque Processes"
27
26
  until counter > 10 || workers_processes_started?
28
- print "."
27
+ print "\rStarting Resque Processes... #{10-counter} "
29
28
  counter += 1
30
- sleep 0.5
29
+ sleep 1
31
30
  end
32
31
  puts ""
33
32
 
34
33
 
35
34
  counter = 0
36
- print "Waiting for Resque Workers"
37
- until counter > 50 || workers_running?
38
- print "."
35
+ until counter > 30 || workers_running?
36
+ print "\rWaiting for Resque Workers... #{30-counter} "
39
37
  counter += 1
40
- sleep 0.5
38
+ sleep 1
41
39
  end
42
40
  puts ""
43
41
 
44
- if Resque.workers.count == RESQUE_WORKER_COUNT
42
+ if workers_running?
45
43
  puts "Workers Running."
46
44
  else
47
45
  raise "Workers didn't appear, please check environment"
@@ -62,10 +60,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
62
60
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
63
61
  :crawl_limit => nil,
64
62
  :quiet => false,
65
- :debug => false,
63
+ :debug => true,
66
64
  :cache => nil
67
65
  }
68
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
66
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
69
67
  @cobweb = Cobweb.new @request
70
68
  end
71
69
  it "should not crawl anything if nothing has started" do
@@ -95,10 +93,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
95
93
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
96
94
  :crawl_limit => nil,
97
95
  :quiet => false,
98
- :debug => false,
96
+ :debug => true,
99
97
  :cache => nil
100
98
  }
101
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
99
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
102
100
 
103
101
  @cobweb = Cobweb.new @request
104
102
  end
@@ -124,11 +122,11 @@ describe CrawlJob, :local_only => true, :disabled => true do
124
122
  @request = {
125
123
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
126
124
  :quiet => false,
127
- :debug => false,
125
+ :debug => true,
128
126
  :cache => nil,
129
127
  :valid_mime_types => ["text/html"]
130
128
  }
131
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
129
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
132
130
  @cobweb = Cobweb.new @request
133
131
  end
134
132
 
@@ -150,10 +148,10 @@ describe CrawlJob, :local_only => true, :disabled => true do
150
148
  @request = {
151
149
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
152
150
  :quiet => false,
153
- :debug => false,
151
+ :debug => true,
154
152
  :cache => nil
155
153
  }
156
- @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => Redis.new)
154
+ @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@request[:crawl_id]}", :redis => RedisConnection.new)
157
155
  end
158
156
 
159
157
  # describe "crawling http://yepadeperrors.wordpress.com/ with limit of 20" do
@@ -226,6 +224,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
226
224
  @redis.get("crawl_job_enqueued_count").to_i.should_not == @base_page_count
227
225
  end
228
226
  it "should notify of crawl finished once" do
227
+ @redis.get("crawl_finished_enqueued_count").to_i.should == 0
229
228
  crawl = @cobweb.start(@base_url)
230
229
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
231
230
  wait_for_crawl_finished crawl[:crawl_id]
@@ -280,11 +279,13 @@ end
280
279
 
281
280
  def wait_for_crawl_finished(crawl_id, timeout=20)
282
281
  @counter = 0
282
+ @timeout = timeout unless @timeout
283
283
  start_time = Time.now
284
284
  while(running?(crawl_id) && Time.now < start_time + timeout) do
285
285
  sleep 1
286
286
  end
287
- if Time.now > start_time + timeout
287
+ if Time.now > start_time + @timeout
288
+ @timeout = 5
288
289
  raise "End of crawl not detected"
289
290
  end
290
291
  end
@@ -296,7 +297,7 @@ def workers_processes_started?
296
297
  end
297
298
 
298
299
  def workers_running?
299
- Resque.workers.count > 0
300
+ Resque.workers.count == RESQUE_WORKER_COUNT
300
301
  end
301
302
 
302
303
  def running?(crawl_id)
@@ -8,8 +8,8 @@ describe CrawlWorker, :local_only => true do
8
8
  if SIDEKIQ_INSTALLED
9
9
  #store all existing resque process ids so we don't kill them afterwards
10
10
  @existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
11
- puts @existing_processes
12
- @existing_processes.should be_empty
11
+
12
+ raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
13
13
 
14
14
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
15
15
  puts "Starting Workers... Please Wait..."
@@ -34,7 +34,7 @@ describe CrawlWorker, :local_only => true do
34
34
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
35
35
  :crawl_limit => nil,
36
36
  :quiet => false,
37
- :debug => false,
37
+ :debug => true,
38
38
  :cache => nil,
39
39
  :queue_system => :sidekiq
40
40
  }
@@ -60,6 +60,7 @@ describe CrawlWorker, :local_only => true do
60
60
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
61
61
  :quiet => true,
62
62
  :cache => nil,
63
+ :debug => true,
63
64
  :queue_system => :sidekiq,
64
65
  :valid_mime_types => ["text/html"]
65
66
  }
@@ -87,6 +88,7 @@ describe CrawlWorker, :local_only => true do
87
88
  @request = {
88
89
  :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
89
90
  :quiet => true,
91
+ :debug => true,
90
92
  :queue_system => :sidekiq,
91
93
  :cache => nil
92
94
  }
@@ -136,7 +138,6 @@ describe CrawlWorker, :local_only => true do
136
138
  wait_for_crawl_finished crawl[:crawl_id]
137
139
 
138
140
  mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
139
- ap mime_types
140
141
  mime_types.select{|m| m=="text/html"}.count.should == 5
141
142
  end
142
143
  end
@@ -186,11 +187,11 @@ describe CrawlWorker, :local_only => true do
186
187
  wait_for_crawl_finished crawl[:crawl_id]
187
188
  CrawlFinishedWorker.queue_size.should == 1
188
189
  end
189
- it "should not crawl 100 pages" do
190
+ it "should not crawl more than 100 pages" do
190
191
  crawl = @cobweb.start(@base_url)
191
192
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
192
193
  wait_for_crawl_finished crawl[:crawl_id]
193
- CrawlProcessWorker.queue_size.should_not == 100
194
+ CrawlProcessWorker.queue_size.should_not > 100
194
195
  end
195
196
  end
196
197
  end
@@ -244,7 +245,7 @@ def clear_sidekiq_queues
244
245
  conn.srem("queues", queue_name)
245
246
  end
246
247
  end
247
- sleep 2
248
+ sleep 5
248
249
 
249
250
  CrawlProcessWorker.queue_size.should == 0
250
251
  CrawlFinishedWorker.queue_size.should == 0
metadata CHANGED
@@ -1,18 +1,20 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.18
4
+ version: 1.0.19
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Stewart McKee
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-10-17 00:00:00.000000000 Z
12
+ date: 2013-11-26 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: redis
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
19
  - - ! '>='
18
20
  - !ruby/object:Gem::Version
@@ -20,6 +22,7 @@ dependencies:
20
22
  type: :runtime
21
23
  prerelease: false
22
24
  version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
23
26
  requirements:
24
27
  - - ! '>='
25
28
  - !ruby/object:Gem::Version
@@ -27,6 +30,7 @@ dependencies:
27
30
  - !ruby/object:Gem::Dependency
28
31
  name: nokogiri
29
32
  requirement: !ruby/object:Gem::Requirement
33
+ none: false
30
34
  requirements:
31
35
  - - ! '>='
32
36
  - !ruby/object:Gem::Version
@@ -34,6 +38,7 @@ dependencies:
34
38
  type: :runtime
35
39
  prerelease: false
36
40
  version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
37
42
  requirements:
38
43
  - - ! '>='
39
44
  - !ruby/object:Gem::Version
@@ -41,6 +46,7 @@ dependencies:
41
46
  - !ruby/object:Gem::Dependency
42
47
  name: addressable
43
48
  requirement: !ruby/object:Gem::Requirement
49
+ none: false
44
50
  requirements:
45
51
  - - ! '>='
46
52
  - !ruby/object:Gem::Version
@@ -48,6 +54,7 @@ dependencies:
48
54
  type: :runtime
49
55
  prerelease: false
50
56
  version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
51
58
  requirements:
52
59
  - - ! '>='
53
60
  - !ruby/object:Gem::Version
@@ -55,6 +62,7 @@ dependencies:
55
62
  - !ruby/object:Gem::Dependency
56
63
  name: rspec
57
64
  requirement: !ruby/object:Gem::Requirement
65
+ none: false
58
66
  requirements:
59
67
  - - ! '>='
60
68
  - !ruby/object:Gem::Version
@@ -62,6 +70,7 @@ dependencies:
62
70
  type: :runtime
63
71
  prerelease: false
64
72
  version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
65
74
  requirements:
66
75
  - - ! '>='
67
76
  - !ruby/object:Gem::Version
@@ -69,6 +78,7 @@ dependencies:
69
78
  - !ruby/object:Gem::Dependency
70
79
  name: awesome_print
71
80
  requirement: !ruby/object:Gem::Requirement
81
+ none: false
72
82
  requirements:
73
83
  - - ! '>='
74
84
  - !ruby/object:Gem::Version
@@ -76,6 +86,7 @@ dependencies:
76
86
  type: :runtime
77
87
  prerelease: false
78
88
  version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
79
90
  requirements:
80
91
  - - ! '>='
81
92
  - !ruby/object:Gem::Version
@@ -83,6 +94,7 @@ dependencies:
83
94
  - !ruby/object:Gem::Dependency
84
95
  name: sinatra
85
96
  requirement: !ruby/object:Gem::Requirement
97
+ none: false
86
98
  requirements:
87
99
  - - ! '>='
88
100
  - !ruby/object:Gem::Version
@@ -90,6 +102,7 @@ dependencies:
90
102
  type: :runtime
91
103
  prerelease: false
92
104
  version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
93
106
  requirements:
94
107
  - - ! '>='
95
108
  - !ruby/object:Gem::Version
@@ -97,6 +110,7 @@ dependencies:
97
110
  - !ruby/object:Gem::Dependency
98
111
  name: thin
99
112
  requirement: !ruby/object:Gem::Requirement
113
+ none: false
100
114
  requirements:
101
115
  - - ! '>='
102
116
  - !ruby/object:Gem::Version
@@ -104,6 +118,7 @@ dependencies:
104
118
  type: :runtime
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
107
122
  requirements:
108
123
  - - ! '>='
109
124
  - !ruby/object:Gem::Version
@@ -111,6 +126,7 @@ dependencies:
111
126
  - !ruby/object:Gem::Dependency
112
127
  name: haml
113
128
  requirement: !ruby/object:Gem::Requirement
129
+ none: false
114
130
  requirements:
115
131
  - - ! '>='
116
132
  - !ruby/object:Gem::Version
@@ -118,6 +134,7 @@ dependencies:
118
134
  type: :runtime
119
135
  prerelease: false
120
136
  version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
121
138
  requirements:
122
139
  - - ! '>='
123
140
  - !ruby/object:Gem::Version
@@ -125,6 +142,7 @@ dependencies:
125
142
  - !ruby/object:Gem::Dependency
126
143
  name: namespaced_redis
127
144
  requirement: !ruby/object:Gem::Requirement
145
+ none: false
128
146
  requirements:
129
147
  - - ! '>='
130
148
  - !ruby/object:Gem::Version
@@ -132,6 +150,7 @@ dependencies:
132
150
  type: :runtime
133
151
  prerelease: false
134
152
  version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
135
154
  requirements:
136
155
  - - ! '>='
137
156
  - !ruby/object:Gem::Version
@@ -139,6 +158,7 @@ dependencies:
139
158
  - !ruby/object:Gem::Dependency
140
159
  name: json
141
160
  requirement: !ruby/object:Gem::Requirement
161
+ none: false
142
162
  requirements:
143
163
  - - ! '>='
144
164
  - !ruby/object:Gem::Version
@@ -146,6 +166,7 @@ dependencies:
146
166
  type: :runtime
147
167
  prerelease: false
148
168
  version_requirements: !ruby/object:Gem::Requirement
169
+ none: false
149
170
  requirements:
150
171
  - - ! '>='
151
172
  - !ruby/object:Gem::Version
@@ -153,6 +174,7 @@ dependencies:
153
174
  - !ruby/object:Gem::Dependency
154
175
  name: slop
155
176
  requirement: !ruby/object:Gem::Requirement
177
+ none: false
156
178
  requirements:
157
179
  - - ! '>='
158
180
  - !ruby/object:Gem::Version
@@ -160,6 +182,7 @@ dependencies:
160
182
  type: :runtime
161
183
  prerelease: false
162
184
  version_requirements: !ruby/object:Gem::Requirement
185
+ none: false
163
186
  requirements:
164
187
  - - ! '>='
165
188
  - !ruby/object:Gem::Version
@@ -376,6 +399,7 @@ files:
376
399
  - lib/export_command.rb
377
400
  - lib/hash_util.rb
378
401
  - lib/redirect_error.rb
402
+ - lib/redis_connection.rb
379
403
  - lib/report_command.rb
380
404
  - lib/robots.rb
381
405
  - lib/server.rb
@@ -540,26 +564,27 @@ files:
540
564
  homepage: http://github.com/stewartmckee/cobweb
541
565
  licenses:
542
566
  - MIT
543
- metadata: {}
544
567
  post_install_message:
545
568
  rdoc_options: []
546
569
  require_paths:
547
570
  - lib
548
571
  required_ruby_version: !ruby/object:Gem::Requirement
572
+ none: false
549
573
  requirements:
550
574
  - - ! '>='
551
575
  - !ruby/object:Gem::Version
552
576
  version: '0'
553
577
  required_rubygems_version: !ruby/object:Gem::Requirement
578
+ none: false
554
579
  requirements:
555
580
  - - ! '>='
556
581
  - !ruby/object:Gem::Version
557
582
  version: '0'
558
583
  requirements: []
559
584
  rubyforge_project:
560
- rubygems_version: 2.0.3
585
+ rubygems_version: 1.8.25
561
586
  signing_key:
562
- specification_version: 4
587
+ specification_version: 3
563
588
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
564
589
  crawl extremely large sites faster than multi-threaded crawlers. It is also a standalone
565
590
  crawler that has a sophisticated statistics monitoring interface to monitor the
checksums.yaml DELETED
@@ -1,15 +0,0 @@
1
- ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MTg3ODFiMWE1MmZlYWFjYzZiZjIzZjQ1NmFjZmJmMWU1MDVjZTc5Mg==
5
- data.tar.gz: !binary |-
6
- ZGU4NmFiYTJlNmZlODRiMjRmNTkzZjMwOWQyMzEyZjU4OGQzMWUxMw==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- ZDVmN2MwYzBiMjQ1N2E2YjBmYmM0ZTk5ZWJjMGVkN2VmMDM4ODhkNTQ0OTIx
10
- ZTg4YzMzMWE0OTY2ZjgyNWRiNzZlZjgyZDlkM2Y4MTQ2OTVmZTg5Zjc1NTA1
11
- MTZhYzc2ZmYwNmM2ODRlMmViODljMGFjODYwNTY5OThlNjY2M2Y=
12
- data.tar.gz: !binary |-
13
- M2M2YzU4ZTE5YzkxMWVmNmJiNTQ5OWFhNDExZGUwNzkxMGEzY2IyYTFmYTJl
14
- YTE0OWI2ZmZhN2I0ZjA2YjU4NWFmNmUwMjY5ZDM4YWQ3ZmJkZmViNzRlNWMw
15
- ZWMzNjIwNDkxNDk0NmMxOTE3NzljMGQ5MjlmYzgyODc3ZWQ2ZTY=