cobweb 1.0.19 → 1.0.20

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MDE5MzU3NzI2MTRhYzM5NzIwMDZlMTJjMTg5NzNiMzAyMjFkMjcxOQ==
5
+ data.tar.gz: !binary |-
6
+ MWE3ZTAwYjExZjc4NzU2MDYzOTlhOTQwMTNlNTcyZjNmZTYwNmU3Zg==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ N2U2MDk1MmI3ZTU3OTFmZDI5YjY4YTEyNDNkMTE4MmJjOTFkNTZiYzNhY2Q4
10
+ Mjk0ZjM1YThhNzhkMGNjNjJiZTJkNjM1OWQ1MGMzZmVlMDI5MzUyOTU5YTRk
11
+ NzEzZjBiZjM2OTUxZTc2NzZjZDIyOWQ4ZmVlYzYyOGViMDIyYzY=
12
+ data.tar.gz: !binary |-
13
+ ZTJiYmRlNDY0M2FkNTdlN2I0ZjNiODYxOGQyN2MxZGZlMGViMWIxZDA4YmY1
14
+ ZDc2ZDU3NDc4ODg1YmExYjFmYjMyY2U0MDU4MGQ0OTJkZjRmNjAyYmQ3NWVl
15
+ ODY0ZTE5MGUzNzAzZWFlMzdmZmY1YzNhMmEzNWE1NzVkYzAwZDE=
data/README.textile CHANGED
@@ -1,5 +1,4 @@
1
-
2
- h1. Cobweb v1.0.19
1
+ h1. Cobweb v1.0.20
3
2
 
4
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -155,7 +154,7 @@ The :processing_queue option is used to specify the class that contains the resq
155
154
 
156
155
  h3. CobwebCrawler
157
156
 
158
- CobwebCrawler is the standalone crawling class. If you don't want to use redis and just want to crawl the site within your ruby process, you can use this class.
157
+ CobwebCrawler is the standalone crawling class. If you don't want to use resque or sidekiq and just want to crawl the site within your ruby process, you can use this class.
159
158
 
160
159
  bc. crawler = CobwebCrawler.new(:cache => 600)
161
160
  statistics = crawler.crawl("http://www.pepsico.com")
@@ -207,7 +206,7 @@ h2. License
207
206
 
208
207
  h3. The MIT License
209
208
 
210
- Copyright (c) 2010 6Central Limited
209
+ Copyright (c) 2013 Active Information Design
211
210
 
212
211
  Permission is hereby granted, free of charge, to any person obtaining a copy
213
212
  of this software and associated documentation files (the "Software"), to deal
data/lib/cobweb.rb CHANGED
@@ -1,6 +1,5 @@
1
1
  require 'rubygems'
2
2
  require 'uri'
3
- require 'resque'
4
3
  require "addressable/uri"
5
4
  require 'digest/sha1'
6
5
  require 'base64'
@@ -178,6 +177,9 @@ class Cobweb
178
177
  raise ":username and :password are required if using basic authentication" unless @options[:username] && @options[:password]
179
178
  request.basic_auth @options[:username], @options[:password]
180
179
  end
180
+ if @options[:range]
181
+ request.set_range(@options[:range])
182
+ end
181
183
 
182
184
  response = @http.request request
183
185
 
@@ -451,6 +453,10 @@ class Cobweb
451
453
  pattern = pattern.gsub("*", ".*?")
452
454
  pattern
453
455
  end
456
+
457
+ def clear_cache
458
+
459
+ end
454
460
 
455
461
  private
456
462
  # checks if the mime_type is textual
@@ -15,15 +15,18 @@ class CobwebCrawlHelper
15
15
  @stats = Stats.new(data)
16
16
  end
17
17
 
18
- def destroy(options={})
19
-
18
+ def destroy
19
+ options = @data
20
20
  options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
21
- options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
21
+ if RESQUE_INSTALLED
22
+ options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
23
+ end
22
24
 
23
25
  # set status as cancelled now so that we don't enqueue any further pages
24
26
  self.statistics.end_crawl(@data, true)
25
27
 
26
- if options[:finished_resque_queue]
28
+
29
+ if options[:finished_resque_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
27
30
 
28
31
  additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
29
32
  additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
@@ -38,15 +41,17 @@ class CobwebCrawlHelper
38
41
  sleep 1
39
42
  counter += 1
40
43
  end
41
- position = Resque.size(options[:queue_name])
42
- until position == 0
43
- position-=BATCH_SIZE
44
- position = 0 if position < 0
45
- job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
46
- job_items.each do |item|
47
- if item["args"][0]["crawl_id"] == id
48
- # remove this job from the queue
49
- Resque.dequeue(CrawlJob, item["args"][0])
44
+ if options[:queue_system] == :resque && RESQUE_INSTALLED
45
+ position = Resque.size(options[:queue_name])
46
+ until position == 0
47
+ position-=BATCH_SIZE
48
+ position = 0 if position < 0
49
+ job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
50
+ job_items.each do |item|
51
+ if item["args"][0]["crawl_id"] == id
52
+ # remove this job from the queue
53
+ Resque.dequeue(CrawlJob, item["args"][0])
54
+ end
50
55
  end
51
56
  end
52
57
  end
@@ -122,7 +122,7 @@ class CobwebCrawler
122
122
 
123
123
  if @options[:store_inbound_links]
124
124
  document_links.each do |target_link|
125
- target_uri = UriHelper.parse(target_link)
125
+ target_uri = UriHelper.parse(target_link).normalize
126
126
  @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
127
127
  end
128
128
  end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.19"
6
+ "1.0.20"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl.rb CHANGED
@@ -30,7 +30,6 @@ module CobwebModule
30
30
  already_crawled?(link) || already_queued?(link) || already_running?(link)
31
31
  end
32
32
 
33
-
34
33
  # Returns true if the crawl count is within limits
35
34
  def within_crawl_limits?
36
35
  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
@@ -62,15 +61,15 @@ module CobwebModule
62
61
 
63
62
  unless already_running? @options[:url]
64
63
  unless already_crawled? @options[:url]
65
- @redis.sadd("currently_running", @options[:url])
64
+ update_queues
66
65
  if within_crawl_limits?
66
+ @redis.sadd("currently_running", @options[:url])
67
67
  @stats.update_status("Retrieving #{@options[:url]}...")
68
- lock("update_queues") do
69
- @content = Cobweb.new(@options).get(@options[:url], @options)
70
- if @options[:url] == @redis.get("original_base_url")
71
- @redis.set("crawled_base_url", @content[:base_url])
72
- end
73
- update_queues
68
+ @content = Cobweb.new(@options).get(@options[:url], @options)
69
+ update_counters
70
+
71
+ if @options[:url] == @redis.get("original_base_url")
72
+ @redis.set("crawled_base_url", @content[:base_url])
74
73
  end
75
74
 
76
75
  if content.permitted_type?
@@ -80,9 +79,15 @@ module CobwebModule
80
79
  return true
81
80
  end
82
81
  else
82
+ puts "======================================="
83
+ puts "OUTWITH CRAWL LIMITS"
84
+ puts "======================================="
83
85
  decrement_queue_counter
84
86
  end
85
87
  else
88
+ puts "======================================="
89
+ puts "ALREADY CRAWLED"
90
+ puts "======================================="
86
91
  decrement_queue_counter
87
92
  end
88
93
  else
@@ -108,26 +113,28 @@ module CobwebModule
108
113
  internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
109
114
 
110
115
  # reject the link if we've crawled it or queued it
111
- internal_links.reject! { |link| @redis.sismember("crawled", link) }
112
- internal_links.reject! { |link| @redis.sismember("queued", link) }
113
-
114
- internal_links.each do |link|
115
- if within_queue_limits? && !already_queued?(link) && !already_crawled?(link)
116
- if status != CobwebCrawlHelper::CANCELLED
117
- yield link if block_given?
118
- unless link.nil?
119
- @redis.sadd "queued", link
120
- increment_queue_counter
116
+
117
+ internal_links.reject! { |link| already_handled?(link)}
118
+
119
+ lock("internal-links") do
120
+ internal_links.each do |link|
121
+ if within_queue_limits? && !already_handled?(link)
122
+ if status != CobwebCrawlHelper::CANCELLED
123
+ yield link if block_given?
124
+ unless link.nil?
125
+ @redis.sadd "queued", link
126
+ increment_queue_counter
127
+ end
128
+ else
129
+ debug_puts "Cannot enqueue new content as crawl has been cancelled."
121
130
  end
122
- else
123
- debug_puts "Cannot enqueue new content as crawl has been cancelled."
124
131
  end
125
132
  end
126
133
  end
127
134
 
128
135
  if @options[:store_inbound_links]
129
136
  document_links.each do |link|
130
- uri = URI.parse(link)
137
+ uri = URI.parse(link).normalize
131
138
  @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
132
139
  end
133
140
  end
@@ -140,25 +147,25 @@ module CobwebModule
140
147
  end
141
148
 
142
149
  def update_queues
143
- #lock("update_queues") do
150
+ lock("update_queues") do
144
151
  #@redis.incr "inprogress"
145
152
  # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
146
153
  @redis.srem "queued", @options[:url]
147
154
  @redis.sadd "crawled", @options[:url]
148
- if content.url != @options[:url]
149
- @redis.srem "queued", content.url
150
- @redis.sadd "crawled", content.url
151
- end
155
+
152
156
  # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
153
- if @options[:crawl_limit_by_page]
154
- if content.mime_type.match("text/html")
155
- increment_crawl_counter
156
- end
157
- else
157
+ end
158
+ end
159
+
160
+ def update_counters
161
+ if @options[:crawl_limit_by_page]
162
+ if content.mime_type.match("text/html")
158
163
  increment_crawl_counter
159
164
  end
160
- decrement_queue_counter
161
- #end
165
+ else
166
+ increment_crawl_counter
167
+ end
168
+ decrement_queue_counter
162
169
  end
163
170
 
164
171
  def to_be_processed?
@@ -166,7 +173,7 @@ module CobwebModule
166
173
  end
167
174
 
168
175
  def process(&block)
169
- lock("process") do
176
+ lock("process-count") do
170
177
  if @options[:crawl_limit_by_page]
171
178
  if content.mime_type.match("text/html")
172
179
  increment_process_counter
@@ -175,10 +182,10 @@ module CobwebModule
175
182
  increment_process_counter
176
183
  end
177
184
  #@redis.sadd "queued", @options[:url]
178
-
179
- yield if block_given?
180
- @redis.incr("crawl_job_enqueued_count")
181
185
  end
186
+
187
+ yield if block_given?
188
+ @redis.incr("crawl_job_enqueued_count")
182
189
  end
183
190
 
184
191
  def finished_processing
@@ -250,17 +257,17 @@ module CobwebModule
250
257
  end
251
258
 
252
259
  def lock(key, &block)
253
- #debug_puts "REQUESTING LOCK [#{key}]"
260
+ debug_puts "REQUESTING LOCK [#{key}]"
254
261
  set_nx = @redis.setnx("#{key}_lock", "locked")
255
- #debug_puts "LOCK:#{key}:#{set_nx}"
262
+ debug_puts "LOCK:#{key}:#{set_nx}"
256
263
  while !set_nx
257
- #debug_puts "===== WAITING FOR LOCK [#{key}] ====="
264
+ debug_puts "===== WAITING FOR LOCK [#{key}] ====="
258
265
  sleep 0.01
259
266
  set_nx = @redis.setnx("#{key}_lock", "locked")
260
267
  end
261
268
 
262
- #debug_puts "RECEIVED LOCK [#{key}]"
263
- @redis.expire("#{key}_lock", 10)
269
+ debug_puts "RECEIVED LOCK [#{key}]"
270
+ @redis.expire("#{key}_lock", 30)
264
271
  begin
265
272
  result = yield
266
273
  ensure
data/lib/crawl_helper.rb CHANGED
@@ -150,7 +150,13 @@ class CrawlHelper
150
150
  elsif content_request[:use_encoding_safe_process_job]
151
151
  content_to_send[:body] = Base64.encode64(content[:body])
152
152
  content_to_send[:processing_queue] = content_request[:processing_queue]
153
- Resque.enqueue(EncodingSafeProcessJob, content_to_send)
153
+ if content_request[:queue_system] == :resque
154
+ Resque.enqueue(EncodingSafeProcessJob, content_to_send)
155
+ elsif content_request[:queue_system] == :sidekiq
156
+ const_get(content_request[:processing_queue]).perform_async(content_to_send)
157
+ else
158
+ raise "Unknown queue system: #{content_request[:queue_system]}"
159
+ end
154
160
  else
155
161
  if content_request[:queue_system] == :resque
156
162
  Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
data/lib/crawl_worker.rb CHANGED
@@ -25,17 +25,15 @@ class CrawlWorker
25
25
 
26
26
  # if the crawled object is an object type we are interested
27
27
  if @crawl.content.permitted_type?
28
-
29
- @crawl.lock("queue_links") do
30
- # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
31
- @crawl.process_links do |link|
32
-
28
+
29
+ # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
30
+ @crawl.process_links do |link|
31
+ @crawl.lock("queue_links") do
33
32
  if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
34
33
  # enqueue the links to sidekiq
35
34
  @crawl.debug_puts "QUEUED LINK: #{link}"
36
35
  enqueue_content(content_request, link)
37
36
  end
38
-
39
37
  end
40
38
  end
41
39
 
@@ -64,7 +62,7 @@ class CrawlWorker
64
62
  end
65
63
  end
66
64
 
67
- @crawl.lock("finished") do
65
+ #@crawl.lock("finished") do
68
66
  # let the crawl know we're finished with this object
69
67
  @crawl.finished_processing
70
68
 
@@ -74,7 +72,7 @@ class CrawlWorker
74
72
  @crawl.debug_puts "Calling crawl_job finished"
75
73
  finished(content_request)
76
74
  end
77
- end
75
+ #end
78
76
  end
79
77
  def self.jobs
80
78
  Sidekiq.redis do |conn|
@@ -99,6 +97,9 @@ class CrawlWorker
99
97
  # Enqueues the content to the processing queue setup in options
100
98
  def send_to_processing_queue(content, content_request)
101
99
  content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
100
+ content_to_send.keys.each do |key|
101
+ content_to_send[key] = content_to_send[key].force_encoding('UTF-8') if content_to_send[key].kind_of?(String)
102
+ end
102
103
  if content_request[:direct_call_process_job]
103
104
  clazz = content_request[:processing_queue].constantize
104
105
  clazz.perform(content_to_send)
@@ -6,6 +6,13 @@ else
6
6
  SIDEKIQ_INSTALLED = false
7
7
  puts "sidekiq gem not installed, skipping crawl_worker specs"
8
8
  end
9
+ if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
10
+ RESQUE_INSTALLED = true
11
+ require 'resque'
12
+ else
13
+ RESQUE_INSTALLED = false
14
+ puts "resque gem not installed, skipping crawl_job specs"
15
+ end
9
16
 
10
17
  module Sidekiq
11
18
  module Worker
data/lib/stats.rb CHANGED
@@ -1,17 +1,21 @@
1
1
  # Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
2
2
  class Stats
3
3
  require 'json'
4
-
4
+
5
5
  attr_reader :redis
6
-
6
+
7
7
  # Sets up redis usage for statistics
8
8
  def initialize(options)
9
9
  options[:redis_options] = {} unless options.has_key? :redis_options
10
- @full_redis = Redis.new(options[:redis_options])
10
+ if options[:redis]
11
+ @full_redis = options[:redis]
12
+ else
13
+ @full_redis = Redis.new(options[:redis_options])
14
+ end
11
15
  @lock = Mutex.new
12
16
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
13
17
  end
14
-
18
+
15
19
  # Sets up the crawl in statistics
16
20
  def start_crawl(options)
17
21
  unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
@@ -23,7 +27,7 @@ class Stats
23
27
  @redis.hset "statistics", "crawl_started_at", DateTime.now
24
28
  @redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
25
29
  end
26
-
30
+
27
31
  # Removes the crawl from the running crawls and updates status
28
32
  def end_crawl(options, cancelled=false)
29
33
  #@full_redis.srem "cobweb_crawls", options[:crawl_id]
@@ -35,21 +39,21 @@ class Stats
35
39
  @redis.hset "statistics", "crawl_finished_at", DateTime.now
36
40
  #@redis.del "crawl_details"
37
41
  end
38
-
42
+
39
43
  def get_crawled
40
44
  @redis.smembers "crawled"
41
45
  end
42
46
 
43
47
  def inbound_links_for(url)
44
- uri = UriHelper.parse(url)
48
+ uri = UriHelper.parse(url).normalize
45
49
  @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}")
46
50
  end
47
51
 
48
- # Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
52
+ # Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
49
53
  def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
50
54
  @lock.synchronize {
51
55
  @statistics = get_statistics
52
-
56
+
53
57
  if @statistics.has_key? :average_response_time
54
58
  @statistics[:average_response_time] = (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
55
59
  else
@@ -64,7 +68,7 @@ class Stats
64
68
  end
65
69
  @statistics[:maximum_length] = content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @statistics[:maximum_length].to_i
66
70
  @statistics[:minimum_length] = content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @statistics[:minimum_length].to_i
67
-
71
+
68
72
  if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
69
73
  @statistics[:page_count] = @statistics[:page_count].to_i + 1
70
74
  @statistics[:page_size] = @statistics[:page_size].to_i + content[:length].to_i
@@ -74,14 +78,14 @@ class Stats
74
78
  @statistics[:asset_size] = @statistics[:asset_size].to_i + content[:length].to_i
75
79
  increment_time_stat("assets_count")
76
80
  end
77
-
81
+
78
82
  total_redirects = @statistics[:total_redirects].to_i
79
83
  @statistics[:total_redirects] = 0 if total_redirects.nil?
80
84
  @statistics[:total_redirects] = total_redirects += content[:redirect_through].count unless content[:redirect_through].nil?
81
85
 
82
86
  @statistics[:crawl_counter] = crawl_counter
83
87
  @statistics[:queue_counter] = queue_counter
84
-
88
+
85
89
  total_length = @statistics[:total_length].to_i
86
90
  @statistics[:total_length] = total_length + content[:length].to_i
87
91
 
@@ -117,7 +121,7 @@ class Stats
117
121
  elsif content[:mime_type].cobweb_starts_with? "video"
118
122
  increment_time_stat("mime_video_count")
119
123
  end
120
-
124
+
121
125
  status_counts = {}
122
126
  if @statistics.has_key? :status_counts
123
127
  status_counts = @statistics[:status_counts]
@@ -126,11 +130,11 @@ class Stats
126
130
  status_counts[status_code] += 1
127
131
  else
128
132
  status_counts[status_code] = 1
129
- end
133
+ end
130
134
  else
131
135
  status_counts = {status_code => 1}
132
136
  end
133
-
137
+
134
138
  # record statistics by status type
135
139
  if content[:status_code] >= 200 && content[:status_code] < 300
136
140
  increment_time_stat("status_200_count")
@@ -139,21 +143,21 @@ class Stats
139
143
  elsif content[:status_code] >= 500 && content[:status_code] < 600
140
144
  increment_time_stat("status|_500_count")
141
145
  end
142
-
146
+
143
147
  @statistics[:status_counts] = status_counts.to_json
144
-
148
+
145
149
  ## time based statistics
146
150
  increment_time_stat("minute_totals", "minute", 60)
147
-
151
+
148
152
  redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key].to_s.gsub("'","''")}'"}.join(", ")}"
149
153
  instance_eval redis_command
150
154
  }
151
155
  @statistics
152
156
  end
153
-
157
+
154
158
  # Returns the statistics hash
155
159
  def get_statistics
156
-
160
+
157
161
  statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
158
162
  if statistics[:status_counts].nil?
159
163
  statistics[:status_counts]
@@ -167,23 +171,23 @@ class Stats
167
171
  end
168
172
  statistics
169
173
  end
170
-
174
+
171
175
  # Sets the current status of the crawl
172
176
  def update_status(status)
173
177
  @redis.hset("statistics", "current_status", status) unless get_status == CobwebCrawlHelper::CANCELLED
174
178
  end
175
-
179
+
176
180
  # Returns the current status of the crawl
177
181
  def get_status
178
182
  @redis.hget "statistics", "current_status"
179
183
  end
180
-
184
+
181
185
  # Sets totals for the end of the crawl (Not Used)
182
186
  def set_totals
183
187
  stats = get_statistics
184
188
  stats[:crawled] = @redis.smembers "crawled"
185
189
  end
186
-
190
+
187
191
  private
188
192
  # Records a time based statistic
189
193
  def record_time_stat(stat_name, value, type="minute", duration=60)
@@ -193,7 +197,7 @@ class Stats
193
197
  end
194
198
  stat_value = @redis.hget(stat_name, key).to_i
195
199
  stat_count = @redis.hget("#{stat_name}-count", key).to_i
196
-
200
+
197
201
  if minute_count.nil?
198
202
  @redis.hset stat_name, key, value
199
203
  @redis.hset "#{stat_name}-count", key, 1
@@ -202,7 +206,7 @@ class Stats
202
206
  @redis.hset "#{stat_name}-count", key, stat_count+1
203
207
  end
204
208
  end
205
-
209
+
206
210
  # Increments a time based statistic (eg pages per minute)
207
211
  def increment_time_stat(stat_name, type="minute", duration=60)
208
212
  key = DateTime.now.strftime("%Y-%m-%d %H:%M")
@@ -218,12 +222,9 @@ class Stats
218
222
  #clear up older data
219
223
  @redis.hgetall(stat_name).keys.each do |key|
220
224
  if DateTime.parse(key) < DateTime.now-(duration/1440.0)
221
- puts "Deleting #{stat_name} - #{key}"
222
225
  @redis.hdel(stat_name, key)
223
226
  end
224
227
  end
225
228
  end
226
-
227
- end
228
-
229
229
 
230
+ end
@@ -3,6 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
3
3
  describe CobwebCrawlHelper do
4
4
  include HttpStubs
5
5
  before(:each) do
6
+ pending("not enabled for non resque installs") unless RESQUE_INSTALLED
7
+
6
8
  setup_stubs
7
9
  end
8
10
  # this spec tests the crawl object
@@ -42,14 +44,14 @@ describe CobwebCrawlHelper do
42
44
  end
43
45
  end
44
46
  after(:each) do
45
- Resque.remove_queue("cobweb_crawl_job")
47
+ Resque.remove_queue("cobweb_crawl_job") if RESQUE_INSTALLED
46
48
  end
47
49
  it "should have a queue length of 210" do
48
50
  Resque.size("cobweb_crawl_job").should == 210
49
51
  end
50
52
  describe "after called" do
51
53
  before(:each) do
52
- @crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id"})
54
+ @crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id", :queue_system => :resque})
53
55
  @crawl.destroy
54
56
  end
55
57
  it "should delete only the crawl specified" do
@@ -3,6 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
3
3
  describe CobwebCrawler do
4
4
 
5
5
  before(:each) do
6
+ pending("thin not installed") unless THIN_INSTALLED
6
7
 
7
8
  @base_url = "http://localhost:3532/"
8
9
 
@@ -55,12 +56,12 @@ describe CobwebCrawler do
55
56
  context "storing inbound links" do
56
57
 
57
58
  before(:each) do
59
+ pending("thin not installed") unless THIN_INSTALLED
58
60
  @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
59
61
  @statistics = @crawler.crawl(@base_url)
60
62
  end
61
63
 
62
64
  it "should store inbound links" do
63
-
64
65
  @statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
65
66
  @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
66
67
  end
@@ -6,48 +6,51 @@ describe CrawlJob, :local_only => true, :disabled => true do
6
6
 
7
7
  before(:all) do
8
8
  #store all existing resque process ids so we don't kill them afterwards
9
+ if RESQUE_INSTALLED && THIN_INSTALLED
10
+
11
+ @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
12
+ if Resque.workers.count > 0 && @existing_processes.empty?
13
+ raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
14
+ elsif Resque.workers.count == 0 && !@existing_processes.empty?
15
+ raise "Ghost worker processes present (#{@existing_processes.join(',')})"
16
+ elsif Resque.workers.count > 0 && !@existing_processes.empty?
17
+ raise "Resque workers present, please end other resque processes before running this spec"
18
+ end
9
19
 
10
- @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
11
- if Resque.workers.count > 0 && @existing_processes.empty?
12
- raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
13
- elsif Resque.workers.count == 0 && !@existing_processes.empty?
14
- raise "Ghost worker processes present (#{@existing_processes.join(',')})"
15
- elsif Resque.workers.count > 0 && !@existing_processes.empty?
16
- raise "Resque workers present, please end other resque processes before running this spec"
17
- end
18
-
19
- # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
20
- `mkdir log` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../log'))
21
- `mkdir tmp` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp'))
22
- `mkdir tmp/pids` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp/pids'))
23
- io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
24
-
25
- counter = 0
26
- until counter > 10 || workers_processes_started?
27
- print "\rStarting Resque Processes... #{10-counter} "
28
- counter += 1
29
- sleep 1
30
- end
31
- puts ""
20
+ # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
21
+ `mkdir log` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../log'))
22
+ `mkdir tmp` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp'))
23
+ `mkdir tmp/pids` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp/pids'))
24
+ io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
25
+
26
+ counter = 0
27
+ until counter > 10 || workers_processes_started?
28
+ print "\rStarting Resque Processes... #{10-counter} "
29
+ counter += 1
30
+ sleep 1
31
+ end
32
+ puts ""
32
33
 
33
34
 
34
- counter = 0
35
- until counter > 30 || workers_running?
36
- print "\rWaiting for Resque Workers... #{30-counter} "
37
- counter += 1
38
- sleep 1
39
- end
40
- puts ""
35
+ counter = 0
36
+ until counter > 30 || workers_running?
37
+ print "\rWaiting for Resque Workers... #{30-counter} "
38
+ counter += 1
39
+ sleep 1
40
+ end
41
+ puts ""
41
42
 
42
- if workers_running?
43
- puts "Workers Running."
44
- else
45
- raise "Workers didn't appear, please check environment"
43
+ if workers_running?
44
+ puts "Workers Running."
45
+ else
46
+ raise "Workers didn't appear, please check environment"
47
+ end
46
48
  end
47
-
48
49
  end
49
50
 
50
51
  before(:each) do
52
+ pending("Resque not installed") unless RESQUE_INSTALLED
53
+ pending("thin not installed") unless THIN_INSTALLED
51
54
  @base_url = "http://localhost:3532/"
52
55
  @base_page_count = 77
53
56
 
@@ -4,8 +4,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
4
4
  describe CrawlWorker, :local_only => true do
5
5
 
6
6
  before(:all) do
7
-
8
- if SIDEKIQ_INSTALLED
7
+ if SIDEKIQ_INSTALLED && THIN_INSTALLED
9
8
  #store all existing resque process ids so we don't kill them afterwards
10
9
  @existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
11
10
 
@@ -22,6 +21,7 @@ describe CrawlWorker, :local_only => true do
22
21
 
23
22
  before(:each) do
24
23
  pending("Sidkiq not installed") unless SIDEKIQ_INSTALLED
24
+ pending("thin not installed") unless THIN_INSTALLED
25
25
  @base_url = "http://localhost:3532/"
26
26
  @base_page_count = 77
27
27
 
@@ -9,6 +9,7 @@ describe Robots do
9
9
 
10
10
  describe "default user-agent" do
11
11
  before(:each) do
12
+ pending("thin not installed") unless THIN_INSTALLED
12
13
  @options = {:url => "http://localhost:3532/"}
13
14
  end
14
15
 
data/spec/spec_helper.rb CHANGED
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
3
3
  require File.expand_path(File.dirname(__FILE__) + '/../spec/samples/sample_server')
4
4
  require File.expand_path(File.dirname(__FILE__) + '/../spec/http_stubs')
5
5
  require 'mock_redis'
6
- require 'thin' if ENV["TRAVIS_RUBY_VERSION"].nil?
6
+
7
7
 
8
8
  require 'coveralls'
9
9
  Coveralls.wear!
@@ -17,9 +17,14 @@ RSpec.configure do |config|
17
17
  if ENV["TRAVIS_RUBY_VERSION"] || ENV['CI']
18
18
  config.filter_run_excluding :local_only => true
19
19
  end
20
-
21
- Thread.new do
22
- @thin ||= Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
20
+
21
+ THIN_INSTALLED = false
22
+ if Gem::Specification.find_all_by_name("thin", ">=1.0.0").count >= 1
23
+ require 'thin'
24
+ THIN_INSTALLED = true
25
+ Thread.new do
26
+ @thin ||= Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
27
+ end
23
28
  end
24
29
 
25
30
  # WAIT FOR START TO COMPLETE
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.19
5
- prerelease:
4
+ version: 1.0.20
6
5
  platform: ruby
7
6
  authors:
8
7
  - Stewart McKee
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-11-26 00:00:00.000000000 Z
11
+ date: 2014-08-23 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: redis
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: nokogiri
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: addressable
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,23 +48,6 @@ dependencies:
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- - !ruby/object:Gem::Dependency
63
- name: rspec
64
- requirement: !ruby/object:Gem::Requirement
65
- none: false
66
- requirements:
67
- - - ! '>='
68
- - !ruby/object:Gem::Version
69
- version: '0'
70
- type: :runtime
71
- prerelease: false
72
- version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
51
  requirements:
75
52
  - - ! '>='
76
53
  - !ruby/object:Gem::Version
@@ -78,7 +55,6 @@ dependencies:
78
55
  - !ruby/object:Gem::Dependency
79
56
  name: awesome_print
80
57
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
58
  requirements:
83
59
  - - ! '>='
84
60
  - !ruby/object:Gem::Version
@@ -86,7 +62,6 @@ dependencies:
86
62
  type: :runtime
87
63
  prerelease: false
88
64
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
65
  requirements:
91
66
  - - ! '>='
92
67
  - !ruby/object:Gem::Version
@@ -94,23 +69,6 @@ dependencies:
94
69
  - !ruby/object:Gem::Dependency
95
70
  name: sinatra
96
71
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
- requirements:
99
- - - ! '>='
100
- - !ruby/object:Gem::Version
101
- version: '0'
102
- type: :runtime
103
- prerelease: false
104
- version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
- requirements:
107
- - - ! '>='
108
- - !ruby/object:Gem::Version
109
- version: '0'
110
- - !ruby/object:Gem::Dependency
111
- name: thin
112
- requirement: !ruby/object:Gem::Requirement
113
- none: false
114
72
  requirements:
115
73
  - - ! '>='
116
74
  - !ruby/object:Gem::Version
@@ -118,7 +76,6 @@ dependencies:
118
76
  type: :runtime
119
77
  prerelease: false
120
78
  version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
79
  requirements:
123
80
  - - ! '>='
124
81
  - !ruby/object:Gem::Version
@@ -126,7 +83,6 @@ dependencies:
126
83
  - !ruby/object:Gem::Dependency
127
84
  name: haml
128
85
  requirement: !ruby/object:Gem::Requirement
129
- none: false
130
86
  requirements:
131
87
  - - ! '>='
132
88
  - !ruby/object:Gem::Version
@@ -134,15 +90,13 @@ dependencies:
134
90
  type: :runtime
135
91
  prerelease: false
136
92
  version_requirements: !ruby/object:Gem::Requirement
137
- none: false
138
93
  requirements:
139
94
  - - ! '>='
140
95
  - !ruby/object:Gem::Version
141
96
  version: '0'
142
97
  - !ruby/object:Gem::Dependency
143
- name: namespaced_redis
98
+ name: redis-namespace
144
99
  requirement: !ruby/object:Gem::Requirement
145
- none: false
146
100
  requirements:
147
101
  - - ! '>='
148
102
  - !ruby/object:Gem::Version
@@ -150,7 +104,6 @@ dependencies:
150
104
  type: :runtime
151
105
  prerelease: false
152
106
  version_requirements: !ruby/object:Gem::Requirement
153
- none: false
154
107
  requirements:
155
108
  - - ! '>='
156
109
  - !ruby/object:Gem::Version
@@ -158,7 +111,6 @@ dependencies:
158
111
  - !ruby/object:Gem::Dependency
159
112
  name: json
160
113
  requirement: !ruby/object:Gem::Requirement
161
- none: false
162
114
  requirements:
163
115
  - - ! '>='
164
116
  - !ruby/object:Gem::Version
@@ -166,7 +118,6 @@ dependencies:
166
118
  type: :runtime
167
119
  prerelease: false
168
120
  version_requirements: !ruby/object:Gem::Requirement
169
- none: false
170
121
  requirements:
171
122
  - - ! '>='
172
123
  - !ruby/object:Gem::Version
@@ -174,7 +125,6 @@ dependencies:
174
125
  - !ruby/object:Gem::Dependency
175
126
  name: slop
176
127
  requirement: !ruby/object:Gem::Requirement
177
- none: false
178
128
  requirements:
179
129
  - - ! '>='
180
130
  - !ruby/object:Gem::Version
@@ -182,7 +132,6 @@ dependencies:
182
132
  type: :runtime
183
133
  prerelease: false
184
134
  version_requirements: !ruby/object:Gem::Requirement
185
- none: false
186
135
  requirements:
187
136
  - - ! '>='
188
137
  - !ruby/object:Gem::Version
@@ -564,27 +513,26 @@ files:
564
513
  homepage: http://github.com/stewartmckee/cobweb
565
514
  licenses:
566
515
  - MIT
516
+ metadata: {}
567
517
  post_install_message:
568
518
  rdoc_options: []
569
519
  require_paths:
570
520
  - lib
571
521
  required_ruby_version: !ruby/object:Gem::Requirement
572
- none: false
573
522
  requirements:
574
523
  - - ! '>='
575
524
  - !ruby/object:Gem::Version
576
525
  version: '0'
577
526
  required_rubygems_version: !ruby/object:Gem::Requirement
578
- none: false
579
527
  requirements:
580
528
  - - ! '>='
581
529
  - !ruby/object:Gem::Version
582
530
  version: '0'
583
531
  requirements: []
584
532
  rubyforge_project:
585
- rubygems_version: 1.8.25
533
+ rubygems_version: 2.1.11
586
534
  signing_key:
587
- specification_version: 3
535
+ specification_version: 4
588
536
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
589
537
  crawl extremely large sites faster than multi-threaded crawlers. It is also a standalone
590
538
  crawler that has a sophisticated statistics monitoring interface to monitor the