apisonator 3.2.1 → 3.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,32 +6,13 @@ module ThreeScale
6
6
  respond_with_404('service not found') unless Service.exists?(params[:service_id])
7
7
  end
8
8
 
9
- # This is very slow and needs to be disabled until the performance
10
- # issues are solved. In the meanwhile, the job will just return OK.
11
- =begin
12
- delete '' do |service_id|
13
- delete_stats_job_attrs = api_params Stats::DeleteJobDef
14
- delete_stats_job_attrs[:service_id] = service_id
15
- delete_stats_job_attrs[:from] = delete_stats_job_attrs[:from].to_i
16
- delete_stats_job_attrs[:to] = delete_stats_job_attrs[:to].to_i
17
- begin
18
- Stats::DeleteJobDef.new(delete_stats_job_attrs).run_async
19
- rescue DeleteServiceStatsValidationError => e
20
- [400, headers, { status: :error, error: e.message }.to_json]
21
- else
22
- { status: :to_be_deleted }.to_json
23
- end
24
- =end
25
-
26
- # This is an alternative to the above. It just adds the service to a
27
- # Redis set to marked is as "to be deleted".
28
- # Later a script can read that set and actually delete the keys.
29
- # Read the docs of the Stats::Cleaner class for more details.
9
+ # This adds the service to a Redis set to mark is as "to be deleted".
10
+ # Later a script can read that set and actually delete the keys. Read
11
+ # the docs of the Stats::Cleaner class for more details.
30
12
  #
31
- # Notice that this method ignores the "from" and "to" parameters. When
32
- # system calls this method, they're always interested in deleting all
33
- # the keys. They were just passing "from" and "to" to make the
34
- # implementation of the option above easier.
13
+ # Notice that this method ignores the "from" and "to" parameters used in
14
+ # previous versions. When system calls this method, they're always
15
+ # interested in deleting all the keys.
35
16
  delete '' do |service_id|
36
17
  Stats::Cleaner.mark_service_to_be_deleted(service_id)
37
18
  { status: :to_be_deleted }.to_json
@@ -6,11 +6,10 @@ module ThreeScale
6
6
 
7
7
  # The compacted hour in the params refers to the
8
8
  # TimeHacks.to_compact_s method.
9
- def alert_keys(service_id, app_id, discrete_utilization, compacted_hour_start)
9
+ def alert_keys(service_id, app_id, discrete_utilization)
10
10
  {
11
11
  already_notified: key_already_notified(service_id, app_id, discrete_utilization),
12
12
  allowed: key_allowed_set(service_id),
13
- current_max: key_current_max(service_id, app_id, compacted_hour_start),
14
13
  current_id: key_current_id
15
14
  }
16
15
  end
@@ -31,11 +30,6 @@ module ThreeScale
31
30
  "#{prefix}allowed_set"
32
31
  end
33
32
 
34
- def key_current_max(service_id, app_id, compacted_hour_start)
35
- prefix = key_prefix(service_id, app_id)
36
- "#{prefix}#{compacted_hour_start}/current_max"
37
- end
38
-
39
33
  def key_current_id
40
34
  'alerts/current_id'.freeze
41
35
  end
@@ -43,6 +37,7 @@ module ThreeScale
43
37
 
44
38
  extend self
45
39
  extend KeyHelpers
40
+ include Memoizer::Decorator
46
41
 
47
42
  ALERT_TTL = 24*3600 # 1 day (only one message per day)
48
43
  ## zero must be here and sorted, yes or yes
@@ -50,6 +45,16 @@ module ThreeScale
50
45
  FIRST_ALERT_BIN = ALERT_BINS.first
51
46
  RALERT_BINS = ALERT_BINS.reverse.freeze
52
47
 
48
+ def can_raise_more_alerts?(service_id, app_id)
49
+ allowed_bins = allowed_set_for_service(service_id).sort
50
+
51
+ return false if allowed_bins.empty?
52
+
53
+ # If the bin with the highest value has already been notified, there's
54
+ # no need to notify anything else.
55
+ not notified?(service_id, app_id, allowed_bins.last)
56
+ end
57
+
53
58
  def utilization(app_usage_reports)
54
59
  max_utilization = -1.0
55
60
  max_record = nil
@@ -77,25 +82,12 @@ module ThreeScale
77
82
 
78
83
  def update_utilization(service_id, app_id, max_utilization, max_record, timestamp)
79
84
  discrete = utilization_discrete(max_utilization)
80
- max_utilization_i = (max_utilization * 100.0).round
81
85
 
82
- beginning_of_day = Period::Boundary.day_start(timestamp)
83
- period_hour = Period::Boundary.hour_start(timestamp).to_compact_s
84
- # UNIX timestamp for key expiration - add 1 day + 5 mins
85
- expire_at = (beginning_of_day + 86700).to_i
86
+ keys = alert_keys(service_id, app_id, discrete)
86
87
 
87
- keys = alert_keys(service_id, app_id, discrete, period_hour)
88
-
89
- already_alerted, allowed, current_max, _ = storage.pipelined do
88
+ already_alerted, allowed = storage.pipelined do
90
89
  storage.get(keys[:already_notified])
91
90
  storage.sismember(keys[:allowed], discrete)
92
- storage.get(keys[:current_max])
93
- storage.expireat(keys[:current_max], expire_at)
94
- end
95
-
96
- ## update the status of utilization
97
- if max_utilization_i > current_max.to_i
98
- storage.set(keys[:current_max], max_utilization_i)
99
91
  end
100
92
 
101
93
  if already_alerted.nil? && allowed && discrete.to_i > 0
@@ -129,6 +121,16 @@ module ThreeScale
129
121
  "#{record.current_value}/#{record.max_value}"
130
122
  end
131
123
 
124
+ def allowed_set_for_service(service_id)
125
+ storage.smembers(key_allowed_set(service_id)).map(&:to_i) # Redis returns strings always
126
+ end
127
+ memoize :allowed_set_for_service
128
+
129
+ def notified?(service_id, app_id, bin)
130
+ storage.get(key_already_notified(service_id, app_id, bin))
131
+ end
132
+ memoize :notified?
133
+
132
134
  def storage
133
135
  Storage.instance
134
136
  end
@@ -40,10 +40,8 @@ module ThreeScale
40
40
  private
41
41
 
42
42
  def self.first_traffic(service_id, application_id)
43
- key = Stats::Keys.applications_key_prefix(
44
- Stats::Keys.service_key_prefix(service_id)
45
- )
46
- if storage.sadd(key, encode_key(application_id))
43
+ if storage.sadd(Stats::Keys.set_of_apps_with_traffic(service_id),
44
+ encode_key(application_id))
47
45
  EventStorage.store(:first_traffic,
48
46
  { service_id: service_id,
49
47
  application_id: application_id,
@@ -32,8 +32,6 @@ module ThreeScale
32
32
 
33
33
  CONFIG_DELETE_STATS_BATCH_SIZE = 50
34
34
  private_constant :CONFIG_DELETE_STATS_BATCH_SIZE
35
- CONFIG_DELETE_STATS_PARTITION_BATCH_SIZE = 1000
36
- private_constant :CONFIG_DELETE_STATS_PARTITION_BATCH_SIZE
37
35
 
38
36
  @configuration = Configuration::Loader.new
39
37
 
@@ -54,13 +52,12 @@ module ThreeScale
54
52
  config.add_section(:analytics_redis, :server,
55
53
  :connect_timeout, :read_timeout, :write_timeout)
56
54
  config.add_section(:hoptoad, :service, :api_key)
57
- config.add_section(:stats, :bucket_size, :delete_batch_size, :delete_partition_batch_size)
55
+ config.add_section(:stats, :bucket_size, :delete_batch_size)
58
56
  config.add_section(:redshift, :host, :port, :dbname, :user, :password)
59
57
  config.add_section(:statsd, :host, :port)
60
58
  config.add_section(:internal_api, :user, :password)
61
59
  config.add_section(:master, :metrics)
62
60
  config.add_section(:worker_prometheus_metrics, :enabled, :port)
63
- config.add_section(:listener_prometheus_metrics, :enabled, :port)
64
61
 
65
62
  config.add_section(
66
63
  :async_worker,
@@ -125,9 +122,6 @@ module ThreeScale
125
122
  config.stats.delete_batch_size = parse_int(config.stats.delete_batch_size,
126
123
  CONFIG_DELETE_STATS_BATCH_SIZE)
127
124
 
128
- config.stats.delete_partition_batch_size = parse_int(config.stats.delete_partition_batch_size,
129
- CONFIG_DELETE_STATS_PARTITION_BATCH_SIZE)
130
-
131
125
  # often we don't have a log_file setting - generate it here from
132
126
  # the log_path setting.
133
127
  log_file = config.log_file
@@ -292,12 +292,6 @@ module ThreeScale
292
292
  end
293
293
  end
294
294
 
295
- class DeleteServiceStatsValidationError < Error
296
- def initialize(service_id, msg)
297
- super "Delete stats job context validation error. Service: #{service_id}. Error: #{msg}"
298
- end
299
- end
300
-
301
295
  class EndUsersNoLongerSupported < BadRequest
302
296
  def initialize
303
297
  super 'End-users are no longer supported, do not specify the user_id parameter'.freeze
@@ -32,25 +32,6 @@ module ThreeScale
32
32
  DEFAULT_WAIT_BEFORE_FETCHING_MORE_JOBS
33
33
  end
34
34
 
35
- def pop_from_queue
36
- begin
37
- encoded_job = @redis.blpop(*@queues, timeout: @fetch_timeout)
38
- rescue Redis::BaseConnectionError, Errno::ECONNREFUSED, Errno::EPIPE => e
39
- raise RedisConnectionError.new(e.message)
40
- rescue Redis::CommandError => e
41
- # Redis::CommandError from redis-rb can be raised for multiple
42
- # reasons, so we need to check the error message to distinguish
43
- # connection errors from the rest.
44
- if e.message == 'ERR Connection timed out'.freeze
45
- raise RedisConnectionError.new(e.message)
46
- else
47
- raise e
48
- end
49
- end
50
-
51
- encoded_job
52
- end
53
-
54
35
  def fetch
55
36
  encoded_job = pop_from_queue
56
37
  return nil if encoded_job.nil? || encoded_job.empty?
@@ -99,10 +80,11 @@ module ThreeScale
99
80
 
100
81
  # Re-instantiate Redis instance. This is needed to recover from
101
82
  # Errno::EPIPE, not sure if there are others.
102
- @redis = ThreeScale::Backend::QueueStorage.connection(
103
- ThreeScale::Backend.environment,
104
- ThreeScale::Backend.configuration
83
+ @redis = Redis::Namespace.new(
84
+ WorkerAsync.const_get(:RESQUE_REDIS_NAMESPACE),
85
+ redis: QueueStorage.connection(Backend.environment, Backend.configuration)
105
86
  )
87
+
106
88
  # If there is a different kind of error, it's probably a
107
89
  # programming error. Like sending an invalid blpop command to
108
90
  # Redis. In that case, let the worker crash.
@@ -111,12 +93,36 @@ module ThreeScale
111
93
  end
112
94
  end
113
95
 
96
+ rescue Exception => e
97
+ Worker.logger.notify(e)
98
+ ensure
114
99
  job_queue.close
115
100
  end
116
101
 
117
102
  def shutdown
118
103
  @shutdown = true
119
104
  end
105
+
106
+ private
107
+
108
+ def pop_from_queue
109
+ begin
110
+ encoded_job = @redis.blpop(*@queues, timeout: @fetch_timeout)
111
+ rescue Redis::BaseConnectionError, Errno::ECONNREFUSED, Errno::EPIPE => e
112
+ raise RedisConnectionError.new(e.message)
113
+ rescue Redis::CommandError => e
114
+ # Redis::CommandError from redis-rb can be raised for multiple
115
+ # reasons, so we need to check the error message to distinguish
116
+ # connection errors from the rest.
117
+ if e.message == 'ERR Connection timed out'.freeze
118
+ raise RedisConnectionError.new(e.message)
119
+ else
120
+ raise e
121
+ end
122
+ end
123
+
124
+ encoded_job
125
+ end
120
126
  end
121
127
  end
122
128
  end
@@ -17,7 +17,10 @@ module ThreeScale
17
17
 
18
18
  Backend::Logging::External.setup_rack self
19
19
 
20
- if Backend.configuration.listener_prometheus_metrics.enabled
20
+ # Notice that this cannot be specified via config, it needs to be an
21
+ # ENV because the metric server is started in Puma/Falcon
22
+ # "before_fork" and the configuration is not loaded at that point.
23
+ if ENV['CONFIG_LISTENER_PROMETHEUS_METRICS_ENABLED'].to_s.downcase.freeze == 'true'.freeze
21
24
  use Rack::Prometheus
22
25
  end
23
26
 
@@ -1,8 +1,4 @@
1
1
  require '3scale/backend/stats/codes_commons'
2
2
  require '3scale/backend/stats/period_commons'
3
3
  require '3scale/backend/stats/aggregator'
4
- require '3scale/backend/stats/delete_job_def'
5
- require '3scale/backend/stats/key_generator'
6
- require '3scale/backend/stats/partition_generator_job'
7
- require '3scale/backend/stats/partition_eraser_job'
8
4
  require '3scale/backend/stats/cleaner'
@@ -145,6 +145,16 @@ module ThreeScale
145
145
  application = Backend::Application.load(service_id,
146
146
  values[:application_id])
147
147
 
148
+ # The app could have been deleted at some point since the job was
149
+ # enqueued. No need to update alerts in that case.
150
+ next unless application
151
+
152
+ # The operations below are costly. They load all the usage limits
153
+ # and current usages to find the current utilization levels.
154
+ # That's why before that, we check if there are any alerts that
155
+ # can be raised.
156
+ next unless Alerts.can_raise_more_alerts?(service_id, values[:application_id])
157
+
148
158
  application.load_metric_names
149
159
  usage = Usage.application_usage(application, current_timestamp)
150
160
  status = Transactor::Status.new(service_id: service_id,
@@ -20,7 +20,14 @@ module ThreeScale
20
20
  key = counter_key(prefix_key, granularity.new(timestamp))
21
21
  expire_time = Stats::PeriodCommons.expire_time_for_granularity(granularity)
22
22
 
23
- store_key(cmd, key, value, expire_time)
23
+ # We don't need to store stats keys set to 0. It wastes Redis
24
+ # memory because for rate-limiting and stats, a key of set to 0
25
+ # is equivalent to a key that does not exist.
26
+ if cmd == :set && value == 0
27
+ storage.del(key)
28
+ else
29
+ store_key(cmd, key, value, expire_time)
30
+ end
24
31
 
25
32
  unless Stats::PeriodCommons::EXCLUDED_FOR_BUCKETS.include?(granularity)
26
33
  keys_for_bucket << key
@@ -45,6 +45,12 @@ module ThreeScale
45
45
  STATS_KEY_PREFIX = 'stats/'.freeze
46
46
  private_constant :STATS_KEY_PREFIX
47
47
 
48
+ REDIS_CONN_ERRORS = [Redis::BaseConnectionError, Errno::ECONNREFUSED, Errno::EPIPE].freeze
49
+ private_constant :REDIS_CONN_ERRORS
50
+
51
+ MAX_RETRIES_REDIS_ERRORS = 3
52
+ private_constant :MAX_RETRIES_REDIS_ERRORS
53
+
48
54
  class << self
49
55
  include Logging
50
56
  def mark_service_to_be_deleted(service_id)
@@ -77,37 +83,73 @@ module ThreeScale
77
83
  logger.info("Going to delete the stats keys for these services: #{services.to_a}")
78
84
 
79
85
  unless services.empty?
80
- delete_successful = true
81
- redis_conns.each do |redis_conn|
86
+ _ok, failed = redis_conns.partition do |redis_conn|
82
87
  begin
83
88
  delete_keys(redis_conn, services, log_deleted_keys)
84
- # If it's a connection error, mark as failed and continue
85
- # cleaning other shards. If it's another kind of error, it
86
- # could be a bug, so better re-raise.
87
- rescue Redis::BaseConnectionError, Errno::ECONNREFUSED, Errno::EPIPE => e
88
- logger.error("Error while deleting stats of server #{redis_conn}: #{e}")
89
- delete_successful = false
90
- rescue Redis::CommandError => e
91
- # Redis::CommandError from redis-rb can be raised for multiple
92
- # reasons, so we need to check the error message to distinguish
93
- # connection errors from the rest.
94
- if e.message == 'ERR Connection timed out'.freeze
95
- logger.error("Error while deleting stats of server #{redis_conn}: #{e}")
96
- delete_successful = false
97
- else
98
- raise e
99
- end
89
+ true
90
+ rescue => e
91
+ handle_redis_exception(e, redis_conn)
92
+ false
100
93
  end
101
94
  end
102
95
 
103
- remove_services_from_delete_set(services) if delete_successful
96
+ with_retries { remove_services_from_delete_set(services) } if failed.empty?
97
+
98
+ failed.each do |failed_conn|
99
+ logger.error("Error while deleting stats of server #{failed_conn}")
100
+ end
104
101
  end
105
102
 
106
103
  logger.info("Finished deleting the stats keys for these services: #{services.to_a}")
107
104
  end
108
105
 
106
+ # Deletes all the stats keys set to 0.
107
+ #
108
+ # Stats keys set to 0 are useless and occupy Redis memory
109
+ # unnecessarily. They were generated due to a bug in previous versions
110
+ # of Apisonator.
111
+ # Ref: https://github.com/3scale/apisonator/pull/247
112
+ #
113
+ # As the .delete function, this one also receives a collection of
114
+ # instantiated Redis clients and those need to connect to Redis
115
+ # servers directly.
116
+ #
117
+ # @param [Array] redis_conns Instantiated Redis clients.
118
+ # @param [IO] log_deleted_keys IO where to write the logs. Defaults to
119
+ # nil (logs nothing).
120
+ def delete_stats_keys_set_to_0(redis_conns, log_deleted_keys: nil)
121
+ _ok, failed = redis_conns.partition do |redis_conn|
122
+ begin
123
+ delete_stats_keys_with_val_0(redis_conn, log_deleted_keys)
124
+ true
125
+ rescue => e
126
+ handle_redis_exception(e, redis_conn)
127
+ false
128
+ end
129
+ end
130
+
131
+ failed.each do |failed_conn|
132
+ logger.error("Error while deleting stats of server #{failed_conn}")
133
+ end
134
+ end
135
+
109
136
  private
110
137
 
138
+ def handle_redis_exception(exception, redis_conn)
139
+ # If it's a connection error, do nothing so we can continue with
140
+ # other shards. If it's another kind of error, it could be caused by
141
+ # a bug, so better re-raise.
142
+
143
+ case exception
144
+ when *REDIS_CONN_ERRORS
145
+ # Do nothing.
146
+ when Redis::CommandError
147
+ raise exception if exception.message != 'ERR Connection timed out'.freeze
148
+ else
149
+ raise exception
150
+ end
151
+ end
152
+
111
153
  # Returns a set with the services included in the
112
154
  # SET_WITH_SERVICES_MARKED_FOR_DELETION Redis set.
113
155
  def services_to_delete
@@ -133,19 +175,21 @@ module ThreeScale
133
175
  cursor = 0
134
176
 
135
177
  loop do
136
- cursor, keys = redis_conn.scan(cursor, count: SCAN_SLICE)
178
+ with_retries do
179
+ cursor, keys = redis_conn.scan(cursor, count: SCAN_SLICE)
137
180
 
138
- to_delete = keys.select { |key| delete_key?(key, services) }
181
+ to_delete = keys.select { |key| delete_key?(key, services) }
139
182
 
140
- unless to_delete.empty?
141
- if log_deleted_keys
142
- values = redis_conn.mget(*(to_delete.to_a))
143
- to_delete.each_with_index do |k, i|
144
- log_deleted_keys.puts "#{k} #{values[i]}"
183
+ unless to_delete.empty?
184
+ if log_deleted_keys
185
+ values = redis_conn.mget(*(to_delete.to_a))
186
+ to_delete.each_with_index do |k, i|
187
+ log_deleted_keys.puts "#{k} #{values[i]}"
188
+ end
145
189
  end
146
- end
147
190
 
148
- redis_conn.del(to_delete)
191
+ redis_conn.del(to_delete)
192
+ end
149
193
  end
150
194
 
151
195
  break if cursor.to_i == 0
@@ -188,6 +232,43 @@ module ThreeScale
188
232
  # simply ignore those keys.
189
233
  nil
190
234
  end
235
+
236
+ def delete_stats_keys_with_val_0(redis_conn, log_deleted_keys)
237
+ cursor = 0
238
+
239
+ loop do
240
+ with_retries do
241
+ cursor, keys = redis_conn.scan(cursor, count: SCAN_SLICE)
242
+
243
+ stats_keys = keys.select { |k| is_stats_key?(k) }
244
+
245
+ unless stats_keys.empty?
246
+ values = redis_conn.mget(*stats_keys)
247
+ to_delete = stats_keys.zip(values).select { |_, v| v == '0'.freeze }.map(&:first)
248
+
249
+ unless to_delete.empty?
250
+ redis_conn.del(to_delete)
251
+ to_delete.each { |k| log_deleted_keys.puts k } if log_deleted_keys
252
+ end
253
+ end
254
+ end
255
+
256
+ break if cursor.to_i == 0
257
+
258
+ sleep(SLEEP_BETWEEN_SCANS)
259
+ end
260
+ end
261
+
262
+ def with_retries(max = MAX_RETRIES_REDIS_ERRORS)
263
+ retries = 0
264
+ begin
265
+ yield
266
+ rescue Exception => e
267
+ retries += 1
268
+ retry if retries < max
269
+ raise e
270
+ end
271
+ end
191
272
  end
192
273
  end
193
274
  end