sidekiq 6.5.0 → 6.5.3

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sidekiq might be problematic. Click here for more details.

@@ -91,7 +91,7 @@ module Sidekiq
91
91
 
92
92
  msg = Sidekiq.load_json(jobstr)
93
93
  if msg["retry"]
94
- attempt_retry(nil, msg, queue, e)
94
+ process_retry(nil, msg, queue, e)
95
95
  else
96
96
  Sidekiq.death_handlers.each do |handler|
97
97
  handler.call(msg, e)
@@ -128,7 +128,7 @@ module Sidekiq
128
128
  end
129
129
 
130
130
  raise e unless msg["retry"]
131
- attempt_retry(jobinst, msg, queue, e)
131
+ process_retry(jobinst, msg, queue, e)
132
132
  # We've handled this error associated with this job, don't
133
133
  # need to handle it at the global level
134
134
  raise Skip
@@ -139,7 +139,7 @@ module Sidekiq
139
139
  # Note that +jobinst+ can be nil here if an error is raised before we can
140
140
  # instantiate the job instance. All access must be guarded and
141
141
  # best effort.
142
- def attempt_retry(jobinst, msg, queue, exception)
142
+ def process_retry(jobinst, msg, queue, exception)
143
143
  max_retry_attempts = retry_attempts_from(msg["retry"], @max_retries)
144
144
 
145
145
  msg["queue"] = (msg["retry_queue"] || queue)
@@ -170,19 +170,50 @@ module Sidekiq
170
170
  msg["error_backtrace"] = compress_backtrace(lines)
171
171
  end
172
172
 
173
- if count < max_retry_attempts
174
- delay = delay_for(jobinst, count, exception)
175
- # Logging here can break retries if the logging device raises ENOSPC #3979
176
- # logger.debug { "Failure! Retry #{count} in #{delay} seconds" }
177
- retry_at = Time.now.to_f + delay
178
- payload = Sidekiq.dump_json(msg)
179
- Sidekiq.redis do |conn|
180
- conn.zadd("retry", retry_at.to_s, payload)
181
- end
173
+ # Goodbye dear message, you (re)tried your best I'm sure.
174
+ return retries_exhausted(jobinst, msg, exception) if count >= max_retry_attempts
175
+
176
+ strategy, delay = delay_for(jobinst, count, exception)
177
+ case strategy
178
+ when :discard
179
+ return # poof!
180
+ when :kill
181
+ return retries_exhausted(jobinst, msg, exception)
182
+ end
183
+
184
+ # Logging here can break retries if the logging device raises ENOSPC #3979
185
+ # logger.debug { "Failure! Retry #{count} in #{delay} seconds" }
186
+ jitter = rand(10) * (count + 1)
187
+ retry_at = Time.now.to_f + delay + jitter
188
+ payload = Sidekiq.dump_json(msg)
189
+ redis do |conn|
190
+ conn.zadd("retry", retry_at.to_s, payload)
191
+ end
192
+ end
193
+
194
+ # returns (strategy, seconds)
195
+ def delay_for(jobinst, count, exception)
196
+ rv = begin
197
+ # sidekiq_retry_in can return two different things:
198
+ # 1. When to retry next, as an integer of seconds
199
+ # 2. A symbol which re-routes the job elsewhere, e.g. :discard, :kill, :default
200
+ jobinst&.sidekiq_retry_in_block&.call(count, exception)
201
+ rescue Exception => e
202
+ handle_exception(e, {context: "Failure scheduling retry using the defined `sidekiq_retry_in` in #{jobinst.class.name}, falling back to default"})
203
+ nil
204
+ end
205
+
206
+ delay = if Integer === rv && rv > 0
207
+ rv
208
+ elsif rv == :discard
209
+ return [:discard, nil] # do nothing, job goes poof
210
+ elsif rv == :kill
211
+ return [:kill, nil]
182
212
  else
183
- # Goodbye dear message, you (re)tried your best I'm sure.
184
- retries_exhausted(jobinst, msg, exception)
213
+ (count**4) + 15
185
214
  end
215
+
216
+ [:default, delay]
186
217
  end
187
218
 
188
219
  def retries_exhausted(jobinst, msg, exception)
@@ -195,7 +226,7 @@ module Sidekiq
195
226
 
196
227
  send_to_morgue(msg) unless msg["dead"] == false
197
228
 
198
- Sidekiq.death_handlers.each do |handler|
229
+ config.death_handlers.each do |handler|
199
230
  handler.call(msg, exception)
200
231
  rescue => e
201
232
  handle_exception(e, {context: "Error calling death handler", job: msg})
@@ -216,22 +247,6 @@ module Sidekiq
216
247
  end
217
248
  end
218
249
 
219
- def delay_for(jobinst, count, exception)
220
- jitter = rand(10) * (count + 1)
221
- if jobinst&.sidekiq_retry_in_block
222
- custom_retry_in = retry_in(jobinst, count, exception).to_i
223
- return custom_retry_in + jitter if custom_retry_in > 0
224
- end
225
- (count**4) + 15 + jitter
226
- end
227
-
228
- def retry_in(jobinst, count, exception)
229
- jobinst.sidekiq_retry_in_block.call(count, exception)
230
- rescue Exception => e
231
- handle_exception(e, {context: "Failure scheduling retry using the defined `sidekiq_retry_in` in #{jobinst.class.name}, falling back to default"})
232
- nil
233
- end
234
-
235
250
  def exception_caused_by_shutdown?(e, checked_causes = [])
236
251
  return false unless e.cause
237
252
 
@@ -79,6 +79,8 @@ module Sidekiq
79
79
  end
80
80
 
81
81
  def clear_heartbeat
82
+ flush_stats
83
+
82
84
  # Remove record from Redis since we are shutting down.
83
85
  # Note we don't stop the heartbeat thread; if the process
84
86
  # doesn't actually exit, it'll reappear in the Web UI.
@@ -98,7 +100,7 @@ module Sidekiq
98
100
 
99
101
  end
100
102
 
101
- def self.flush_stats
103
+ def flush_stats
102
104
  fails = Processor::FAILURE.reset
103
105
  procd = Processor::PROCESSED.reset
104
106
  return if fails + procd == 0
@@ -122,7 +124,6 @@ module Sidekiq
122
124
  Sidekiq.logger.warn("Unable to flush stats: #{ex}")
123
125
  end
124
126
  end
125
- at_exit(&method(:flush_stats))
126
127
 
127
128
  def ❤
128
129
  key = identity
@@ -179,6 +180,7 @@ module Sidekiq
179
180
 
180
181
  # first heartbeat or recovering from an outage and need to reestablish our heartbeat
181
182
  fire_event(:heartbeat) unless exists
183
+ fire_event(:beat, oneshot: false)
182
184
 
183
185
  return unless msg
184
186
 
@@ -0,0 +1,47 @@
1
+ require "sidekiq"
2
+ require "date"
3
+
4
+ # This file is designed to be required within the user's
5
+ # deployment script; it should need a bare minimum of dependencies.
6
+ #
7
+ # require "sidekiq/metrics/deploy"
8
+ # gitdesc = `git log -1 --format="%h %s"`.strip
9
+ # d = Sidekiq::Metrics::Deploy.new
10
+ # d.mark(label: gitdesc)
11
+ #
12
+ # Note that you cannot mark more than once per minute. This is a feature, not a bug.
13
+ module Sidekiq
14
+ module Metrics
15
+ class Deploy
16
+ MARK_TTL = 90 * 24 * 60 * 60 # 90 days
17
+
18
+ def initialize(pool = Sidekiq.redis_pool)
19
+ @pool = pool
20
+ end
21
+
22
+ def mark(at: Time.now, label: "")
23
+ # we need to round the timestamp so that we gracefully
24
+ # handle an excepted common error in marking deploys:
25
+ # having every process mark its deploy, leading
26
+ # to N marks for each deploy. Instead we round the time
27
+ # to the minute so that multple marks within that minute
28
+ # will all naturally rollup into one mark per minute.
29
+ whence = at.utc
30
+ floor = Time.utc(whence.year, whence.month, whence.mday, whence.hour, whence.min, 0)
31
+ datecode = floor.strftime("%Y%m%d")
32
+ key = "#{datecode}-marks"
33
+ @pool.with do |c|
34
+ c.pipelined do |pipe|
35
+ pipe.hsetnx(key, floor.rfc3339, label)
36
+ pipe.expire(key, MARK_TTL)
37
+ end
38
+ end
39
+ end
40
+
41
+ def fetch(date = Time.now.utc.to_date)
42
+ datecode = date.strftime("%Y%m%d")
43
+ @pool.with { |c| c.hgetall("#{datecode}-marks") }
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,124 @@
1
+ require "sidekiq"
2
+ require "date"
3
+ require "set"
4
+
5
+ require "sidekiq/metrics/shared"
6
+
7
+ module Sidekiq
8
+ module Metrics
9
+ # Allows caller to query for Sidekiq execution metrics within Redis.
10
+ # Caller sets a set of attributes to act as filters. {#fetch} will call
11
+ # Redis and return a Hash of results.
12
+ #
13
+ # NB: all metrics and times/dates are UTC only. We specifically do not
14
+ # support timezones.
15
+ class Query
16
+ # :hour, :day, :month
17
+ attr_accessor :period
18
+
19
+ # a specific job class, e.g. "App::OrderJob"
20
+ attr_accessor :klass
21
+
22
+ # the date specific to the period
23
+ # for :day or :hour, something like Date.today or Date.new(2022, 7, 13)
24
+ # for :month, Date.new(2022, 7, 1)
25
+ attr_accessor :date
26
+
27
+ # for period = :hour, the specific hour, integer e.g. 1 or 18
28
+ # note that hours and minutes do not have a leading zero so minute-specific
29
+ # keys will look like "j|20220718|7:3" for data at 07:03.
30
+ attr_accessor :hour
31
+
32
+ def initialize(pool: Sidekiq.redis_pool, now: Time.now)
33
+ @time = now.utc
34
+ @pool = pool
35
+ @klass = nil
36
+ end
37
+
38
+ # Get metric data from the last hour and roll it up
39
+ # into top processed count and execution time based on class.
40
+ def top_jobs
41
+ resultset = {}
42
+ resultset[:date] = @time.to_date
43
+ resultset[:period] = :hour
44
+ resultset[:ends_at] = @time
45
+ time = @time
46
+
47
+ results = @pool.with do |conn|
48
+ conn.pipelined do |pipe|
49
+ resultset[:size] = 60
50
+ 60.times do |idx|
51
+ key = "j|#{time.strftime("%Y%m%d")}|#{time.hour}:#{time.min}"
52
+ pipe.hgetall key
53
+ time -= 60
54
+ end
55
+ resultset[:starts_at] = time
56
+ end
57
+ end
58
+
59
+ t = Hash.new(0)
60
+ klsset = Set.new
61
+ # merge the per-minute data into a totals hash for the hour
62
+ results.each do |hash|
63
+ hash.each { |k, v| t[k] = t[k] + v.to_i }
64
+ klsset.merge(hash.keys.map { |k| k.split("|")[0] })
65
+ end
66
+ resultset[:job_classes] = klsset.delete_if { |item| item.size < 3 }
67
+ resultset[:totals] = t
68
+ top = t.each_with_object({}) do |(k, v), memo|
69
+ (kls, metric) = k.split("|")
70
+ memo[metric] ||= Hash.new(0)
71
+ memo[metric][kls] = v
72
+ end
73
+
74
+ sorted = {}
75
+ top.each_pair do |metric, hash|
76
+ sorted[metric] = hash.sort_by { |k, v| v }.reverse.to_h
77
+ end
78
+ resultset[:top_classes] = sorted
79
+ resultset
80
+ end
81
+
82
+ def for_job(klass)
83
+ resultset = {}
84
+ resultset[:date] = @time.to_date
85
+ resultset[:period] = :hour
86
+ resultset[:ends_at] = @time
87
+ marks = @pool.with { |c| c.hgetall("#{@time.strftime("%Y%m%d")}-marks") }
88
+
89
+ time = @time
90
+ initial = @pool.with do |conn|
91
+ conn.pipelined do |pipe|
92
+ resultset[:size] = 60
93
+ 60.times do |idx|
94
+ key = "j|#{time.strftime("%Y%m%d|%-H:%-M")}"
95
+ pipe.hmget key, "#{klass}|ms", "#{klass}|p", "#{klass}|f"
96
+ time -= 60
97
+ end
98
+ end
99
+ end
100
+
101
+ time = @time
102
+ hist = Histogram.new(klass)
103
+ results = @pool.with do |conn|
104
+ initial.map do |(ms, p, f)|
105
+ tm = Time.utc(time.year, time.month, time.mday, time.hour, time.min, 0)
106
+ {
107
+ time: tm.iso8601,
108
+ epoch: tm.to_i,
109
+ ms: ms.to_i, p: p.to_i, f: f.to_i, hist: hist.fetch(conn, time)
110
+ }.tap { |x|
111
+ x[:mark] = marks[x[:time]] if marks[x[:time]]
112
+ time -= 60
113
+ }
114
+ end
115
+ end
116
+
117
+ resultset[:marks] = marks
118
+ resultset[:starts_at] = time
119
+ resultset[:data] = results
120
+ resultset
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,94 @@
1
+ require "concurrent"
2
+
3
+ module Sidekiq
4
+ module Metrics
5
+ # TODO Support apps without concurrent-ruby
6
+ Counter = ::Concurrent::AtomicFixnum
7
+
8
+ # Implements space-efficient but statistically useful histogram storage.
9
+ # A precise time histogram stores every time. Instead we break times into a set of
10
+ # known buckets and increment counts of the associated time bucket. Even if we call
11
+ # the histogram a million times, we'll still only store 26 buckets.
12
+ # NB: needs to be thread-safe or resiliant to races.
13
+ #
14
+ # To store this data, we use Redis' BITFIELD command to store unsigned 16-bit counters
15
+ # per bucket per klass per minute. It's unlikely that most people will be executing more
16
+ # than 1000 job/sec for a full minute of a specific type.
17
+ class Histogram
18
+ include Enumerable
19
+
20
+ # This number represents the maximum milliseconds for this bucket.
21
+ # 20 means all job executions up to 20ms, e.g. if a job takes
22
+ # 280ms, it'll increment bucket[7]. Note we can track job executions
23
+ # up to about 5.5 minutes. After that, it's assumed you're probably
24
+ # not too concerned with its performance.
25
+ BUCKET_INTERVALS = [
26
+ 20, 30, 45, 65, 100,
27
+ 150, 225, 335, 500, 750,
28
+ 1100, 1700, 2500, 3800, 5750,
29
+ 8500, 13000, 20000, 30000, 45000,
30
+ 65000, 100000, 150000, 225000, 335000,
31
+ Float::INFINITY # the "maybe your job is too long" bucket
32
+ ]
33
+ LABELS = [
34
+ "20ms", "30ms", "45ms", "65ms", "100ms",
35
+ "150ms", "225ms", "335ms", "500ms", "750ms",
36
+ "1.1s", "1.7s", "2.5s", "3.8s", "5.75s",
37
+ "8.5s", "13s", "20s", "30s", "45s",
38
+ "65s", "100s", "150s", "225s", "335s",
39
+ "Slow"
40
+ ]
41
+
42
+ FETCH = "GET u16 #0 GET u16 #1 GET u16 #2 GET u16 #3 \
43
+ GET u16 #4 GET u16 #5 GET u16 #6 GET u16 #7 \
44
+ GET u16 #8 GET u16 #9 GET u16 #10 GET u16 #11 \
45
+ GET u16 #12 GET u16 #13 GET u16 #14 GET u16 #15 \
46
+ GET u16 #16 GET u16 #17 GET u16 #18 GET u16 #19 \
47
+ GET u16 #20 GET u16 #21 GET u16 #22 GET u16 #23 \
48
+ GET u16 #24 GET u16 #25".split
49
+
50
+ def each
51
+ buckets.each { |counter| yield counter.value }
52
+ end
53
+
54
+ def label(idx)
55
+ LABELS[idx]
56
+ end
57
+
58
+ attr_reader :buckets
59
+ def initialize(klass)
60
+ @klass = klass
61
+ @buckets = Array.new(BUCKET_INTERVALS.size) { Counter.new }
62
+ end
63
+
64
+ def record_time(ms)
65
+ index_to_use = BUCKET_INTERVALS.each_index do |idx|
66
+ break idx if ms < BUCKET_INTERVALS[idx]
67
+ end
68
+
69
+ @buckets[index_to_use].increment
70
+ end
71
+
72
+ def fetch(conn, now = Time.now)
73
+ window = now.utc.strftime("%d-%H:%-M")
74
+ key = "#{@klass}-#{window}"
75
+ conn.bitfield(key, *FETCH)
76
+ end
77
+
78
+ def persist(conn, now = Time.now)
79
+ buckets, @buckets = @buckets, []
80
+ window = now.utc.strftime("%d-%H:%-M")
81
+ key = "#{@klass}-#{window}"
82
+ cmd = [key, "OVERFLOW", "SAT"]
83
+ buckets.each_with_index do |counter, idx|
84
+ val = counter.value
85
+ cmd << "INCRBY" << "u16" << "##{idx}" << val.to_s if val > 0
86
+ end
87
+
88
+ conn.bitfield(*cmd) if cmd.size > 3
89
+ conn.expire(key, 86400)
90
+ key
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,134 @@
1
+ require "time"
2
+ require "sidekiq"
3
+ require "sidekiq/metrics/shared"
4
+
5
+ # This file contains the components which track execution metrics within Sidekiq.
6
+ module Sidekiq
7
+ module Metrics
8
+ class ExecutionTracker
9
+ include Sidekiq::Component
10
+
11
+ def initialize(config)
12
+ @config = config
13
+ @jobs = Hash.new(0)
14
+ @totals = Hash.new(0)
15
+ @grams = Hash.new { |hash, key| hash[key] = Histogram.new(key) }
16
+ @lock = Mutex.new
17
+ end
18
+
19
+ def track(queue, klass)
20
+ start = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond)
21
+ time_ms = 0
22
+ begin
23
+ begin
24
+ yield
25
+ ensure
26
+ finish = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :millisecond)
27
+ time_ms = finish - start
28
+ end
29
+ # We don't track time for failed jobs as they can have very unpredictable
30
+ # execution times. more important to know average time for successful jobs so we
31
+ # can better recognize when a perf regression is introduced.
32
+ @lock.synchronize {
33
+ @grams[klass].record_time(time_ms)
34
+ @jobs["#{klass}|ms"] += time_ms
35
+ @totals["ms"] += time_ms
36
+ }
37
+ rescue Exception
38
+ @lock.synchronize {
39
+ @jobs["#{klass}|f"] += 1
40
+ @totals["f"] += 1
41
+ }
42
+ raise
43
+ ensure
44
+ @lock.synchronize {
45
+ @jobs["#{klass}|p"] += 1
46
+ @totals["p"] += 1
47
+ }
48
+ end
49
+ end
50
+
51
+ LONG_TERM = 90 * 24 * 60 * 60
52
+ MID_TERM = 7 * 24 * 60 * 60
53
+ SHORT_TERM = 8 * 60 * 60
54
+
55
+ def flush(time = Time.now)
56
+ totals, jobs, grams = reset
57
+ procd = totals["p"]
58
+ fails = totals["f"]
59
+ return if procd == 0 && fails == 0
60
+
61
+ now = time.utc
62
+ nowdate = now.strftime("%Y%m%d")
63
+ nowhour = now.strftime("%Y%m%d|%-H")
64
+ nowmin = now.strftime("%Y%m%d|%-H:%-M")
65
+ count = 0
66
+
67
+ redis do |conn|
68
+ if grams.size > 0
69
+ conn.pipelined do |pipe|
70
+ grams.each do |_, gram|
71
+ gram.persist(pipe, now)
72
+ end
73
+ end
74
+ end
75
+
76
+ [
77
+ ["j", jobs, nowdate, LONG_TERM],
78
+ ["j", jobs, nowhour, MID_TERM],
79
+ ["j", jobs, nowmin, SHORT_TERM]
80
+ ].each do |prefix, data, bucket, ttl|
81
+ # Quietly seed the new 7.0 stats format so migration is painless.
82
+ conn.pipelined do |xa|
83
+ stats = "#{prefix}|#{bucket}"
84
+ # logger.debug "Flushing metrics #{stats}"
85
+ data.each_pair do |key, value|
86
+ xa.hincrby stats, key, value
87
+ count += 1
88
+ end
89
+ xa.expire(stats, ttl)
90
+ end
91
+ end
92
+ logger.info "Flushed #{count} metrics"
93
+ count
94
+ end
95
+ end
96
+
97
+ private
98
+
99
+ def reset
100
+ @lock.synchronize {
101
+ array = [@totals, @jobs, @grams]
102
+ @totals = Hash.new(0)
103
+ @jobs = Hash.new(0)
104
+ @grams = Hash.new { |hash, key| hash[key] = Histogram.new(key) }
105
+ array
106
+ }
107
+ end
108
+ end
109
+
110
+ class Middleware
111
+ include Sidekiq::ServerMiddleware
112
+
113
+ def initialize(options)
114
+ @exec = options
115
+ end
116
+
117
+ def call(_instance, hash, queue, &block)
118
+ @exec.track(queue, hash["wrapped"] || hash["class"], &block)
119
+ end
120
+ end
121
+ end
122
+ end
123
+
124
+ if ENV["SIDEKIQ_METRICS_BETA"] == "1"
125
+ Sidekiq.configure_server do |config|
126
+ exec = Sidekiq::Metrics::ExecutionTracker.new(config)
127
+ config.server_middleware do |chain|
128
+ chain.add Sidekiq::Metrics::Middleware, exec
129
+ end
130
+ config.on(:beat) do
131
+ exec.flush
132
+ end
133
+ end
134
+ end