sidekiq 4.2.4 → 6.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. checksums.yaml +5 -5
  2. data/Changes.md +445 -0
  3. data/LICENSE +1 -1
  4. data/README.md +21 -34
  5. data/bin/sidekiq +26 -2
  6. data/bin/sidekiqload +28 -38
  7. data/bin/sidekiqmon +8 -0
  8. data/lib/generators/sidekiq/templates/worker_spec.rb.erb +1 -1
  9. data/lib/generators/sidekiq/templates/worker_test.rb.erb +2 -2
  10. data/lib/generators/sidekiq/worker_generator.rb +21 -13
  11. data/lib/sidekiq/api.rb +347 -213
  12. data/lib/sidekiq/cli.rb +221 -212
  13. data/lib/sidekiq/client.rb +75 -52
  14. data/lib/sidekiq/delay.rb +41 -0
  15. data/lib/sidekiq/exception_handler.rb +12 -16
  16. data/lib/sidekiq/extensions/action_mailer.rb +13 -22
  17. data/lib/sidekiq/extensions/active_record.rb +13 -10
  18. data/lib/sidekiq/extensions/class_methods.rb +14 -11
  19. data/lib/sidekiq/extensions/generic_proxy.rb +10 -4
  20. data/lib/sidekiq/fetch.rb +38 -31
  21. data/lib/sidekiq/job_logger.rb +63 -0
  22. data/lib/sidekiq/job_retry.rb +263 -0
  23. data/lib/sidekiq/launcher.rb +169 -70
  24. data/lib/sidekiq/logger.rb +166 -0
  25. data/lib/sidekiq/manager.rb +17 -20
  26. data/lib/sidekiq/middleware/chain.rb +15 -5
  27. data/lib/sidekiq/middleware/i18n.rb +5 -7
  28. data/lib/sidekiq/monitor.rb +133 -0
  29. data/lib/sidekiq/paginator.rb +18 -14
  30. data/lib/sidekiq/processor.rb +161 -70
  31. data/lib/sidekiq/rails.rb +30 -73
  32. data/lib/sidekiq/redis_connection.rb +67 -20
  33. data/lib/sidekiq/scheduled.rb +61 -35
  34. data/lib/sidekiq/sd_notify.rb +149 -0
  35. data/lib/sidekiq/systemd.rb +24 -0
  36. data/lib/sidekiq/testing/inline.rb +2 -1
  37. data/lib/sidekiq/testing.rb +54 -26
  38. data/lib/sidekiq/util.rb +48 -15
  39. data/lib/sidekiq/version.rb +2 -1
  40. data/lib/sidekiq/web/action.rb +15 -15
  41. data/lib/sidekiq/web/application.rb +112 -89
  42. data/lib/sidekiq/web/csrf_protection.rb +180 -0
  43. data/lib/sidekiq/web/helpers.rb +153 -73
  44. data/lib/sidekiq/web/router.rb +27 -19
  45. data/lib/sidekiq/web.rb +64 -109
  46. data/lib/sidekiq/worker.rb +164 -41
  47. data/lib/sidekiq.rb +86 -60
  48. data/sidekiq.gemspec +24 -22
  49. data/web/assets/images/apple-touch-icon.png +0 -0
  50. data/web/assets/javascripts/application.js +25 -27
  51. data/web/assets/javascripts/dashboard.js +34 -38
  52. data/web/assets/stylesheets/application-dark.css +160 -0
  53. data/web/assets/stylesheets/application-rtl.css +246 -0
  54. data/web/assets/stylesheets/application.css +402 -12
  55. data/web/assets/stylesheets/bootstrap-rtl.min.css +9 -0
  56. data/web/assets/stylesheets/bootstrap.css +2 -2
  57. data/web/locales/ar.yml +81 -0
  58. data/web/locales/de.yml +14 -2
  59. data/web/locales/en.yml +4 -0
  60. data/web/locales/es.yml +4 -3
  61. data/web/locales/fa.yml +80 -0
  62. data/web/locales/fr.yml +3 -3
  63. data/web/locales/he.yml +79 -0
  64. data/web/locales/ja.yml +9 -4
  65. data/web/locales/lt.yml +83 -0
  66. data/web/locales/pl.yml +4 -4
  67. data/web/locales/ru.yml +4 -0
  68. data/web/locales/ur.yml +80 -0
  69. data/web/locales/vi.yml +83 -0
  70. data/web/views/_footer.erb +5 -2
  71. data/web/views/_job_info.erb +3 -2
  72. data/web/views/_nav.erb +4 -18
  73. data/web/views/_paging.erb +1 -1
  74. data/web/views/busy.erb +57 -19
  75. data/web/views/dashboard.erb +3 -3
  76. data/web/views/dead.erb +2 -2
  77. data/web/views/layout.erb +13 -2
  78. data/web/views/morgue.erb +19 -12
  79. data/web/views/queue.erb +22 -12
  80. data/web/views/queues.erb +13 -3
  81. data/web/views/retries.erb +22 -13
  82. data/web/views/retry.erb +3 -3
  83. data/web/views/scheduled.erb +7 -4
  84. metadata +42 -194
  85. data/.github/contributing.md +0 -32
  86. data/.github/issue_template.md +0 -4
  87. data/.gitignore +0 -12
  88. data/.travis.yml +0 -12
  89. data/3.0-Upgrade.md +0 -70
  90. data/4.0-Upgrade.md +0 -53
  91. data/COMM-LICENSE +0 -95
  92. data/Ent-Changes.md +0 -146
  93. data/Gemfile +0 -29
  94. data/Pro-2.0-Upgrade.md +0 -138
  95. data/Pro-3.0-Upgrade.md +0 -44
  96. data/Pro-Changes.md +0 -585
  97. data/Rakefile +0 -9
  98. data/bin/sidekiqctl +0 -99
  99. data/code_of_conduct.md +0 -50
  100. data/lib/sidekiq/core_ext.rb +0 -106
  101. data/lib/sidekiq/logging.rb +0 -106
  102. data/lib/sidekiq/middleware/server/active_record.rb +0 -13
  103. data/lib/sidekiq/middleware/server/logging.rb +0 -40
  104. data/lib/sidekiq/middleware/server/retry_jobs.rb +0 -205
  105. data/test/config.yml +0 -9
  106. data/test/env_based_config.yml +0 -11
  107. data/test/fake_env.rb +0 -1
  108. data/test/fixtures/en.yml +0 -2
  109. data/test/helper.rb +0 -75
  110. data/test/test_actors.rb +0 -138
  111. data/test/test_api.rb +0 -528
  112. data/test/test_cli.rb +0 -418
  113. data/test/test_client.rb +0 -266
  114. data/test/test_exception_handler.rb +0 -56
  115. data/test/test_extensions.rb +0 -127
  116. data/test/test_fetch.rb +0 -50
  117. data/test/test_launcher.rb +0 -95
  118. data/test/test_logging.rb +0 -35
  119. data/test/test_manager.rb +0 -50
  120. data/test/test_middleware.rb +0 -158
  121. data/test/test_processor.rb +0 -235
  122. data/test/test_rails.rb +0 -22
  123. data/test/test_redis_connection.rb +0 -132
  124. data/test/test_retry.rb +0 -326
  125. data/test/test_retry_exhausted.rb +0 -149
  126. data/test/test_scheduled.rb +0 -115
  127. data/test/test_scheduling.rb +0 -58
  128. data/test/test_sidekiq.rb +0 -107
  129. data/test/test_testing.rb +0 -143
  130. data/test/test_testing_fake.rb +0 -357
  131. data/test/test_testing_inline.rb +0 -94
  132. data/test/test_util.rb +0 -13
  133. data/test/test_web.rb +0 -726
  134. data/test/test_web_helpers.rb +0 -54
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Sidekiq
4
+ class JobLogger
5
+ def initialize(logger = Sidekiq.logger)
6
+ @logger = logger
7
+ end
8
+
9
+ def call(item, queue)
10
+ start = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
11
+ @logger.info("start")
12
+
13
+ yield
14
+
15
+ with_elapsed_time_context(start) do
16
+ @logger.info("done")
17
+ end
18
+ rescue Exception
19
+ with_elapsed_time_context(start) do
20
+ @logger.info("fail")
21
+ end
22
+
23
+ raise
24
+ end
25
+
26
+ def prepare(job_hash, &block)
27
+ level = job_hash["log_level"]
28
+ if level
29
+ @logger.log_at(level) do
30
+ Sidekiq::Context.with(job_hash_context(job_hash), &block)
31
+ end
32
+ else
33
+ Sidekiq::Context.with(job_hash_context(job_hash), &block)
34
+ end
35
+ end
36
+
37
+ def job_hash_context(job_hash)
38
+ # If we're using a wrapper class, like ActiveJob, use the "wrapped"
39
+ # attribute to expose the underlying thing.
40
+ h = {
41
+ class: job_hash["wrapped"] || job_hash["class"],
42
+ jid: job_hash["jid"]
43
+ }
44
+ h[:bid] = job_hash["bid"] if job_hash["bid"]
45
+ h[:tags] = job_hash["tags"] if job_hash["tags"]
46
+ h
47
+ end
48
+
49
+ def with_elapsed_time_context(start, &block)
50
+ Sidekiq::Context.with(elapsed_time_context(start), &block)
51
+ end
52
+
53
+ def elapsed_time_context(start)
54
+ {elapsed: elapsed(start).to_s}
55
+ end
56
+
57
+ private
58
+
59
+ def elapsed(start)
60
+ (::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - start).round(3)
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,263 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sidekiq/scheduled"
4
+ require "sidekiq/api"
5
+
6
+ require "zlib"
7
+ require "base64"
8
+
9
+ module Sidekiq
10
+ ##
11
+ # Automatically retry jobs that fail in Sidekiq.
12
+ # Sidekiq's retry support assumes a typical development lifecycle:
13
+ #
14
+ # 0. Push some code changes with a bug in it.
15
+ # 1. Bug causes job processing to fail, Sidekiq's middleware captures
16
+ # the job and pushes it onto a retry queue.
17
+ # 2. Sidekiq retries jobs in the retry queue multiple times with
18
+ # an exponential delay, the job continues to fail.
19
+ # 3. After a few days, a developer deploys a fix. The job is
20
+ # reprocessed successfully.
21
+ # 4. Once retries are exhausted, Sidekiq will give up and move the
22
+ # job to the Dead Job Queue (aka morgue) where it must be dealt with
23
+ # manually in the Web UI.
24
+ # 5. After 6 months on the DJQ, Sidekiq will discard the job.
25
+ #
26
+ # A job looks like:
27
+ #
28
+ # { 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => true }
29
+ #
30
+ # The 'retry' option also accepts a number (in place of 'true'):
31
+ #
32
+ # { 'class' => 'HardWorker', 'args' => [1, 2, 'foo'], 'retry' => 5 }
33
+ #
34
+ # The job will be retried this number of times before giving up. (If simply
35
+ # 'true', Sidekiq retries 25 times)
36
+ #
37
+ # We'll add a bit more data to the job to support retries:
38
+ #
39
+ # * 'queue' - the queue to use
40
+ # * 'retry_count' - number of times we've retried so far.
41
+ # * 'error_message' - the message from the exception
42
+ # * 'error_class' - the exception class
43
+ # * 'failed_at' - the first time it failed
44
+ # * 'retried_at' - the last time it was retried
45
+ # * 'backtrace' - the number of lines of error backtrace to store
46
+ #
47
+ # We don't store the backtrace by default as that can add a lot of overhead
48
+ # to the job and everyone is using an error service, right?
49
+ #
50
+ # The default number of retries is 25 which works out to about 3 weeks
51
+ # You can change the default maximum number of retries in your initializer:
52
+ #
53
+ # Sidekiq.options[:max_retries] = 7
54
+ #
55
+ # or limit the number of retries for a particular worker with:
56
+ #
57
+ # class MyWorker
58
+ # include Sidekiq::Worker
59
+ # sidekiq_options :retry => 10
60
+ # end
61
+ #
62
+ class JobRetry
63
+ class Handled < ::RuntimeError; end
64
+
65
+ class Skip < Handled; end
66
+
67
+ include Sidekiq::Util
68
+
69
+ DEFAULT_MAX_RETRY_ATTEMPTS = 25
70
+
71
+ def initialize(options = {})
72
+ @max_retries = Sidekiq.options.merge(options).fetch(:max_retries, DEFAULT_MAX_RETRY_ATTEMPTS)
73
+ end
74
+
75
+ # The global retry handler requires only the barest of data.
76
+ # We want to be able to retry as much as possible so we don't
77
+ # require the worker to be instantiated.
78
+ def global(jobstr, queue)
79
+ yield
80
+ rescue Handled => ex
81
+ raise ex
82
+ rescue Sidekiq::Shutdown => ey
83
+ # ignore, will be pushed back onto queue during hard_shutdown
84
+ raise ey
85
+ rescue Exception => e
86
+ # ignore, will be pushed back onto queue during hard_shutdown
87
+ raise Sidekiq::Shutdown if exception_caused_by_shutdown?(e)
88
+
89
+ msg = Sidekiq.load_json(jobstr)
90
+ if msg["retry"]
91
+ attempt_retry(nil, msg, queue, e)
92
+ else
93
+ Sidekiq.death_handlers.each do |handler|
94
+ handler.call(msg, e)
95
+ rescue => handler_ex
96
+ handle_exception(handler_ex, {context: "Error calling death handler", job: msg})
97
+ end
98
+ end
99
+
100
+ raise Handled
101
+ end
102
+
103
+ # The local retry support means that any errors that occur within
104
+ # this block can be associated with the given worker instance.
105
+ # This is required to support the `sidekiq_retries_exhausted` block.
106
+ #
107
+ # Note that any exception from the block is wrapped in the Skip
108
+ # exception so the global block does not reprocess the error. The
109
+ # Skip exception is unwrapped within Sidekiq::Processor#process before
110
+ # calling the handle_exception handlers.
111
+ def local(worker, jobstr, queue)
112
+ yield
113
+ rescue Handled => ex
114
+ raise ex
115
+ rescue Sidekiq::Shutdown => ey
116
+ # ignore, will be pushed back onto queue during hard_shutdown
117
+ raise ey
118
+ rescue Exception => e
119
+ # ignore, will be pushed back onto queue during hard_shutdown
120
+ raise Sidekiq::Shutdown if exception_caused_by_shutdown?(e)
121
+
122
+ msg = Sidekiq.load_json(jobstr)
123
+ if msg["retry"].nil?
124
+ msg["retry"] = worker.class.get_sidekiq_options["retry"]
125
+ end
126
+
127
+ raise e unless msg["retry"]
128
+ attempt_retry(worker, msg, queue, e)
129
+ # We've handled this error associated with this job, don't
130
+ # need to handle it at the global level
131
+ raise Skip
132
+ end
133
+
134
+ private
135
+
136
+ # Note that +worker+ can be nil here if an error is raised before we can
137
+ # instantiate the worker instance. All access must be guarded and
138
+ # best effort.
139
+ def attempt_retry(worker, msg, queue, exception)
140
+ max_retry_attempts = retry_attempts_from(msg["retry"], @max_retries)
141
+
142
+ msg["queue"] = (msg["retry_queue"] || queue)
143
+
144
+ m = exception_message(exception)
145
+ if m.respond_to?(:scrub!)
146
+ m.force_encoding("utf-8")
147
+ m.scrub!
148
+ end
149
+
150
+ msg["error_message"] = m
151
+ msg["error_class"] = exception.class.name
152
+ count = if msg["retry_count"]
153
+ msg["retried_at"] = Time.now.to_f
154
+ msg["retry_count"] += 1
155
+ else
156
+ msg["failed_at"] = Time.now.to_f
157
+ msg["retry_count"] = 0
158
+ end
159
+
160
+ if msg["backtrace"]
161
+ lines = if msg["backtrace"] == true
162
+ exception.backtrace
163
+ else
164
+ exception.backtrace[0...msg["backtrace"].to_i]
165
+ end
166
+
167
+ msg["error_backtrace"] = compress_backtrace(lines)
168
+ end
169
+
170
+ if count < max_retry_attempts
171
+ delay = delay_for(worker, count, exception)
172
+ # Logging here can break retries if the logging device raises ENOSPC #3979
173
+ # logger.debug { "Failure! Retry #{count} in #{delay} seconds" }
174
+ retry_at = Time.now.to_f + delay
175
+ payload = Sidekiq.dump_json(msg)
176
+ Sidekiq.redis do |conn|
177
+ conn.zadd("retry", retry_at.to_s, payload)
178
+ end
179
+ else
180
+ # Goodbye dear message, you (re)tried your best I'm sure.
181
+ retries_exhausted(worker, msg, exception)
182
+ end
183
+ end
184
+
185
+ def retries_exhausted(worker, msg, exception)
186
+ begin
187
+ block = worker&.sidekiq_retries_exhausted_block
188
+ block&.call(msg, exception)
189
+ rescue => e
190
+ handle_exception(e, {context: "Error calling retries_exhausted", job: msg})
191
+ end
192
+
193
+ send_to_morgue(msg) unless msg["dead"] == false
194
+
195
+ Sidekiq.death_handlers.each do |handler|
196
+ handler.call(msg, exception)
197
+ rescue => e
198
+ handle_exception(e, {context: "Error calling death handler", job: msg})
199
+ end
200
+ end
201
+
202
+ def send_to_morgue(msg)
203
+ logger.info { "Adding dead #{msg["class"]} job #{msg["jid"]}" }
204
+ payload = Sidekiq.dump_json(msg)
205
+ DeadSet.new.kill(payload, notify_failure: false)
206
+ end
207
+
208
+ def retry_attempts_from(msg_retry, default)
209
+ if msg_retry.is_a?(Integer)
210
+ msg_retry
211
+ else
212
+ default
213
+ end
214
+ end
215
+
216
+ def delay_for(worker, count, exception)
217
+ if worker&.sidekiq_retry_in_block
218
+ custom_retry_in = retry_in(worker, count, exception).to_i
219
+ return custom_retry_in if custom_retry_in > 0
220
+ end
221
+ seconds_to_delay(count)
222
+ end
223
+
224
+ # delayed_job uses the same basic formula
225
+ def seconds_to_delay(count)
226
+ (count**4) + 15 + (rand(30) * (count + 1))
227
+ end
228
+
229
+ def retry_in(worker, count, exception)
230
+ worker.sidekiq_retry_in_block.call(count, exception)
231
+ rescue Exception => e
232
+ handle_exception(e, {context: "Failure scheduling retry using the defined `sidekiq_retry_in` in #{worker.class.name}, falling back to default"})
233
+ nil
234
+ end
235
+
236
+ def exception_caused_by_shutdown?(e, checked_causes = [])
237
+ return false unless e.cause
238
+
239
+ # Handle circular causes
240
+ checked_causes << e.object_id
241
+ return false if checked_causes.include?(e.cause.object_id)
242
+
243
+ e.cause.instance_of?(Sidekiq::Shutdown) ||
244
+ exception_caused_by_shutdown?(e.cause, checked_causes)
245
+ end
246
+
247
+ # Extract message from exception.
248
+ # Set a default if the message raises an error
249
+ def exception_message(exception)
250
+ # App code can stuff all sorts of crazy binary data into the error message
251
+ # that won't convert to JSON.
252
+ exception.message.to_s[0, 10_000]
253
+ rescue
254
+ +"!!! ERROR MESSAGE THREW AN ERROR !!!"
255
+ end
256
+
257
+ def compress_backtrace(backtrace)
258
+ serialized = Sidekiq.dump_json(backtrace)
259
+ compressed = Zlib::Deflate.deflate(serialized)
260
+ Base64.encode64(compressed)
261
+ end
262
+ end
263
+ end
@@ -1,20 +1,28 @@
1
1
  # frozen_string_literal: true
2
- # encoding: utf-8
3
- require 'sidekiq/manager'
4
- require 'sidekiq/fetch'
5
- require 'sidekiq/scheduled'
2
+
3
+ require "sidekiq/manager"
4
+ require "sidekiq/fetch"
5
+ require "sidekiq/scheduled"
6
6
 
7
7
  module Sidekiq
8
- # The Launcher is a very simple Actor whose job is to
9
- # start, monitor and stop the core Actors in Sidekiq.
10
- # If any of these actors die, the Sidekiq process exits
11
- # immediately.
8
+ # The Launcher starts the Manager and Poller threads and provides the process heartbeat.
12
9
  class Launcher
13
10
  include Util
14
11
 
12
+ STATS_TTL = 5 * 365 * 24 * 60 * 60 # 5 years
13
+
14
+ PROCTITLES = [
15
+ proc { "sidekiq" },
16
+ proc { Sidekiq::VERSION },
17
+ proc { |me, data| data["tag"] },
18
+ proc { |me, data| "[#{Processor::WORKER_STATE.size} of #{data["concurrency"]} busy]" },
19
+ proc { |me, data| "stopping" if me.stopping? }
20
+ ]
21
+
15
22
  attr_accessor :manager, :poller, :fetcher
16
23
 
17
24
  def initialize(options)
25
+ options[:fetch] ||= BasicFetch.new(options)
18
26
  @manager = Sidekiq::Manager.new(options)
19
27
  @poller = Sidekiq::Scheduled::Poller.new
20
28
  @done = false
@@ -39,7 +47,7 @@ module Sidekiq
39
47
  # return until all work is complete and cleaned up.
40
48
  # It can take up to the timeout to complete.
41
49
  def stop
42
- deadline = Time.now + @options[:timeout]
50
+ deadline = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC) + @options[:timeout]
43
51
 
44
52
  @done = true
45
53
  @manager.quiet
@@ -49,7 +57,7 @@ module Sidekiq
49
57
 
50
58
  # Requeue everything in case there was a worker who grabbed work while stopped
51
59
  # This call is a no-op in Sidekiq but necessary for Sidekiq Pro.
52
- strategy = (@options[:fetch] || Sidekiq::BasicFetch)
60
+ strategy = @options[:fetch]
53
61
  strategy.bulk_requeue([], @options)
54
62
 
55
63
  clear_heartbeat
@@ -61,104 +69,195 @@ module Sidekiq
61
69
 
62
70
  private unless $TESTING
63
71
 
64
- JVM_RESERVED_SIGNALS = ['USR1', 'USR2'] # Don't Process#kill if we get these signals via the API
72
+ def start_heartbeat
73
+ loop do
74
+ heartbeat
75
+ sleep 5
76
+ end
77
+ Sidekiq.logger.info("Heartbeat stopping...")
78
+ end
65
79
 
66
- def heartbeat(k, data, json)
67
- results = Sidekiq::CLI::PROCTITLES.map {|x| x.(self, data) }
68
- results.compact!
69
- $0 = results.join(' ')
80
+ def clear_heartbeat
81
+ # Remove record from Redis since we are shutting down.
82
+ # Note we don't stop the heartbeat thread; if the process
83
+ # doesn't actually exit, it'll reappear in the Web UI.
84
+ Sidekiq.redis do |conn|
85
+ conn.pipelined do
86
+ conn.srem("processes", identity)
87
+ conn.unlink("#{identity}:workers")
88
+ end
89
+ end
90
+ rescue
91
+ # best effort, ignore network errors
92
+ end
93
+
94
+ def heartbeat
95
+ $0 = PROCTITLES.map { |proc| proc.call(self, to_data) }.compact.join(" ")
70
96
 
71
- (k, json)
97
+
98
+ end
99
+
100
+ def self.flush_stats
101
+ fails = Processor::FAILURE.reset
102
+ procd = Processor::PROCESSED.reset
103
+ return if fails + procd == 0
104
+
105
+ nowdate = Time.now.utc.strftime("%Y-%m-%d")
106
+ begin
107
+ Sidekiq.redis do |conn|
108
+ conn.pipelined do
109
+ conn.incrby("stat:processed", procd)
110
+ conn.incrby("stat:processed:#{nowdate}", procd)
111
+ conn.expire("stat:processed:#{nowdate}", STATS_TTL)
112
+
113
+ conn.incrby("stat:failed", fails)
114
+ conn.incrby("stat:failed:#{nowdate}", fails)
115
+ conn.expire("stat:failed:#{nowdate}", STATS_TTL)
116
+ end
117
+ end
118
+ rescue => ex
119
+ # we're exiting the process, things might be shut down so don't
120
+ # try to handle the exception
121
+ Sidekiq.logger.warn("Unable to flush stats: #{ex}")
122
+ end
72
123
  end
124
+ at_exit(&method(:flush_stats))
73
125
 
74
- def ❤(key, json)
126
+ def ❤
127
+ key = identity
75
128
  fails = procd = 0
129
+
76
130
  begin
77
- Processor::FAILURE.update {|curr| fails = curr; 0 }
78
- Processor::PROCESSED.update {|curr| procd = curr; 0 }
131
+ fails = Processor::FAILURE.reset
132
+ procd = Processor::PROCESSED.reset
133
+ curstate = Processor::WORKER_STATE.dup
134
+
135
+ workers_key = "#{key}:workers"
136
+ nowdate = Time.now.utc.strftime("%Y-%m-%d")
79
137
 
80
- workers_key = "#{key}:workers".freeze
81
- nowdate = Time.now.utc.strftime("%Y-%m-%d".freeze)
82
138
  Sidekiq.redis do |conn|
83
139
  conn.multi do
84
- conn.incrby("stat:processed".freeze, procd)
140
+ conn.incrby("stat:processed", procd)
85
141
  conn.incrby("stat:processed:#{nowdate}", procd)
86
- conn.incrby("stat:failed".freeze, fails)
142
+ conn.expire("stat:processed:#{nowdate}", STATS_TTL)
143
+
144
+ conn.incrby("stat:failed", fails)
87
145
  conn.incrby("stat:failed:#{nowdate}", fails)
88
- conn.del(workers_key)
89
- Processor::WORKER_STATE.each_pair do |tid, hash|
146
+ conn.expire("stat:failed:#{nowdate}", STATS_TTL)
147
+
148
+ conn.unlink(workers_key)
149
+ curstate.each_pair do |tid, hash|
90
150
  conn.hset(workers_key, tid, Sidekiq.dump_json(hash))
91
151
  end
92
152
  conn.expire(workers_key, 60)
93
153
  end
94
154
  end
155
+
156
+ rtt = check_rtt
157
+
95
158
  fails = procd = 0
159
+ kb = memory_usage(::Process.pid)
96
160
 
97
- _, exists, _, _, msg = Sidekiq.redis do |conn|
98
- conn.multi do
99
- conn.sadd('processes', key)
100
- conn.exists(key)
101
- conn.hmset(key, 'info', json, 'busy', Processor::WORKER_STATE.size, 'beat', Time.now.to_f, 'quiet', @done)
161
+ _, exists, _, _, msg = Sidekiq.redis { |conn|
162
+ conn.multi {
163
+ conn.sadd("processes", key)
164
+ conn.exists?(key)
165
+ conn.hmset(key, "info", to_json,
166
+ "busy", curstate.size,
167
+ "beat", Time.now.to_f,
168
+ "rtt_us", rtt,
169
+ "quiet", @done,
170
+ "rss", kb)
102
171
  conn.expire(key, 60)
103
172
  conn.rpop("#{key}-signals")
104
- end
105
- end
173
+ }
174
+ }
106
175
 
107
176
  # first heartbeat or recovering from an outage and need to reestablish our heartbeat
108
- fire_event(:heartbeat) if !exists
177
+ fire_event(:heartbeat) unless exists
109
178
 
110
179
  return unless msg
111
180
 
112
- if JVM_RESERVED_SIGNALS.include?(msg)
113
- Sidekiq::CLI.instance.handle_signal(msg)
114
- else
115
- ::Process.kill(msg, $$)
116
- end
181
+ ::Process.kill(msg, ::Process.pid)
117
182
  rescue => e
118
183
  # ignore all redis/network issues
119
- logger.error("heartbeat: #{e.message}")
184
+ logger.error("heartbeat: #{e}")
120
185
  # don't lose the counts if there was a network issue
121
- Processor::PROCESSED.increment(procd)
122
- Processor::FAILURE.increment(fails)
186
+ Processor::PROCESSED.incr(procd)
187
+ Processor::FAILURE.incr(fails)
123
188
  end
124
189
  end
125
190
 
126
- def start_heartbeat
127
- k = identity
128
- data = {
129
- 'hostname' => hostname,
130
- 'started_at' => Time.now.to_f,
131
- 'pid' => $$,
132
- 'tag' => @options[:tag] || '',
133
- 'concurrency' => @options[:concurrency],
134
- 'queues' => @options[:queues].uniq,
135
- 'labels' => @options[:labels],
136
- 'identity' => k,
137
- }
138
- # this data doesn't change so dump it to a string
139
- # now so we don't need to dump it every heartbeat.
140
- json = Sidekiq.dump_json(data)
191
+ # We run the heartbeat every five seconds.
192
+ # Capture five samples of RTT, log a warning if each sample
193
+ # is above our warning threshold.
194
+ RTT_READINGS = RingBuffer.new(5)
195
+ RTT_WARNING_LEVEL = 50_000
141
196
 
142
- while true
143
- heartbeat(k, data, json)
144
- sleep 5
197
+ def check_rtt
198
+ a = b = 0
199
+ Sidekiq.redis do |x|
200
+ a = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :microsecond)
201
+ x.ping
202
+ b = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC, :microsecond)
145
203
  end
146
- Sidekiq.logger.info("Heartbeat stopping...")
204
+ rtt = b - a
205
+ RTT_READINGS << rtt
206
+ # Ideal RTT for Redis is < 1000µs
207
+ # Workable is < 10,000µs
208
+ # Log a warning if it's a disaster.
209
+ if RTT_READINGS.all? { |x| x > RTT_WARNING_LEVEL }
210
+ Sidekiq.logger.warn <<~EOM
211
+ Your Redis network connection is performing extremely poorly.
212
+ Last RTT readings were #{RTT_READINGS.buffer.inspect}, ideally these should be < 1000.
213
+ Ensure Redis is running in the same AZ or datacenter as Sidekiq.
214
+ EOM
215
+ RTT_READINGS.reset
216
+ end
217
+ rtt
147
218
  end
148
219
 
149
- def clear_heartbeat
150
- # Remove record from Redis since we are shutting down.
151
- # Note we don't stop the heartbeat thread; if the process
152
- # doesn't actually exit, it'll reappear in the Web UI.
153
- Sidekiq.redis do |conn|
154
- conn.pipelined do
155
- conn.srem('processes', identity)
156
- conn.del("#{identity}:workers")
220
+ MEMORY_GRABBER = case RUBY_PLATFORM
221
+ when /linux/
222
+ ->(pid) {
223
+ IO.readlines("/proc/#{$$}/status").each do |line|
224
+ next unless line.start_with?("VmRSS:")
225
+ break line.split[1].to_i
157
226
  end
227
+ }
228
+ when /darwin|bsd/
229
+ ->(pid) {
230
+ `ps -o pid,rss -p #{pid}`.lines.last.split.last.to_i
231
+ }
232
+ else
233
+ ->(pid) { 0 }
234
+ end
235
+
236
+ def memory_usage(pid)
237
+ MEMORY_GRABBER.call(pid)
238
+ end
239
+
240
+ def to_data
241
+ @data ||= begin
242
+ {
243
+ "hostname" => hostname,
244
+ "started_at" => Time.now.to_f,
245
+ "pid" => ::Process.pid,
246
+ "tag" => @options[:tag] || "",
247
+ "concurrency" => @options[:concurrency],
248
+ "queues" => @options[:queues].uniq,
249
+ "labels" => @options[:labels],
250
+ "identity" => identity
251
+ }
158
252
  end
159
- rescue
160
- # best effort, ignore network errors
161
253
  end
162
254
 
255
+ def to_json
256
+ @json ||= begin
257
+ # this data changes infrequently so dump it to a string
258
+ # now so we don't need to dump it every heartbeat.
259
+ Sidekiq.dump_json(to_data)
260
+ end
261
+ end
163
262
  end
164
263
  end