ci-queue 0.42.0 → 0.43.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fb0b6d2c1537200e0a3540652972d25456cda0fe2115a65a884c8f5feeb17ff7
4
- data.tar.gz: 7ac240806fdf6f16edfa314c59dccb22b379dbc2795d23d0c4ad19f4a46956d1
3
+ metadata.gz: 9fafefb68f5e00faba6e1c19944c0289bd0e7c5ca4d67090605fea402f2b04f3
4
+ data.tar.gz: fe85dc8a004f45a54203eac3e4294e5348bca9d8920b70ad2427fed9f5f26776
5
5
  SHA512:
6
- metadata.gz: feaea129a7c672e57372e6475dc0fdc049c1e8b7815994b9ece3b4db0cbf6970ba19c222ed6bf9a652c7e68c7781e46edbd45e0eaa0da2b4e4d31b3856980aed
7
- data.tar.gz: e2b9112e41b223c85b0acc20a7129d2ec517a816e73a42bea381ce696af8821dbaee1c5151d5002784630b1c0104b6c342d91b6273dd12de01c063d5bef5b1fe
6
+ metadata.gz: a190ef07194b9cbb9c880de74abd9e54ede4d3e52480e26ba20c16a80720fafb66ce5933bc21df4213e7cefc9abb5f3893b75e46fb7e8327ee6e11b8a5e26de4
7
+ data.tar.gz: 5b3a127d41fde0f8094878e8cfcd088eb2d1a32fc86798b98adafbf06caa19e201631899fa4aec9c0afa8d8a9174215d529dd1dfdf637e4e9122dfb93f99d741
@@ -5,7 +5,7 @@ module CI
5
5
  attr_accessor :timeout, :worker_id, :max_requeues, :grind_count, :failure_file, :export_flaky_tests_file
6
6
  attr_accessor :requeue_tolerance, :namespace, :failing_test, :statsd_endpoint
7
7
  attr_accessor :max_test_duration, :max_test_duration_percentile, :track_test_duration
8
- attr_accessor :max_test_failed, :redis_ttl, :warnings_file, :debug_log
8
+ attr_accessor :max_test_failed, :redis_ttl, :warnings_file, :debug_log, :max_missed_heartbeat_seconds
9
9
  attr_reader :circuit_breakers
10
10
  attr_writer :seed, :build_id
11
11
  attr_writer :queue_init_timeout, :report_timeout, :inactive_workers_timeout
@@ -37,7 +37,7 @@ module CI
37
37
  grind_count: nil, max_duration: nil, failure_file: nil, max_test_duration: nil,
38
38
  max_test_duration_percentile: 0.5, track_test_duration: false, max_test_failed: nil,
39
39
  queue_init_timeout: nil, redis_ttl: 8 * 60 * 60, report_timeout: nil, inactive_workers_timeout: nil,
40
- export_flaky_tests_file: nil, warnings_file: nil, debug_log: nil)
40
+ export_flaky_tests_file: nil, warnings_file: nil, debug_log: nil, max_missed_heartbeat_seconds: nil)
41
41
  @build_id = build_id
42
42
  @circuit_breakers = [CircuitBreaker::Disabled]
43
43
  @failure_file = failure_file
@@ -63,6 +63,7 @@ module CI
63
63
  @export_flaky_tests_file = export_flaky_tests_file
64
64
  @warnings_file = warnings_file
65
65
  @debug_log = debug_log
66
+ @max_missed_heartbeat_seconds = max_missed_heartbeat_seconds
66
67
  end
67
68
 
68
69
  def queue_init_timeout
@@ -35,7 +35,7 @@ module CI
35
35
  url: redis_url,
36
36
  # Booting a CI worker is costly, so in case of a Redis blip,
37
37
  # it makes sense to retry for a while before giving up.
38
- reconnect_attempts: [0, 0, 0.1, 0.5, 1, 3, 5],
38
+ reconnect_attempts: reconnect_attempts,
39
39
  middlewares: custom_middlewares,
40
40
  custom: custom_config,
41
41
  )
@@ -44,6 +44,43 @@ module CI
44
44
  end
45
45
  end
46
46
 
47
+ def reconnect_attempts
48
+ return [] if ENV["CI_QUEUE_DISABLE_RECONNECT_ATTEMPTS"]
49
+
50
+ [0, 0, 0.1, 0.5, 1, 3, 5]
51
+ end
52
+
53
+ def with_heartbeat(id)
54
+ if heartbeat_enabled?
55
+ ensure_heartbeat_thread_alive!
56
+ heartbeat_state.set(:tick, id)
57
+ end
58
+
59
+ yield
60
+ ensure
61
+ heartbeat_state.set(:reset) if heartbeat_enabled?
62
+ end
63
+
64
+ def ensure_heartbeat_thread_alive!
65
+ return unless heartbeat_enabled?
66
+ return if @heartbeat_thread&.alive?
67
+
68
+ @heartbeat_thread = Thread.start { heartbeat }
69
+ end
70
+
71
+ def boot_heartbeat_process!
72
+ return unless heartbeat_enabled?
73
+
74
+ heartbeat_process.boot!
75
+ end
76
+
77
+ def stop_heartbeat!
78
+ return unless heartbeat_enabled?
79
+
80
+ heartbeat_state.set(:stop)
81
+ heartbeat_process.shutdown!
82
+ end
83
+
47
84
  def custom_config
48
85
  return unless config.debug_log
49
86
 
@@ -163,6 +200,131 @@ module CI
163
200
  rescue SystemCallError
164
201
  ::File.read(::File.join(CI::Queue::RELEASE_SCRIPTS_ROOT, "#{name}.lua"))
165
202
  end
203
+
204
+ class HeartbeatProcess
205
+ def initialize(redis_url, zset_key, processed_key, owners_key, worker_queue_key)
206
+ @redis_url = redis_url
207
+ @zset_key = zset_key
208
+ @processed_key = processed_key
209
+ @owners_key = owners_key
210
+ @worker_queue_key = worker_queue_key
211
+ end
212
+
213
+ def boot!
214
+ child_read, @pipe = IO.pipe
215
+ ready_pipe, child_write = IO.pipe
216
+ @pipe.binmode
217
+ @pid = Process.spawn(
218
+ RbConfig.ruby,
219
+ ::File.join(__dir__, "monitor.rb"),
220
+ @redis_url,
221
+ @zset_key,
222
+ @processed_key,
223
+ @owners_key,
224
+ @worker_queue_key,
225
+ in: child_read,
226
+ out: child_write,
227
+ )
228
+ child_read.close
229
+ child_write.close
230
+
231
+ # Check the process is alive.
232
+ if ready_pipe.wait_readable(10)
233
+ ready_pipe.gets
234
+ ready_pipe.close
235
+ Process.kill(0, @pid)
236
+ else
237
+ Process.kill(0, @pid)
238
+ Process.wait(@pid)
239
+ raise "Monitor child wasn't ready after 10 seconds"
240
+ end
241
+ @pipe
242
+ end
243
+
244
+ def shutdown!
245
+ @pipe.close
246
+ begin
247
+ _, status = Process.waitpid2(@pid)
248
+ status
249
+ rescue Errno::ECHILD
250
+ nil
251
+ end
252
+ end
253
+
254
+ def tick!(id)
255
+ send_message(:tick!, id: id)
256
+ end
257
+
258
+ private
259
+
260
+ def send_message(*message)
261
+ payload = message.to_json
262
+ @pipe.write([payload.bytesize].pack("L").b, payload)
263
+ end
264
+ end
265
+
266
+ class State
267
+ def initialize
268
+ @state = nil
269
+ @mutex = Mutex.new
270
+ @cond = ConditionVariable.new
271
+ end
272
+
273
+ def set(*state)
274
+ @state = state
275
+ @mutex.synchronize do
276
+ @cond.broadcast
277
+ end
278
+ end
279
+
280
+ def wait(timeout)
281
+ @mutex.synchronize do
282
+ @cond.wait(@mutex, timeout)
283
+ end
284
+ @state
285
+ end
286
+ end
287
+
288
+ def heartbeat_state
289
+ @heartbeat_state ||= State.new
290
+ end
291
+
292
+ def heartbeat_process
293
+ @heartbeat_process ||= HeartbeatProcess.new(
294
+ @redis_url,
295
+ key('running'),
296
+ key('processed'),
297
+ key('owners'),
298
+ key('worker', worker_id, 'queue'),
299
+ )
300
+ end
301
+
302
+ def heartbeat_enabled?
303
+ config.max_missed_heartbeat_seconds
304
+ end
305
+
306
+ def heartbeat
307
+ Thread.current.name = "CI::Queue#heartbeat"
308
+ Thread.current.abort_on_exception = true
309
+
310
+ timeout = config.timeout.to_i
311
+ loop do
312
+ command = nil
313
+ command = heartbeat_state.wait(1) # waits for max 1 second but wakes up immediately if we receive a command
314
+
315
+ case command&.first
316
+ when :tick
317
+ if timeout > 0
318
+ heartbeat_process.tick!(command.last)
319
+ timeout -= 1
320
+ end
321
+ when :reset
322
+ timeout = config.timeout.to_i
323
+ when :stop
324
+ break
325
+ end
326
+ end
327
+ end
166
328
  end
167
329
  end
168
330
  end
@@ -0,0 +1,18 @@
1
+ -- AUTOGENERATED FILE DO NOT EDIT DIRECTLY
2
+ local zset_key = KEYS[1]
3
+ local processed_key = KEYS[2]
4
+ local owners_key = KEYS[3]
5
+ local worker_queue_key = KEYS[4]
6
+
7
+ local current_time = ARGV[1]
8
+ local test = ARGV[2]
9
+
10
+ -- already processed, we do not need to bump the timestamp
11
+ if redis.call('sismember', processed_key, test) == 1 then
12
+ return false
13
+ end
14
+
15
+ -- we're still the owner of the test, we can bump the timestamp
16
+ if redis.call('hget', owners_key, test) == worker_queue_key then
17
+ return redis.call('zadd', zset_key, current_time, test)
18
+ end
@@ -0,0 +1,153 @@
1
+ #!/usr/bin/env -S ruby --disable-gems
2
+ # typed: false
3
+ # frozen_string_literal: true
4
+
5
+ require 'logger'
6
+ require 'redis'
7
+ require 'json'
8
+
9
+ module CI
10
+ module Queue
11
+ module Redis
12
+ class Monitor
13
+ DEV_SCRIPTS_ROOT = ::File.expand_path('../../../../../../redis', __FILE__)
14
+ RELEASE_SCRIPTS_ROOT = ::File.expand_path('../../redis', __FILE__)
15
+
16
+ def initialize(pipe, logger, redis_url, zset_key, processed_key, owners_key, worker_queue_key)
17
+ @zset_key = zset_key
18
+ @processed_key = processed_key
19
+ @owners_key = owners_key
20
+ @worker_queue_key = worker_queue_key
21
+ @logger = logger
22
+ @redis = ::Redis.new(url: redis_url, reconnect_attempts: [0, 0, 0.1, 0.5, 1, 3, 5])
23
+ @shutdown = false
24
+ @pipe = pipe
25
+ @self_pipe_reader, @self_pipe_writer = IO.pipe
26
+ @self_pipe_writer.sync = true
27
+ @queue = []
28
+ @deadlines = {}
29
+ %i[TERM INT USR1].each do |sig|
30
+ Signal.trap(sig) { soft_signal(sig) }
31
+ end
32
+ end
33
+
34
+ def soft_signal(sig)
35
+ @queue << sig
36
+ @self_pipe_writer << '.'
37
+ end
38
+
39
+ def process_tick!(id:)
40
+ eval_script(
41
+ :heartbeat,
42
+ keys: [@zset_key, @processed_key, @owners_key, @worker_queue_key],
43
+ argv: [Time.now.to_f, id]
44
+ )
45
+ rescue => error
46
+ @logger.info(error)
47
+ end
48
+
49
+ def eval_script(script, *args)
50
+ @redis.evalsha(load_script(script), *args)
51
+ end
52
+
53
+ def load_script(script)
54
+ @scripts_cache ||= {}
55
+ @scripts_cache[script] ||= @redis.script(:load, read_script(script))
56
+ end
57
+
58
+ def read_script(name)
59
+ ::File.read(::File.join(DEV_SCRIPTS_ROOT, "#{name}.lua"))
60
+ rescue SystemCallError
61
+ ::File.read(::File.join(RELEASE_SCRIPTS_ROOT, "#{name}.lua"))
62
+ end
63
+
64
+ HEADER = 'L'
65
+ HEADER_SIZE = [0].pack(HEADER).bytesize
66
+ def read_message(io)
67
+ case header = io.read_nonblock(HEADER_SIZE, exception: false)
68
+ when :wait_readable
69
+ nil
70
+ when nil
71
+ @logger.debug('Broken pipe, exiting')
72
+ @shutdown = 0
73
+ false
74
+ else
75
+ JSON.parse(io.read(header.unpack1(HEADER)))
76
+ end
77
+ end
78
+
79
+ def process_messages(io)
80
+ while (message = read_message(io))
81
+ type, kwargs = message
82
+ kwargs.transform_keys!(&:to_sym)
83
+ public_send("process_#{type}", **kwargs)
84
+ end
85
+ end
86
+
87
+ def wait_for_events(ios)
88
+ return if @shutdown
89
+
90
+ return unless (ready = IO.select(ios, nil, nil, 10))
91
+
92
+ ready[0].each do |io|
93
+ case io
94
+ when @self_pipe_reader
95
+ io.read_nonblock(512, exception: false) # Just flush the pipe, the information is in the @queue
96
+ when @pipe
97
+ process_messages(@pipe)
98
+ else
99
+ @logger.debug("Unknown reader: #{io.inspect}")
100
+ raise "Unknown reader: #{io.inspect}"
101
+ end
102
+ end
103
+ end
104
+
105
+ def monitor
106
+ @logger.debug("Starting monitor")
107
+ ios = [@self_pipe_reader, @pipe]
108
+
109
+ until @shutdown
110
+ while (sig = @queue.shift)
111
+ case sig
112
+ when :INT, :TERM
113
+ @logger.debug("Received #{sig}, exiting")
114
+ @shutdown = 0
115
+ break
116
+ else
117
+ raise "Unknown signal: #{sig.inspect}"
118
+ end
119
+ end
120
+
121
+ wait_for_events(ios)
122
+ end
123
+
124
+ @logger.debug('Done')
125
+ @shutdown
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
131
+
132
+ logger = Logger.new($stderr)
133
+ if ARGV.include?('-v')
134
+ logger.level = Logger::DEBUG
135
+ else
136
+ logger.level = Logger::INFO
137
+ logger.formatter = ->(_severity, _timestamp, _progname, msg) { "[CI Queue Monitor] #{msg}\n" }
138
+ end
139
+
140
+ redis_url = ARGV[0]
141
+ zset_key = ARGV[1]
142
+ processed_key = ARGV[2]
143
+ owners_key = ARGV[3]
144
+ worker_queue_key = ARGV[4]
145
+
146
+ logger.debug("Starting monitor: #{redis_url} #{zset_key} #{processed_key}")
147
+ manager = CI::Queue::Redis::Monitor.new($stdin, logger, redis_url, zset_key, processed_key, owners_key, worker_queue_key)
148
+
149
+ # Notify the parent we're ready
150
+ $stdout.puts(".")
151
+ $stdout.close
152
+
153
+ exit(manager.monitor)
@@ -144,10 +144,6 @@ module CI
144
144
  config.worker_id
145
145
  end
146
146
 
147
- def timeout
148
- config.timeout
149
- end
150
-
151
147
  def raise_on_mismatching_test(test)
152
148
  if @reserved_test == test
153
149
  @reserved_test = nil
@@ -180,6 +176,8 @@ module CI
180
176
  end
181
177
 
182
178
  def try_to_reserve_lost_test
179
+ timeout = config.max_missed_heartbeat_seconds ? config.max_missed_heartbeat_seconds : config.timeout
180
+
183
181
  lost_test = eval_script(
184
182
  :reserve_lost,
185
183
  keys: [
@@ -192,7 +190,7 @@ module CI
192
190
  )
193
191
 
194
192
  if lost_test
195
- build.record_warning(Warnings::RESERVED_LOST_TEST, test: lost_test, timeout: timeout)
193
+ build.record_warning(Warnings::RESERVED_LOST_TEST, test: lost_test, timeout: config.timeout)
196
194
  end
197
195
 
198
196
  lost_test
@@ -48,6 +48,16 @@ module CI
48
48
  self
49
49
  end
50
50
 
51
+ def with_heartbeat(id)
52
+ yield
53
+ end
54
+
55
+ def ensure_heartbeat_thread_alive!; end
56
+
57
+ def boot_heartbeat_process!; end
58
+
59
+ def stop_heartbeat!; end
60
+
51
61
  def created_at=(timestamp)
52
62
  @created_at ||= timestamp
53
63
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  module CI
4
4
  module Queue
5
- VERSION = '0.42.0'
5
+ VERSION = '0.43.0'
6
6
  DEV_SCRIPTS_ROOT = ::File.expand_path('../../../../../redis', __FILE__)
7
7
  RELEASE_SCRIPTS_ROOT = ::File.expand_path('../redis', __FILE__)
8
8
  end
data/lib/ci/queue.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'uri'
4
4
  require 'cgi'
5
+ require 'json'
5
6
 
6
7
  require 'ci/queue/version'
7
8
  require 'ci/queue/output_helpers'
@@ -64,6 +64,7 @@ module Minitest
64
64
  end
65
65
 
66
66
  queue.rescue_connection_errors { queue.created_at = CI::Queue.time_now.to_f }
67
+ queue.boot_heartbeat_process!
67
68
 
68
69
  set_load_path
69
70
  Minitest.queue = queue
@@ -582,6 +583,16 @@ module Minitest
582
583
  queue.config.redis_ttl = time
583
584
  end
584
585
 
586
+ help = <<~EOS
587
+ If heartbeat is enabled, a background process will periodically signal it's still processing
588
+ the current test. If the heartbeat stops for the specified amount of seconds,
589
+ the test will be requeued to another worker.
590
+ EOS
591
+ opts.on("--heartbeat [SECONDS]", Integer, help) do |time|
592
+ queue_config.max_missed_heartbeat_seconds = time || 30
593
+ end
594
+
595
+
585
596
  opts.on("-v", "--verbose", "Verbose. Show progress processing files.") do
586
597
  self.verbose = true
587
598
  end
@@ -226,7 +226,10 @@ module Minitest
226
226
 
227
227
  def run_from_queue(reporter, *)
228
228
  queue.poll do |example|
229
- result = example.run
229
+ result = queue.with_heartbeat(example.id) do
230
+ example.run
231
+ end
232
+
230
233
  failed = !(result.passed? || result.skipped?)
231
234
 
232
235
  if example.flaky?
@@ -256,6 +259,7 @@ module Minitest
256
259
  reporter.record(result)
257
260
  end
258
261
  end
262
+ queue.stop_heartbeat!
259
263
  end
260
264
  end
261
265
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ci-queue
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.42.0
4
+ version: 0.43.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jean Boussier
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-15 00:00:00.000000000 Z
11
+ date: 2024-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -188,6 +188,8 @@ files:
188
188
  - lib/ci/queue/redis/grind.rb
189
189
  - lib/ci/queue/redis/grind_record.rb
190
190
  - lib/ci/queue/redis/grind_supervisor.rb
191
+ - lib/ci/queue/redis/heartbeat.lua
192
+ - lib/ci/queue/redis/monitor.rb
191
193
  - lib/ci/queue/redis/release.lua
192
194
  - lib/ci/queue/redis/requeue.lua
193
195
  - lib/ci/queue/redis/reserve.lua
@@ -239,7 +241,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
239
241
  - !ruby/object:Gem::Version
240
242
  version: '0'
241
243
  requirements: []
242
- rubygems_version: 3.5.4
244
+ rubygems_version: 3.5.5
243
245
  signing_key:
244
246
  specification_version: 4
245
247
  summary: Distribute tests over many workers using a queue