ci-queue 0.81.0 → 0.83.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +1 -1
  3. data/README.md +109 -0
  4. data/lib/ci/queue/build_record.rb +22 -10
  5. data/lib/ci/queue/class_resolver.rb +38 -0
  6. data/lib/ci/queue/configuration.rb +62 -1
  7. data/lib/ci/queue/file_loader.rb +101 -0
  8. data/lib/ci/queue/queue_entry.rb +56 -0
  9. data/lib/ci/queue/redis/_entry_helpers.lua +10 -0
  10. data/lib/ci/queue/redis/acknowledge.lua +10 -7
  11. data/lib/ci/queue/redis/base.rb +34 -8
  12. data/lib/ci/queue/redis/build_record.rb +89 -22
  13. data/lib/ci/queue/redis/grind_record.rb +17 -13
  14. data/lib/ci/queue/redis/heartbeat.lua +9 -4
  15. data/lib/ci/queue/redis/monitor.rb +19 -5
  16. data/lib/ci/queue/redis/requeue.lua +19 -11
  17. data/lib/ci/queue/redis/reserve.lua +47 -8
  18. data/lib/ci/queue/redis/reserve_lost.lua +5 -1
  19. data/lib/ci/queue/redis/supervisor.rb +3 -3
  20. data/lib/ci/queue/redis/worker.rb +216 -23
  21. data/lib/ci/queue/redis.rb +0 -1
  22. data/lib/ci/queue/version.rb +1 -1
  23. data/lib/ci/queue.rb +27 -0
  24. data/lib/minitest/queue/build_status_recorder.rb +32 -14
  25. data/lib/minitest/queue/grind_recorder.rb +3 -3
  26. data/lib/minitest/queue/junit_reporter.rb +2 -2
  27. data/lib/minitest/queue/lazy_entry_resolver.rb +55 -0
  28. data/lib/minitest/queue/lazy_test_discovery.rb +169 -0
  29. data/lib/minitest/queue/local_requeue_reporter.rb +11 -0
  30. data/lib/minitest/queue/order_reporter.rb +9 -2
  31. data/lib/minitest/queue/queue_population_strategy.rb +176 -0
  32. data/lib/minitest/queue/runner.rb +117 -27
  33. data/lib/minitest/queue/test_data.rb +14 -1
  34. data/lib/minitest/queue/worker_profile_reporter.rb +77 -0
  35. data/lib/minitest/queue.rb +271 -6
  36. metadata +10 -3
  37. data/lib/ci/queue/redis/key_shortener.rb +0 -53
@@ -56,30 +56,71 @@ module CI
56
56
  redis.rpush(key('warnings'), Marshal.dump([type, attributes]))
57
57
  end
58
58
 
59
- def record_error(id, payload, stats: nil)
60
- acknowledged, _ = redis.pipelined do |pipeline|
61
- @queue.acknowledge(id, error: payload, pipeline: pipeline)
62
- record_stats(stats, pipeline: pipeline)
59
+ def record_error(id, payload, stat_delta: nil)
60
+ # Run acknowledge first so we know whether we're the first to ack
61
+ acknowledged = @queue.acknowledge(id, error: payload)
62
+
63
+ if acknowledged
64
+ # We were the first to ack; another worker already ack'd would get falsy from SADD
65
+ @queue.increment_test_failed
66
+ # Only the acknowledging worker's stats include this failure (others skip increment when ack=false).
67
+ # Store so we can subtract it if another worker records success later.
68
+ store_error_report_delta(id, stat_delta) if stat_delta && stat_delta.any?
63
69
  end
64
-
65
- @queue.increment_test_failed if acknowledged == 1
66
- nil
70
+ # Return so caller can roll back local counter when not acknowledged
71
+ !!acknowledged
67
72
  end
68
73
 
69
- def record_success(id, stats: nil, skip_flaky_record: false)
70
- _, error_reports_deleted_count, requeued_count, _ = redis.multi do |transaction|
74
+ def record_success(id, skip_flaky_record: false)
75
+ acknowledged, error_reports_deleted_count, requeued_count, delta_json = redis.multi do |transaction|
71
76
  @queue.acknowledge(id, pipeline: transaction)
72
77
  transaction.hdel(key('error-reports'), id)
73
78
  transaction.hget(key('requeues-count'), id)
74
- record_stats(stats, pipeline: transaction)
79
+ transaction.hget(key('error-report-deltas'), id)
80
+ end
81
+ # When we're replacing a failure, subtract the (single) acknowledging worker's stat contribution
82
+ if error_reports_deleted_count.to_i > 0 && delta_json
83
+ apply_error_report_delta_correction(delta_json)
84
+ redis.hdel(key('error-report-deltas'), id)
75
85
  end
76
86
  record_flaky(id) if !skip_flaky_record && (error_reports_deleted_count.to_i > 0 || requeued_count.to_i > 0)
77
- nil
87
+ # Count this run when we ack'd or when we replaced a failure (so stats delta is applied)
88
+ !!(acknowledged || error_reports_deleted_count.to_i > 0)
78
89
  end
79
90
 
80
- def record_requeue(id, stats: nil)
81
- redis.pipelined do |pipeline|
82
- record_stats(stats, pipeline: pipeline)
91
+ def record_requeue(id)
92
+ true
93
+ end
94
+
95
+ def record_stats(stats = nil, pipeline: nil)
96
+ return unless stats
97
+ if pipeline
98
+ stats.each do |stat_name, stat_value|
99
+ pipeline.hset(key(stat_name), config.worker_id, stat_value)
100
+ pipeline.expire(key(stat_name), config.redis_ttl)
101
+ end
102
+ else
103
+ redis.pipelined do |p|
104
+ record_stats(stats, pipeline: p)
105
+ end
106
+ end
107
+ end
108
+
109
+ # Apply a delta to this worker's stats in Redis (HINCRBY). Use this instead of
110
+ # record_stats when recording per-test so we never overwrite and correction sticks.
111
+ def record_stats_delta(delta, pipeline: nil)
112
+ return if delta.nil? || delta.empty?
113
+ apply_delta = lambda do |p|
114
+ delta.each do |stat_name, value|
115
+ next unless value.is_a?(Numeric) || value.to_s.match?(/\A-?\d+\.?\d*\z/)
116
+ p.hincrbyfloat(key(stat_name), config.worker_id.to_s, value.to_f)
117
+ p.expire(key(stat_name), config.redis_ttl)
118
+ end
119
+ end
120
+ if pipeline
121
+ apply_delta.call(pipeline)
122
+ else
123
+ redis.pipelined { |p| apply_delta.call(p) }
83
124
  end
84
125
  end
85
126
 
@@ -108,6 +149,18 @@ module CI
108
149
  redis.smembers(key('flaky-reports'))
109
150
  end
110
151
 
152
+ def record_worker_profile(profile)
153
+ redis.pipelined do |pipeline|
154
+ pipeline.hset(key('worker-profiles'), config.worker_id, JSON.dump(profile))
155
+ pipeline.expire(key('worker-profiles'), config.redis_ttl)
156
+ end
157
+ end
158
+
159
+ def worker_profiles
160
+ raw = redis.hgetall(key('worker-profiles'))
161
+ raw.transform_values { |v| JSON.parse(v) }
162
+ end
163
+
111
164
  def fetch_stats(stat_names)
112
165
  counts = redis.pipelined do |pipeline|
113
166
  stat_names.each { |c| pipeline.hvals(key(c)) }
@@ -130,16 +183,30 @@ module CI
130
183
 
131
184
  attr_reader :config, :redis
132
185
 
133
- def record_stats(stats, pipeline: redis)
134
- return unless stats
135
- stats.each do |stat_name, stat_value|
136
- pipeline.hset(key(stat_name), config.worker_id, stat_value)
137
- pipeline.expire(key(stat_name), config.redis_ttl)
138
- end
186
+ def key(*args)
187
+ ['build', config.build_id, *args].join(':')
139
188
  end
140
189
 
141
- def key(*args)
142
- KeyShortener.key(config.build_id, *args)
190
+ def store_error_report_delta(test_id, stat_delta)
191
+ # Only the acknowledging worker's stats include this test; store their delta for correction on success
192
+ payload = { 'worker_id' => config.worker_id.to_s }.merge(stat_delta)
193
+ redis.hset(key('error-report-deltas'), test_id, JSON.generate(payload))
194
+ redis.expire(key('error-report-deltas'), config.redis_ttl)
195
+ end
196
+
197
+ def apply_error_report_delta_correction(delta_json)
198
+ delta = JSON.parse(delta_json)
199
+ worker_id = delta.delete('worker_id')&.to_s
200
+ return if worker_id.nil? || worker_id.empty? || delta.empty?
201
+
202
+ redis.pipelined do |pipeline|
203
+ delta.each do |stat_name, value|
204
+ next unless value.is_a?(Numeric) || value.to_s.match?(/\A-?\d+\.?\d*\z/)
205
+
206
+ pipeline.hincrbyfloat(key(stat_name), worker_id, -value.to_f)
207
+ pipeline.expire(key(stat_name), config.redis_ttl)
208
+ end
209
+ end
143
210
  end
144
211
  end
145
212
  end
@@ -10,20 +10,32 @@ module CI
10
10
  @config = config
11
11
  end
12
12
 
13
- def record_error(payload, stats: nil)
13
+ def record_error(payload)
14
14
  redis.pipelined do |pipeline|
15
15
  pipeline.lpush(
16
16
  key('error-reports'),
17
17
  payload,
18
18
  )
19
19
  pipeline.expire(key('error-reports'), config.redis_ttl)
20
- record_stats(stats, pipeline: pipeline)
21
20
  end
22
21
  nil
23
22
  end
24
23
 
25
- def record_success(stats: nil)
26
- record_stats(stats)
24
+ def record_success
25
+ end
26
+
27
+ def record_stats(stats, pipeline: nil)
28
+ return unless stats
29
+ if pipeline
30
+ stats.each do |stat_name, stat_value|
31
+ pipeline.hset(key(stat_name), config.worker_id, stat_value)
32
+ pipeline.expire(key(stat_name), config.redis_ttl)
33
+ end
34
+ else
35
+ redis.pipelined do |p|
36
+ record_stats(stats, pipeline: p)
37
+ end
38
+ end
27
39
  end
28
40
 
29
41
  def record_warning(_,_)
@@ -52,15 +64,7 @@ module CI
52
64
  attr_reader :redis, :config
53
65
 
54
66
  def key(*args)
55
- KeyShortener.key(config.build_id, *args)
56
- end
57
-
58
- def record_stats(stats, pipeline: redis)
59
- return unless stats
60
- stats.each do |stat_name, stat_value|
61
- pipeline.hset(key(stat_name), config.worker_id, stat_value)
62
- pipeline.expire(key(stat_name), config.redis_ttl)
63
- end
67
+ ['build', config.build_id, *args].join(':')
64
68
  end
65
69
  end
66
70
  end
@@ -1,18 +1,23 @@
1
1
  -- AUTOGENERATED FILE DO NOT EDIT DIRECTLY
2
+ -- @include _entry_helpers
3
+
2
4
  local zset_key = KEYS[1]
3
5
  local processed_key = KEYS[2]
4
6
  local owners_key = KEYS[3]
5
7
  local worker_queue_key = KEYS[4]
6
8
 
7
9
  local current_time = ARGV[1]
8
- local test = ARGV[2]
10
+ local entry = ARGV[2]
11
+ local entry_delimiter = ARGV[3]
12
+
13
+ local test_id = test_id_from_entry(entry, entry_delimiter)
9
14
 
10
15
  -- already processed, we do not need to bump the timestamp
11
- if redis.call('sismember', processed_key, test) == 1 then
16
+ if redis.call('sismember', processed_key, test_id) == 1 then
12
17
  return false
13
18
  end
14
19
 
15
20
  -- we're still the owner of the test, we can bump the timestamp
16
- if redis.call('hget', owners_key, test) == worker_queue_key then
17
- return redis.call('zadd', zset_key, current_time, test)
21
+ if redis.call('hget', owners_key, entry) == worker_queue_key then
22
+ return redis.call('zadd', zset_key, current_time, entry)
18
23
  end
@@ -13,11 +13,12 @@ module CI
13
13
  DEV_SCRIPTS_ROOT = ::File.expand_path('../../../../../../redis', __FILE__)
14
14
  RELEASE_SCRIPTS_ROOT = ::File.expand_path('../../redis', __FILE__)
15
15
 
16
- def initialize(pipe, logger, redis_url, zset_key, processed_key, owners_key, worker_queue_key)
16
+ def initialize(pipe, logger, redis_url, zset_key, processed_key, owners_key, worker_queue_key, entry_delimiter)
17
17
  @zset_key = zset_key
18
18
  @processed_key = processed_key
19
19
  @owners_key = owners_key
20
20
  @worker_queue_key = worker_queue_key
21
+ @entry_delimiter = entry_delimiter
21
22
  @logger = logger
22
23
  @redis = ::Redis.new(url: redis_url, reconnect_attempts: [0, 0, 0.1, 0.5, 1, 3, 5])
23
24
  @shutdown = false
@@ -40,7 +41,7 @@ module CI
40
41
  eval_script(
41
42
  :heartbeat,
42
43
  keys: [@zset_key, @processed_key, @owners_key, @worker_queue_key],
43
- argv: [Time.now.to_f, id]
44
+ argv: [Time.now.to_f, id, @entry_delimiter]
44
45
  )
45
46
  rescue => error
46
47
  @logger.info(error)
@@ -56,9 +57,21 @@ module CI
56
57
  end
57
58
 
58
59
  def read_script(name)
59
- ::File.read(::File.join(DEV_SCRIPTS_ROOT, "#{name}.lua"))
60
+ resolve_lua_includes(
61
+ ::File.read(::File.join(DEV_SCRIPTS_ROOT, "#{name}.lua")),
62
+ DEV_SCRIPTS_ROOT,
63
+ )
60
64
  rescue SystemCallError
61
- ::File.read(::File.join(RELEASE_SCRIPTS_ROOT, "#{name}.lua"))
65
+ resolve_lua_includes(
66
+ ::File.read(::File.join(RELEASE_SCRIPTS_ROOT, "#{name}.lua")),
67
+ RELEASE_SCRIPTS_ROOT,
68
+ )
69
+ end
70
+
71
+ def resolve_lua_includes(script, root)
72
+ script.gsub(/^-- @include (\S+)$/) do
73
+ ::File.read(::File.join(root, "#{$1}.lua"))
74
+ end
62
75
  end
63
76
 
64
77
  HEADER = 'L'
@@ -142,9 +155,10 @@ zset_key = ARGV[1]
142
155
  processed_key = ARGV[2]
143
156
  owners_key = ARGV[3]
144
157
  worker_queue_key = ARGV[4]
158
+ entry_delimiter = ARGV[5]
145
159
 
146
160
  logger.debug("Starting monitor: #{redis_url} #{zset_key} #{processed_key}")
147
- manager = CI::Queue::Redis::Monitor.new($stdin, logger, redis_url, zset_key, processed_key, owners_key, worker_queue_key)
161
+ manager = CI::Queue::Redis::Monitor.new($stdin, logger, redis_url, zset_key, processed_key, owners_key, worker_queue_key, entry_delimiter)
148
162
 
149
163
  # Notify the parent we're ready
150
164
  $stdout.puts(".")
@@ -6,17 +6,20 @@ local zset_key = KEYS[4]
6
6
  local worker_queue_key = KEYS[5]
7
7
  local owners_key = KEYS[6]
8
8
  local error_reports_key = KEYS[7]
9
+ local requeued_by_key = KEYS[8]
9
10
 
10
11
  local max_requeues = tonumber(ARGV[1])
11
12
  local global_max_requeues = tonumber(ARGV[2])
12
- local test = ARGV[3]
13
- local offset = ARGV[4]
13
+ local entry = ARGV[3]
14
+ local test_id = ARGV[4]
15
+ local offset = ARGV[5]
16
+ local ttl = tonumber(ARGV[6])
14
17
 
15
- if redis.call('hget', owners_key, test) == worker_queue_key then
16
- redis.call('hdel', owners_key, test)
18
+ if redis.call('hget', owners_key, entry) == worker_queue_key then
19
+ redis.call('hdel', owners_key, entry)
17
20
  end
18
21
 
19
- if redis.call('sismember', processed_key, test) == 1 then
22
+ if redis.call('sismember', processed_key, test_id) == 1 then
20
23
  return false
21
24
  end
22
25
 
@@ -25,23 +28,28 @@ if global_requeues and global_requeues >= tonumber(global_max_requeues) then
25
28
  return false
26
29
  end
27
30
 
28
- local requeues = tonumber(redis.call('hget', requeues_count_key, test))
31
+ local requeues = tonumber(redis.call('hget', requeues_count_key, test_id))
29
32
  if requeues and requeues >= max_requeues then
30
33
  return false
31
34
  end
32
35
 
33
36
  redis.call('hincrby', requeues_count_key, '___total___', 1)
34
- redis.call('hincrby', requeues_count_key, test, 1)
37
+ redis.call('hincrby', requeues_count_key, test_id, 1)
35
38
 
36
- redis.call('hdel', error_reports_key, test)
39
+ redis.call('hdel', error_reports_key, test_id)
37
40
 
38
41
  local pivot = redis.call('lrange', queue_key, -1 - offset, 0 - offset)[1]
39
42
  if pivot then
40
- redis.call('linsert', queue_key, 'BEFORE', pivot, test)
43
+ redis.call('linsert', queue_key, 'BEFORE', pivot, entry)
41
44
  else
42
- redis.call('lpush', queue_key, test)
45
+ redis.call('lpush', queue_key, entry)
43
46
  end
44
47
 
45
- redis.call('zrem', zset_key, test)
48
+ redis.call('hset', requeued_by_key, entry, worker_queue_key)
49
+ if ttl and ttl > 0 then
50
+ redis.call('expire', requeued_by_key, ttl)
51
+ end
52
+
53
+ redis.call('zrem', zset_key, entry)
46
54
 
47
55
  return true
@@ -4,15 +4,54 @@ local zset_key = KEYS[2]
4
4
  local processed_key = KEYS[3]
5
5
  local worker_queue_key = KEYS[4]
6
6
  local owners_key = KEYS[5]
7
+ local requeued_by_key = KEYS[6]
8
+ local workers_key = KEYS[7]
7
9
 
8
10
  local current_time = ARGV[1]
11
+ local defer_offset = tonumber(ARGV[2]) or 0
12
+ local max_skip_attempts = 4
9
13
 
10
- local test = redis.call('rpop', queue_key)
11
- if test then
12
- redis.call('zadd', zset_key, current_time, test)
13
- redis.call('lpush', worker_queue_key, test)
14
- redis.call('hset', owners_key, test, worker_queue_key)
15
- return test
16
- else
17
- return nil
14
+ local function insert_with_offset(test)
15
+ local pivot = redis.call('lrange', queue_key, -1 - defer_offset, 0 - defer_offset)[1]
16
+ if pivot then
17
+ redis.call('linsert', queue_key, 'BEFORE', pivot, test)
18
+ else
19
+ redis.call('lpush', queue_key, test)
20
+ end
18
21
  end
22
+
23
+ for attempt = 1, max_skip_attempts do
24
+ local test = redis.call('rpop', queue_key)
25
+ if not test then
26
+ return nil
27
+ end
28
+
29
+ local requeued_by = redis.call('hget', requeued_by_key, test)
30
+ if requeued_by == worker_queue_key then
31
+ -- If this build only has one worker, allow immediate self-pickup.
32
+ if redis.call('scard', workers_key) <= 1 then
33
+ redis.call('hdel', requeued_by_key, test)
34
+ redis.call('zadd', zset_key, current_time, test)
35
+ redis.call('lpush', worker_queue_key, test)
36
+ redis.call('hset', owners_key, test, worker_queue_key)
37
+ return test
38
+ end
39
+
40
+ insert_with_offset(test)
41
+
42
+ -- If this worker only finds its own requeued tests, defer once by returning nil,
43
+ -- then allow pickup on a subsequent reserve attempt.
44
+ if attempt == max_skip_attempts then
45
+ redis.call('hdel', requeued_by_key, test)
46
+ return nil
47
+ end
48
+ else
49
+ redis.call('hdel', requeued_by_key, test)
50
+ redis.call('zadd', zset_key, current_time, test)
51
+ redis.call('lpush', worker_queue_key, test)
52
+ redis.call('hset', owners_key, test, worker_queue_key)
53
+ return test
54
+ end
55
+ end
56
+
57
+ return nil
@@ -1,4 +1,6 @@
1
1
  -- AUTOGENERATED FILE DO NOT EDIT DIRECTLY
2
+ -- @include _entry_helpers
3
+
2
4
  local zset_key = KEYS[1]
3
5
  local processed_key = KEYS[2]
4
6
  local worker_queue_key = KEYS[3]
@@ -6,10 +8,12 @@ local owners_key = KEYS[4]
6
8
 
7
9
  local current_time = ARGV[1]
8
10
  local timeout = ARGV[2]
11
+ local entry_delimiter = ARGV[3]
9
12
 
10
13
  local lost_tests = redis.call('zrangebyscore', zset_key, 0, current_time - timeout)
11
14
  for _, test in ipairs(lost_tests) do
12
- if redis.call('sismember', processed_key, test) == 0 then
15
+ local test_id = test_id_from_entry(test, entry_delimiter)
16
+ if redis.call('sismember', processed_key, test_id) == 0 then
13
17
  redis.call('zadd', zset_key, current_time, test)
14
18
  redis.call('lpush', worker_queue_key, test)
15
19
  redis.call('hset', owners_key, test, worker_queue_key) -- Take ownership
@@ -9,7 +9,7 @@ module CI
9
9
  end
10
10
 
11
11
  def total
12
- wait_for_master(timeout: config.queue_init_timeout)
12
+ wait_for_master(timeout: config.queue_init_timeout, allow_streaming: true)
13
13
  redis.get(key('total')).to_i
14
14
  end
15
15
 
@@ -19,7 +19,7 @@ module CI
19
19
 
20
20
  def wait_for_workers
21
21
  duration = measure do
22
- wait_for_master(timeout: config.queue_init_timeout)
22
+ wait_for_master(timeout: config.queue_init_timeout, allow_streaming: true)
23
23
  end
24
24
 
25
25
  yield if block_given?
@@ -30,7 +30,7 @@ module CI
30
30
  @time_left -= 1
31
31
  sleep 1
32
32
 
33
- if active_workers?
33
+ if active_workers? || streaming?
34
34
  @time_left_with_no_workers = config.inactive_workers_timeout
35
35
  else
36
36
  @time_left_with_no_workers -= 1