ci-queue 0.82.0 → 0.84.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.ruby-version +1 -1
  3. data/Gemfile.lock +59 -47
  4. data/README.md +87 -0
  5. data/ci-queue.gemspec +3 -1
  6. data/lib/ci/queue/build_record.rb +5 -5
  7. data/lib/ci/queue/class_resolver.rb +38 -0
  8. data/lib/ci/queue/configuration.rb +62 -1
  9. data/lib/ci/queue/file_loader.rb +101 -0
  10. data/lib/ci/queue/queue_entry.rb +48 -0
  11. data/lib/ci/queue/redis/acknowledge.lua +7 -5
  12. data/lib/ci/queue/redis/base.rb +29 -6
  13. data/lib/ci/queue/redis/build_record.rb +29 -17
  14. data/lib/ci/queue/redis/heartbeat.lua +4 -4
  15. data/lib/ci/queue/redis/monitor.rb +14 -2
  16. data/lib/ci/queue/redis/requeue.lua +17 -10
  17. data/lib/ci/queue/redis/reserve.lua +47 -8
  18. data/lib/ci/queue/redis/supervisor.rb +3 -3
  19. data/lib/ci/queue/redis/worker.rb +210 -27
  20. data/lib/ci/queue/static.rb +5 -5
  21. data/lib/ci/queue/version.rb +1 -1
  22. data/lib/ci/queue.rb +27 -0
  23. data/lib/minitest/queue/build_status_recorder.rb +4 -4
  24. data/lib/minitest/queue/junit_reporter.rb +2 -2
  25. data/lib/minitest/queue/lazy_entry_resolver.rb +55 -0
  26. data/lib/minitest/queue/lazy_test_discovery.rb +169 -0
  27. data/lib/minitest/queue/local_requeue_reporter.rb +11 -0
  28. data/lib/minitest/queue/order_reporter.rb +9 -2
  29. data/lib/minitest/queue/queue_population_strategy.rb +176 -0
  30. data/lib/minitest/queue/runner.rb +97 -22
  31. data/lib/minitest/queue/test_data.rb +15 -2
  32. data/lib/minitest/queue/worker_profile_reporter.rb +77 -0
  33. data/lib/minitest/queue.rb +278 -10
  34. data/lib/rspec/queue/build_status_recorder.rb +4 -2
  35. data/lib/rspec/queue.rb +6 -2
  36. metadata +38 -3
@@ -144,19 +144,26 @@ module CI
144
144
  end
145
145
 
146
146
  def to_a
147
- test_ids.reverse.map { |k| index.fetch(k) }
147
+ test_ids.reverse.map do |entry|
148
+ index.fetch(entry) do
149
+ test_id = CI::Queue::QueueEntry.test_id(entry)
150
+ index.fetch(test_id)
151
+ end
152
+ end
148
153
  end
149
154
 
150
155
  def progress
151
- total - size
156
+ progress = total - size
157
+ progress < 0 ? 0 : progress
152
158
  end
153
159
 
154
- def wait_for_master(timeout: 30)
160
+ def wait_for_master(timeout: 30, allow_streaming: false)
155
161
  return true if master?
156
162
  return true if queue_initialized?
163
+ return true if allow_streaming && streaming?
157
164
 
158
165
  (timeout * 10 + 1).to_i.times do
159
- if queue_initialized?
166
+ if queue_initialized? || (allow_streaming && streaming?)
160
167
  return true
161
168
  else
162
169
  sleep 0.1
@@ -177,6 +184,10 @@ module CI
177
184
  end
178
185
  end
179
186
 
187
+ def streaming?
188
+ master_status == 'streaming'
189
+ end
190
+
180
191
  def queue_initializing?
181
192
  master_status == 'setup'
182
193
  end
@@ -235,9 +246,21 @@ module CI
235
246
  end
236
247
 
237
248
  def read_script(name)
238
- ::File.read(::File.join(CI::Queue::DEV_SCRIPTS_ROOT, "#{name}.lua"))
249
+ resolve_lua_includes(
250
+ ::File.read(::File.join(CI::Queue::DEV_SCRIPTS_ROOT, "#{name}.lua")),
251
+ CI::Queue::DEV_SCRIPTS_ROOT,
252
+ )
239
253
  rescue SystemCallError
240
- ::File.read(::File.join(CI::Queue::RELEASE_SCRIPTS_ROOT, "#{name}.lua"))
254
+ resolve_lua_includes(
255
+ ::File.read(::File.join(CI::Queue::RELEASE_SCRIPTS_ROOT, "#{name}.lua")),
256
+ CI::Queue::RELEASE_SCRIPTS_ROOT,
257
+ )
258
+ end
259
+
260
+ def resolve_lua_includes(script, root)
261
+ script.gsub(/^-- @include (\S+)$/) do
262
+ ::File.read(::File.join(root, "#{$1}.lua"))
263
+ end
241
264
  end
242
265
 
243
266
  class HeartbeatProcess
@@ -33,14 +33,14 @@ module CI
33
33
  end
34
34
 
35
35
  def failed_tests
36
- redis.hkeys(key('error-reports'))
36
+ redis.hkeys(key('error-reports')).map { |entry| CI::Queue::QueueEntry.test_id(entry) }
37
37
  end
38
38
 
39
39
  TOTAL_KEY = "___total___"
40
40
  def requeued_tests
41
41
  requeues = redis.hgetall(key('requeues-count'))
42
42
  requeues.delete(TOTAL_KEY)
43
- requeues
43
+ requeues.transform_keys { |entry| CI::Queue::QueueEntry.test_id(entry) }
44
44
  end
45
45
 
46
46
  def pop_warnings
@@ -56,39 +56,39 @@ module CI
56
56
  redis.rpush(key('warnings'), Marshal.dump([type, attributes]))
57
57
  end
58
58
 
59
- def record_error(id, payload, stat_delta: nil)
59
+ def record_error(entry, payload, stat_delta: nil)
60
60
  # Run acknowledge first so we know whether we're the first to ack
61
- acknowledged = @queue.acknowledge(id, error: payload)
61
+ acknowledged = @queue.acknowledge(entry, error: payload)
62
62
 
63
63
  if acknowledged
64
64
  # We were the first to ack; another worker already ack'd would get falsy from SADD
65
65
  @queue.increment_test_failed
66
66
  # Only the acknowledging worker's stats include this failure (others skip increment when ack=false).
67
67
  # Store so we can subtract it if another worker records success later.
68
- store_error_report_delta(id, stat_delta) if stat_delta && stat_delta.any?
68
+ store_error_report_delta(entry, stat_delta) if stat_delta && stat_delta.any?
69
69
  end
70
70
  # Return so caller can roll back local counter when not acknowledged
71
71
  !!acknowledged
72
72
  end
73
73
 
74
- def record_success(id, skip_flaky_record: false)
74
+ def record_success(entry, skip_flaky_record: false)
75
75
  acknowledged, error_reports_deleted_count, requeued_count, delta_json = redis.multi do |transaction|
76
- @queue.acknowledge(id, pipeline: transaction)
77
- transaction.hdel(key('error-reports'), id)
78
- transaction.hget(key('requeues-count'), id)
79
- transaction.hget(key('error-report-deltas'), id)
76
+ @queue.acknowledge(entry, pipeline: transaction)
77
+ transaction.hdel(key('error-reports'), entry)
78
+ transaction.hget(key('requeues-count'), entry)
79
+ transaction.hget(key('error-report-deltas'), entry)
80
80
  end
81
81
  # When we're replacing a failure, subtract the (single) acknowledging worker's stat contribution
82
82
  if error_reports_deleted_count.to_i > 0 && delta_json
83
83
  apply_error_report_delta_correction(delta_json)
84
- redis.hdel(key('error-report-deltas'), id)
84
+ redis.hdel(key('error-report-deltas'), entry)
85
85
  end
86
- record_flaky(id) if !skip_flaky_record && (error_reports_deleted_count.to_i > 0 || requeued_count.to_i > 0)
86
+ record_flaky(entry) if !skip_flaky_record && (error_reports_deleted_count.to_i > 0 || requeued_count.to_i > 0)
87
87
  # Count this run when we ack'd or when we replaced a failure (so stats delta is applied)
88
88
  !!(acknowledged || error_reports_deleted_count.to_i > 0)
89
89
  end
90
90
 
91
- def record_requeue(id)
91
+ def record_requeue(entry)
92
92
  true
93
93
  end
94
94
 
@@ -142,11 +142,23 @@ module CI
142
142
  end
143
143
 
144
144
  def error_reports
145
- redis.hgetall(key('error-reports'))
145
+ redis.hgetall(key('error-reports')).transform_keys { |entry| CI::Queue::QueueEntry.test_id(entry) }
146
146
  end
147
147
 
148
148
  def flaky_reports
149
- redis.smembers(key('flaky-reports'))
149
+ redis.smembers(key('flaky-reports')).map { |entry| CI::Queue::QueueEntry.test_id(entry) }
150
+ end
151
+
152
+ def record_worker_profile(profile)
153
+ redis.pipelined do |pipeline|
154
+ pipeline.hset(key('worker-profiles'), config.worker_id, JSON.dump(profile))
155
+ pipeline.expire(key('worker-profiles'), config.redis_ttl)
156
+ end
157
+ end
158
+
159
+ def worker_profiles
160
+ raw = redis.hgetall(key('worker-profiles'))
161
+ raw.transform_values { |v| JSON.parse(v) }
150
162
  end
151
163
 
152
164
  def fetch_stats(stat_names)
@@ -175,10 +187,10 @@ module CI
175
187
  ['build', config.build_id, *args].join(':')
176
188
  end
177
189
 
178
- def store_error_report_delta(test_id, stat_delta)
190
+ def store_error_report_delta(entry, stat_delta)
179
191
  # Only the acknowledging worker's stats include this test; store their delta for correction on success
180
192
  payload = { 'worker_id' => config.worker_id.to_s }.merge(stat_delta)
181
- redis.hset(key('error-report-deltas'), test_id, JSON.generate(payload))
193
+ redis.hset(key('error-report-deltas'), entry, JSON.generate(payload))
182
194
  redis.expire(key('error-report-deltas'), config.redis_ttl)
183
195
  end
184
196
 
@@ -5,14 +5,14 @@ local owners_key = KEYS[3]
5
5
  local worker_queue_key = KEYS[4]
6
6
 
7
7
  local current_time = ARGV[1]
8
- local test = ARGV[2]
8
+ local entry = ARGV[2]
9
9
 
10
10
  -- already processed, we do not need to bump the timestamp
11
- if redis.call('sismember', processed_key, test) == 1 then
11
+ if redis.call('sismember', processed_key, entry) == 1 then
12
12
  return false
13
13
  end
14
14
 
15
15
  -- we're still the owner of the test, we can bump the timestamp
16
- if redis.call('hget', owners_key, test) == worker_queue_key then
17
- return redis.call('zadd', zset_key, current_time, test)
16
+ if redis.call('hget', owners_key, entry) == worker_queue_key then
17
+ return redis.call('zadd', zset_key, current_time, entry)
18
18
  end
@@ -56,9 +56,21 @@ module CI
56
56
  end
57
57
 
58
58
  def read_script(name)
59
- ::File.read(::File.join(DEV_SCRIPTS_ROOT, "#{name}.lua"))
59
+ resolve_lua_includes(
60
+ ::File.read(::File.join(DEV_SCRIPTS_ROOT, "#{name}.lua")),
61
+ DEV_SCRIPTS_ROOT,
62
+ )
60
63
  rescue SystemCallError
61
- ::File.read(::File.join(RELEASE_SCRIPTS_ROOT, "#{name}.lua"))
64
+ resolve_lua_includes(
65
+ ::File.read(::File.join(RELEASE_SCRIPTS_ROOT, "#{name}.lua")),
66
+ RELEASE_SCRIPTS_ROOT,
67
+ )
68
+ end
69
+
70
+ def resolve_lua_includes(script, root)
71
+ script.gsub(/^-- @include (\S+)$/) do
72
+ ::File.read(::File.join(root, "#{$1}.lua"))
73
+ end
62
74
  end
63
75
 
64
76
  HEADER = 'L'
@@ -6,17 +6,19 @@ local zset_key = KEYS[4]
6
6
  local worker_queue_key = KEYS[5]
7
7
  local owners_key = KEYS[6]
8
8
  local error_reports_key = KEYS[7]
9
+ local requeued_by_key = KEYS[8]
9
10
 
10
11
  local max_requeues = tonumber(ARGV[1])
11
12
  local global_max_requeues = tonumber(ARGV[2])
12
- local test = ARGV[3]
13
+ local entry = ARGV[3]
13
14
  local offset = ARGV[4]
15
+ local ttl = tonumber(ARGV[5])
14
16
 
15
- if redis.call('hget', owners_key, test) == worker_queue_key then
16
- redis.call('hdel', owners_key, test)
17
+ if redis.call('hget', owners_key, entry) == worker_queue_key then
18
+ redis.call('hdel', owners_key, entry)
17
19
  end
18
20
 
19
- if redis.call('sismember', processed_key, test) == 1 then
21
+ if redis.call('sismember', processed_key, entry) == 1 then
20
22
  return false
21
23
  end
22
24
 
@@ -25,23 +27,28 @@ if global_requeues and global_requeues >= tonumber(global_max_requeues) then
25
27
  return false
26
28
  end
27
29
 
28
- local requeues = tonumber(redis.call('hget', requeues_count_key, test))
30
+ local requeues = tonumber(redis.call('hget', requeues_count_key, entry))
29
31
  if requeues and requeues >= max_requeues then
30
32
  return false
31
33
  end
32
34
 
33
35
  redis.call('hincrby', requeues_count_key, '___total___', 1)
34
- redis.call('hincrby', requeues_count_key, test, 1)
36
+ redis.call('hincrby', requeues_count_key, entry, 1)
35
37
 
36
- redis.call('hdel', error_reports_key, test)
38
+ redis.call('hdel', error_reports_key, entry)
37
39
 
38
40
  local pivot = redis.call('lrange', queue_key, -1 - offset, 0 - offset)[1]
39
41
  if pivot then
40
- redis.call('linsert', queue_key, 'BEFORE', pivot, test)
42
+ redis.call('linsert', queue_key, 'BEFORE', pivot, entry)
41
43
  else
42
- redis.call('lpush', queue_key, test)
44
+ redis.call('lpush', queue_key, entry)
43
45
  end
44
46
 
45
- redis.call('zrem', zset_key, test)
47
+ redis.call('hset', requeued_by_key, entry, worker_queue_key)
48
+ if ttl and ttl > 0 then
49
+ redis.call('expire', requeued_by_key, ttl)
50
+ end
51
+
52
+ redis.call('zrem', zset_key, entry)
46
53
 
47
54
  return true
@@ -4,15 +4,54 @@ local zset_key = KEYS[2]
4
4
  local processed_key = KEYS[3]
5
5
  local worker_queue_key = KEYS[4]
6
6
  local owners_key = KEYS[5]
7
+ local requeued_by_key = KEYS[6]
8
+ local workers_key = KEYS[7]
7
9
 
8
10
  local current_time = ARGV[1]
11
+ local defer_offset = tonumber(ARGV[2]) or 0
12
+ local max_skip_attempts = 4
9
13
 
10
- local test = redis.call('rpop', queue_key)
11
- if test then
12
- redis.call('zadd', zset_key, current_time, test)
13
- redis.call('lpush', worker_queue_key, test)
14
- redis.call('hset', owners_key, test, worker_queue_key)
15
- return test
16
- else
17
- return nil
14
+ local function insert_with_offset(test)
15
+ local pivot = redis.call('lrange', queue_key, -1 - defer_offset, 0 - defer_offset)[1]
16
+ if pivot then
17
+ redis.call('linsert', queue_key, 'BEFORE', pivot, test)
18
+ else
19
+ redis.call('lpush', queue_key, test)
20
+ end
18
21
  end
22
+
23
+ for attempt = 1, max_skip_attempts do
24
+ local test = redis.call('rpop', queue_key)
25
+ if not test then
26
+ return nil
27
+ end
28
+
29
+ local requeued_by = redis.call('hget', requeued_by_key, test)
30
+ if requeued_by == worker_queue_key then
31
+ -- If this build only has one worker, allow immediate self-pickup.
32
+ if redis.call('scard', workers_key) <= 1 then
33
+ redis.call('hdel', requeued_by_key, test)
34
+ redis.call('zadd', zset_key, current_time, test)
35
+ redis.call('lpush', worker_queue_key, test)
36
+ redis.call('hset', owners_key, test, worker_queue_key)
37
+ return test
38
+ end
39
+
40
+ insert_with_offset(test)
41
+
42
+ -- If this worker only finds its own requeued tests, defer once by returning nil,
43
+ -- then allow pickup on a subsequent reserve attempt.
44
+ if attempt == max_skip_attempts then
45
+ redis.call('hdel', requeued_by_key, test)
46
+ return nil
47
+ end
48
+ else
49
+ redis.call('hdel', requeued_by_key, test)
50
+ redis.call('zadd', zset_key, current_time, test)
51
+ redis.call('lpush', worker_queue_key, test)
52
+ redis.call('hset', owners_key, test, worker_queue_key)
53
+ return test
54
+ end
55
+ end
56
+
57
+ return nil
@@ -9,7 +9,7 @@ module CI
9
9
  end
10
10
 
11
11
  def total
12
- wait_for_master(timeout: config.queue_init_timeout)
12
+ wait_for_master(timeout: config.queue_init_timeout, allow_streaming: true)
13
13
  redis.get(key('total')).to_i
14
14
  end
15
15
 
@@ -19,7 +19,7 @@ module CI
19
19
 
20
20
  def wait_for_workers
21
21
  duration = measure do
22
- wait_for_master(timeout: config.queue_init_timeout)
22
+ wait_for_master(timeout: config.queue_init_timeout, allow_streaming: true)
23
23
  end
24
24
 
25
25
  yield if block_given?
@@ -30,7 +30,7 @@ module CI
30
30
  @time_left -= 1
31
31
  sleep 1
32
32
 
33
- if active_workers?
33
+ if active_workers? || streaming?
34
34
  @time_left_with_no_workers = config.inactive_workers_timeout
35
35
  else
36
36
  @time_left_with_no_workers -= 1