minitest-distributed 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/Gemfile +1 -1
- data/README.md +29 -13
- data/bin/setup +0 -2
- data/lib/minitest/distributed/configuration.rb +49 -4
- data/lib/minitest/distributed/coordinators/coordinator_interface.rb +3 -0
- data/lib/minitest/distributed/coordinators/memory_coordinator.rb +29 -9
- data/lib/minitest/distributed/coordinators/redis_coordinator.rb +258 -156
- data/lib/minitest/distributed/enqueued_runnable.rb +193 -41
- data/lib/minitest/distributed/filters/exclude_filter.rb +4 -4
- data/lib/minitest/distributed/filters/filter_interface.rb +3 -3
- data/lib/minitest/distributed/filters/include_filter.rb +4 -4
- data/lib/minitest/distributed/reporters/distributed_progress_reporter.rb +2 -2
- data/lib/minitest/distributed/reporters/distributed_summary_reporter.rb +49 -10
- data/lib/minitest/distributed/reporters/redis_coordinator_warnings_reporter.rb +11 -16
- data/lib/minitest/distributed/result_aggregate.rb +38 -9
- data/lib/minitest/distributed/result_type.rb +76 -2
- data/lib/minitest/distributed/test_selector.rb +4 -6
- data/lib/minitest/distributed/version.rb +1 -1
- data/lib/minitest/distributed_plugin.rb +1 -25
- data/sorbet/rbi/minitest.rbi +18 -3
- data/sorbet/rbi/redis.rbi +19 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 076d9467680eff44b28d42436648c5f4878fb5689bca7ddd9365bd05991a5352
|
4
|
+
data.tar.gz: '0068262916e339e61d4997eb478df915be5943d3a6b39e70fb02def9ba0d0dbb'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d71ead8d7f352d9d628ec6682d2367804c63362e1e02e658ae3899b55486e1ff1bedd1cc10e23796a3373a0a3d0580b874646c4f7e3ec472675b8b85d88b001
|
7
|
+
data.tar.gz: cafd308bdad9ee0332323e0e826810fee2be91cbb5bdafa74ec8037891e9a6031ffa6f4f1e4777527e0f21784a48502848893661120145749629240164c3c077
|
data/.rubocop.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -63,8 +63,8 @@ them to fail.
|
|
63
63
|
|
64
64
|
### Other optional command line arguments
|
65
65
|
|
66
|
-
- `--test-timeout=SECONDS` or `ENV[
|
67
|
-
maximum amount a test is allowed to run before it times out. In a distributed
|
66
|
+
- `--test-timeout=SECONDS` or `ENV[MINITEST_TEST_TIMEOUT_SECONDS]` (default: 30s):
|
67
|
+
the maximum amount a test is allowed to run before it times out. In a distributed
|
68
68
|
system, it's impossible to differentiate between a worker being slow and a
|
69
69
|
worker being broken. When the timeout passes, the other workers will assume
|
70
70
|
that the worker running the test has crashed, and will attempt to claim this
|
@@ -92,24 +92,40 @@ other tests.
|
|
92
92
|
|
93
93
|
## Development
|
94
94
|
|
95
|
-
|
96
|
-
run `rake test` to run the tests. You can also run `bin/console` for an
|
97
|
-
interactive prompt that will allow you to experiment.
|
95
|
+
To bootstrap a local development environment:
|
98
96
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
97
|
+
- Run `bin/setup` to install dependencies.
|
98
|
+
- Start a Redis server by running `redis-server`, assuming you have Redis
|
99
|
+
installed locally and the binary is on your `PATH`. Alternatively, you can
|
100
|
+
set the `REDIS_URL` environment variable to point to a Redis instance running
|
101
|
+
elsewhere.
|
102
|
+
- Now, run `bin/rake test` to run the tests, and verify everything is working.
|
103
|
+
- You can also run `bin/console` for an interactive prompt that will allow you
|
104
|
+
to experiment.
|
105
|
+
|
106
|
+
### Releasing a new version
|
107
|
+
|
108
|
+
- To install this gem onto your local machine, run `bin/rake install`.
|
109
|
+
- Only people at Shopify can release a new version to
|
110
|
+
[rubygems.org](https://rubygems.org). To do so, update the `VERSION` constant
|
111
|
+
in `version.rb`, and merge to master. Shipit will take care of building the
|
112
|
+
`.gem` bundle, and pushing it to rubygems.org.
|
104
113
|
|
105
114
|
## Contributing
|
106
115
|
|
107
|
-
Bug reports and pull requests are welcome on GitHub at
|
116
|
+
Bug reports and pull requests are welcome on GitHub at
|
117
|
+
https://github.com/Shopify/minitest-distributed. This project is intended to
|
118
|
+
be a safe, welcoming space for collaboration, and contributors are expected to
|
119
|
+
adhere to the [code of
|
120
|
+
conduct](https://github.com/Shopify/minitest-distributed/blob/master/CODE_OF_CONDUCT.md).
|
108
121
|
|
109
122
|
## License
|
110
123
|
|
111
|
-
The gem is available as open source under the terms of the [MIT
|
124
|
+
The gem is available as open source under the terms of the [MIT
|
125
|
+
License](https://opensource.org/licenses/MIT).
|
112
126
|
|
113
127
|
## Code of Conduct
|
114
128
|
|
115
|
-
Everyone interacting in the
|
129
|
+
Everyone interacting in the `minitest-distributed` project's codebases, issue
|
130
|
+
trackers, chat rooms and mailing lists is expected to follow the [code of
|
131
|
+
conduct](https://github.com/Shopify/minitest-distributed/blob/master/CODE_OF_CONDUCT.md).
|
data/bin/setup
CHANGED
@@ -8,8 +8,8 @@ module Minitest
|
|
8
8
|
module Distributed
|
9
9
|
class Configuration < T::Struct
|
10
10
|
DEFAULT_BATCH_SIZE = 10
|
11
|
-
DEFAULT_MAX_ATTEMPTS =
|
12
|
-
|
11
|
+
DEFAULT_MAX_ATTEMPTS = 1
|
12
|
+
DEFAULT_TEST_TIMEOUT_SECONDS = 30.0 # seconds
|
13
13
|
|
14
14
|
class << self
|
15
15
|
extend T::Sig
|
@@ -20,11 +20,54 @@ module Minitest
|
|
20
20
|
coordinator_uri: URI(env['MINITEST_COORDINATOR'] || 'memory:'),
|
21
21
|
run_id: env['MINITEST_RUN_ID'] || SecureRandom.uuid,
|
22
22
|
worker_id: env['MINITEST_WORKER_ID'] || SecureRandom.uuid,
|
23
|
-
|
23
|
+
test_timeout_seconds: Float(env['MINITEST_TEST_TIMEOUT_SECONDS'] || DEFAULT_TEST_TIMEOUT_SECONDS),
|
24
24
|
test_batch_size: Integer(env['MINITEST_TEST_BATCH_SIZE'] || DEFAULT_BATCH_SIZE),
|
25
25
|
max_attempts: Integer(env['MINITEST_MAX_ATTEMPTS'] || DEFAULT_MAX_ATTEMPTS),
|
26
|
+
max_failures: (max_failures_env = env['MINITEST_MAX_FAILURES']) ? Integer(max_failures_env) : nil,
|
26
27
|
)
|
27
28
|
end
|
29
|
+
|
30
|
+
sig { params(opts: OptionParser).returns(T.attached_class) }
|
31
|
+
def from_command_line_options(opts)
|
32
|
+
configuration = from_env
|
33
|
+
|
34
|
+
opts.on('--coordinator=URI', "The URI pointing to the coordinator") do |uri|
|
35
|
+
configuration.coordinator_uri = URI.parse(uri)
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on('--test-timeout=TIMEOUT', "The maximum run time for a single test in seconds") do |timeout|
|
39
|
+
configuration.test_timeout_seconds = Float(timeout)
|
40
|
+
end
|
41
|
+
|
42
|
+
opts.on('--max-attempts=ATTEMPTS', "The maximum number of attempts to run a test") do |attempts|
|
43
|
+
configuration.max_attempts = Integer(attempts)
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on('--test-batch-size=NUMBER', "The number of tests to process per batch") do |batch_size|
|
47
|
+
configuration.test_batch_size = Integer(batch_size)
|
48
|
+
end
|
49
|
+
|
50
|
+
opts.on('--max-failures=FAILURES', "The maximum allowed failure before aborting a run") do |failures|
|
51
|
+
configuration.max_failures = Integer(failures)
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on('--run-id=ID', "The ID for this run shared between coordinated workers") do |id|
|
55
|
+
configuration.run_id = id
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.on('--worker-id=ID', "The unique ID for this worker") do |id|
|
59
|
+
configuration.worker_id = id
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.on(
|
63
|
+
'--[no-]retry-failures', "Retry failed and errored tests from a previous run attempt " \
|
64
|
+
"with the same run ID (default: enabled)"
|
65
|
+
) do |enabled|
|
66
|
+
configuration.retry_failures = enabled
|
67
|
+
end
|
68
|
+
|
69
|
+
configuration
|
70
|
+
end
|
28
71
|
end
|
29
72
|
|
30
73
|
extend T::Sig
|
@@ -32,9 +75,11 @@ module Minitest
|
|
32
75
|
prop :coordinator_uri, URI::Generic, default: URI('memory:')
|
33
76
|
prop :run_id, String, factory: -> { SecureRandom.uuid }
|
34
77
|
prop :worker_id, String, factory: -> { SecureRandom.uuid }
|
35
|
-
prop :
|
78
|
+
prop :test_timeout_seconds, Float, default: DEFAULT_TEST_TIMEOUT_SECONDS
|
36
79
|
prop :test_batch_size, Integer, default: DEFAULT_BATCH_SIZE
|
37
80
|
prop :max_attempts, Integer, default: DEFAULT_MAX_ATTEMPTS
|
81
|
+
prop :max_failures, T.nilable(Integer)
|
82
|
+
prop :retry_failures, T::Boolean, default: true
|
38
83
|
|
39
84
|
sig { returns(Coordinators::CoordinatorInterface) }
|
40
85
|
def coordinator
|
@@ -25,7 +25,8 @@ module Minitest
|
|
25
25
|
|
26
26
|
@leader = T.let(Mutex.new, Mutex)
|
27
27
|
@queue = T.let(Queue.new, Queue)
|
28
|
-
@local_results = T.let(ResultAggregate.new, ResultAggregate)
|
28
|
+
@local_results = T.let(ResultAggregate.new(max_failures: configuration.max_failures), ResultAggregate)
|
29
|
+
@aborted = T.let(false, T::Boolean)
|
29
30
|
end
|
30
31
|
|
31
32
|
sig { override.params(reporter: Minitest::CompositeReporter, options: T::Hash[Symbol, T.untyped]).void }
|
@@ -33,6 +34,11 @@ module Minitest
|
|
33
34
|
# No need for any additional reporters
|
34
35
|
end
|
35
36
|
|
37
|
+
sig { override.returns(T::Boolean) }
|
38
|
+
def aborted?
|
39
|
+
@aborted
|
40
|
+
end
|
41
|
+
|
36
42
|
sig { override.params(test_selector: TestSelector).void }
|
37
43
|
def produce(test_selector:)
|
38
44
|
if @leader.try_lock
|
@@ -41,24 +47,38 @@ module Minitest
|
|
41
47
|
if tests.empty?
|
42
48
|
queue.close
|
43
49
|
else
|
44
|
-
tests.each
|
50
|
+
tests.each do |runnable|
|
51
|
+
queue << EnqueuedRunnable.new(
|
52
|
+
class_name: T.must(runnable.class.name),
|
53
|
+
method_name: runnable.name,
|
54
|
+
test_timeout_seconds: configuration.test_timeout_seconds,
|
55
|
+
max_attempts: configuration.max_attempts,
|
56
|
+
)
|
57
|
+
end
|
45
58
|
end
|
46
59
|
end
|
47
60
|
end
|
48
61
|
|
49
62
|
sig { override.params(reporter: AbstractReporter).void }
|
50
63
|
def consume(reporter:)
|
51
|
-
until queue.
|
52
|
-
enqueued_runnable = queue.pop
|
64
|
+
until queue.closed?
|
65
|
+
enqueued_runnable = T.let(queue.pop, EnqueuedRunnable)
|
66
|
+
|
53
67
|
reporter.prerecord(enqueued_runnable.runnable_class, enqueued_runnable.method_name)
|
54
|
-
result = enqueued_runnable.run
|
55
68
|
|
56
|
-
|
57
|
-
|
69
|
+
enqueued_result = enqueued_runnable.run do |initial_result|
|
70
|
+
if ResultType.of(initial_result) == ResultType::Requeued
|
71
|
+
queue << enqueued_runnable.next_attempt
|
72
|
+
end
|
73
|
+
EnqueuedRunnable::Result::Commit.success
|
74
|
+
end
|
58
75
|
|
59
|
-
reporter.record(
|
76
|
+
reporter.record(enqueued_result.committed_result)
|
77
|
+
local_results.update_with_result(enqueued_result)
|
60
78
|
|
61
|
-
|
79
|
+
# We abort a run if we reach the maximum number of failures
|
80
|
+
queue.close if combined_results.abort?
|
81
|
+
queue.close if combined_results.complete?
|
62
82
|
end
|
63
83
|
end
|
64
84
|
end
|
@@ -15,11 +15,19 @@ module Minitest
|
|
15
15
|
# to the stream.
|
16
16
|
#
|
17
17
|
# AFter that, all workers will start consuming from the stream. They will first
|
18
|
-
# try to claim stale entries from other workers (determined by the `
|
19
|
-
# option), and process them
|
18
|
+
# try to claim stale entries from other workers (determined by the `test_timeout_seconds`
|
19
|
+
# option), and process them up to a maximum of `max_attempts` attempts. Then,
|
20
20
|
# they will consume tests from the stream, run them, and ack them. This is done
|
21
21
|
# in batches to reduce load on Redis.
|
22
22
|
#
|
23
|
+
# Retrying failed tests (up to `max_attempts` times) uses the same mechanism.
|
24
|
+
# When a test fails, and we haven't exhausted the maximum number of attempts, we
|
25
|
+
# do not ACK the result with Redis. The means that another worker will eventually
|
26
|
+
# claim the test, and run it again. However, in this case we don't want to slow
|
27
|
+
# things down unnecessarily. When a test fails and we want to retry it, we add the
|
28
|
+
# test to the `retry_set` in Redis. When other worker sees that a test is in this
|
29
|
+
# set, it can immediately claim the test, rather than waiting for the timeout.
|
30
|
+
#
|
23
31
|
# Finally, when we have acked the same number of tests as we populated into the
|
24
32
|
# queue, the run is considered finished. The first worker to detect this will
|
25
33
|
# remove the consumergroup and the associated stream from Redis.
|
@@ -47,7 +55,10 @@ module Minitest
|
|
47
55
|
attr_reader :local_results
|
48
56
|
|
49
57
|
sig { returns(T::Set[EnqueuedRunnable]) }
|
50
|
-
attr_reader :
|
58
|
+
attr_reader :reclaimed_timeout_tests
|
59
|
+
|
60
|
+
sig { returns(T::Set[EnqueuedRunnable]) }
|
61
|
+
attr_reader :reclaimed_failed_tests
|
51
62
|
|
52
63
|
sig { params(configuration: Configuration).void }
|
53
64
|
def initialize(configuration:)
|
@@ -58,7 +69,9 @@ module Minitest
|
|
58
69
|
@group_name = T.let('minitest-distributed', String)
|
59
70
|
@local_results = T.let(ResultAggregate.new, ResultAggregate)
|
60
71
|
@combined_results = T.let(nil, T.nilable(ResultAggregate))
|
61
|
-
@
|
72
|
+
@reclaimed_timeout_tests = T.let(Set.new, T::Set[EnqueuedRunnable])
|
73
|
+
@reclaimed_failed_tests = T.let(Set.new, T::Set[EnqueuedRunnable])
|
74
|
+
@aborted = T.let(false, T::Boolean)
|
62
75
|
end
|
63
76
|
|
64
77
|
sig { override.params(reporter: Minitest::CompositeReporter, options: T::Hash[Symbol, T.untyped]).void }
|
@@ -70,41 +83,51 @@ module Minitest
|
|
70
83
|
def combined_results
|
71
84
|
@combined_results ||= begin
|
72
85
|
stats_as_string = redis.mget(key('runs'), key('assertions'), key('passes'),
|
73
|
-
key('failures'), key('errors'), key('skips'), key('
|
86
|
+
key('failures'), key('errors'), key('skips'), key('requeues'), key('discards'),
|
87
|
+
key('acks'), key('size'))
|
74
88
|
|
75
89
|
ResultAggregate.new(
|
90
|
+
max_failures: configuration.max_failures,
|
91
|
+
|
76
92
|
runs: Integer(stats_as_string.fetch(0) || 0),
|
77
93
|
assertions: Integer(stats_as_string.fetch(1) || 0),
|
78
94
|
passes: Integer(stats_as_string.fetch(2) || 0),
|
79
95
|
failures: Integer(stats_as_string.fetch(3) || 0),
|
80
96
|
errors: Integer(stats_as_string.fetch(4) || 0),
|
81
97
|
skips: Integer(stats_as_string.fetch(5) || 0),
|
82
|
-
|
83
|
-
|
98
|
+
requeues: Integer(stats_as_string.fetch(6) || 0),
|
99
|
+
discards: Integer(stats_as_string.fetch(7) || 0),
|
100
|
+
acks: Integer(stats_as_string.fetch(8) || 0),
|
84
101
|
|
85
|
-
# In the case where we have no build
|
102
|
+
# In the case where we have no build size number published yet, we initialize
|
86
103
|
# thesize of the test suite to be arbitrarity large, to make sure it is
|
87
104
|
# higher than the number of acks, so the run is not consider completed yet.
|
88
|
-
size: Integer(stats_as_string.fetch(
|
105
|
+
size: Integer(stats_as_string.fetch(9) || 2_147_483_647),
|
89
106
|
)
|
90
107
|
end
|
91
108
|
end
|
92
109
|
|
110
|
+
sig { override.returns(T::Boolean) }
|
111
|
+
def aborted?
|
112
|
+
@aborted
|
113
|
+
end
|
114
|
+
|
93
115
|
sig { override.params(test_selector: TestSelector).void }
|
94
116
|
def produce(test_selector:)
|
95
117
|
# Whoever ends up creating the consumer group will act as leader,
|
96
118
|
# and publish the list of tests to the stream.
|
97
119
|
|
98
|
-
begin
|
120
|
+
initial_attempt = begin
|
99
121
|
# When using `redis.multi`, the second DEL command gets executed even if the initial GROUP
|
100
122
|
# fails. This is bad, because only the leader should be issuing the DEL command.
|
101
123
|
# When using EVAL and a Lua script, the script aborts after the first XGROUP command
|
102
124
|
# fails, and the DEL never gets executed for followers.
|
103
|
-
redis.evalsha(
|
125
|
+
keys_deleted = redis.evalsha(
|
104
126
|
register_consumergroup_script,
|
105
127
|
keys: [stream_key, key('size'), key('acks')],
|
106
128
|
argv: [group_name],
|
107
129
|
)
|
130
|
+
keys_deleted == 0
|
108
131
|
|
109
132
|
rescue Redis::CommandError => ce
|
110
133
|
if ce.message.include?('BUSYGROUP')
|
@@ -118,38 +141,67 @@ module Minitest
|
|
118
141
|
end
|
119
142
|
end
|
120
143
|
|
121
|
-
|
122
|
-
|
123
|
-
redis.lrange(key('failure_list'), 0, -1)
|
124
|
-
redis.lrange(key('error_list'), 0, -1)
|
125
|
-
redis.del(key('failure_list'), key('error_list'))
|
126
|
-
end
|
127
|
-
|
128
|
-
tests = if run_attempt == 1
|
144
|
+
tests = T.let([], T::Array[Minitest::Runnable])
|
145
|
+
tests = if initial_attempt
|
129
146
|
# If this is the first attempt for this run ID, we will schedule the full
|
130
147
|
# test suite as returned by the test selector to run.
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
148
|
+
|
149
|
+
tests_from_selector = test_selector.tests
|
150
|
+
adjust_combined_results(ResultAggregate.new(size: tests_from_selector.size))
|
151
|
+
tests_from_selector
|
152
|
+
|
153
|
+
elsif configuration.retry_failures
|
154
|
+
# Before starting a retry attempt, we first check if the previous attempt
|
155
|
+
# was aborted before it was completed. If this is the case, we cannot use
|
156
|
+
# retry mode, and should immediately fail the attempt.
|
157
|
+
if combined_results.abort?
|
158
|
+
# We mark this run as aborted, which causes this worker to not be successful.
|
159
|
+
@aborted = true
|
160
|
+
|
161
|
+
# We still publish an empty size run to Redis, so if there are any followers,
|
162
|
+
# they will wind down normally. Only the leader will exit
|
163
|
+
# with a non-zero exit status and fail the build; any follower will
|
164
|
+
# exit with status 0.
|
165
|
+
adjust_combined_results(ResultAggregate.new(size: 0))
|
166
|
+
T.let([], T::Array[Minitest::Runnable])
|
167
|
+
else
|
168
|
+
previous_failures, previous_errors, _deleted = redis.multi do
|
169
|
+
redis.lrange(list_key(ResultType::Failed.serialize), 0, -1)
|
170
|
+
redis.lrange(list_key(ResultType::Error.serialize), 0, -1)
|
171
|
+
redis.del(list_key(ResultType::Failed.serialize), list_key(ResultType::Error.serialize))
|
172
|
+
end
|
173
|
+
|
174
|
+
# We set the `size` key to the number of tests we are planning to schedule.
|
175
|
+
# We also adjust the number of failures and errors back to 0.
|
176
|
+
# We set the number of requeues to the number of tests that failed, so the
|
177
|
+
# run statistics will reflect that we retried some failed test.
|
178
|
+
#
|
179
|
+
# However, normally requeues are not acked, as we expect the test to be acked
|
180
|
+
# by another worker later. This makes the test loop think iot is already done.
|
181
|
+
# To prevent this, we initialize the number of acks negatively, so it evens out
|
182
|
+
# in the statistics.
|
183
|
+
total_failures = previous_failures.length + previous_errors.length
|
184
|
+
adjust_combined_results(ResultAggregate.new(
|
185
|
+
size: total_failures,
|
186
|
+
failures: -previous_failures.length,
|
187
|
+
errors: -previous_errors.length,
|
188
|
+
requeues: total_failures,
|
189
|
+
))
|
190
|
+
|
191
|
+
# For subsequent attempts, we check the list of previous failures and
|
192
|
+
# errors, and only schedule to re-run those tests. This allows for faster
|
193
|
+
# retries of potentially flaky tests.
|
194
|
+
test_identifiers_to_retry = T.let(previous_failures + previous_errors, T::Array[String])
|
195
|
+
test_identifiers_to_retry.map { |identifier| DefinedRunnable.from_identifier(identifier) }
|
138
196
|
end
|
197
|
+
else
|
198
|
+
adjust_combined_results(ResultAggregate.new(size: 0))
|
199
|
+
T.let([], T::Array[Minitest::Runnable])
|
139
200
|
end
|
140
201
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
adjust_combined_results(ResultAggregate.new(
|
145
|
-
size: tests.size,
|
146
|
-
failures: -previous_failures.length,
|
147
|
-
errors: -previous_errors.length,
|
148
|
-
reruns: previous_failures.length + previous_errors.length,
|
149
|
-
))
|
150
|
-
|
151
|
-
# TODO: break this up in batches.
|
152
|
-
tests.each { |test| redis.xadd(stream_key, test.serialize) }
|
202
|
+
redis.pipelined do
|
203
|
+
tests.each { |test| redis.xadd(stream_key, class_name: T.must(test.class.name), method_name: test.name) }
|
204
|
+
end
|
153
205
|
end
|
154
206
|
|
155
207
|
sig { override.params(reporter: AbstractReporter).void }
|
@@ -158,26 +210,29 @@ module Minitest
|
|
158
210
|
loop do
|
159
211
|
# First, see if there are any pending tests from other workers to claim.
|
160
212
|
stale_runnables = claim_stale_runnables
|
161
|
-
|
213
|
+
process_batch(stale_runnables, reporter)
|
162
214
|
|
163
|
-
#
|
215
|
+
# Then, try to process a regular batch of messages
|
164
216
|
fresh_runnables = claim_fresh_runnables(block: exponential_backoff)
|
165
|
-
|
217
|
+
process_batch(fresh_runnables, reporter)
|
166
218
|
|
167
219
|
# If we have acked the same amount of tests as we were supposed to, the run
|
168
220
|
# is complete and we can exit our loop. Generally, only one worker will detect
|
169
221
|
# this condition. The pther workers will quit their consumer loop because the
|
170
222
|
# consumergroup will be deleted by the first worker, and their Redis commands
|
171
223
|
# will start to fail - see the rescue block below.
|
172
|
-
break if combined_results.
|
224
|
+
break if combined_results.complete?
|
225
|
+
|
226
|
+
# We also abort a run if we reach the maximum number of failures
|
227
|
+
break if combined_results.abort?
|
173
228
|
|
174
229
|
# To make sure we don't end up in a busy loop overwhelming Redis with commands
|
175
230
|
# when there is no work to do, we increase the blocking time exponentially,
|
176
|
-
# and reset it to the initial value if we processed any
|
177
|
-
if
|
178
|
-
exponential_backoff = INITIAL_BACKOFF
|
179
|
-
else
|
231
|
+
# and reset it to the initial value if we processed any tests.
|
232
|
+
if stale_runnables.empty? && fresh_runnables.empty?
|
180
233
|
exponential_backoff <<= 1
|
234
|
+
else
|
235
|
+
exponential_backoff = INITIAL_BACKOFF
|
181
236
|
end
|
182
237
|
end
|
183
238
|
|
@@ -203,28 +258,20 @@ module Minitest
|
|
203
258
|
@redis ||= Redis.new(url: configuration.coordinator_uri)
|
204
259
|
end
|
205
260
|
|
206
|
-
sig { returns(String) }
|
207
|
-
def ack_batch_script
|
208
|
-
@ack_batch_script = T.let(@ack_batch_script, T.nilable(String))
|
209
|
-
@ack_batch_script ||= redis.script(:load, <<~LUA)
|
210
|
-
local acked_ids, acked, i = {}, 0, 2
|
211
|
-
while ARGV[i] do
|
212
|
-
if redis.call('XACK', KEYS[1], ARGV[1], ARGV[i]) > 0 then
|
213
|
-
acked = acked + 1
|
214
|
-
acked_ids[acked] = ARGV[i]
|
215
|
-
end
|
216
|
-
i = i + 1
|
217
|
-
end
|
218
|
-
return acked_ids
|
219
|
-
LUA
|
220
|
-
end
|
221
|
-
|
222
261
|
sig { returns(String) }
|
223
262
|
def register_consumergroup_script
|
224
263
|
@register_consumergroup_script = T.let(@register_consumergroup_script, T.nilable(String))
|
225
264
|
@register_consumergroup_script ||= redis.script(:load, <<~LUA)
|
265
|
+
-- Try to create the consumergroup. This will raise an error if the
|
266
|
+
-- consumergroup has already been registered by somebody else, which
|
267
|
+
-- means another worker will be acting as leader.
|
268
|
+
-- In that case, the next Redis DEL call will not be executed.
|
226
269
|
redis.call('XGROUP', 'CREATE', KEYS[1], ARGV[1], '0', 'MKSTREAM')
|
227
|
-
|
270
|
+
|
271
|
+
-- The leader should reset the size and acks key for this run attempt.
|
272
|
+
-- We return the number of keys that were deleted, which can be used to
|
273
|
+
-- determine whether this was the first attempt for this run or not.
|
274
|
+
return redis.call('DEL', KEYS[2], KEYS[3])
|
228
275
|
LUA
|
229
276
|
end
|
230
277
|
|
@@ -232,51 +279,119 @@ module Minitest
|
|
232
279
|
def claim_fresh_runnables(block:)
|
233
280
|
result = redis.xreadgroup(group_name, configuration.worker_id, stream_key, '>',
|
234
281
|
block: block, count: configuration.test_batch_size)
|
235
|
-
EnqueuedRunnable.from_redis_stream_claim(result.fetch(stream_key, []))
|
282
|
+
EnqueuedRunnable.from_redis_stream_claim(result.fetch(stream_key, []), configuration: configuration)
|
283
|
+
end
|
284
|
+
|
285
|
+
sig do
|
286
|
+
params(
|
287
|
+
pending_messages: T::Hash[String, PendingExecution],
|
288
|
+
max_idle_time_ms: Integer,
|
289
|
+
).returns(T::Array[EnqueuedRunnable])
|
290
|
+
end
|
291
|
+
def xclaim_messages(pending_messages, max_idle_time_ms:)
|
292
|
+
return [] if pending_messages.empty?
|
293
|
+
claimed = redis.xclaim(stream_key, group_name, configuration.worker_id,
|
294
|
+
max_idle_time_ms, pending_messages.keys)
|
295
|
+
|
296
|
+
EnqueuedRunnable.from_redis_stream_claim(claimed, pending_messages, configuration: configuration)
|
236
297
|
end
|
237
298
|
|
238
299
|
sig { returns(T::Array[EnqueuedRunnable]) }
|
239
300
|
def claim_stale_runnables
|
240
|
-
#
|
241
|
-
#
|
242
|
-
# to prevent the exact same batch from being too slow on repeated attempts,
|
243
|
-
# which would cause us to mark all the tests in that batch as failed.
|
244
|
-
#
|
245
|
-
# This has the side effect that for a retried test, the test timeout
|
246
|
-
# will be TEST_TIMEOUT * BATCH_SIZE in practice. This gives us a higher
|
247
|
-
# likelihood that the test will pass if the batch size > 1.
|
248
|
-
pending = redis.xpending(stream_key, group_name, '-', '+', 1)
|
249
|
-
|
250
|
-
# Every test is allowed to take test_timeout milliseconds. Because we process tests in
|
251
|
-
# batches, they should never be pending for TEST_TIMEOUT * BATCH_SIZE milliseconds.
|
301
|
+
# Every test is allowed to take test_timeout_seconds. Because we process tests in
|
302
|
+
# batches, they should never be pending for TEST_TIMEOUT_SECONDS * BATCH_SIZE seconds.
|
252
303
|
# So, only try to claim messages older than that, with a bit of jitter.
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
304
|
+
max_idle_time_ms = Integer(configuration.test_timeout_seconds * configuration.test_batch_size * 1000)
|
305
|
+
max_idle_time_ms_with_jitter = max_idle_time_ms * rand(1.0...1.2)
|
306
|
+
|
307
|
+
# Find all the pending messages to see if we want to attenpt to claim some.
|
308
|
+
pending = redis.xpending(stream_key, group_name, '-', '+', configuration.test_batch_size)
|
309
|
+
return [] if pending.empty?
|
310
|
+
|
311
|
+
active_consumers = Set[configuration.worker_id]
|
312
|
+
|
313
|
+
stale_messages = {}
|
314
|
+
active_messages = {}
|
315
|
+
pending.each do |msg|
|
316
|
+
message = PendingExecution.from_xpending(msg)
|
317
|
+
if message.elapsed_time_ms < max_idle_time_ms_with_jitter
|
318
|
+
active_consumers << message.worker_id
|
319
|
+
active_messages[message.entry_id] = message
|
320
|
+
else
|
321
|
+
stale_messages[message.entry_id] = message
|
258
322
|
end
|
259
323
|
end
|
260
324
|
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
if attempt > configuration.max_attempts
|
271
|
-
# If we exhaust our attempts, we will mark the test to immediately fail when it will be run next.
|
272
|
-
mark_runnable_to_fail_immediately(er)
|
273
|
-
else
|
274
|
-
reclaimed_tests << er
|
325
|
+
# If we only have evidence of one active consumer based on the pending message,
|
326
|
+
# we will query Redis for all consumers to make sure we have full data.
|
327
|
+
# We can skip this if we already know that there is more than one active one.
|
328
|
+
if active_consumers.size == 1
|
329
|
+
begin
|
330
|
+
redis.xinfo('consumers', stream_key, group_name).each do |consumer|
|
331
|
+
if consumer.fetch('idle') < max_idle_time_ms
|
332
|
+
active_consumers << consumer.fetch('name')
|
333
|
+
end
|
275
334
|
end
|
335
|
+
rescue Redis::CommandError
|
336
|
+
# This command can fail, specifically during the cleanup phase at the end
|
337
|
+
# of a build, when another worker has removed the stream key already.
|
276
338
|
end
|
339
|
+
end
|
340
|
+
|
341
|
+
# Now, see if we want to claim any stale messages. If we are the only active
|
342
|
+
# consumer, we want to claim our own messages as well as messgaes from other
|
343
|
+
# (stale) consumers. If there are multiple active consumers, we are going to
|
344
|
+
# let another consumer claim our own messages.
|
345
|
+
if active_consumers.size > 1
|
346
|
+
stale_messages.reject! { |_key, message| message.worker_id == configuration.worker_id }
|
347
|
+
end
|
348
|
+
|
349
|
+
unless stale_messages.empty?
|
350
|
+
# When we have to reclaim stale tests, those test are potentially too slow
|
351
|
+
# to run inside the test timeout. We only claim one timed out test at a time in order
|
352
|
+
# to prevent the exact same batch from being too slow on repeated attempts,
|
353
|
+
# which would cause us to mark all the tests in that batch as failed.
|
354
|
+
#
|
355
|
+
# This has the side effect that for a retried test, the test timeout
|
356
|
+
# will be TEST_TIMEOUT_SECONDS * BATCH_SIZE in practice. This gives us a higher
|
357
|
+
# likelihood that the test will pass if the batch size > 1.
|
358
|
+
stale_messages = stale_messages.slice(stale_messages.keys.first)
|
359
|
+
|
360
|
+
enqueued_runnables = xclaim_messages(stale_messages, max_idle_time_ms: max_idle_time_ms)
|
361
|
+
reclaimed_timeout_tests.merge(enqueued_runnables)
|
362
|
+
return enqueued_runnables
|
363
|
+
end
|
364
|
+
|
365
|
+
# Now, see if we want to claim any failed tests to retry. Again, if we are the only
|
366
|
+
# active consumer, we want to claim our own messages as well as messgaes from other
|
367
|
+
# (stale) consumers. If there are multiple active consumers, we are going to let
|
368
|
+
# another consumer claim our own messages.
|
369
|
+
if active_consumers.size > 1
|
370
|
+
active_messages.reject! { |_key, message| message.worker_id == configuration.worker_id }
|
371
|
+
end
|
277
372
|
|
278
|
-
|
373
|
+
# For all the active messages, we can check whether they are marked for a retry by
|
374
|
+
# trying to remove the test from the retry set set in Redis. Only one worker will be
|
375
|
+
# able to remove the entry from the set, so only one worker will end up trying to
|
376
|
+
# claim the test for the next attempt.
|
377
|
+
#
|
378
|
+
# We use `redis.multi` so we only need one round-trip for the entire list. Note that
|
379
|
+
# this is not an atomic operation with the XCLAIM call. This is OK, because the retry
|
380
|
+
# set is only there to speed things up and prevent us from having to wait for the test
|
381
|
+
# timeout. If the worker crashes between removing an item from the retry setm the test
|
382
|
+
# will eventually be picked up by another worker.
|
383
|
+
messages_in_retry_set = {}
|
384
|
+
redis.multi do
|
385
|
+
active_messages.each do |key, message|
|
386
|
+
messages_in_retry_set[key] = redis.srem(key('retry_set'), message.attempt_id)
|
387
|
+
end
|
279
388
|
end
|
389
|
+
|
390
|
+
# Now, we only select the messages that were on the retry set, and try to claim them.
|
391
|
+
active_messages.keep_if { |key, _value| messages_in_retry_set.fetch(key).value }
|
392
|
+
enqueued_runnables = xclaim_messages(active_messages, max_idle_time_ms: 0)
|
393
|
+
reclaimed_failed_tests.merge(enqueued_runnables)
|
394
|
+
enqueued_runnables
|
280
395
|
end
|
281
396
|
|
282
397
|
sig { void }
|
@@ -288,18 +403,6 @@ module Minitest
|
|
288
403
|
# so we can assume that all the Redis cleanup was completed.
|
289
404
|
end
|
290
405
|
|
291
|
-
sig { params(er: EnqueuedRunnable).void }
|
292
|
-
def mark_runnable_to_fail_immediately(er)
|
293
|
-
assertion = Minitest::Assertion.new(<<~EOM.chomp)
|
294
|
-
This test takes too long to run (> #{configuration.test_timeout}s).
|
295
|
-
|
296
|
-
We have tried running this test #{configuration.max_attempts} on different workers, but every time the worker has not reported back a result within #{configuration.test_timeout}ms.
|
297
|
-
Try to make the test faster, or increase the test timeout.
|
298
|
-
EOM
|
299
|
-
assertion.set_backtrace(caller)
|
300
|
-
er.canned_failure = assertion
|
301
|
-
end
|
302
|
-
|
303
406
|
sig { params(results: ResultAggregate).void }
|
304
407
|
def adjust_combined_results(results)
|
305
408
|
updated = redis.multi do
|
@@ -309,14 +412,16 @@ module Minitest
|
|
309
412
|
redis.incrby(key('failures'), results.failures)
|
310
413
|
redis.incrby(key('errors'), results.errors)
|
311
414
|
redis.incrby(key('skips'), results.skips)
|
312
|
-
redis.incrby(key('
|
415
|
+
redis.incrby(key('requeues'), results.requeues)
|
416
|
+
redis.incrby(key('discards'), results.discards)
|
313
417
|
redis.incrby(key('acks'), results.acks)
|
314
418
|
redis.incrby(key('size'), results.size)
|
315
419
|
end
|
316
420
|
|
317
|
-
@combined_results = ResultAggregate.new(
|
318
|
-
|
319
|
-
|
421
|
+
@combined_results = ResultAggregate.new(max_failures: configuration.max_failures,
|
422
|
+
runs: updated[0], assertions: updated[1], passes: updated[2],
|
423
|
+
failures: updated[3], errors: updated[4], skips: updated[5], requeues: updated[6], discards: updated[7],
|
424
|
+
acks: updated[8], size: updated[9])
|
320
425
|
end
|
321
426
|
|
322
427
|
sig { params(name: String).returns(String) }
|
@@ -324,59 +429,56 @@ module Minitest
|
|
324
429
|
"minitest/#{configuration.run_id}/#{name}"
|
325
430
|
end
|
326
431
|
|
327
|
-
sig { params(
|
432
|
+
sig { params(name: String).returns(String) }
|
433
|
+
def list_key(name)
|
434
|
+
key("#{name}_list")
|
435
|
+
end
|
436
|
+
|
437
|
+
sig { params(batch: T::Array[EnqueuedRunnable], reporter: AbstractReporter).void }
|
328
438
|
def process_batch(batch, reporter)
|
329
|
-
|
439
|
+
return 0 if batch.empty?
|
440
|
+
|
441
|
+
local_results.size += batch.size
|
442
|
+
|
443
|
+
runnable_results = T.let([], T::Array[EnqueuedRunnable::Result])
|
444
|
+
redis.multi do
|
445
|
+
batch.each do |enqueued_runnable|
|
446
|
+
# Fulfill the reporter contract by calling `prerecord` before we run the test.
|
447
|
+
reporter.prerecord(enqueued_runnable.runnable_class, enqueued_runnable.method_name)
|
448
|
+
|
449
|
+
# Actually run the test!
|
450
|
+
runnable_results << enqueued_runnable.run do |initial_result|
|
451
|
+
if ResultType.of(initial_result) == ResultType::Requeued
|
452
|
+
sadd_future = redis.sadd(key('retry_set'), enqueued_runnable.attempt_id)
|
453
|
+
EnqueuedRunnable::Result::Commit.new { sadd_future.value }
|
454
|
+
else
|
455
|
+
xack_future = redis.xack(stream_key, group_name, enqueued_runnable.entry_id)
|
456
|
+
EnqueuedRunnable::Result::Commit.new { xack_future.value == 1 }
|
457
|
+
end
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
461
|
+
|
462
|
+
batch_result_aggregate = ResultAggregate.new
|
463
|
+
runnable_results.each do |runnable_result|
|
464
|
+
# Complete the reporter contract by calling `record` with the result.
|
465
|
+
reporter.record(runnable_result.committed_result)
|
330
466
|
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
result = enqueued_runnable.run
|
467
|
+
# Update statistics.
|
468
|
+
batch_result_aggregate.update_with_result(runnable_result)
|
469
|
+
local_results.update_with_result(runnable_result)
|
335
470
|
|
336
|
-
case (result_type = ResultType.of(
|
337
|
-
when ResultType::
|
471
|
+
case (result_type = ResultType.of(runnable_result.committed_result))
|
472
|
+
when ResultType::Skipped, ResultType::Failed, ResultType::Error
|
473
|
+
redis.lpush(list_key(result_type.serialize), runnable_result.enqueued_runnable.identifier)
|
474
|
+
when ResultType::Passed, ResultType::Requeued, ResultType::Discarded
|
338
475
|
# noop
|
339
|
-
when ResultType::Skipped
|
340
|
-
redis.lpush(key('skip_list'), Marshal.dump(enqueued_runnable.serialize))
|
341
|
-
when ResultType::Failed
|
342
|
-
redis.lpush(key('failure_list'), Marshal.dump(enqueued_runnable.serialize))
|
343
|
-
when ResultType::Error
|
344
|
-
redis.lpush(key('error_list'), Marshal.dump(enqueued_runnable.serialize))
|
345
476
|
else
|
346
477
|
T.absurd(result_type)
|
347
478
|
end
|
348
|
-
|
349
|
-
local_results.update_with_result(result)
|
350
|
-
to_be_acked[enqueued_runnable.execution_id] = result
|
351
|
-
end
|
352
|
-
|
353
|
-
return 0 if to_be_acked.empty?
|
354
|
-
|
355
|
-
acked = redis.evalsha(
|
356
|
-
ack_batch_script,
|
357
|
-
keys: [stream_key],
|
358
|
-
argv: [group_name] + to_be_acked.keys
|
359
|
-
)
|
360
|
-
|
361
|
-
batch_results = ResultAggregate.new(acks: acked.length)
|
362
|
-
acked.each do |execution_id|
|
363
|
-
acked_result = to_be_acked.delete(execution_id)
|
364
|
-
reporter.record(acked_result)
|
365
|
-
batch_results.update_with_result(acked_result)
|
366
|
-
end
|
367
|
-
|
368
|
-
to_be_acked.each do |_execution_id, unacked_result|
|
369
|
-
# TODO: use custom assertion class.
|
370
|
-
discard_assertion = Minitest::Skip.new("The test result was discarded, " \
|
371
|
-
"because the test has been claimed another worker.")
|
372
|
-
discard_assertion.set_backtrace(caller)
|
373
|
-
unacked_result.failures = [discard_assertion]
|
374
|
-
reporter.record(unacked_result)
|
375
479
|
end
|
376
480
|
|
377
|
-
adjust_combined_results(
|
378
|
-
local_results.acks += acked.length
|
379
|
-
acked.length
|
481
|
+
adjust_combined_results(batch_result_aggregate)
|
380
482
|
end
|
381
483
|
|
382
484
|
INITIAL_BACKOFF = 10 # milliseconds
|