minitest-distributed 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/Gemfile +1 -1
- data/README.md +29 -13
- data/bin/setup +0 -2
- data/lib/minitest/distributed/configuration.rb +49 -4
- data/lib/minitest/distributed/coordinators/coordinator_interface.rb +3 -0
- data/lib/minitest/distributed/coordinators/memory_coordinator.rb +29 -9
- data/lib/minitest/distributed/coordinators/redis_coordinator.rb +258 -156
- data/lib/minitest/distributed/enqueued_runnable.rb +193 -41
- data/lib/minitest/distributed/filters/exclude_filter.rb +4 -4
- data/lib/minitest/distributed/filters/filter_interface.rb +3 -3
- data/lib/minitest/distributed/filters/include_filter.rb +4 -4
- data/lib/minitest/distributed/reporters/distributed_progress_reporter.rb +2 -2
- data/lib/minitest/distributed/reporters/distributed_summary_reporter.rb +49 -10
- data/lib/minitest/distributed/reporters/redis_coordinator_warnings_reporter.rb +11 -16
- data/lib/minitest/distributed/result_aggregate.rb +38 -9
- data/lib/minitest/distributed/result_type.rb +76 -2
- data/lib/minitest/distributed/test_selector.rb +4 -6
- data/lib/minitest/distributed/version.rb +1 -1
- data/lib/minitest/distributed_plugin.rb +1 -25
- data/sorbet/rbi/minitest.rbi +18 -3
- data/sorbet/rbi/redis.rbi +19 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 076d9467680eff44b28d42436648c5f4878fb5689bca7ddd9365bd05991a5352
|
4
|
+
data.tar.gz: '0068262916e339e61d4997eb478df915be5943d3a6b39e70fb02def9ba0d0dbb'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d71ead8d7f352d9d628ec6682d2367804c63362e1e02e658ae3899b55486e1ff1bedd1cc10e23796a3373a0a3d0580b874646c4f7e3ec472675b8b85d88b001
|
7
|
+
data.tar.gz: cafd308bdad9ee0332323e0e826810fee2be91cbb5bdafa74ec8037891e9a6031ffa6f4f1e4777527e0f21784a48502848893661120145749629240164c3c077
|
data/.rubocop.yml
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -63,8 +63,8 @@ them to fail.
|
|
63
63
|
|
64
64
|
### Other optional command line arguments
|
65
65
|
|
66
|
-
- `--test-timeout=SECONDS` or `ENV[
|
67
|
-
maximum amount a test is allowed to run before it times out. In a distributed
|
66
|
+
- `--test-timeout=SECONDS` or `ENV[MINITEST_TEST_TIMEOUT_SECONDS]` (default: 30s):
|
67
|
+
the maximum amount a test is allowed to run before it times out. In a distributed
|
68
68
|
system, it's impossible to differentiate between a worker being slow and a
|
69
69
|
worker being broken. When the timeout passes, the other workers will assume
|
70
70
|
that the worker running the test has crashed, and will attempt to claim this
|
@@ -92,24 +92,40 @@ other tests.
|
|
92
92
|
|
93
93
|
## Development
|
94
94
|
|
95
|
-
|
96
|
-
run `rake test` to run the tests. You can also run `bin/console` for an
|
97
|
-
interactive prompt that will allow you to experiment.
|
95
|
+
To bootstrap a local development environment:
|
98
96
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
97
|
+
- Run `bin/setup` to install dependencies.
|
98
|
+
- Start a Redis server by running `redis-server`, assuming you have Redis
|
99
|
+
installed locally and the binary is on your `PATH`. Alternatively, you can
|
100
|
+
set the `REDIS_URL` environment variable to point to a Redis instance running
|
101
|
+
elsewhere.
|
102
|
+
- Now, run `bin/rake test` to run the tests, and verify everything is working.
|
103
|
+
- You can also run `bin/console` for an interactive prompt that will allow you
|
104
|
+
to experiment.
|
105
|
+
|
106
|
+
### Releasing a new version
|
107
|
+
|
108
|
+
- To install this gem onto your local machine, run `bin/rake install`.
|
109
|
+
- Only people at Shopify can release a new version to
|
110
|
+
[rubygems.org](https://rubygems.org). To do so, update the `VERSION` constant
|
111
|
+
in `version.rb`, and merge to master. Shipit will take care of building the
|
112
|
+
`.gem` bundle, and pushing it to rubygems.org.
|
104
113
|
|
105
114
|
## Contributing
|
106
115
|
|
107
|
-
Bug reports and pull requests are welcome on GitHub at
|
116
|
+
Bug reports and pull requests are welcome on GitHub at
|
117
|
+
https://github.com/Shopify/minitest-distributed. This project is intended to
|
118
|
+
be a safe, welcoming space for collaboration, and contributors are expected to
|
119
|
+
adhere to the [code of
|
120
|
+
conduct](https://github.com/Shopify/minitest-distributed/blob/master/CODE_OF_CONDUCT.md).
|
108
121
|
|
109
122
|
## License
|
110
123
|
|
111
|
-
The gem is available as open source under the terms of the [MIT
|
124
|
+
The gem is available as open source under the terms of the [MIT
|
125
|
+
License](https://opensource.org/licenses/MIT).
|
112
126
|
|
113
127
|
## Code of Conduct
|
114
128
|
|
115
|
-
Everyone interacting in the
|
129
|
+
Everyone interacting in the `minitest-distributed` project's codebases, issue
|
130
|
+
trackers, chat rooms and mailing lists is expected to follow the [code of
|
131
|
+
conduct](https://github.com/Shopify/minitest-distributed/blob/master/CODE_OF_CONDUCT.md).
|
data/bin/setup
CHANGED
@@ -8,8 +8,8 @@ module Minitest
|
|
8
8
|
module Distributed
|
9
9
|
class Configuration < T::Struct
|
10
10
|
DEFAULT_BATCH_SIZE = 10
|
11
|
-
DEFAULT_MAX_ATTEMPTS =
|
12
|
-
|
11
|
+
DEFAULT_MAX_ATTEMPTS = 1
|
12
|
+
DEFAULT_TEST_TIMEOUT_SECONDS = 30.0 # seconds
|
13
13
|
|
14
14
|
class << self
|
15
15
|
extend T::Sig
|
@@ -20,11 +20,54 @@ module Minitest
|
|
20
20
|
coordinator_uri: URI(env['MINITEST_COORDINATOR'] || 'memory:'),
|
21
21
|
run_id: env['MINITEST_RUN_ID'] || SecureRandom.uuid,
|
22
22
|
worker_id: env['MINITEST_WORKER_ID'] || SecureRandom.uuid,
|
23
|
-
|
23
|
+
test_timeout_seconds: Float(env['MINITEST_TEST_TIMEOUT_SECONDS'] || DEFAULT_TEST_TIMEOUT_SECONDS),
|
24
24
|
test_batch_size: Integer(env['MINITEST_TEST_BATCH_SIZE'] || DEFAULT_BATCH_SIZE),
|
25
25
|
max_attempts: Integer(env['MINITEST_MAX_ATTEMPTS'] || DEFAULT_MAX_ATTEMPTS),
|
26
|
+
max_failures: (max_failures_env = env['MINITEST_MAX_FAILURES']) ? Integer(max_failures_env) : nil,
|
26
27
|
)
|
27
28
|
end
|
29
|
+
|
30
|
+
sig { params(opts: OptionParser).returns(T.attached_class) }
|
31
|
+
def from_command_line_options(opts)
|
32
|
+
configuration = from_env
|
33
|
+
|
34
|
+
opts.on('--coordinator=URI', "The URI pointing to the coordinator") do |uri|
|
35
|
+
configuration.coordinator_uri = URI.parse(uri)
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on('--test-timeout=TIMEOUT', "The maximum run time for a single test in seconds") do |timeout|
|
39
|
+
configuration.test_timeout_seconds = Float(timeout)
|
40
|
+
end
|
41
|
+
|
42
|
+
opts.on('--max-attempts=ATTEMPTS', "The maximum number of attempts to run a test") do |attempts|
|
43
|
+
configuration.max_attempts = Integer(attempts)
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on('--test-batch-size=NUMBER', "The number of tests to process per batch") do |batch_size|
|
47
|
+
configuration.test_batch_size = Integer(batch_size)
|
48
|
+
end
|
49
|
+
|
50
|
+
opts.on('--max-failures=FAILURES', "The maximum allowed failure before aborting a run") do |failures|
|
51
|
+
configuration.max_failures = Integer(failures)
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on('--run-id=ID', "The ID for this run shared between coordinated workers") do |id|
|
55
|
+
configuration.run_id = id
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.on('--worker-id=ID', "The unique ID for this worker") do |id|
|
59
|
+
configuration.worker_id = id
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.on(
|
63
|
+
'--[no-]retry-failures', "Retry failed and errored tests from a previous run attempt " \
|
64
|
+
"with the same run ID (default: enabled)"
|
65
|
+
) do |enabled|
|
66
|
+
configuration.retry_failures = enabled
|
67
|
+
end
|
68
|
+
|
69
|
+
configuration
|
70
|
+
end
|
28
71
|
end
|
29
72
|
|
30
73
|
extend T::Sig
|
@@ -32,9 +75,11 @@ module Minitest
|
|
32
75
|
prop :coordinator_uri, URI::Generic, default: URI('memory:')
|
33
76
|
prop :run_id, String, factory: -> { SecureRandom.uuid }
|
34
77
|
prop :worker_id, String, factory: -> { SecureRandom.uuid }
|
35
|
-
prop :
|
78
|
+
prop :test_timeout_seconds, Float, default: DEFAULT_TEST_TIMEOUT_SECONDS
|
36
79
|
prop :test_batch_size, Integer, default: DEFAULT_BATCH_SIZE
|
37
80
|
prop :max_attempts, Integer, default: DEFAULT_MAX_ATTEMPTS
|
81
|
+
prop :max_failures, T.nilable(Integer)
|
82
|
+
prop :retry_failures, T::Boolean, default: true
|
38
83
|
|
39
84
|
sig { returns(Coordinators::CoordinatorInterface) }
|
40
85
|
def coordinator
|
@@ -25,7 +25,8 @@ module Minitest
|
|
25
25
|
|
26
26
|
@leader = T.let(Mutex.new, Mutex)
|
27
27
|
@queue = T.let(Queue.new, Queue)
|
28
|
-
@local_results = T.let(ResultAggregate.new, ResultAggregate)
|
28
|
+
@local_results = T.let(ResultAggregate.new(max_failures: configuration.max_failures), ResultAggregate)
|
29
|
+
@aborted = T.let(false, T::Boolean)
|
29
30
|
end
|
30
31
|
|
31
32
|
sig { override.params(reporter: Minitest::CompositeReporter, options: T::Hash[Symbol, T.untyped]).void }
|
@@ -33,6 +34,11 @@ module Minitest
|
|
33
34
|
# No need for any additional reporters
|
34
35
|
end
|
35
36
|
|
37
|
+
sig { override.returns(T::Boolean) }
|
38
|
+
def aborted?
|
39
|
+
@aborted
|
40
|
+
end
|
41
|
+
|
36
42
|
sig { override.params(test_selector: TestSelector).void }
|
37
43
|
def produce(test_selector:)
|
38
44
|
if @leader.try_lock
|
@@ -41,24 +47,38 @@ module Minitest
|
|
41
47
|
if tests.empty?
|
42
48
|
queue.close
|
43
49
|
else
|
44
|
-
tests.each
|
50
|
+
tests.each do |runnable|
|
51
|
+
queue << EnqueuedRunnable.new(
|
52
|
+
class_name: T.must(runnable.class.name),
|
53
|
+
method_name: runnable.name,
|
54
|
+
test_timeout_seconds: configuration.test_timeout_seconds,
|
55
|
+
max_attempts: configuration.max_attempts,
|
56
|
+
)
|
57
|
+
end
|
45
58
|
end
|
46
59
|
end
|
47
60
|
end
|
48
61
|
|
49
62
|
sig { override.params(reporter: AbstractReporter).void }
|
50
63
|
def consume(reporter:)
|
51
|
-
until queue.
|
52
|
-
enqueued_runnable = queue.pop
|
64
|
+
until queue.closed?
|
65
|
+
enqueued_runnable = T.let(queue.pop, EnqueuedRunnable)
|
66
|
+
|
53
67
|
reporter.prerecord(enqueued_runnable.runnable_class, enqueued_runnable.method_name)
|
54
|
-
result = enqueued_runnable.run
|
55
68
|
|
56
|
-
|
57
|
-
|
69
|
+
enqueued_result = enqueued_runnable.run do |initial_result|
|
70
|
+
if ResultType.of(initial_result) == ResultType::Requeued
|
71
|
+
queue << enqueued_runnable.next_attempt
|
72
|
+
end
|
73
|
+
EnqueuedRunnable::Result::Commit.success
|
74
|
+
end
|
58
75
|
|
59
|
-
reporter.record(
|
76
|
+
reporter.record(enqueued_result.committed_result)
|
77
|
+
local_results.update_with_result(enqueued_result)
|
60
78
|
|
61
|
-
|
79
|
+
# We abort a run if we reach the maximum number of failures
|
80
|
+
queue.close if combined_results.abort?
|
81
|
+
queue.close if combined_results.complete?
|
62
82
|
end
|
63
83
|
end
|
64
84
|
end
|
@@ -15,11 +15,19 @@ module Minitest
|
|
15
15
|
# to the stream.
|
16
16
|
#
|
17
17
|
# AFter that, all workers will start consuming from the stream. They will first
|
18
|
-
# try to claim stale entries from other workers (determined by the `
|
19
|
-
# option), and process them
|
18
|
+
# try to claim stale entries from other workers (determined by the `test_timeout_seconds`
|
19
|
+
# option), and process them up to a maximum of `max_attempts` attempts. Then,
|
20
20
|
# they will consume tests from the stream, run them, and ack them. This is done
|
21
21
|
# in batches to reduce load on Redis.
|
22
22
|
#
|
23
|
+
# Retrying failed tests (up to `max_attempts` times) uses the same mechanism.
|
24
|
+
# When a test fails, and we haven't exhausted the maximum number of attempts, we
|
25
|
+
# do not ACK the result with Redis. The means that another worker will eventually
|
26
|
+
# claim the test, and run it again. However, in this case we don't want to slow
|
27
|
+
# things down unnecessarily. When a test fails and we want to retry it, we add the
|
28
|
+
# test to the `retry_set` in Redis. When other worker sees that a test is in this
|
29
|
+
# set, it can immediately claim the test, rather than waiting for the timeout.
|
30
|
+
#
|
23
31
|
# Finally, when we have acked the same number of tests as we populated into the
|
24
32
|
# queue, the run is considered finished. The first worker to detect this will
|
25
33
|
# remove the consumergroup and the associated stream from Redis.
|
@@ -47,7 +55,10 @@ module Minitest
|
|
47
55
|
attr_reader :local_results
|
48
56
|
|
49
57
|
sig { returns(T::Set[EnqueuedRunnable]) }
|
50
|
-
attr_reader :
|
58
|
+
attr_reader :reclaimed_timeout_tests
|
59
|
+
|
60
|
+
sig { returns(T::Set[EnqueuedRunnable]) }
|
61
|
+
attr_reader :reclaimed_failed_tests
|
51
62
|
|
52
63
|
sig { params(configuration: Configuration).void }
|
53
64
|
def initialize(configuration:)
|
@@ -58,7 +69,9 @@ module Minitest
|
|
58
69
|
@group_name = T.let('minitest-distributed', String)
|
59
70
|
@local_results = T.let(ResultAggregate.new, ResultAggregate)
|
60
71
|
@combined_results = T.let(nil, T.nilable(ResultAggregate))
|
61
|
-
@
|
72
|
+
@reclaimed_timeout_tests = T.let(Set.new, T::Set[EnqueuedRunnable])
|
73
|
+
@reclaimed_failed_tests = T.let(Set.new, T::Set[EnqueuedRunnable])
|
74
|
+
@aborted = T.let(false, T::Boolean)
|
62
75
|
end
|
63
76
|
|
64
77
|
sig { override.params(reporter: Minitest::CompositeReporter, options: T::Hash[Symbol, T.untyped]).void }
|
@@ -70,41 +83,51 @@ module Minitest
|
|
70
83
|
def combined_results
|
71
84
|
@combined_results ||= begin
|
72
85
|
stats_as_string = redis.mget(key('runs'), key('assertions'), key('passes'),
|
73
|
-
key('failures'), key('errors'), key('skips'), key('
|
86
|
+
key('failures'), key('errors'), key('skips'), key('requeues'), key('discards'),
|
87
|
+
key('acks'), key('size'))
|
74
88
|
|
75
89
|
ResultAggregate.new(
|
90
|
+
max_failures: configuration.max_failures,
|
91
|
+
|
76
92
|
runs: Integer(stats_as_string.fetch(0) || 0),
|
77
93
|
assertions: Integer(stats_as_string.fetch(1) || 0),
|
78
94
|
passes: Integer(stats_as_string.fetch(2) || 0),
|
79
95
|
failures: Integer(stats_as_string.fetch(3) || 0),
|
80
96
|
errors: Integer(stats_as_string.fetch(4) || 0),
|
81
97
|
skips: Integer(stats_as_string.fetch(5) || 0),
|
82
|
-
|
83
|
-
|
98
|
+
requeues: Integer(stats_as_string.fetch(6) || 0),
|
99
|
+
discards: Integer(stats_as_string.fetch(7) || 0),
|
100
|
+
acks: Integer(stats_as_string.fetch(8) || 0),
|
84
101
|
|
85
|
-
# In the case where we have no build
|
102
|
+
# In the case where we have no build size number published yet, we initialize
|
86
103
|
# thesize of the test suite to be arbitrarity large, to make sure it is
|
87
104
|
# higher than the number of acks, so the run is not consider completed yet.
|
88
|
-
size: Integer(stats_as_string.fetch(
|
105
|
+
size: Integer(stats_as_string.fetch(9) || 2_147_483_647),
|
89
106
|
)
|
90
107
|
end
|
91
108
|
end
|
92
109
|
|
110
|
+
sig { override.returns(T::Boolean) }
|
111
|
+
def aborted?
|
112
|
+
@aborted
|
113
|
+
end
|
114
|
+
|
93
115
|
sig { override.params(test_selector: TestSelector).void }
|
94
116
|
def produce(test_selector:)
|
95
117
|
# Whoever ends up creating the consumer group will act as leader,
|
96
118
|
# and publish the list of tests to the stream.
|
97
119
|
|
98
|
-
begin
|
120
|
+
initial_attempt = begin
|
99
121
|
# When using `redis.multi`, the second DEL command gets executed even if the initial GROUP
|
100
122
|
# fails. This is bad, because only the leader should be issuing the DEL command.
|
101
123
|
# When using EVAL and a Lua script, the script aborts after the first XGROUP command
|
102
124
|
# fails, and the DEL never gets executed for followers.
|
103
|
-
redis.evalsha(
|
125
|
+
keys_deleted = redis.evalsha(
|
104
126
|
register_consumergroup_script,
|
105
127
|
keys: [stream_key, key('size'), key('acks')],
|
106
128
|
argv: [group_name],
|
107
129
|
)
|
130
|
+
keys_deleted == 0
|
108
131
|
|
109
132
|
rescue Redis::CommandError => ce
|
110
133
|
if ce.message.include?('BUSYGROUP')
|
@@ -118,38 +141,67 @@ module Minitest
|
|
118
141
|
end
|
119
142
|
end
|
120
143
|
|
121
|
-
|
122
|
-
|
123
|
-
redis.lrange(key('failure_list'), 0, -1)
|
124
|
-
redis.lrange(key('error_list'), 0, -1)
|
125
|
-
redis.del(key('failure_list'), key('error_list'))
|
126
|
-
end
|
127
|
-
|
128
|
-
tests = if run_attempt == 1
|
144
|
+
tests = T.let([], T::Array[Minitest::Runnable])
|
145
|
+
tests = if initial_attempt
|
129
146
|
# If this is the first attempt for this run ID, we will schedule the full
|
130
147
|
# test suite as returned by the test selector to run.
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
148
|
+
|
149
|
+
tests_from_selector = test_selector.tests
|
150
|
+
adjust_combined_results(ResultAggregate.new(size: tests_from_selector.size))
|
151
|
+
tests_from_selector
|
152
|
+
|
153
|
+
elsif configuration.retry_failures
|
154
|
+
# Before starting a retry attempt, we first check if the previous attempt
|
155
|
+
# was aborted before it was completed. If this is the case, we cannot use
|
156
|
+
# retry mode, and should immediately fail the attempt.
|
157
|
+
if combined_results.abort?
|
158
|
+
# We mark this run as aborted, which causes this worker to not be successful.
|
159
|
+
@aborted = true
|
160
|
+
|
161
|
+
# We still publish an empty size run to Redis, so if there are any followers,
|
162
|
+
# they will wind down normally. Only the leader will exit
|
163
|
+
# with a non-zero exit status and fail the build; any follower will
|
164
|
+
# exit with status 0.
|
165
|
+
adjust_combined_results(ResultAggregate.new(size: 0))
|
166
|
+
T.let([], T::Array[Minitest::Runnable])
|
167
|
+
else
|
168
|
+
previous_failures, previous_errors, _deleted = redis.multi do
|
169
|
+
redis.lrange(list_key(ResultType::Failed.serialize), 0, -1)
|
170
|
+
redis.lrange(list_key(ResultType::Error.serialize), 0, -1)
|
171
|
+
redis.del(list_key(ResultType::Failed.serialize), list_key(ResultType::Error.serialize))
|
172
|
+
end
|
173
|
+
|
174
|
+
# We set the `size` key to the number of tests we are planning to schedule.
|
175
|
+
# We also adjust the number of failures and errors back to 0.
|
176
|
+
# We set the number of requeues to the number of tests that failed, so the
|
177
|
+
# run statistics will reflect that we retried some failed test.
|
178
|
+
#
|
179
|
+
# However, normally requeues are not acked, as we expect the test to be acked
|
180
|
+
# by another worker later. This makes the test loop think iot is already done.
|
181
|
+
# To prevent this, we initialize the number of acks negatively, so it evens out
|
182
|
+
# in the statistics.
|
183
|
+
total_failures = previous_failures.length + previous_errors.length
|
184
|
+
adjust_combined_results(ResultAggregate.new(
|
185
|
+
size: total_failures,
|
186
|
+
failures: -previous_failures.length,
|
187
|
+
errors: -previous_errors.length,
|
188
|
+
requeues: total_failures,
|
189
|
+
))
|
190
|
+
|
191
|
+
# For subsequent attempts, we check the list of previous failures and
|
192
|
+
# errors, and only schedule to re-run those tests. This allows for faster
|
193
|
+
# retries of potentially flaky tests.
|
194
|
+
test_identifiers_to_retry = T.let(previous_failures + previous_errors, T::Array[String])
|
195
|
+
test_identifiers_to_retry.map { |identifier| DefinedRunnable.from_identifier(identifier) }
|
138
196
|
end
|
197
|
+
else
|
198
|
+
adjust_combined_results(ResultAggregate.new(size: 0))
|
199
|
+
T.let([], T::Array[Minitest::Runnable])
|
139
200
|
end
|
140
201
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
adjust_combined_results(ResultAggregate.new(
|
145
|
-
size: tests.size,
|
146
|
-
failures: -previous_failures.length,
|
147
|
-
errors: -previous_errors.length,
|
148
|
-
reruns: previous_failures.length + previous_errors.length,
|
149
|
-
))
|
150
|
-
|
151
|
-
# TODO: break this up in batches.
|
152
|
-
tests.each { |test| redis.xadd(stream_key, test.serialize) }
|
202
|
+
redis.pipelined do
|
203
|
+
tests.each { |test| redis.xadd(stream_key, class_name: T.must(test.class.name), method_name: test.name) }
|
204
|
+
end
|
153
205
|
end
|
154
206
|
|
155
207
|
sig { override.params(reporter: AbstractReporter).void }
|
@@ -158,26 +210,29 @@ module Minitest
|
|
158
210
|
loop do
|
159
211
|
# First, see if there are any pending tests from other workers to claim.
|
160
212
|
stale_runnables = claim_stale_runnables
|
161
|
-
|
213
|
+
process_batch(stale_runnables, reporter)
|
162
214
|
|
163
|
-
#
|
215
|
+
# Then, try to process a regular batch of messages
|
164
216
|
fresh_runnables = claim_fresh_runnables(block: exponential_backoff)
|
165
|
-
|
217
|
+
process_batch(fresh_runnables, reporter)
|
166
218
|
|
167
219
|
# If we have acked the same amount of tests as we were supposed to, the run
|
168
220
|
# is complete and we can exit our loop. Generally, only one worker will detect
|
169
221
|
# this condition. The pther workers will quit their consumer loop because the
|
170
222
|
# consumergroup will be deleted by the first worker, and their Redis commands
|
171
223
|
# will start to fail - see the rescue block below.
|
172
|
-
break if combined_results.
|
224
|
+
break if combined_results.complete?
|
225
|
+
|
226
|
+
# We also abort a run if we reach the maximum number of failures
|
227
|
+
break if combined_results.abort?
|
173
228
|
|
174
229
|
# To make sure we don't end up in a busy loop overwhelming Redis with commands
|
175
230
|
# when there is no work to do, we increase the blocking time exponentially,
|
176
|
-
# and reset it to the initial value if we processed any
|
177
|
-
if
|
178
|
-
exponential_backoff = INITIAL_BACKOFF
|
179
|
-
else
|
231
|
+
# and reset it to the initial value if we processed any tests.
|
232
|
+
if stale_runnables.empty? && fresh_runnables.empty?
|
180
233
|
exponential_backoff <<= 1
|
234
|
+
else
|
235
|
+
exponential_backoff = INITIAL_BACKOFF
|
181
236
|
end
|
182
237
|
end
|
183
238
|
|
@@ -203,28 +258,20 @@ module Minitest
|
|
203
258
|
@redis ||= Redis.new(url: configuration.coordinator_uri)
|
204
259
|
end
|
205
260
|
|
206
|
-
sig { returns(String) }
|
207
|
-
def ack_batch_script
|
208
|
-
@ack_batch_script = T.let(@ack_batch_script, T.nilable(String))
|
209
|
-
@ack_batch_script ||= redis.script(:load, <<~LUA)
|
210
|
-
local acked_ids, acked, i = {}, 0, 2
|
211
|
-
while ARGV[i] do
|
212
|
-
if redis.call('XACK', KEYS[1], ARGV[1], ARGV[i]) > 0 then
|
213
|
-
acked = acked + 1
|
214
|
-
acked_ids[acked] = ARGV[i]
|
215
|
-
end
|
216
|
-
i = i + 1
|
217
|
-
end
|
218
|
-
return acked_ids
|
219
|
-
LUA
|
220
|
-
end
|
221
|
-
|
222
261
|
sig { returns(String) }
|
223
262
|
def register_consumergroup_script
|
224
263
|
@register_consumergroup_script = T.let(@register_consumergroup_script, T.nilable(String))
|
225
264
|
@register_consumergroup_script ||= redis.script(:load, <<~LUA)
|
265
|
+
-- Try to create the consumergroup. This will raise an error if the
|
266
|
+
-- consumergroup has already been registered by somebody else, which
|
267
|
+
-- means another worker will be acting as leader.
|
268
|
+
-- In that case, the next Redis DEL call will not be executed.
|
226
269
|
redis.call('XGROUP', 'CREATE', KEYS[1], ARGV[1], '0', 'MKSTREAM')
|
227
|
-
|
270
|
+
|
271
|
+
-- The leader should reset the size and acks key for this run attempt.
|
272
|
+
-- We return the number of keys that were deleted, which can be used to
|
273
|
+
-- determine whether this was the first attempt for this run or not.
|
274
|
+
return redis.call('DEL', KEYS[2], KEYS[3])
|
228
275
|
LUA
|
229
276
|
end
|
230
277
|
|
@@ -232,51 +279,119 @@ module Minitest
|
|
232
279
|
def claim_fresh_runnables(block:)
|
233
280
|
result = redis.xreadgroup(group_name, configuration.worker_id, stream_key, '>',
|
234
281
|
block: block, count: configuration.test_batch_size)
|
235
|
-
EnqueuedRunnable.from_redis_stream_claim(result.fetch(stream_key, []))
|
282
|
+
EnqueuedRunnable.from_redis_stream_claim(result.fetch(stream_key, []), configuration: configuration)
|
283
|
+
end
|
284
|
+
|
285
|
+
sig do
|
286
|
+
params(
|
287
|
+
pending_messages: T::Hash[String, PendingExecution],
|
288
|
+
max_idle_time_ms: Integer,
|
289
|
+
).returns(T::Array[EnqueuedRunnable])
|
290
|
+
end
|
291
|
+
def xclaim_messages(pending_messages, max_idle_time_ms:)
|
292
|
+
return [] if pending_messages.empty?
|
293
|
+
claimed = redis.xclaim(stream_key, group_name, configuration.worker_id,
|
294
|
+
max_idle_time_ms, pending_messages.keys)
|
295
|
+
|
296
|
+
EnqueuedRunnable.from_redis_stream_claim(claimed, pending_messages, configuration: configuration)
|
236
297
|
end
|
237
298
|
|
238
299
|
sig { returns(T::Array[EnqueuedRunnable]) }
|
239
300
|
def claim_stale_runnables
|
240
|
-
#
|
241
|
-
#
|
242
|
-
# to prevent the exact same batch from being too slow on repeated attempts,
|
243
|
-
# which would cause us to mark all the tests in that batch as failed.
|
244
|
-
#
|
245
|
-
# This has the side effect that for a retried test, the test timeout
|
246
|
-
# will be TEST_TIMEOUT * BATCH_SIZE in practice. This gives us a higher
|
247
|
-
# likelihood that the test will pass if the batch size > 1.
|
248
|
-
pending = redis.xpending(stream_key, group_name, '-', '+', 1)
|
249
|
-
|
250
|
-
# Every test is allowed to take test_timeout milliseconds. Because we process tests in
|
251
|
-
# batches, they should never be pending for TEST_TIMEOUT * BATCH_SIZE milliseconds.
|
301
|
+
# Every test is allowed to take test_timeout_seconds. Because we process tests in
|
302
|
+
# batches, they should never be pending for TEST_TIMEOUT_SECONDS * BATCH_SIZE seconds.
|
252
303
|
# So, only try to claim messages older than that, with a bit of jitter.
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
304
|
+
max_idle_time_ms = Integer(configuration.test_timeout_seconds * configuration.test_batch_size * 1000)
|
305
|
+
max_idle_time_ms_with_jitter = max_idle_time_ms * rand(1.0...1.2)
|
306
|
+
|
307
|
+
# Find all the pending messages to see if we want to attenpt to claim some.
|
308
|
+
pending = redis.xpending(stream_key, group_name, '-', '+', configuration.test_batch_size)
|
309
|
+
return [] if pending.empty?
|
310
|
+
|
311
|
+
active_consumers = Set[configuration.worker_id]
|
312
|
+
|
313
|
+
stale_messages = {}
|
314
|
+
active_messages = {}
|
315
|
+
pending.each do |msg|
|
316
|
+
message = PendingExecution.from_xpending(msg)
|
317
|
+
if message.elapsed_time_ms < max_idle_time_ms_with_jitter
|
318
|
+
active_consumers << message.worker_id
|
319
|
+
active_messages[message.entry_id] = message
|
320
|
+
else
|
321
|
+
stale_messages[message.entry_id] = message
|
258
322
|
end
|
259
323
|
end
|
260
324
|
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
if attempt > configuration.max_attempts
|
271
|
-
# If we exhaust our attempts, we will mark the test to immediately fail when it will be run next.
|
272
|
-
mark_runnable_to_fail_immediately(er)
|
273
|
-
else
|
274
|
-
reclaimed_tests << er
|
325
|
+
# If we only have evidence of one active consumer based on the pending message,
|
326
|
+
# we will query Redis for all consumers to make sure we have full data.
|
327
|
+
# We can skip this if we already know that there is more than one active one.
|
328
|
+
if active_consumers.size == 1
|
329
|
+
begin
|
330
|
+
redis.xinfo('consumers', stream_key, group_name).each do |consumer|
|
331
|
+
if consumer.fetch('idle') < max_idle_time_ms
|
332
|
+
active_consumers << consumer.fetch('name')
|
333
|
+
end
|
275
334
|
end
|
335
|
+
rescue Redis::CommandError
|
336
|
+
# This command can fail, specifically during the cleanup phase at the end
|
337
|
+
# of a build, when another worker has removed the stream key already.
|
276
338
|
end
|
339
|
+
end
|
340
|
+
|
341
|
+
# Now, see if we want to claim any stale messages. If we are the only active
|
342
|
+
# consumer, we want to claim our own messages as well as messgaes from other
|
343
|
+
# (stale) consumers. If there are multiple active consumers, we are going to
|
344
|
+
# let another consumer claim our own messages.
|
345
|
+
if active_consumers.size > 1
|
346
|
+
stale_messages.reject! { |_key, message| message.worker_id == configuration.worker_id }
|
347
|
+
end
|
348
|
+
|
349
|
+
unless stale_messages.empty?
|
350
|
+
# When we have to reclaim stale tests, those test are potentially too slow
|
351
|
+
# to run inside the test timeout. We only claim one timed out test at a time in order
|
352
|
+
# to prevent the exact same batch from being too slow on repeated attempts,
|
353
|
+
# which would cause us to mark all the tests in that batch as failed.
|
354
|
+
#
|
355
|
+
# This has the side effect that for a retried test, the test timeout
|
356
|
+
# will be TEST_TIMEOUT_SECONDS * BATCH_SIZE in practice. This gives us a higher
|
357
|
+
# likelihood that the test will pass if the batch size > 1.
|
358
|
+
stale_messages = stale_messages.slice(stale_messages.keys.first)
|
359
|
+
|
360
|
+
enqueued_runnables = xclaim_messages(stale_messages, max_idle_time_ms: max_idle_time_ms)
|
361
|
+
reclaimed_timeout_tests.merge(enqueued_runnables)
|
362
|
+
return enqueued_runnables
|
363
|
+
end
|
364
|
+
|
365
|
+
# Now, see if we want to claim any failed tests to retry. Again, if we are the only
|
366
|
+
# active consumer, we want to claim our own messages as well as messgaes from other
|
367
|
+
# (stale) consumers. If there are multiple active consumers, we are going to let
|
368
|
+
# another consumer claim our own messages.
|
369
|
+
if active_consumers.size > 1
|
370
|
+
active_messages.reject! { |_key, message| message.worker_id == configuration.worker_id }
|
371
|
+
end
|
277
372
|
|
278
|
-
|
373
|
+
# For all the active messages, we can check whether they are marked for a retry by
|
374
|
+
# trying to remove the test from the retry set set in Redis. Only one worker will be
|
375
|
+
# able to remove the entry from the set, so only one worker will end up trying to
|
376
|
+
# claim the test for the next attempt.
|
377
|
+
#
|
378
|
+
# We use `redis.multi` so we only need one round-trip for the entire list. Note that
|
379
|
+
# this is not an atomic operation with the XCLAIM call. This is OK, because the retry
|
380
|
+
# set is only there to speed things up and prevent us from having to wait for the test
|
381
|
+
# timeout. If the worker crashes between removing an item from the retry setm the test
|
382
|
+
# will eventually be picked up by another worker.
|
383
|
+
messages_in_retry_set = {}
|
384
|
+
redis.multi do
|
385
|
+
active_messages.each do |key, message|
|
386
|
+
messages_in_retry_set[key] = redis.srem(key('retry_set'), message.attempt_id)
|
387
|
+
end
|
279
388
|
end
|
389
|
+
|
390
|
+
# Now, we only select the messages that were on the retry set, and try to claim them.
|
391
|
+
active_messages.keep_if { |key, _value| messages_in_retry_set.fetch(key).value }
|
392
|
+
enqueued_runnables = xclaim_messages(active_messages, max_idle_time_ms: 0)
|
393
|
+
reclaimed_failed_tests.merge(enqueued_runnables)
|
394
|
+
enqueued_runnables
|
280
395
|
end
|
281
396
|
|
282
397
|
sig { void }
|
@@ -288,18 +403,6 @@ module Minitest
|
|
288
403
|
# so we can assume that all the Redis cleanup was completed.
|
289
404
|
end
|
290
405
|
|
291
|
-
sig { params(er: EnqueuedRunnable).void }
|
292
|
-
def mark_runnable_to_fail_immediately(er)
|
293
|
-
assertion = Minitest::Assertion.new(<<~EOM.chomp)
|
294
|
-
This test takes too long to run (> #{configuration.test_timeout}s).
|
295
|
-
|
296
|
-
We have tried running this test #{configuration.max_attempts} on different workers, but every time the worker has not reported back a result within #{configuration.test_timeout}ms.
|
297
|
-
Try to make the test faster, or increase the test timeout.
|
298
|
-
EOM
|
299
|
-
assertion.set_backtrace(caller)
|
300
|
-
er.canned_failure = assertion
|
301
|
-
end
|
302
|
-
|
303
406
|
sig { params(results: ResultAggregate).void }
|
304
407
|
def adjust_combined_results(results)
|
305
408
|
updated = redis.multi do
|
@@ -309,14 +412,16 @@ module Minitest
|
|
309
412
|
redis.incrby(key('failures'), results.failures)
|
310
413
|
redis.incrby(key('errors'), results.errors)
|
311
414
|
redis.incrby(key('skips'), results.skips)
|
312
|
-
redis.incrby(key('
|
415
|
+
redis.incrby(key('requeues'), results.requeues)
|
416
|
+
redis.incrby(key('discards'), results.discards)
|
313
417
|
redis.incrby(key('acks'), results.acks)
|
314
418
|
redis.incrby(key('size'), results.size)
|
315
419
|
end
|
316
420
|
|
317
|
-
@combined_results = ResultAggregate.new(
|
318
|
-
|
319
|
-
|
421
|
+
@combined_results = ResultAggregate.new(max_failures: configuration.max_failures,
|
422
|
+
runs: updated[0], assertions: updated[1], passes: updated[2],
|
423
|
+
failures: updated[3], errors: updated[4], skips: updated[5], requeues: updated[6], discards: updated[7],
|
424
|
+
acks: updated[8], size: updated[9])
|
320
425
|
end
|
321
426
|
|
322
427
|
sig { params(name: String).returns(String) }
|
@@ -324,59 +429,56 @@ module Minitest
|
|
324
429
|
"minitest/#{configuration.run_id}/#{name}"
|
325
430
|
end
|
326
431
|
|
327
|
-
sig { params(
|
432
|
+
sig { params(name: String).returns(String) }
|
433
|
+
def list_key(name)
|
434
|
+
key("#{name}_list")
|
435
|
+
end
|
436
|
+
|
437
|
+
sig { params(batch: T::Array[EnqueuedRunnable], reporter: AbstractReporter).void }
|
328
438
|
def process_batch(batch, reporter)
|
329
|
-
|
439
|
+
return 0 if batch.empty?
|
440
|
+
|
441
|
+
local_results.size += batch.size
|
442
|
+
|
443
|
+
runnable_results = T.let([], T::Array[EnqueuedRunnable::Result])
|
444
|
+
redis.multi do
|
445
|
+
batch.each do |enqueued_runnable|
|
446
|
+
# Fulfill the reporter contract by calling `prerecord` before we run the test.
|
447
|
+
reporter.prerecord(enqueued_runnable.runnable_class, enqueued_runnable.method_name)
|
448
|
+
|
449
|
+
# Actually run the test!
|
450
|
+
runnable_results << enqueued_runnable.run do |initial_result|
|
451
|
+
if ResultType.of(initial_result) == ResultType::Requeued
|
452
|
+
sadd_future = redis.sadd(key('retry_set'), enqueued_runnable.attempt_id)
|
453
|
+
EnqueuedRunnable::Result::Commit.new { sadd_future.value }
|
454
|
+
else
|
455
|
+
xack_future = redis.xack(stream_key, group_name, enqueued_runnable.entry_id)
|
456
|
+
EnqueuedRunnable::Result::Commit.new { xack_future.value == 1 }
|
457
|
+
end
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
461
|
+
|
462
|
+
batch_result_aggregate = ResultAggregate.new
|
463
|
+
runnable_results.each do |runnable_result|
|
464
|
+
# Complete the reporter contract by calling `record` with the result.
|
465
|
+
reporter.record(runnable_result.committed_result)
|
330
466
|
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
result = enqueued_runnable.run
|
467
|
+
# Update statistics.
|
468
|
+
batch_result_aggregate.update_with_result(runnable_result)
|
469
|
+
local_results.update_with_result(runnable_result)
|
335
470
|
|
336
|
-
case (result_type = ResultType.of(
|
337
|
-
when ResultType::
|
471
|
+
case (result_type = ResultType.of(runnable_result.committed_result))
|
472
|
+
when ResultType::Skipped, ResultType::Failed, ResultType::Error
|
473
|
+
redis.lpush(list_key(result_type.serialize), runnable_result.enqueued_runnable.identifier)
|
474
|
+
when ResultType::Passed, ResultType::Requeued, ResultType::Discarded
|
338
475
|
# noop
|
339
|
-
when ResultType::Skipped
|
340
|
-
redis.lpush(key('skip_list'), Marshal.dump(enqueued_runnable.serialize))
|
341
|
-
when ResultType::Failed
|
342
|
-
redis.lpush(key('failure_list'), Marshal.dump(enqueued_runnable.serialize))
|
343
|
-
when ResultType::Error
|
344
|
-
redis.lpush(key('error_list'), Marshal.dump(enqueued_runnable.serialize))
|
345
476
|
else
|
346
477
|
T.absurd(result_type)
|
347
478
|
end
|
348
|
-
|
349
|
-
local_results.update_with_result(result)
|
350
|
-
to_be_acked[enqueued_runnable.execution_id] = result
|
351
|
-
end
|
352
|
-
|
353
|
-
return 0 if to_be_acked.empty?
|
354
|
-
|
355
|
-
acked = redis.evalsha(
|
356
|
-
ack_batch_script,
|
357
|
-
keys: [stream_key],
|
358
|
-
argv: [group_name] + to_be_acked.keys
|
359
|
-
)
|
360
|
-
|
361
|
-
batch_results = ResultAggregate.new(acks: acked.length)
|
362
|
-
acked.each do |execution_id|
|
363
|
-
acked_result = to_be_acked.delete(execution_id)
|
364
|
-
reporter.record(acked_result)
|
365
|
-
batch_results.update_with_result(acked_result)
|
366
|
-
end
|
367
|
-
|
368
|
-
to_be_acked.each do |_execution_id, unacked_result|
|
369
|
-
# TODO: use custom assertion class.
|
370
|
-
discard_assertion = Minitest::Skip.new("The test result was discarded, " \
|
371
|
-
"because the test has been claimed another worker.")
|
372
|
-
discard_assertion.set_backtrace(caller)
|
373
|
-
unacked_result.failures = [discard_assertion]
|
374
|
-
reporter.record(unacked_result)
|
375
479
|
end
|
376
480
|
|
377
|
-
adjust_combined_results(
|
378
|
-
local_results.acks += acked.length
|
379
|
-
acked.length
|
481
|
+
adjust_combined_results(batch_result_aggregate)
|
380
482
|
end
|
381
483
|
|
382
484
|
INITIAL_BACKOFF = 10 # milliseconds
|