atex 0.10__py3-none-any.whl → 0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atex/aggregator/__init__.py +8 -6
- atex/aggregator/json.py +234 -51
- atex/cli/__init__.py +3 -0
- atex/cli/fmf.py +7 -7
- atex/cli/testingfarm.py +95 -45
- atex/executor/__init__.py +23 -2
- atex/executor/executor.py +26 -21
- atex/executor/reporter.py +3 -4
- atex/executor/scripts.py +14 -14
- atex/executor/testcontrol.py +32 -27
- atex/orchestrator/adhoc.py +116 -83
- atex/orchestrator/contest.py +116 -0
- atex/provisioner/__init__.py +0 -16
- atex/provisioner/libvirt/libvirt.py +13 -1
- atex/provisioner/testingfarm/api.py +57 -10
- atex/provisioner/testingfarm/testingfarm.py +25 -21
- atex/util/log.py +1 -1
- atex/util/subprocess.py +6 -6
- {atex-0.10.dist-info → atex-0.12.dist-info}/METADATA +1 -1
- {atex-0.10.dist-info → atex-0.12.dist-info}/RECORD +23 -22
- {atex-0.10.dist-info → atex-0.12.dist-info}/WHEEL +1 -1
- {atex-0.10.dist-info → atex-0.12.dist-info}/entry_points.txt +0 -0
- {atex-0.10.dist-info → atex-0.12.dist-info}/licenses/COPYING.txt +0 -0
atex/executor/executor.py
CHANGED
|
@@ -8,18 +8,11 @@ import subprocess
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
|
|
10
10
|
from .. import util, fmf
|
|
11
|
-
from . import testcontrol, scripts
|
|
11
|
+
from . import TestSetupError, TestAbortedError, testcontrol, scripts
|
|
12
12
|
from .duration import Duration
|
|
13
13
|
from .reporter import Reporter
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class TestAbortedError(Exception):
|
|
17
|
-
"""
|
|
18
|
-
Raised when an infrastructure-related issue happened while running a test.
|
|
19
|
-
"""
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
|
|
23
16
|
class Executor:
|
|
24
17
|
"""
|
|
25
18
|
Logic for running tests on a remote system and processing results
|
|
@@ -153,11 +146,11 @@ class Executor:
|
|
|
153
146
|
**self.env,
|
|
154
147
|
"TMT_PLAN_ENVIRONMENT_FILE": self.plan_env_file,
|
|
155
148
|
}
|
|
156
|
-
env_args = (f"{k}={v}" for k, v in env.items())
|
|
149
|
+
env_args = tuple(f"{k}={v}" for k, v in env.items())
|
|
157
150
|
# run the scripts
|
|
158
151
|
for script in scripts:
|
|
159
152
|
self.conn.cmd(
|
|
160
|
-
("env", *env_args, "bash"),
|
|
153
|
+
("env", "-C", self.tests_dir, *env_args, "bash"),
|
|
161
154
|
func=util.subprocess_log,
|
|
162
155
|
stderr=subprocess.STDOUT,
|
|
163
156
|
input=script,
|
|
@@ -222,16 +215,6 @@ class Executor:
|
|
|
222
215
|
output_dir = Path(output_dir)
|
|
223
216
|
test_data = self.fmf_tests.tests[test_name]
|
|
224
217
|
|
|
225
|
-
# run a setup script, preparing wrapper + test scripts
|
|
226
|
-
setup_script = scripts.test_setup(
|
|
227
|
-
test=scripts.Test(test_name, test_data, self.fmf_tests.test_dirs[test_name]),
|
|
228
|
-
tests_dir=self.tests_dir,
|
|
229
|
-
wrapper_exec=f"{self.work_dir}/wrapper.sh",
|
|
230
|
-
test_exec=f"{self.work_dir}/test.sh",
|
|
231
|
-
test_yaml=f"{self.work_dir}/metadata.yaml",
|
|
232
|
-
)
|
|
233
|
-
self.conn.cmd(("bash",), input=setup_script, text=True, check=True)
|
|
234
|
-
|
|
235
218
|
# start with fmf-plan-defined environment
|
|
236
219
|
env_vars = {
|
|
237
220
|
**self.fmf_tests.plan_env,
|
|
@@ -253,6 +236,28 @@ class Executor:
|
|
|
253
236
|
duration = Duration(test_data.get("duration", "5m"))
|
|
254
237
|
control = testcontrol.TestControl(reporter=reporter, duration=duration)
|
|
255
238
|
|
|
239
|
+
# run a setup script, preparing wrapper + test scripts
|
|
240
|
+
setup_script = scripts.test_setup(
|
|
241
|
+
test=scripts.Test(test_name, test_data, self.fmf_tests.test_dirs[test_name]),
|
|
242
|
+
tests_dir=self.tests_dir,
|
|
243
|
+
wrapper_exec=f"{self.work_dir}/wrapper.sh",
|
|
244
|
+
test_exec=f"{self.work_dir}/test.sh",
|
|
245
|
+
test_yaml=f"{self.work_dir}/metadata.yaml",
|
|
246
|
+
)
|
|
247
|
+
setup_proc = self.conn.cmd(
|
|
248
|
+
("bash",),
|
|
249
|
+
input=setup_script,
|
|
250
|
+
stdout=subprocess.PIPE,
|
|
251
|
+
stderr=subprocess.STDOUT,
|
|
252
|
+
text=True,
|
|
253
|
+
)
|
|
254
|
+
if setup_proc.returncode != 0:
|
|
255
|
+
reporter.report({
|
|
256
|
+
"status": "infra",
|
|
257
|
+
"note": f"TestSetupError({setup_proc.stdout})",
|
|
258
|
+
})
|
|
259
|
+
raise TestSetupError(setup_proc.stdout)
|
|
260
|
+
|
|
256
261
|
test_proc = None
|
|
257
262
|
control_fd = None
|
|
258
263
|
stack.callback(lambda: os.close(control_fd) if control_fd else None)
|
|
@@ -387,7 +392,7 @@ class Executor:
|
|
|
387
392
|
pass
|
|
388
393
|
reporter.report({
|
|
389
394
|
"status": "infra",
|
|
390
|
-
"note":
|
|
395
|
+
"note": f"{type(exception).__name__}({exception})",
|
|
391
396
|
"testout": "output.txt",
|
|
392
397
|
})
|
|
393
398
|
|
atex/executor/reporter.py
CHANGED
|
@@ -85,16 +85,15 @@ class Reporter:
|
|
|
85
85
|
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
86
86
|
return file_path
|
|
87
87
|
|
|
88
|
-
def
|
|
88
|
+
def open_fd(self, file_name, mode, result_name=None):
|
|
89
89
|
"""
|
|
90
90
|
Open a file named 'file_name' in a directory relevant to 'result_name'.
|
|
91
|
-
Returns an opened file
|
|
92
|
-
just like with regular open().
|
|
91
|
+
Returns an opened file descriptor that can be closed with os.close().
|
|
93
92
|
|
|
94
93
|
If 'result_name' (typically a subtest) is not given, open the file
|
|
95
94
|
for the test (name) itself.
|
|
96
95
|
"""
|
|
97
|
-
return open(self._dest_path(file_name, result_name), mode)
|
|
96
|
+
return os.open(self._dest_path(file_name, result_name), mode)
|
|
98
97
|
|
|
99
98
|
def link_testout(self, file_name, result_name=None):
|
|
100
99
|
# TODO: docstring
|
atex/executor/scripts.py
CHANGED
|
@@ -86,7 +86,8 @@ def test_wrapper(*, test, tests_dir, test_exec):
|
|
|
86
86
|
out += ")\n"
|
|
87
87
|
|
|
88
88
|
# write test exitcode to test control stream
|
|
89
|
-
|
|
89
|
+
if os.environ.get("ATEX_DEBUG_NO_EXITCODE") != "1":
|
|
90
|
+
out += "echo exitcode $? >&$orig_stdout\n"
|
|
90
91
|
|
|
91
92
|
# always exit the wrapper with 0 if test execution was normal
|
|
92
93
|
out += "exit 0\n"
|
|
@@ -94,17 +95,6 @@ def test_wrapper(*, test, tests_dir, test_exec):
|
|
|
94
95
|
return out
|
|
95
96
|
|
|
96
97
|
|
|
97
|
-
def _install_packages(pkgs, extra_opts=None):
|
|
98
|
-
pkgs_str = " ".join(pkgs)
|
|
99
|
-
extra_opts = extra_opts or ()
|
|
100
|
-
dnf = ["dnf", "-y", "--setopt=install_weak_deps=False", "install", *extra_opts]
|
|
101
|
-
dnf_str = " ".join(dnf)
|
|
102
|
-
return util.dedent(fr"""
|
|
103
|
-
not_installed=$(rpm -q --qf '' {pkgs_str} | sed -nr 's/^package ([^ ]+) is not installed$/\1/p')
|
|
104
|
-
[[ $not_installed ]] && {dnf_str} $not_installed
|
|
105
|
-
""") # noqa: E501
|
|
106
|
-
|
|
107
|
-
|
|
108
98
|
def test_setup(*, test, wrapper_exec, test_exec, test_yaml, **kwargs):
|
|
109
99
|
"""
|
|
110
100
|
Generate a bash script that should prepare the remote end for test
|
|
@@ -133,9 +123,19 @@ def test_setup(*, test, wrapper_exec, test_exec, test_yaml, **kwargs):
|
|
|
133
123
|
# install test dependencies
|
|
134
124
|
# - only strings (package names) in require/recommend are supported
|
|
135
125
|
if require := list(fmf.test_pkg_requires(test.data, "require")):
|
|
136
|
-
|
|
126
|
+
pkgs_str = " ".join(require)
|
|
127
|
+
out += util.dedent(fr"""
|
|
128
|
+
not_installed=$(rpm -q --qf '' {pkgs_str} | sed -nr 's/^package ([^ ]+) is not installed$/\1/p')
|
|
129
|
+
[[ $not_installed ]] && dnf -y --setopt=install_weak_deps=False install $not_installed
|
|
130
|
+
""") + "\n" # noqa: E501
|
|
137
131
|
if recommend := list(fmf.test_pkg_requires(test.data, "recommend")):
|
|
138
|
-
|
|
132
|
+
pkgs_str = " ".join(recommend)
|
|
133
|
+
out += util.dedent(fr"""
|
|
134
|
+
have_dnf5=$(command -v dnf5) || true
|
|
135
|
+
skip_bad="--skip-broken${{have_dnf5:+ --skip-unavailable}}"
|
|
136
|
+
not_installed=$(rpm -q --qf '' {pkgs_str} | sed -nr 's/^package ([^ ]+) is not installed$/\1/p')
|
|
137
|
+
[[ $not_installed ]] && dnf -y --setopt=install_weak_deps=False install $skip_bad $not_installed
|
|
138
|
+
""") + "\n" # noqa: E501
|
|
139
139
|
|
|
140
140
|
# write out test data
|
|
141
141
|
out += f"cat > '{test_yaml}' <<'ATEX_SETUP_EOF'\n"
|
atex/executor/testcontrol.py
CHANGED
|
@@ -267,40 +267,45 @@ class TestControl:
|
|
|
267
267
|
except ValueError as e:
|
|
268
268
|
raise BadReportJSONError(f"file entry {file_name} length: {str(e)}") from None
|
|
269
269
|
|
|
270
|
+
fd = self.reporter.open_fd(file_name, os.O_WRONLY | os.O_CREAT, name)
|
|
270
271
|
try:
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
272
|
+
# Linux can't do splice(2) on O_APPEND fds, so we open it above
|
|
273
|
+
# as O_WRONLY and just seek to the end, simulating append
|
|
274
|
+
os.lseek(fd, 0, os.SEEK_END)
|
|
275
|
+
|
|
276
|
+
while file_length > 0:
|
|
277
|
+
try:
|
|
278
|
+
# try a more universal sendfile first, fall back to splice
|
|
274
279
|
try:
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
raise
|
|
283
|
-
except BlockingIOError:
|
|
284
|
-
yield
|
|
285
|
-
continue
|
|
286
|
-
if written == 0:
|
|
287
|
-
raise BadControlError("EOF when reading data")
|
|
288
|
-
file_length -= written
|
|
280
|
+
written = os.sendfile(fd, self.control_fd, None, file_length)
|
|
281
|
+
except OSError as e:
|
|
282
|
+
if e.errno == 22: # EINVAL
|
|
283
|
+
written = os.splice(self.control_fd, fd, file_length)
|
|
284
|
+
else:
|
|
285
|
+
raise
|
|
286
|
+
except BlockingIOError:
|
|
289
287
|
yield
|
|
290
|
-
|
|
291
|
-
|
|
288
|
+
continue
|
|
289
|
+
if written == 0:
|
|
290
|
+
raise BadControlError("EOF when reading data")
|
|
291
|
+
file_length -= written
|
|
292
|
+
yield
|
|
293
|
+
finally:
|
|
294
|
+
os.close(fd)
|
|
292
295
|
|
|
293
296
|
# either store partial result + return,
|
|
294
297
|
# or load previous partial result and merge into it
|
|
295
|
-
partial = result.get("partial"
|
|
296
|
-
if partial:
|
|
297
|
-
# do not store the 'partial' key in the result
|
|
298
|
+
partial = result.get("partial")
|
|
299
|
+
if partial is not None:
|
|
300
|
+
# do not store the 'partial' key in the result, even if False
|
|
298
301
|
del result["partial"]
|
|
299
|
-
#
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
302
|
+
# if it exists and is True
|
|
303
|
+
if partial:
|
|
304
|
+
# note that nameless result will get None as dict key,
|
|
305
|
+
# which is perfectly fine
|
|
306
|
+
self._merge(self.partial_results[name], result)
|
|
307
|
+
# partial = do nothing
|
|
308
|
+
return
|
|
304
309
|
|
|
305
310
|
# if previously-stored partial result exist, merge the current one
|
|
306
311
|
# into it, but then use the merged result
|
atex/orchestrator/adhoc.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import tempfile
|
|
2
|
-
import concurrent
|
|
3
|
-
import collections
|
|
2
|
+
import concurrent.futures
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
|
|
6
5
|
from .. import util, executor
|
|
@@ -60,7 +59,7 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
60
59
|
|
|
61
60
|
def __init__(
|
|
62
61
|
self, platform, fmf_tests, provisioners, aggregator, tmp_dir, *,
|
|
63
|
-
max_remotes=1, max_spares=0,
|
|
62
|
+
max_remotes=1, max_spares=0, max_failed_setups=10, env=None,
|
|
64
63
|
):
|
|
65
64
|
"""
|
|
66
65
|
'platform' is a string with platform name.
|
|
@@ -84,15 +83,15 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
84
83
|
speed up test reruns as Remote reservation happens asynchronously
|
|
85
84
|
to test execution. Spares are reserved on top of 'max_remotes'.
|
|
86
85
|
|
|
87
|
-
'max_reruns' is an integer of how many times to re-try running a failed
|
|
88
|
-
test (which exited with non-0 or caused an Executor exception).
|
|
89
|
-
|
|
90
86
|
'max_failed_setups' is an integer of how many times an Executor's
|
|
91
87
|
plan setup (uploading tests, running prepare scripts, etc.) may fail
|
|
92
88
|
before FailedSetupError is raised.
|
|
93
89
|
|
|
94
90
|
'env' is a dict of extra environment variables to pass to Executor.
|
|
95
91
|
"""
|
|
92
|
+
if not fmf_tests.tests:
|
|
93
|
+
raise ValueError("'fmf_tests' has no tests (bad discover params?)")
|
|
94
|
+
|
|
96
95
|
self.platform = platform
|
|
97
96
|
self.fmf_tests = fmf_tests
|
|
98
97
|
self.provisioners = tuple(provisioners)
|
|
@@ -101,11 +100,11 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
101
100
|
self.failed_setups_left = max_failed_setups
|
|
102
101
|
self.max_remotes = max_remotes
|
|
103
102
|
self.max_spares = max_spares
|
|
104
|
-
# indexed by test name, value being integer of how many times
|
|
105
|
-
self.reruns = collections.defaultdict(lambda: max_reruns)
|
|
106
103
|
self.env = env
|
|
107
104
|
# tests still waiting to be run
|
|
108
105
|
self.to_run = set(fmf_tests.tests)
|
|
106
|
+
# number of Remotes being provisioned + set up (not running tests)
|
|
107
|
+
self.remotes_requested = 0
|
|
109
108
|
# running tests as a dict, indexed by test name, with RunningInfo values
|
|
110
109
|
self.running_tests = {}
|
|
111
110
|
# thread queue for actively running tests
|
|
@@ -114,6 +113,8 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
114
113
|
self.setup_queue = util.ThreadQueue(daemon=True)
|
|
115
114
|
# thread queue for remotes being released
|
|
116
115
|
self.release_queue = util.ThreadQueue(daemon=True)
|
|
116
|
+
# thread queue for results being ingested
|
|
117
|
+
self.ingest_queue = util.ThreadQueue(daemon=False)
|
|
117
118
|
|
|
118
119
|
def _run_new_test(self, info):
|
|
119
120
|
"""
|
|
@@ -125,7 +126,7 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
125
126
|
next_test_name = self.next_test(self.to_run, self.fmf_tests.tests, info)
|
|
126
127
|
assert next_test_name in self.to_run, "next_test() returned valid test name"
|
|
127
128
|
|
|
128
|
-
util.info(f"starting '{next_test_name}'
|
|
129
|
+
util.info(f"{info.remote}: starting '{next_test_name}'")
|
|
129
130
|
|
|
130
131
|
self.to_run.remove(next_test_name)
|
|
131
132
|
|
|
@@ -140,6 +141,7 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
140
141
|
)
|
|
141
142
|
|
|
142
143
|
tmp_dir_path = Path(rinfo.tmp_dir.name)
|
|
144
|
+
tmp_dir_path.chmod(0o755)
|
|
143
145
|
self.test_queue.start_thread(
|
|
144
146
|
target=info.executor.run_test,
|
|
145
147
|
target_args=(
|
|
@@ -156,26 +158,11 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
156
158
|
'finfo' is a FinishedInfo instance.
|
|
157
159
|
"""
|
|
158
160
|
test_data = self.fmf_tests.tests[finfo.test_name]
|
|
159
|
-
|
|
160
|
-
# TODO: somehow move logging from was_successful and should_be_rerun here,
|
|
161
|
-
# probably print just some generic info from those functions that doesn't
|
|
162
|
-
# imply any outcome, ie.
|
|
163
|
-
# {remote_with_test} threw {exception}
|
|
164
|
-
# {remote_with_test} exited with {code}
|
|
165
|
-
# {remote_with_test} has {N} reruns left
|
|
166
|
-
# {remote_with_test} has 0 reruns left
|
|
167
|
-
# and then log the decision separately, here below, such as
|
|
168
|
-
# {remote_with_test} failed, re-running
|
|
169
|
-
# {remote_with_test} completed, ingesting result
|
|
170
|
-
# {remote_with_test} was destructive, releasing remote
|
|
171
|
-
# {remote_with_test} ...., running next test
|
|
172
|
-
# That allows the user to override the functions, while keeping critical
|
|
173
|
-
# flow reliably logged here.
|
|
174
|
-
|
|
175
161
|
remote_with_test = f"{finfo.remote}: '{finfo.test_name}'"
|
|
176
162
|
|
|
177
163
|
if not self.was_successful(finfo, test_data) and self.should_be_rerun(finfo, test_data):
|
|
178
164
|
# re-run the test
|
|
165
|
+
util.info(f"{remote_with_test} failed, re-running")
|
|
179
166
|
self.to_run.add(finfo.test_name)
|
|
180
167
|
else:
|
|
181
168
|
# ingest the result
|
|
@@ -183,15 +170,27 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
183
170
|
# a condition just in case Executor code itself threw an exception
|
|
184
171
|
# and didn't even report the fallback 'infra' result
|
|
185
172
|
if finfo.results is not None and finfo.files is not None:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
173
|
+
util.info(f"{remote_with_test} completed, ingesting result")
|
|
174
|
+
|
|
175
|
+
def ingest_and_cleanup(ingest, args, cleanup):
|
|
176
|
+
ingest(*args)
|
|
177
|
+
# also delete the tmpdir housing these
|
|
178
|
+
cleanup()
|
|
179
|
+
|
|
180
|
+
self.ingest_queue.start_thread(
|
|
181
|
+
ingest_and_cleanup,
|
|
182
|
+
target_args=(
|
|
183
|
+
# ingest func itself
|
|
184
|
+
self.aggregator.ingest,
|
|
185
|
+
# args for ingest
|
|
186
|
+
(self.platform, finfo.test_name, finfo.results, finfo.files),
|
|
187
|
+
# cleanup func itself
|
|
188
|
+
finfo.tmp_dir.cleanup,
|
|
189
|
+
),
|
|
190
|
+
test_name=finfo.test_name,
|
|
191
191
|
)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
# ingesting destroyed these
|
|
192
|
+
|
|
193
|
+
# ingesting destroys these
|
|
195
194
|
finfo = self.FinishedInfo._from(
|
|
196
195
|
finfo,
|
|
197
196
|
results=None,
|
|
@@ -199,22 +198,34 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
199
198
|
tmp_dir=None,
|
|
200
199
|
)
|
|
201
200
|
|
|
202
|
-
# if
|
|
203
|
-
#
|
|
204
|
-
if finfo.exception or self.destructive(finfo, test_data):
|
|
205
|
-
util.debug(f"{remote_with_test} was destructive,
|
|
201
|
+
# if there are still tests to be run and the last test was not
|
|
202
|
+
# destructive, just run a new test on it
|
|
203
|
+
if self.to_run and not (finfo.exception or self.destructive(finfo, test_data)):
|
|
204
|
+
util.debug(f"{remote_with_test} was non-destructive, running next test")
|
|
205
|
+
self._run_new_test(finfo)
|
|
206
|
+
return
|
|
207
|
+
|
|
208
|
+
# we are not running a new test right now, serve_once() might run it
|
|
209
|
+
# some time later, just decide what to do with the current remote
|
|
210
|
+
|
|
211
|
+
if self.remotes_requested >= len(self.to_run):
|
|
212
|
+
# we have enough remotes in the pipe to run every test,
|
|
213
|
+
# we don't need a new one - just release the current one
|
|
214
|
+
util.debug(f"{finfo.remote} no longer useful, releasing it")
|
|
215
|
+
self.release_queue.start_thread(
|
|
216
|
+
finfo.remote.release,
|
|
217
|
+
remote=finfo.remote,
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
# we need more remotes and the last test was destructive,
|
|
221
|
+
# get a new one and let serve_once() run a test later
|
|
222
|
+
util.debug(f"{remote_with_test} was destructive, getting a new Remote")
|
|
206
223
|
self.release_queue.start_thread(
|
|
207
224
|
finfo.remote.release,
|
|
208
225
|
remote=finfo.remote,
|
|
209
226
|
)
|
|
210
227
|
finfo.provisioner.provision(1)
|
|
211
228
|
|
|
212
|
-
# if still not destroyed, run another test on it
|
|
213
|
-
# (without running plan setup, re-using already set up remote)
|
|
214
|
-
elif self.to_run:
|
|
215
|
-
util.debug(f"{remote_with_test} was non-destructive, running next test")
|
|
216
|
-
self._run_new_test(finfo)
|
|
217
|
-
|
|
218
229
|
def serve_once(self):
|
|
219
230
|
"""
|
|
220
231
|
Run the orchestration logic, processing any outstanding requests
|
|
@@ -225,7 +236,7 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
225
236
|
(more work to be done), False once all testing is concluded.
|
|
226
237
|
"""
|
|
227
238
|
# all done
|
|
228
|
-
if not self.to_run and not self.running_tests
|
|
239
|
+
if not self.to_run and not self.running_tests:
|
|
229
240
|
return False
|
|
230
241
|
|
|
231
242
|
# process all finished tests, potentially reusing remotes for executing
|
|
@@ -260,18 +271,21 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
260
271
|
except util.ThreadQueue.Empty:
|
|
261
272
|
break
|
|
262
273
|
|
|
274
|
+
self.remotes_requested -= 1
|
|
263
275
|
sinfo = treturn.sinfo
|
|
264
276
|
|
|
265
277
|
if treturn.exception:
|
|
266
|
-
|
|
278
|
+
exc_str = f"{type(treturn.exception).__name__}({treturn.exception})"
|
|
279
|
+
msg = f"{sinfo.remote}: setup failed with {exc_str}"
|
|
267
280
|
self.release_queue.start_thread(
|
|
268
281
|
sinfo.remote.release,
|
|
269
282
|
remote=sinfo.remote,
|
|
270
283
|
)
|
|
271
|
-
if (
|
|
272
|
-
util.warning(f"{msg}, re-trying ({
|
|
284
|
+
if (retries_left := self.failed_setups_left) > 0:
|
|
285
|
+
util.warning(f"{msg}, re-trying ({retries_left} setup retries left)")
|
|
273
286
|
self.failed_setups_left -= 1
|
|
274
287
|
sinfo.provisioner.provision(1)
|
|
288
|
+
self.remotes_requested += 1
|
|
275
289
|
else:
|
|
276
290
|
util.warning(f"{msg}, setup retries exceeded, giving up")
|
|
277
291
|
raise FailedSetupError("setup retries limit exceeded, broken infra?")
|
|
@@ -286,12 +300,14 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
286
300
|
treturn = self.setup_queue.get_raw(block=False)
|
|
287
301
|
except util.ThreadQueue.Empty:
|
|
288
302
|
break
|
|
303
|
+
util.debug(f"releasing extraneous set-up {treturn.sinfo.remote}")
|
|
289
304
|
self.release_queue.start_thread(
|
|
290
305
|
treturn.sinfo.remote.release,
|
|
291
306
|
remote=treturn.sinfo.remote,
|
|
292
307
|
)
|
|
308
|
+
self.remotes_requested -= 1
|
|
293
309
|
|
|
294
|
-
# try to get new
|
|
310
|
+
# try to get new Remotes from Provisioners - if we get some, start
|
|
295
311
|
# running setup on them
|
|
296
312
|
for provisioner in self.provisioners:
|
|
297
313
|
while (remote := provisioner.get_remote(block=False)) is not None:
|
|
@@ -311,15 +327,30 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
311
327
|
# gather returns from Remote.release() functions - check for exceptions
|
|
312
328
|
# thrown, re-report them as warnings as they are not typically critical
|
|
313
329
|
# for operation
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
if treturn.exception:
|
|
320
|
-
util.warning(f"{treturn.remote} release failed: {repr(treturn.exception)}")
|
|
330
|
+
while True:
|
|
331
|
+
try:
|
|
332
|
+
treturn = self.release_queue.get_raw(block=False)
|
|
333
|
+
except util.ThreadQueue.Empty:
|
|
334
|
+
break
|
|
321
335
|
else:
|
|
322
|
-
|
|
336
|
+
if treturn.exception:
|
|
337
|
+
exc_str = f"{type(treturn.exception).__name__}({treturn.exception})"
|
|
338
|
+
util.warning(f"{treturn.remote} release failed: {exc_str}")
|
|
339
|
+
else:
|
|
340
|
+
util.debug(f"{treturn.remote} release completed")
|
|
341
|
+
|
|
342
|
+
# gather returns from Aggregator.ingest() calls - check for exceptions
|
|
343
|
+
while True:
|
|
344
|
+
try:
|
|
345
|
+
treturn = self.ingest_queue.get_raw(block=False)
|
|
346
|
+
except util.ThreadQueue.Empty:
|
|
347
|
+
break
|
|
348
|
+
else:
|
|
349
|
+
if treturn.exception:
|
|
350
|
+
exc_str = f"{type(treturn.exception).__name__}({treturn.exception})"
|
|
351
|
+
util.warning(f"'{treturn.test_name}' ingesting failed: {exc_str}")
|
|
352
|
+
else:
|
|
353
|
+
util.debug(f"'{treturn.test_name}' ingesting completed")
|
|
323
354
|
|
|
324
355
|
return True
|
|
325
356
|
|
|
@@ -328,9 +359,12 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
328
359
|
for prov in self.provisioners:
|
|
329
360
|
prov.start()
|
|
330
361
|
|
|
362
|
+
# just the base remotes, no spares
|
|
363
|
+
self.remotes_requested = min(self.max_remotes, len(self.fmf_tests.tests))
|
|
364
|
+
|
|
331
365
|
# start up initial reservations, balanced evenly across all available
|
|
332
366
|
# provisioner instances
|
|
333
|
-
count =
|
|
367
|
+
count = self.remotes_requested + self.max_spares
|
|
334
368
|
provisioners = self.provisioners[:count]
|
|
335
369
|
for idx, prov in enumerate(provisioners):
|
|
336
370
|
if count % len(provisioners) > idx:
|
|
@@ -342,16 +376,30 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
342
376
|
# cancel all running tests and wait for them to clean up (up to 0.1sec)
|
|
343
377
|
for rinfo in self.running_tests.values():
|
|
344
378
|
rinfo.executor.cancel()
|
|
345
|
-
self.test_queue.join()
|
|
379
|
+
self.test_queue.join() # also ignore any exceptions raised
|
|
380
|
+
|
|
381
|
+
# wait for all running ingestions to finish, print exceptions
|
|
382
|
+
# (we would rather stop provisioners further below than raise here)
|
|
383
|
+
while True:
|
|
384
|
+
try:
|
|
385
|
+
treturn = self.ingest_queue.get_raw(block=False)
|
|
386
|
+
except util.ThreadQueue.Empty:
|
|
387
|
+
break
|
|
388
|
+
else:
|
|
389
|
+
if treturn.exception:
|
|
390
|
+
exc_str = f"{type(treturn.exception).__name__}({treturn.exception})"
|
|
391
|
+
util.warning(f"'{treturn.test_name}' ingesting failed: {exc_str}")
|
|
392
|
+
else:
|
|
393
|
+
util.debug(f"'{treturn.test_name}' ingesting completed")
|
|
394
|
+
self.ingest_queue.join()
|
|
346
395
|
|
|
347
396
|
# stop all provisioners, also releasing all remotes
|
|
348
|
-
#
|
|
397
|
+
# - parallelize up to 10 provisioners at a time
|
|
349
398
|
if self.provisioners:
|
|
350
|
-
workers = min(len(self.provisioners),
|
|
399
|
+
workers = min(len(self.provisioners), 10)
|
|
351
400
|
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
|
|
352
401
|
for provisioner in self.provisioners:
|
|
353
|
-
|
|
354
|
-
ex.submit(func)
|
|
402
|
+
ex.submit(provisioner.stop)
|
|
355
403
|
|
|
356
404
|
@staticmethod
|
|
357
405
|
def run_setup(sinfo):
|
|
@@ -423,23 +471,20 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
423
471
|
'test_data' is a dict of fully resolved fmf test metadata of that test.
|
|
424
472
|
"""
|
|
425
473
|
remote_with_test = f"{info.remote}: '{info.test_name}'"
|
|
426
|
-
|
|
427
474
|
# executor (or test) threw exception
|
|
428
475
|
if info.exception:
|
|
429
|
-
|
|
476
|
+
exc_str = f"{type(info.exception).__name__}({info.exception})"
|
|
477
|
+
util.info(f"{remote_with_test} threw {exc_str} during test runtime")
|
|
430
478
|
return False
|
|
431
|
-
|
|
432
479
|
# the test exited as non-0
|
|
433
480
|
if info.exit_code != 0:
|
|
434
481
|
util.info(f"{remote_with_test} exited with non-zero: {info.exit_code}")
|
|
435
482
|
return False
|
|
436
|
-
|
|
437
483
|
# otherwise we good
|
|
438
484
|
return True
|
|
439
485
|
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
def should_be_rerun(self, info, test_data): # noqa: ARG004, ARG002
|
|
486
|
+
@staticmethod
|
|
487
|
+
def should_be_rerun(info, test_data): # noqa: ARG004
|
|
443
488
|
"""
|
|
444
489
|
Return a boolean result whether a finished test failed in a way
|
|
445
490
|
that another execution attempt might succeed, due to race conditions
|
|
@@ -449,17 +494,5 @@ class AdHocOrchestrator(Orchestrator):
|
|
|
449
494
|
|
|
450
495
|
'test_data' is a dict of fully resolved fmf test metadata of that test.
|
|
451
496
|
"""
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
# TODO: remove self.reruns and the whole X-reruns logic from AdHocOrchestrator,
|
|
455
|
-
# leave it up to the user to wrap should_be_rerun() with an external dict
|
|
456
|
-
# of tests, counting reruns for each
|
|
457
|
-
# - allows the user to adjust counts per-test (ie. test_data metadata)
|
|
458
|
-
# - allows this template to be @staticmethod
|
|
459
|
-
if (reruns_left := self.reruns[info.test_name]) > 0:
|
|
460
|
-
util.info(f"{remote_with_test}: re-running ({reruns_left} reruns left)")
|
|
461
|
-
self.reruns[info.test_name] -= 1
|
|
462
|
-
return True
|
|
463
|
-
else:
|
|
464
|
-
util.info(f"{remote_with_test}: reruns exceeded, giving up")
|
|
465
|
-
return False
|
|
497
|
+
# never rerun by default
|
|
498
|
+
return False
|