atex 0.8__py3-none-any.whl → 0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,14 +8,22 @@ from pathlib import Path
8
8
  from .. import util, executor
9
9
 
10
10
 
11
+ class OrchestratorError(Exception):
12
+ pass
13
+
14
+
15
+ class FailedSetupError(OrchestratorError):
16
+ pass
17
+
18
+
11
19
  class Orchestrator:
12
20
  """
13
21
  A scheduler for parallel execution on multiple resources (machines/systems).
14
22
  """
15
23
 
16
- SetupInfo = collections.namedtuple(
17
- "SetupInfo",
18
- (
24
+ class SetupInfo(
25
+ util.NamedMapping,
26
+ required=(
19
27
  # class Provisioner instance this machine is provided by
20
28
  # (for logging purposes)
21
29
  "provisioner",
@@ -24,23 +32,23 @@ class Orchestrator:
24
32
  # class Executor instance uploading tests / running setup or tests
25
33
  "executor",
26
34
  ),
27
- )
28
- RunningInfo = collections.namedtuple(
29
- "RunningInfo",
30
- (
31
- # "inherit" from SetupInfo
32
- *SetupInfo._fields,
35
+ ):
36
+ pass
37
+
38
+ class RunningInfo(
39
+ SetupInfo,
40
+ required=(
33
41
  # string with /test/name
34
42
  "test_name",
35
- # class tempfile.TemporaryDirectory instance with 'json_file' and 'files_dir'
43
+ # class tempfile.TemporaryDirectory instance passed to Executor
36
44
  "tmp_dir",
37
45
  ),
38
- )
39
- FinishedInfo = collections.namedtuple(
40
- "FinishedInfo",
41
- (
42
- # "inherit" from RunningInfo
43
- *RunningInfo._fields,
46
+ ):
47
+ pass
48
+
49
+ class FinishedInfo(
50
+ RunningInfo,
51
+ required=(
44
52
  # integer with exit code of the test
45
53
  # (None if exception happened)
46
54
  "exit_code",
@@ -48,9 +56,13 @@ class Orchestrator:
48
56
  # (None if no exception happened (exit_code is defined))
49
57
  "exception",
50
58
  ),
51
- )
59
+ ):
60
+ pass
52
61
 
53
- def __init__(self, platform, fmf_tests, provisioners, aggregator, tmp_dir, *, max_reruns=2):
62
+ def __init__(
63
+ self, platform, fmf_tests, provisioners, aggregator, tmp_dir, *,
64
+ max_reruns=2, max_failed_setups=10, env=None,
65
+ ):
54
66
  """
55
67
  'platform' is a string with platform name.
56
68
 
@@ -63,20 +75,31 @@ class Orchestrator:
63
75
  'tmp_dir' is a string/Path to a temporary directory, to be used for
64
76
  storing per-test results and uploaded files before being ingested
65
77
  by the aggregator. Can be safely shared by Orchestrator instances.
78
+
79
+ 'max_reruns' is an integer of how many times to re-try running a failed
80
+ test (which exited with non-0 or caused an Executor exception).
81
+
82
+ 'max_failed_setups' is an integer of how many times an Executor's
83
+ plan setup (uploading tests, running prepare scripts, etc.) may fail
84
+ before FailedSetupError is raised.
85
+
86
+ 'env' is a dict of extra environment variables to pass to Executor.
66
87
  """
67
88
  self.platform = platform
68
89
  self.fmf_tests = fmf_tests
69
90
  self.provisioners = tuple(provisioners)
70
91
  self.aggregator = aggregator
71
92
  self.tmp_dir = tmp_dir
93
+ self.failed_setups_left = max_failed_setups
94
+ # indexed by test name, value being integer of how many times
95
+ self.reruns = collections.defaultdict(lambda: max_reruns)
96
+ self.env = env
72
97
  # tests still waiting to be run
73
98
  self.to_run = set(fmf_tests.tests)
74
99
  # running setup functions, as a list of SetupInfo items
75
100
  self.running_setups = []
76
101
  # running tests as a dict, indexed by test name, with RunningInfo values
77
102
  self.running_tests = {}
78
- # indexed by test name, value being integer of how many times
79
- self.reruns = collections.defaultdict(lambda: max_reruns)
80
103
  # thread queue for actively running tests
81
104
  self.test_queue = util.ThreadQueue(daemon=False)
82
105
  # thread queue for remotes being set up (uploading tests, etc.)
@@ -85,37 +108,36 @@ class Orchestrator:
85
108
  # cancellation, the execution flow itself uses ThreadQueues
86
109
 
87
110
  @staticmethod
88
- def _run_setup(sinfo):
89
- sinfo.executor.setup()
90
- sinfo.executor.upload_tests()
91
- sinfo.executor.setup_plan()
92
- # NOTE: we never run executor.cleanup() anywhere - instead, we assume
93
- # the remote (and its connection) was invalidated by the test,
94
- # so we just rely on remote.release() destroying the system
95
- return sinfo
96
-
97
- @classmethod
98
- def _wrap_test(cls, rinfo, func, *args, **kwargs):
111
+ def run_setup(sinfo):
99
112
  """
100
- Wrap 'func' (test execution function) to preserve extra metadata
101
- ('rinfo') and return it with the function return value.
113
+ Set up a newly acquired class Remote instance for test execution.
114
+
115
+ 'sinfo' is a SetupInfo instance with the (fully connected) remote.
102
116
  """
103
- try:
104
- return cls.FinishedInfo(*rinfo, func(*args, **kwargs), None)
105
- except Exception as e:
106
- return cls.FinishedInfo(*rinfo, None, e)
117
+ sinfo.executor.setup()
118
+ sinfo.executor.upload_tests()
119
+ sinfo.executor.plan_prepare()
120
+ # NOTE: we never run executor.plan_finish() or even executor.cleanup()
121
+ # anywhere - instead, we assume the remote (and its connection)
122
+ # was invalidated by the test, so we just rely on remote.release()
123
+ # destroying the system
107
124
 
108
- def _run_new_test(self, sinfo):
125
+ def _run_new_test(self, info):
109
126
  """
110
- 'sinfo' is a SetupInfo instance.
127
+ 'info' can be either
128
+ - SetupInfo instance with Remote/Executor to run the new test.
129
+ - FinishedInfo instance of a previously executed test
130
+ (reusing Remote/Executor for a new test).
111
131
  """
112
- next_test_name = self.next_test(self.to_run, self.fmf_tests)
132
+ next_test_name = self.next_test(self.to_run, self.fmf_tests.tests, info)
113
133
  assert next_test_name in self.to_run, "next_test() returned valid test name"
114
134
 
135
+ util.info(f"starting '{next_test_name}' on {info.remote}")
136
+
115
137
  self.to_run.remove(next_test_name)
116
138
 
117
- rinfo = self.RunningInfo(
118
- *sinfo,
139
+ rinfo = self.RunningInfo._from(
140
+ info,
119
141
  test_name=next_test_name,
120
142
  tmp_dir=tempfile.TemporaryDirectory(
121
143
  prefix=next_test_name.strip("/").replace("/","-") + "-",
@@ -126,14 +148,12 @@ class Orchestrator:
126
148
 
127
149
  tmp_dir_path = Path(rinfo.tmp_dir.name)
128
150
  self.test_queue.start_thread(
129
- target=self._wrap_test,
130
- args=(
131
- rinfo,
132
- sinfo.executor.run_test,
151
+ target=info.executor.run_test,
152
+ target_args=(
133
153
  next_test_name,
134
- tmp_dir_path / "json_file",
135
- tmp_dir_path / "files_dir",
154
+ tmp_dir_path,
136
155
  ),
156
+ rinfo=rinfo,
137
157
  )
138
158
 
139
159
  self.running_tests[next_test_name] = rinfo
@@ -142,85 +162,63 @@ class Orchestrator:
142
162
  """
143
163
  'finfo' is a FinishedInfo instance.
144
164
  """
145
- test_id = f"'{finfo.test_name}' on '{finfo.remote}'"
146
- tmp_dir_path = Path(finfo.tmp_dir.name)
147
-
148
- # NOTE: document that we intentionally don't .cleanup() executioner below,
149
- # we rely on remote .release() destroying the OS, because we don't
150
- # want to risk .cleanup() blocking on dead ssh into the remote after
151
- # executing a destructive test
152
-
153
- destructive = False
165
+ remote_with_test = f"{finfo.remote}: '{finfo.test_name}'"
166
+
167
+ def ingest_result():
168
+ tmp_dir_path = Path(finfo.tmp_dir.name)
169
+ results_file = tmp_dir_path / "results"
170
+ files_dir = tmp_dir_path / "files"
171
+ # in case Executor code itself threw an unrecoverable exception
172
+ # and didn't even report the fallback 'infra' result
173
+ if results_file.exists() and files_dir.exists():
174
+ self.aggregator.ingest(self.platform, finfo.test_name, results_file, files_dir)
175
+ finfo.tmp_dir.cleanup()
154
176
 
155
177
  # if executor (or test) threw exception, schedule a re-run
156
178
  if finfo.exception:
157
- destructive = True
158
- exc_str = "".join(traceback.format_exception(finfo.exception)).rstrip("\n")
159
- util.info(f"unexpected exception happened while running {test_id}:\n{exc_str}")
160
- finfo.remote.release()
161
- if self.reruns[finfo.test_name] > 0:
179
+ exc_name = type(finfo.exception).__name__
180
+ exc_tb = "".join(traceback.format_exception(finfo.exception)).rstrip("\n")
181
+ msg = f"{remote_with_test} threw {exc_name} during test runtime"
182
+ #finfo.remote.release()
183
+ if (reruns_left := self.reruns[finfo.test_name]) > 0:
184
+ util.info(f"{msg}, re-running ({reruns_left} reruns left):\n{exc_tb}")
162
185
  self.reruns[finfo.test_name] -= 1
163
186
  self.to_run.add(finfo.test_name)
164
187
  else:
165
- util.info(f"reruns for {test_id} exceeded, ignoring it")
188
+ util.info(f"{msg}, reruns exceeded, giving up:\n{exc_tb}")
189
+ # record the final result anyway
190
+ ingest_result()
166
191
 
167
192
  # if the test exited as non-0, try a re-run
168
193
  elif finfo.exit_code != 0:
169
- destructive = True
170
- finfo.remote.release()
171
- if self.reruns[finfo.test_name] > 0:
172
- util.info(
173
- f"{test_id} exited with non-zero: {finfo.exit_code}, re-running "
174
- f"({self.reruns[finfo.test_name]} reruns left)",
175
- )
194
+ msg = f"{remote_with_test} exited with non-zero: {finfo.exit_code}"
195
+ #finfo.remote.release()
196
+ if (reruns_left := self.reruns[finfo.test_name]) > 0:
197
+ util.info(f"{msg}, re-running ({reruns_left} reruns left)")
176
198
  self.reruns[finfo.test_name] -= 1
177
199
  self.to_run.add(finfo.test_name)
178
200
  else:
179
- util.info(
180
- f"{test_id} exited with non-zero: {finfo.exit_code}, "
181
- "all reruns exceeded, giving up",
182
- )
201
+ util.info(f"{msg}, reruns exceeded, giving up")
183
202
  # record the final result anyway
184
- self.aggregator.ingest(
185
- self.platform,
186
- finfo.test_name,
187
- tmp_dir_path / "json_file",
188
- tmp_dir_path / "files_dir",
189
- )
190
- finfo.tmp_dir.cleanup()
203
+ ingest_result()
191
204
 
192
205
  # test finished successfully - ingest its results
193
206
  else:
194
- util.info(f"{test_id} finished successfully")
195
- self.aggregator.ingest(
196
- self.platform,
197
- finfo.test_name,
198
- tmp_dir_path / "json_file",
199
- tmp_dir_path / "files_dir",
200
- )
201
- finfo.tmp_dir.cleanup()
202
-
203
- # if the remote was not destroyed by traceback / failing test,
204
- # check if the test always destroys it (even on success)
205
- if not destructive:
206
- test_data = self.fmf_tests.tests[finfo.test_name]
207
- destructive = test_data.get("extra-atex", {}).get("destructive", False)
207
+ util.info(f"{remote_with_test} finished successfully")
208
+ ingest_result()
208
209
 
209
210
  # if destroyed, release the remote
210
- if destructive:
211
- util.debug(f"{test_id} was destructive, releasing remote")
211
+ # (Executor exception is always considered destructive)
212
+ test_data = self.fmf_tests.tests[finfo.test_name]
213
+ if finfo.exception or self.destructive(finfo, test_data):
214
+ util.debug(f"{remote_with_test} was destructive, releasing remote")
212
215
  finfo.remote.release()
213
216
 
214
217
  # if still not destroyed, run another test on it
215
218
  # (without running plan setup, re-using already set up remote)
216
219
  elif self.to_run:
217
- sinfo = self.SetupInfo(
218
- provisioner=finfo.provisioner,
219
- remote=finfo.remote,
220
- executor=finfo.executor,
221
- )
222
- util.debug(f"{test_id} was non-destructive, running next test")
223
- self._run_new_test(sinfo)
220
+ util.debug(f"{remote_with_test} was non-destructive, running next test")
221
+ self._run_new_test(finfo)
224
222
 
225
223
  def serve_once(self):
226
224
  """
@@ -243,39 +241,62 @@ class Orchestrator:
243
241
  # further tests
244
242
  while True:
245
243
  try:
246
- finfo = self.test_queue.get(block=False)
244
+ treturn = self.test_queue.get_raw(block=False)
247
245
  except util.ThreadQueue.Empty:
248
246
  break
249
- del self.running_tests[finfo.test_name]
247
+
248
+ rinfo = treturn.rinfo
249
+ del self.running_tests[rinfo.test_name]
250
+
251
+ finfo = self.FinishedInfo(
252
+ **rinfo,
253
+ exit_code=treturn.returned,
254
+ exception=treturn.exception,
255
+ )
250
256
  self._process_finished_test(finfo)
251
257
 
252
258
  # process any remotes with finished plan setup (uploaded tests,
253
259
  # plan-defined pkgs / prepare scripts), start executing tests on them
254
- while True:
260
+ while self.to_run:
255
261
  try:
256
- sinfo = self.setup_queue.get(block=False)
262
+ treturn = self.setup_queue.get_raw(block=False)
257
263
  except util.ThreadQueue.Empty:
258
264
  break
259
- util.debug(f"setup finished for '{sinfo.remote}', running first test")
265
+
266
+ sinfo = treturn.sinfo
260
267
  self.running_setups.remove(sinfo)
261
- self._run_new_test(sinfo)
268
+
269
+ if treturn.exception:
270
+ exc_name = type(treturn.exception).__name__
271
+ exc_tb = "".join(traceback.format_exception(treturn.exception)).rstrip("\n")
272
+ msg = f"{sinfo.remote}: setup failed with {exc_name}"
273
+ sinfo.remote.release()
274
+ if (reruns_left := self.failed_setups_left) > 0:
275
+ util.warning(f"{msg}, re-trying ({reruns_left} setup retries left):\n{exc_tb}")
276
+ self.failed_setups_left -= 1
277
+ else:
278
+ util.warning(f"{msg}, setup retries exceeded, giving up:\n{exc_tb}")
279
+ raise FailedSetupError("setup retries limit exceeded, broken infra?")
280
+ else:
281
+ self._run_new_test(sinfo)
262
282
 
263
283
  # try to get new remotes from Provisioners - if we get some, start
264
284
  # running setup on them
265
285
  for provisioner in self.provisioners:
266
286
  while (remote := provisioner.get_remote(block=False)) is not None:
267
- ex = executor.Executor(self.fmf_tests, remote)
287
+ ex = executor.Executor(self.fmf_tests, remote, env=self.env)
268
288
  sinfo = self.SetupInfo(
269
289
  provisioner=provisioner,
270
290
  remote=remote,
271
291
  executor=ex,
272
292
  )
273
293
  self.setup_queue.start_thread(
274
- target=self._run_setup,
275
- args=(sinfo,),
294
+ target=self.run_setup,
295
+ target_args=(sinfo,),
296
+ sinfo=sinfo,
276
297
  )
277
298
  self.running_setups.append(sinfo)
278
- util.debug(f"got remote '{remote}' from '{provisioner}', running setup")
299
+ util.info(f"{provisioner}: running setup on new {remote}")
279
300
 
280
301
  return True
281
302
 
@@ -286,39 +307,79 @@ class Orchestrator:
286
307
  while self.serve_once():
287
308
  time.sleep(1)
288
309
 
289
- def __enter__(self):
310
+ def start(self):
290
311
  # start all provisioners
291
312
  for prov in self.provisioners:
292
313
  prov.start()
293
314
  return self
294
315
 
295
- def __exit__(self, exc_type, exc_value, traceback):
316
+ def stop(self):
296
317
  # cancel all running tests and wait for them to clean up (up to 0.1sec)
297
318
  for rinfo in self.running_tests.values():
298
319
  rinfo.executor.cancel()
299
320
  self.test_queue.join() # also ignore any exceptions raised
300
321
 
301
322
  # stop all provisioners, also releasing all remotes
302
- with concurrent.futures.ThreadPoolExecutor(max_workers=20) as ex:
303
- for provisioner in self.provisioners:
304
- for func in provisioner.stop_defer():
305
- ex.submit(func)
323
+ if self.provisioners:
324
+ workers = min(len(self.provisioners), 20)
325
+ with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
326
+ for provisioner in self.provisioners:
327
+ for func in provisioner.stop_defer():
328
+ ex.submit(func)
306
329
 
307
- def next_test(self, tests, fmf_tests): # noqa: ARG002, PLR6301
330
+ def __enter__(self):
331
+ try:
332
+ self.start()
333
+ return self
334
+ except Exception:
335
+ self.stop()
336
+ raise
337
+
338
+ def __exit__(self, exc_type, exc_value, traceback):
339
+ self.stop()
340
+
341
+ @staticmethod
342
+ def next_test(to_run, all_tests, previous): # noqa: ARG004
308
343
  """
309
- Return a test name (string) from a set of 'tests' (set of test name
310
- strings) to be run next.
344
+ Return a test name (string) to be executed next.
345
+
346
+ 'to_run' is a set of test names to pick from. The returned test name
347
+ must be chosen from this set.
311
348
 
312
- 'fmf_tests' is a class FMFTests instance with additional test metadata.
349
+ 'tests' is a dict indexed by test name (string), with values being
350
+ fully resolved fmf test metadata (dicts) of all possible tests.
351
+
352
+ 'previous' can be either
353
+ - Orchestrator.SetupInfo instance (first test to be run)
354
+ - Orchestrator.FinishedInfo instance (previous executed test)
355
+
356
+ This method must not modify any of its arguments, it must treat them
357
+ as read-only, eg. don't remove the returned test name from 'to_run'.
358
+ """
359
+ # default to simply picking any available test
360
+ return next(iter(to_run))
361
+
362
+ @staticmethod
363
+ def destructive(info, test_data): # noqa: ARG004
364
+ """
365
+ Return a boolean result whether a finished test was destructive
366
+ to a class Remote instance, indicating that the Remote instance
367
+ should not be used for further test execution.
313
368
 
314
- This method is user-overridable, ie. by subclassing Orchestrator:
369
+ 'info' is Orchestrator.FinishedInfo namedtuple of the test.
315
370
 
316
- class CustomOrchestrator(Orchestrator):
317
- @staticmethod
318
- def next_test(tests):
319
- ...
371
+ 'test_data' is a dict of fully resolved fmf test metadata of that test.
320
372
  """
321
- # TODO: more advanced algorithm
322
- #
323
- # simple:
324
- return next(iter(tests))
373
+ # if Executor ended with an exception (ie. duration exceeded),
374
+ # consider the test destructive
375
+ if info.exception:
376
+ return True
377
+ # if the test returned non-0 exit code, it could have thrown
378
+ # a python exception of its own, or (if bash) aborted abruptly
379
+ # due to 'set -e', don't trust the remote, consider it destroyed
380
+ if info.exit_code != 0:
381
+ return True
382
+ # otherwise we good
383
+ return False
384
+ # TODO: override with additional 'extra-contest: destructive: True' fmf metadata
385
+ # destructive = test_data.get("extra-contest", {}).get("destructive", False)
@@ -1,6 +1,5 @@
1
1
  import importlib as _importlib
2
2
  import pkgutil as _pkgutil
3
- import threading as _threading
4
3
 
5
4
  from .. import connection as _connection
6
5
 
@@ -26,17 +25,14 @@ class Provisioner:
26
25
  ...
27
26
  remote.release()
28
27
 
28
+ TODO: mention how a Provisioner always needs to take care of release all Remotes
29
+ when .stop()ped or when context terminates; even the ones handed over to
30
+ the user
31
+
29
32
  Note that .stop() or .defer_stop() may be called from a different
30
33
  thread, asynchronously to any other functions.
31
34
  """
32
35
 
33
- def __init__(self):
34
- """
35
- Initialize the provisioner instance.
36
- If extending __init__, always call 'super().__init__()' at the top.
37
- """
38
- self.lock = _threading.RLock()
39
-
40
36
  def get_remote(self, block=True):
41
37
  """
42
38
  Get a connected class Remote instance.
@@ -70,11 +66,15 @@ class Provisioner:
70
66
  Ie. a list of 200 .release() functions, to be called in a thread pool
71
67
  by the user, speeding up cleanup.
72
68
  """
73
- return self.stop
69
+ return (self.stop,)
74
70
 
75
71
  def __enter__(self):
76
- self.start()
77
- return self
72
+ try:
73
+ self.start()
74
+ return self
75
+ except Exception:
76
+ self.stop()
77
+ raise
78
78
 
79
79
  def __exit__(self, exc_type, exc_value, traceback):
80
80
  self.stop()
@@ -1,24 +1,2 @@
1
- from .. import base
2
- from ... import util, ssh
3
-
4
-
5
- class LibvirtProvisioner(base.Provisioner):
6
- number = 123
7
-
8
- def reserve(self):
9
- util.debug(f"reserving {self.number}")
10
-
11
- # TODO: as simple attribute, to be guaranteed set when reserve() returns,
12
- # can be overriden by a getter function if you need to keep track
13
- # how many times it was accessed
14
- def connection(self):
15
- #return {"Hostname": "1.2.3.4", "User": "root", "IdentityFile": ...}
16
- util.debug(f"returning ssh for {self.number}")
17
- return ssh.SSHConn({"Hostname": "1.2.3.4", "User": "root"})
18
-
19
- def release(self):
20
- util.debug(f"releasing {self.number}")
21
-
22
- def alive(self):
23
- util.debug(f"always alive: {self.number}")
24
- return True
1
+ from . import locking # noqa: F401
2
+ from .libvirt import LibvirtCloningProvisioner, LibvirtCloningRemote # noqa: F401