PyPI - atex - Versions diffs - 0.8__py3-none-any.whl → 0.9__py3-none-any.whl - Mend

atex 0.8py3-none-any.whl → 0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

atex/cli/fmf.py +73 -23
atex/cli/libvirt.py +127 -0
atex/cli/testingfarm.py +12 -0
atex/connection/__init__.py +13 -11
atex/connection/podman.py +63 -0
atex/connection/ssh.py +31 -33
atex/executor/executor.py +131 -107
atex/executor/reporter.py +66 -71
atex/executor/scripts.py +9 -3
atex/executor/testcontrol.py +43 -30
atex/fmf.py +94 -74
atex/orchestrator/__init__.py +3 -2
atex/orchestrator/aggregator.py +63 -58
atex/orchestrator/orchestrator.py +194 -133
atex/provision/__init__.py +11 -11
atex/provision/libvirt/__init__.py +2 -24
atex/provision/libvirt/libvirt.py +465 -0
atex/provision/libvirt/locking.py +168 -0
atex/provision/libvirt/setup-libvirt.sh +21 -1
atex/provision/podman/__init__.py +1 -0
atex/provision/podman/podman.py +274 -0
atex/provision/testingfarm/api.py +69 -26
atex/provision/testingfarm/testingfarm.py +29 -31
atex/util/libvirt.py +18 -0
atex/util/log.py +23 -8
atex/util/named_mapping.py +158 -0
atex/util/threads.py +64 -20
{atex-0.8.dist-info → atex-0.9.dist-info}/METADATA +27 -46
atex-0.9.dist-info/RECORD +43 -0
atex/provision/podman/README +0 -59
atex/provision/podman/host_container.sh +0 -74
atex-0.8.dist-info/RECORD +0 -37
{atex-0.8.dist-info → atex-0.9.dist-info}/WHEEL +0 -0
{atex-0.8.dist-info → atex-0.9.dist-info}/entry_points.txt +0 -0
{atex-0.8.dist-info → atex-0.9.dist-info}/licenses/COPYING.txt +0 -0

atex/orchestrator/orchestrator.py CHANGED Viewed

@@ -8,14 +8,22 @@ from pathlib import Path
 from .. import util, executor
+class OrchestratorError(Exception):
+    pass
+class FailedSetupError(OrchestratorError):
+    pass
 class Orchestrator:
     """
     A scheduler for parallel execution on multiple resources (machines/systems).
     """
-    SetupInfo = collections.namedtuple(
-        "SetupInfo",
-        (
+    class SetupInfo(
+        util.NamedMapping,
+        required=(
             # class Provisioner instance this machine is provided by
             # (for logging purposes)
             "provisioner",
@@ -24,23 +32,23 @@ class Orchestrator:
             # class Executor instance uploading tests / running setup or tests
             "executor",
         ),
-    )
-    RunningInfo = collections.namedtuple(
-        "RunningInfo",
-        (
-            # "inherit" from SetupInfo
-            *SetupInfo._fields,
+    ):
+        pass
+    class RunningInfo(
+        SetupInfo,
+        required=(
             # string with /test/name
             "test_name",
-            # class tempfile.TemporaryDirectory instance with 'json_file' and 'files_dir'
+            # class tempfile.TemporaryDirectory instance passed to Executor
             "tmp_dir",
         ),
-    )
-    FinishedInfo = collections.namedtuple(
-        "FinishedInfo",
-        (
-            # "inherit" from RunningInfo
-            *RunningInfo._fields,
+    ):
+        pass
+    class FinishedInfo(
+        RunningInfo,
+        required=(
             # integer with exit code of the test
             # (None if exception happened)
             "exit_code",
@@ -48,9 +56,13 @@ class Orchestrator:
             # (None if no exception happened (exit_code is defined))
             "exception",
         ),
-    )
+    ):
+        pass
-    def __init__(self, platform, fmf_tests, provisioners, aggregator, tmp_dir, *, max_reruns=2):
+    def __init__(
+        self, platform, fmf_tests, provisioners, aggregator, tmp_dir, *,
+        max_reruns=2, max_failed_setups=10, env=None,
+    ):
         """
         'platform' is a string with platform name.
@@ -63,20 +75,31 @@ class Orchestrator:
         'tmp_dir' is a string/Path to a temporary directory, to be used for
         storing per-test results and uploaded files before being ingested
         by the aggregator. Can be safely shared by Orchestrator instances.
+        'max_reruns' is an integer of how many times to re-try running a failed
+        test (which exited with non-0 or caused an Executor exception).
+        'max_failed_setups' is an integer of how many times an Executor's
+        plan setup (uploading tests, running prepare scripts, etc.) may fail
+        before FailedSetupError is raised.
+        'env' is a dict of extra environment variables to pass to Executor.
         """
         self.platform = platform
         self.fmf_tests = fmf_tests
         self.provisioners = tuple(provisioners)
         self.aggregator = aggregator
         self.tmp_dir = tmp_dir
+        self.failed_setups_left = max_failed_setups
+        # indexed by test name, value being integer of how many times
+        self.reruns = collections.defaultdict(lambda: max_reruns)
+        self.env = env
         # tests still waiting to be run
         self.to_run = set(fmf_tests.tests)
         # running setup functions, as a list of SetupInfo items
         self.running_setups = []
         # running tests as a dict, indexed by test name, with RunningInfo values
         self.running_tests = {}
-        # indexed by test name, value being integer of how many times
-        self.reruns = collections.defaultdict(lambda: max_reruns)
         # thread queue for actively running tests
         self.test_queue = util.ThreadQueue(daemon=False)
         # thread queue for remotes being set up (uploading tests, etc.)
@@ -85,37 +108,36 @@ class Orchestrator:
         #       cancellation, the execution flow itself uses ThreadQueues
     @staticmethod
-    def _run_setup(sinfo):
-        sinfo.executor.setup()
-        sinfo.executor.upload_tests()
-        sinfo.executor.setup_plan()
-        # NOTE: we never run executor.cleanup() anywhere - instead, we assume
-        #       the remote (and its connection) was invalidated by the test,
-        #       so we just rely on remote.release() destroying the system
-        return sinfo
-    @classmethod
-    def _wrap_test(cls, rinfo, func, *args, **kwargs):
+    def run_setup(sinfo):
         """
-        Wrap 'func' (test execution function) to preserve extra metadata
-        ('rinfo') and return it with the function return value.
+        Set up a newly acquired class Remote instance for test execution.
+        'sinfo' is a SetupInfo instance with the (fully connected) remote.
         """
-        try:
-            return cls.FinishedInfo(*rinfo, func(*args, **kwargs), None)
-        except Exception as e:
-            return cls.FinishedInfo(*rinfo, None, e)
+        sinfo.executor.setup()
+        sinfo.executor.upload_tests()
+        sinfo.executor.plan_prepare()
+        # NOTE: we never run executor.plan_finish() or even executor.cleanup()
+        #       anywhere - instead, we assume the remote (and its connection)
+        #       was invalidated by the test, so we just rely on remote.release()
+        #       destroying the system
-    def _run_new_test(self, sinfo):
+    def _run_new_test(self, info):
         """
-        'sinfo' is a SetupInfo instance.
+        'info' can be either
+          - SetupInfo instance with Remote/Executor to run the new test.
+          - FinishedInfo instance of a previously executed test
+            (reusing Remote/Executor for a new test).
         """
-        next_test_name = self.next_test(self.to_run, self.fmf_tests)
+        next_test_name = self.next_test(self.to_run, self.fmf_tests.tests, info)
         assert next_test_name in self.to_run, "next_test() returned valid test name"
+        util.info(f"starting '{next_test_name}' on {info.remote}")
         self.to_run.remove(next_test_name)
-        rinfo = self.RunningInfo(
-            *sinfo,
+        rinfo = self.RunningInfo._from(
+            info,
             test_name=next_test_name,
             tmp_dir=tempfile.TemporaryDirectory(
                 prefix=next_test_name.strip("/").replace("/","-") + "-",
@@ -126,14 +148,12 @@ class Orchestrator:
         tmp_dir_path = Path(rinfo.tmp_dir.name)
         self.test_queue.start_thread(
-            target=self._wrap_test,
-            args=(
-                rinfo,
-                sinfo.executor.run_test,
+            target=info.executor.run_test,
+            target_args=(
                 next_test_name,
-                tmp_dir_path / "json_file",
-                tmp_dir_path / "files_dir",
+                tmp_dir_path,
             ),
+            rinfo=rinfo,
         )
         self.running_tests[next_test_name] = rinfo
@@ -142,85 +162,63 @@ class Orchestrator:
         """
         'finfo' is a FinishedInfo instance.
         """
-        test_id = f"'{finfo.test_name}' on '{finfo.remote}'"
-        tmp_dir_path = Path(finfo.tmp_dir.name)
-        # NOTE: document that we intentionally don't .cleanup() executioner below,
-        #       we rely on remote .release() destroying the OS, because we don't
-        #       want to risk .cleanup() blocking on dead ssh into the remote after
-        #       executing a destructive test
-        destructive = False
+        remote_with_test = f"{finfo.remote}: '{finfo.test_name}'"
+        def ingest_result():
+            tmp_dir_path = Path(finfo.tmp_dir.name)
+            results_file = tmp_dir_path / "results"
+            files_dir = tmp_dir_path / "files"
+            # in case Executor code itself threw an unrecoverable exception
+            # and didn't even report the fallback 'infra' result
+            if results_file.exists() and files_dir.exists():
+                self.aggregator.ingest(self.platform, finfo.test_name, results_file, files_dir)
+                finfo.tmp_dir.cleanup()
         # if executor (or test) threw exception, schedule a re-run
         if finfo.exception:
-            destructive = True
-            exc_str = "".join(traceback.format_exception(finfo.exception)).rstrip("\n")
-            util.info(f"unexpected exception happened while running {test_id}:\n{exc_str}")
-            finfo.remote.release()
-            if self.reruns[finfo.test_name] > 0:
+            exc_name = type(finfo.exception).__name__
+            exc_tb = "".join(traceback.format_exception(finfo.exception)).rstrip("\n")
+            msg = f"{remote_with_test} threw {exc_name} during test runtime"
+            #finfo.remote.release()
+            if (reruns_left := self.reruns[finfo.test_name]) > 0:
+                util.info(f"{msg}, re-running ({reruns_left} reruns left):\n{exc_tb}")
                 self.reruns[finfo.test_name] -= 1
                 self.to_run.add(finfo.test_name)
             else:
-                util.info(f"reruns for {test_id} exceeded, ignoring it")
+                util.info(f"{msg}, reruns exceeded, giving up:\n{exc_tb}")
+                # record the final result anyway
+                ingest_result()
         # if the test exited as non-0, try a re-run
         elif finfo.exit_code != 0:
-            destructive = True
-            finfo.remote.release()
-            if self.reruns[finfo.test_name] > 0:
-                util.info(
-                    f"{test_id} exited with non-zero: {finfo.exit_code}, re-running "
-                    f"({self.reruns[finfo.test_name]} reruns left)",
-                )
+            msg = f"{remote_with_test} exited with non-zero: {finfo.exit_code}"
+            #finfo.remote.release()
+            if (reruns_left := self.reruns[finfo.test_name]) > 0:
+                util.info(f"{msg}, re-running ({reruns_left} reruns left)")
                 self.reruns[finfo.test_name] -= 1
                 self.to_run.add(finfo.test_name)
             else:
-                util.info(
-                    f"{test_id} exited with non-zero: {finfo.exit_code}, "
-                    "all reruns exceeded, giving up",
-                )
+                util.info(f"{msg}, reruns exceeded, giving up")
                 # record the final result anyway
-                self.aggregator.ingest(
-                    self.platform,
-                    finfo.test_name,
-                    tmp_dir_path / "json_file",
-                    tmp_dir_path / "files_dir",
-                )
-                finfo.tmp_dir.cleanup()
+                ingest_result()
         # test finished successfully - ingest its results
         else:
-            util.info(f"{test_id} finished successfully")
-            self.aggregator.ingest(
-                self.platform,
-                finfo.test_name,
-                tmp_dir_path / "json_file",
-                tmp_dir_path / "files_dir",
-            )
-            finfo.tmp_dir.cleanup()
-        # if the remote was not destroyed by traceback / failing test,
-        # check if the test always destroys it (even on success)
-        if not destructive:
-            test_data = self.fmf_tests.tests[finfo.test_name]
-            destructive = test_data.get("extra-atex", {}).get("destructive", False)
+            util.info(f"{remote_with_test} finished successfully")
+            ingest_result()
         # if destroyed, release the remote
-        if destructive:
-            util.debug(f"{test_id} was destructive, releasing remote")
+        # (Executor exception is always considered destructive)
+        test_data = self.fmf_tests.tests[finfo.test_name]
+        if finfo.exception or self.destructive(finfo, test_data):
+            util.debug(f"{remote_with_test} was destructive, releasing remote")
             finfo.remote.release()
         # if still not destroyed, run another test on it
         # (without running plan setup, re-using already set up remote)
         elif self.to_run:
-            sinfo = self.SetupInfo(
-                provisioner=finfo.provisioner,
-                remote=finfo.remote,
-                executor=finfo.executor,
-            )
-            util.debug(f"{test_id} was non-destructive, running next test")
-            self._run_new_test(sinfo)
+            util.debug(f"{remote_with_test} was non-destructive, running next test")
+            self._run_new_test(finfo)
     def serve_once(self):
         """
@@ -243,39 +241,62 @@ class Orchestrator:
         # further tests
         while True:
             try:
-                finfo = self.test_queue.get(block=False)
+                treturn = self.test_queue.get_raw(block=False)
             except util.ThreadQueue.Empty:
                 break
-            del self.running_tests[finfo.test_name]
+            rinfo = treturn.rinfo
+            del self.running_tests[rinfo.test_name]
+            finfo = self.FinishedInfo(
+                **rinfo,
+                exit_code=treturn.returned,
+                exception=treturn.exception,
+            )
             self._process_finished_test(finfo)
         # process any remotes with finished plan setup (uploaded tests,
         # plan-defined pkgs / prepare scripts), start executing tests on them
-        while True:
+        while self.to_run:
             try:
-                sinfo = self.setup_queue.get(block=False)
+                treturn = self.setup_queue.get_raw(block=False)
             except util.ThreadQueue.Empty:
                 break
-            util.debug(f"setup finished for '{sinfo.remote}', running first test")
+            sinfo = treturn.sinfo
             self.running_setups.remove(sinfo)
-            self._run_new_test(sinfo)
+            if treturn.exception:
+                exc_name = type(treturn.exception).__name__
+                exc_tb = "".join(traceback.format_exception(treturn.exception)).rstrip("\n")
+                msg = f"{sinfo.remote}: setup failed with {exc_name}"
+                sinfo.remote.release()
+                if (reruns_left := self.failed_setups_left) > 0:
+                    util.warning(f"{msg}, re-trying ({reruns_left} setup retries left):\n{exc_tb}")
+                    self.failed_setups_left -= 1
+                else:
+                    util.warning(f"{msg}, setup retries exceeded, giving up:\n{exc_tb}")
+                    raise FailedSetupError("setup retries limit exceeded, broken infra?")
+            else:
+                self._run_new_test(sinfo)
         # try to get new remotes from Provisioners - if we get some, start
         # running setup on them
         for provisioner in self.provisioners:
             while (remote := provisioner.get_remote(block=False)) is not None:
-                ex = executor.Executor(self.fmf_tests, remote)
+                ex = executor.Executor(self.fmf_tests, remote, env=self.env)
                 sinfo = self.SetupInfo(
                     provisioner=provisioner,
                     remote=remote,
                     executor=ex,
                 )
                 self.setup_queue.start_thread(
-                    target=self._run_setup,
-                    args=(sinfo,),
+                    target=self.run_setup,
+                    target_args=(sinfo,),
+                    sinfo=sinfo,
                 )
                 self.running_setups.append(sinfo)
-                util.debug(f"got remote '{remote}' from '{provisioner}', running setup")
+                util.info(f"{provisioner}: running setup on new {remote}")
         return True
@@ -286,39 +307,79 @@ class Orchestrator:
         while self.serve_once():
             time.sleep(1)
-    def __enter__(self):
+    def start(self):
         # start all provisioners
         for prov in self.provisioners:
             prov.start()
         return self
-    def __exit__(self, exc_type, exc_value, traceback):
+    def stop(self):
         # cancel all running tests and wait for them to clean up (up to 0.1sec)
         for rinfo in self.running_tests.values():
             rinfo.executor.cancel()
         self.test_queue.join()  # also ignore any exceptions raised
         # stop all provisioners, also releasing all remotes
-        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as ex:
-            for provisioner in self.provisioners:
-                for func in provisioner.stop_defer():
-                    ex.submit(func)
+        if self.provisioners:
+            workers = min(len(self.provisioners), 20)
+            with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
+                for provisioner in self.provisioners:
+                    for func in provisioner.stop_defer():
+                        ex.submit(func)
-    def next_test(self, tests, fmf_tests):  # noqa: ARG002, PLR6301
+    def __enter__(self):
+        try:
+            self.start()
+            return self
+        except Exception:
+            self.stop()
+            raise
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.stop()
+    @staticmethod
+    def next_test(to_run, all_tests, previous):  # noqa: ARG004
         """
-        Return a test name (string) from a set of 'tests' (set of test name
-        strings) to be run next.
+        Return a test name (string) to be executed next.
+        'to_run' is a set of test names to pick from. The returned test name
+        must be chosen from this set.
-        'fmf_tests' is a class FMFTests instance with additional test metadata.
+        'tests' is a dict indexed by test name (string), with values being
+        fully resolved fmf test metadata (dicts) of all possible tests.
+        'previous' can be either
+          - Orchestrator.SetupInfo instance (first test to be run)
+          - Orchestrator.FinishedInfo instance (previous executed test)
+        This method must not modify any of its arguments, it must treat them
+        as read-only, eg. don't remove the returned test name from 'to_run'.
+        """
+        # default to simply picking any available test
+        return next(iter(to_run))
+    @staticmethod
+    def destructive(info, test_data):  # noqa: ARG004
+        """
+        Return a boolean result whether a finished test was destructive
+        to a class Remote instance, indicating that the Remote instance
+        should not be used for further test execution.
-        This method is user-overridable, ie. by subclassing Orchestrator:
+        'info' is Orchestrator.FinishedInfo namedtuple of the test.
-            class CustomOrchestrator(Orchestrator):
-                @staticmethod
-                def next_test(tests):
-                    ...
+        'test_data' is a dict of fully resolved fmf test metadata of that test.
         """
-        # TODO: more advanced algorithm
-        #
-        # simple:
-        return next(iter(tests))
+        # if Executor ended with an exception (ie. duration exceeded),
+        # consider the test destructive
+        if info.exception:
+            return True
+        # if the test returned non-0 exit code, it could have thrown
+        # a python exception of its own, or (if bash) aborted abruptly
+        # due to 'set -e', don't trust the remote, consider it destroyed
+        if info.exit_code != 0:
+            return True
+        # otherwise we good
+        return False
+        # TODO: override with additional 'extra-contest: destructive: True' fmf metadata
+        # destructive = test_data.get("extra-contest", {}).get("destructive", False)

atex/provision/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import importlib as _importlib
 import pkgutil as _pkgutil
-import threading as _threading
 from .. import connection as _connection
@@ -26,17 +25,14 @@ class Provisioner:
             ...
             remote.release()
+    TODO: mention how a Provisioner always needs to take care of release all Remotes
+          when .stop()ped or when context terminates; even the ones handed over to
+          the user
     Note that .stop() or .defer_stop() may be called from a different
     thread, asynchronously to any other functions.
     """
-    def __init__(self):
-        """
-        Initialize the provisioner instance.
-        If extending __init__, always call 'super().__init__()' at the top.
-        """
-        self.lock = _threading.RLock()
     def get_remote(self, block=True):
         """
         Get a connected class Remote instance.
@@ -70,11 +66,15 @@ class Provisioner:
         Ie. a list of 200 .release() functions, to be called in a thread pool
         by the user, speeding up cleanup.
         """
-        return self.stop
+        return (self.stop,)
     def __enter__(self):
-        self.start()
-        return self
+        try:
+            self.start()
+            return self
+        except Exception:
+            self.stop()
+            raise
     def __exit__(self, exc_type, exc_value, traceback):
         self.stop()

atex/provision/libvirt/__init__.py CHANGED Viewed

@@ -1,24 +1,2 @@
-from .. import base
-from ... import util, ssh
-class LibvirtProvisioner(base.Provisioner):
-    number = 123
-    def reserve(self):
-        util.debug(f"reserving {self.number}")
-    # TODO: as simple attribute, to be guaranteed set when reserve() returns,
-    #       can be overriden by a getter function if you need to keep track
-    #       how many times it was accessed
-    def connection(self):
-        #return {"Hostname": "1.2.3.4", "User": "root", "IdentityFile": ...}
-        util.debug(f"returning ssh for {self.number}")
-        return ssh.SSHConn({"Hostname": "1.2.3.4", "User": "root"})
-    def release(self):
-        util.debug(f"releasing {self.number}")
-    def alive(self):
-        util.debug(f"always alive: {self.number}")
-        return True
+from . import locking  # noqa: F401
+from .libvirt import LibvirtCloningProvisioner, LibvirtCloningRemote  # noqa: F401

atex 0.8__py3-none-any.whl → 0.9__py3-none-any.whl

atex 0.8py3-none-any.whl → 0.9py3-none-any.whl