PyPI - atex - Versions diffs - 0.9__py3-none-any.whl → 0.10__py3-none-any.whl - Mend

atex 0.9py3-none-any.whl → 0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

atex/aggregator/__init__.py +60 -0
atex/{orchestrator/aggregator.py → aggregator/json.py} +6 -21
atex/cli/__init__.py +11 -1
atex/cli/libvirt.py +3 -2
atex/cli/testingfarm.py +48 -3
atex/connection/podman.py +2 -4
atex/connection/ssh.py +7 -14
atex/executor/executor.py +18 -17
atex/executor/scripts.py +5 -3
atex/executor/testcontrol.py +1 -1
atex/orchestrator/__init__.py +76 -3
atex/orchestrator/{orchestrator.py → adhoc.py} +183 -103
atex/{provision → provisioner}/__init__.py +49 -37
atex/{provision → provisioner}/libvirt/libvirt.py +21 -14
atex/{provision → provisioner}/libvirt/locking.py +3 -1
atex/provisioner/podman/__init__.py +2 -0
atex/provisioner/podman/podman.py +169 -0
atex/{provision → provisioner}/testingfarm/api.py +53 -44
atex/{provision → provisioner}/testingfarm/testingfarm.py +17 -23
atex/util/log.py +62 -67
atex/util/subprocess.py +46 -12
atex/util/threads.py +7 -0
atex-0.10.dist-info/METADATA +86 -0
atex-0.10.dist-info/RECORD +44 -0
atex/provision/podman/__init__.py +0 -1
atex/provision/podman/podman.py +0 -274
atex-0.9.dist-info/METADATA +0 -178
atex-0.9.dist-info/RECORD +0 -43
/atex/{provision → provisioner}/libvirt/VM_PROVISION +0 -0
/atex/{provision → provisioner}/libvirt/__init__.py +0 -0
/atex/{provision → provisioner}/libvirt/setup-libvirt.sh +0 -0
/atex/{provision → provisioner}/testingfarm/__init__.py +0 -0
{atex-0.9.dist-info → atex-0.10.dist-info}/WHEEL +0 -0
{atex-0.9.dist-info → atex-0.10.dist-info}/entry_points.txt +0 -0
{atex-0.9.dist-info → atex-0.10.dist-info}/licenses/COPYING.txt +0 -0

atex/orchestrator/{orchestrator.py → adhoc.py} RENAMED Viewed

@@ -1,24 +1,19 @@
-import time
 import tempfile
-import traceback
 import concurrent
 import collections
 from pathlib import Path
 from .. import util, executor
-class OrchestratorError(Exception):
-    pass
+from . import Orchestrator, OrchestratorError
 class FailedSetupError(OrchestratorError):
     pass
-class Orchestrator:
+class AdHocOrchestrator(Orchestrator):
     """
-    A scheduler for parallel execution on multiple resources (machines/systems).
+    TODO: document function specific to this reference, ie. run_setup(), etc.
     """
     class SetupInfo(
@@ -55,13 +50,17 @@ class Orchestrator:
             # exception class instance if running the test failed
             # (None if no exception happened (exit_code is defined))
             "exception",
+            # Path of a 'results' JSON file with test-reported results
+            "results",
+            # Path of a 'files' directory with test-uploaded files
+            "files",
         ),
     ):
         pass
     def __init__(
         self, platform, fmf_tests, provisioners, aggregator, tmp_dir, *,
-        max_reruns=2, max_failed_setups=10, env=None,
+        max_remotes=1, max_spares=0, max_reruns=2, max_failed_setups=10, env=None,
     ):
         """
         'platform' is a string with platform name.
@@ -76,6 +75,15 @@ class Orchestrator:
         storing per-test results and uploaded files before being ingested
         by the aggregator. Can be safely shared by Orchestrator instances.
+        'max_remotes' is how many Remotes to hold reserved at any given time,
+        eg. how many tests to run in parallel. Clamped to the number of
+        to-be-run tests given as 'fmf_tests'.
+        'max_spares' is how many set-up Remotes to hold reserved and unused,
+        ready to replace a Remote destroyed by test. Values above 0 greatly
+        speed up test reruns as Remote reservation happens asynchronously
+        to test execution. Spares are reserved on top of 'max_remotes'.
         'max_reruns' is an integer of how many times to re-try running a failed
         test (which exited with non-0 or caused an Executor exception).
@@ -91,36 +99,21 @@ class Orchestrator:
         self.aggregator = aggregator
         self.tmp_dir = tmp_dir
         self.failed_setups_left = max_failed_setups
+        self.max_remotes = max_remotes
+        self.max_spares = max_spares
         # indexed by test name, value being integer of how many times
         self.reruns = collections.defaultdict(lambda: max_reruns)
         self.env = env
         # tests still waiting to be run
         self.to_run = set(fmf_tests.tests)
-        # running setup functions, as a list of SetupInfo items
-        self.running_setups = []
         # running tests as a dict, indexed by test name, with RunningInfo values
         self.running_tests = {}
         # thread queue for actively running tests
         self.test_queue = util.ThreadQueue(daemon=False)
         # thread queue for remotes being set up (uploading tests, etc.)
         self.setup_queue = util.ThreadQueue(daemon=True)
-        # NOTE: running_setups and test_running are just for debugging and
-        #       cancellation, the execution flow itself uses ThreadQueues
-    @staticmethod
-    def run_setup(sinfo):
-        """
-        Set up a newly acquired class Remote instance for test execution.
-        'sinfo' is a SetupInfo instance with the (fully connected) remote.
-        """
-        sinfo.executor.setup()
-        sinfo.executor.upload_tests()
-        sinfo.executor.plan_prepare()
-        # NOTE: we never run executor.plan_finish() or even executor.cleanup()
-        #       anywhere - instead, we assume the remote (and its connection)
-        #       was invalidated by the test, so we just rely on remote.release()
-        #       destroying the system
+        # thread queue for remotes being released
+        self.release_queue = util.ThreadQueue(daemon=True)
     def _run_new_test(self, info):
         """
@@ -162,57 +155,59 @@ class Orchestrator:
         """
         'finfo' is a FinishedInfo instance.
         """
-        remote_with_test = f"{finfo.remote}: '{finfo.test_name}'"
+        test_data = self.fmf_tests.tests[finfo.test_name]
-        def ingest_result():
-            tmp_dir_path = Path(finfo.tmp_dir.name)
-            results_file = tmp_dir_path / "results"
-            files_dir = tmp_dir_path / "files"
-            # in case Executor code itself threw an unrecoverable exception
-            # and didn't even report the fallback 'infra' result
-            if results_file.exists() and files_dir.exists():
-                self.aggregator.ingest(self.platform, finfo.test_name, results_file, files_dir)
-                finfo.tmp_dir.cleanup()
+        # TODO: somehow move logging from was_successful and should_be_rerun here,
+        #       probably print just some generic info from those functions that doesn't
+        #       imply any outcome, ie.
+        #           {remote_with_test} threw {exception}
+        #           {remote_with_test} exited with {code}
+        #           {remote_with_test} has {N} reruns left
+        #           {remote_with_test} has 0 reruns left
+        #       and then log the decision separately, here below, such as
+        #           {remote_with_test} failed, re-running
+        #           {remote_with_test} completed, ingesting result
+        #           {remote_with_test} was destructive, releasing remote
+        #           {remote_with_test} ...., running next test
+        #       That allows the user to override the functions, while keeping critical
+        #       flow reliably logged here.
-        # if executor (or test) threw exception, schedule a re-run
-        if finfo.exception:
-            exc_name = type(finfo.exception).__name__
-            exc_tb = "".join(traceback.format_exception(finfo.exception)).rstrip("\n")
-            msg = f"{remote_with_test} threw {exc_name} during test runtime"
-            #finfo.remote.release()
-            if (reruns_left := self.reruns[finfo.test_name]) > 0:
-                util.info(f"{msg}, re-running ({reruns_left} reruns left):\n{exc_tb}")
-                self.reruns[finfo.test_name] -= 1
-                self.to_run.add(finfo.test_name)
-            else:
-                util.info(f"{msg}, reruns exceeded, giving up:\n{exc_tb}")
-                # record the final result anyway
-                ingest_result()
-        # if the test exited as non-0, try a re-run
-        elif finfo.exit_code != 0:
-            msg = f"{remote_with_test} exited with non-zero: {finfo.exit_code}"
-            #finfo.remote.release()
-            if (reruns_left := self.reruns[finfo.test_name]) > 0:
-                util.info(f"{msg}, re-running ({reruns_left} reruns left)")
-                self.reruns[finfo.test_name] -= 1
-                self.to_run.add(finfo.test_name)
-            else:
-                util.info(f"{msg}, reruns exceeded, giving up")
-                # record the final result anyway
-                ingest_result()
+        remote_with_test = f"{finfo.remote}: '{finfo.test_name}'"
-        # test finished successfully - ingest its results
+        if not self.was_successful(finfo, test_data) and self.should_be_rerun(finfo, test_data):
+            # re-run the test
+            self.to_run.add(finfo.test_name)
         else:
-            util.info(f"{remote_with_test} finished successfully")
-            ingest_result()
+            # ingest the result
+            #
+            # a condition just in case Executor code itself threw an exception
+            # and didn't even report the fallback 'infra' result
+            if finfo.results is not None and finfo.files is not None:
+                self.aggregator.ingest(
+                    self.platform,
+                    finfo.test_name,
+                    finfo.results,
+                    finfo.files,
+                )
+                # also delete the tmpdir housing these
+                finfo.tmp_dir.cleanup()
+                # ingesting destroyed these
+                finfo = self.FinishedInfo._from(
+                    finfo,
+                    results=None,
+                    files=None,
+                    tmp_dir=None,
+                )
-        # if destroyed, release the remote
+        # if destroyed, release the remote and request a replacement
         # (Executor exception is always considered destructive)
-        test_data = self.fmf_tests.tests[finfo.test_name]
         if finfo.exception or self.destructive(finfo, test_data):
             util.debug(f"{remote_with_test} was destructive, releasing remote")
-            finfo.remote.release()
+            self.release_queue.start_thread(
+                finfo.remote.release,
+                remote=finfo.remote,
+            )
+            finfo.provisioner.provision(1)
         # if still not destroyed, run another test on it
         # (without running plan setup, re-using already set up remote)
@@ -229,12 +224,8 @@ class Orchestrator:
         Returns True to indicate that it should be called again by the user
         (more work to be done), False once all testing is concluded.
         """
-        util.debug(
-            f"to_run: {len(self.to_run)} tests / "
-            f"running: {len(self.running_tests)} tests, {len(self.running_setups)} setups",
-        )
         # all done
-        if not self.to_run and not self.running_tests:
+        if not self.to_run and not self.running_tests and self.release_queue.qsize() == 0:
             return False
         # process all finished tests, potentially reusing remotes for executing
@@ -248,10 +239,16 @@ class Orchestrator:
             rinfo = treturn.rinfo
             del self.running_tests[rinfo.test_name]
+            tmp_dir_path = Path(rinfo.tmp_dir.name)
+            results_path = tmp_dir_path / "results"
+            files_path = tmp_dir_path / "files"
             finfo = self.FinishedInfo(
                 **rinfo,
                 exit_code=treturn.returned,
                 exception=treturn.exception,
+                results=results_path if results_path.exists() else None,
+                files=files_path if files_path.exists() else None,
             )
             self._process_finished_test(finfo)
@@ -264,22 +261,36 @@ class Orchestrator:
                 break
             sinfo = treturn.sinfo
-            self.running_setups.remove(sinfo)
             if treturn.exception:
-                exc_name = type(treturn.exception).__name__
-                exc_tb = "".join(traceback.format_exception(treturn.exception)).rstrip("\n")
-                msg = f"{sinfo.remote}: setup failed with {exc_name}"
-                sinfo.remote.release()
+                msg = f"{sinfo.remote}: setup failed with {repr(treturn.exception)}"
+                self.release_queue.start_thread(
+                    sinfo.remote.release,
+                    remote=sinfo.remote,
+                )
                 if (reruns_left := self.failed_setups_left) > 0:
-                    util.warning(f"{msg}, re-trying ({reruns_left} setup retries left):\n{exc_tb}")
+                    util.warning(f"{msg}, re-trying ({reruns_left} setup retries left)")
                     self.failed_setups_left -= 1
+                    sinfo.provisioner.provision(1)
                 else:
-                    util.warning(f"{msg}, setup retries exceeded, giving up:\n{exc_tb}")
+                    util.warning(f"{msg}, setup retries exceeded, giving up")
                     raise FailedSetupError("setup retries limit exceeded, broken infra?")
             else:
                 self._run_new_test(sinfo)
+        # release any extra Remotes being held as set-up when we know we won't
+        # use them for any tests (because to_run is empty)
+        else:
+            while self.setup_queue.qsize() > self.max_spares:
+                try:
+                    treturn = self.setup_queue.get_raw(block=False)
+                except util.ThreadQueue.Empty:
+                    break
+                self.release_queue.start_thread(
+                    treturn.sinfo.remote.release,
+                    remote=treturn.sinfo.remote,
+                )
         # try to get new remotes from Provisioners - if we get some, start
         # running setup on them
         for provisioner in self.provisioners:
@@ -295,23 +306,37 @@ class Orchestrator:
                     target_args=(sinfo,),
                     sinfo=sinfo,
                 )
-                self.running_setups.append(sinfo)
                 util.info(f"{provisioner}: running setup on new {remote}")
-        return True
+        # gather returns from Remote.release() functions - check for exceptions
+        # thrown, re-report them as warnings as they are not typically critical
+        # for operation
+        try:
+            treturn = self.release_queue.get_raw(block=False)
+        except util.ThreadQueue.Empty:
+            pass
+        else:
+            if treturn.exception:
+                util.warning(f"{treturn.remote} release failed: {repr(treturn.exception)}")
+            else:
+                util.debug(f"{treturn.remote}: completed .release()")
-    def serve_forever(self):
-        """
-        Run the orchestration logic, blocking until all testing is concluded.
-        """
-        while self.serve_once():
-            time.sleep(1)
+        return True
     def start(self):
         # start all provisioners
         for prov in self.provisioners:
             prov.start()
-        return self
+        # start up initial reservations, balanced evenly across all available
+        # provisioner instances
+        count = min(self.max_remotes, len(self.fmf_tests.tests)) + self.max_spares
+        provisioners = self.provisioners[:count]
+        for idx, prov in enumerate(provisioners):
+            if count % len(provisioners) > idx:
+                prov.provision((count // len(provisioners)) + 1)
+            else:
+                prov.provision(count // len(provisioners))
     def stop(self):
         # cancel all running tests and wait for them to clean up (up to 0.1sec)
@@ -320,6 +345,7 @@ class Orchestrator:
         self.test_queue.join()  # also ignore any exceptions raised
         # stop all provisioners, also releasing all remotes
+        # TODO: don't parallelize here, remove .stop_defer() and parallelize in provisioners
         if self.provisioners:
             workers = min(len(self.provisioners), 20)
             with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
@@ -327,16 +353,20 @@ class Orchestrator:
                     for func in provisioner.stop_defer():
                         ex.submit(func)
-    def __enter__(self):
-        try:
-            self.start()
-            return self
-        except Exception:
-            self.stop()
-            raise
+    @staticmethod
+    def run_setup(sinfo):
+        """
+        Set up a newly acquired class Remote instance for test execution.
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.stop()
+        'sinfo' is a SetupInfo instance with the (fully connected) remote.
+        """
+        sinfo.executor.start()
+        sinfo.executor.upload_tests()
+        sinfo.executor.plan_prepare()
+        # NOTE: we never run executor.plan_finish() or even executor.stop()
+        #       anywhere - instead, we assume the remote (and its connection)
+        #       was invalidated by the test, so we just rely on remote.release()
+        #       destroying the system
     @staticmethod
     def next_test(to_run, all_tests, previous):  # noqa: ARG004
@@ -381,5 +411,55 @@ class Orchestrator:
             return True
         # otherwise we good
         return False
-        # TODO: override with additional 'extra-contest: destructive: True' fmf metadata
-        # destructive = test_data.get("extra-contest", {}).get("destructive", False)
+    @staticmethod
+    def was_successful(info, test_data):  # noqa: ARG004
+        """
+        Return a boolean result whether a finished test was successful.
+        Returning False might cause it to be re-run (per should_be_rerun()).
+        'info' is Orchestrator.FinishedInfo namedtuple of the test.
+        'test_data' is a dict of fully resolved fmf test metadata of that test.
+        """
+        remote_with_test = f"{info.remote}: '{info.test_name}'"
+        # executor (or test) threw exception
+        if info.exception:
+            util.info(f"{remote_with_test} threw {repr(info.exception)} during test runtime")
+            return False
+        # the test exited as non-0
+        if info.exit_code != 0:
+            util.info(f"{remote_with_test} exited with non-zero: {info.exit_code}")
+            return False
+        # otherwise we good
+        return True
+    # TODO: @staticmethod and remove ARG002
+    #@staticmethod
+    def should_be_rerun(self, info, test_data):  # noqa: ARG004, ARG002
+        """
+        Return a boolean result whether a finished test failed in a way
+        that another execution attempt might succeed, due to race conditions
+        in the test or other non-deterministic factors.
+        'info' is Orchestrator.FinishedInfo namedtuple of the test.
+        'test_data' is a dict of fully resolved fmf test metadata of that test.
+        """
+        remote_with_test = f"{info.remote}: '{info.test_name}'"
+        # TODO: remove self.reruns and the whole X-reruns logic from AdHocOrchestrator,
+        #       leave it up to the user to wrap should_be_rerun() with an external dict
+        #       of tests, counting reruns for each
+        #        - allows the user to adjust counts per-test (ie. test_data metadata)
+        #        - allows this template to be @staticmethod
+        if (reruns_left := self.reruns[info.test_name]) > 0:
+            util.info(f"{remote_with_test}: re-running ({reruns_left} reruns left)")
+            self.reruns[info.test_name] -= 1
+            return True
+        else:
+            util.info(f"{remote_with_test}: reruns exceeded, giving up")
+            return False

atex/{provision → provisioner}/__init__.py RENAMED Viewed

@@ -4,41 +4,78 @@ import pkgutil as _pkgutil
 from .. import connection as _connection
+class Remote(_connection.Connection):
+    """
+    Representation of a provisioned (reserved) remote system, providing
+    a Connection-like API in addition to system management helpers.
+    An instance of Remote is typically prepared by a Provisioner and returned
+    to the caller for use and an eventual .release().
+    Also note that Remote can be used via Context Manager, but does not
+    do automatic .release(), the manager only handles the built-in Connection.
+    The intention is for a Provisioner to run via its own Contest Manager and
+    release all Remotes upon exit.
+    If you need automatic release of one Remote, use a try/finally block.
+    """
+    def release(self):
+        """
+        Release (de-provision) the remote resource.
+        """
+        raise NotImplementedError(f"'release' not implemented for {self.__class__.__name__}")
 class Provisioner:
     """
     A remote resource (machine/system) provider.
-    The main interface is .get_remote() that returns a connected class Remote
-    instance for use by the user, to be .release()d when not needed anymore,
-    with Provisioner automatically getting a replacement for it, to be returned
-    via .get_remote() later.
+    The idea is to request machines (a.k.a. Remotes, or class Remote instances)
+    to be reserved via a non-blocking .provision() and for them to be retrieved
+    through blocking / non-blocking .get_remote() when they become available.
+    Each Remote has its own .release() for freeing (de-provisioning) it once
+    the user doesn't need it anymore. The Provisioner does this automatically
+    to all Remotes during .stop() or context manager exit.
         p = Provisioner()
         p.start()
+        p.provision(count=1)
         remote = p.get_remote()
         remote.cmd(["ls", "/"])
         remote.release()
         p.stop()
         with Provisioner() as p:
-            remote = p.get_remote()
+            p.provision(count=2)
+            remote1 = p.get_remote()
+            remote2 = p.get_remote()
             ...
-            remote.release()
-    TODO: mention how a Provisioner always needs to take care of release all Remotes
-          when .stop()ped or when context terminates; even the ones handed over to
-          the user
+    Note that .provision() is a hint expressed by the caller, not a guarantee
+    that .get_remote() will ever return a Remote. Ie. the caller can call
+    .provision(count=math.inf) to receive as many remotes as the Provisioner
+    can possibly supply.
+    TODO: remove .defer_stop() (or stop_defer) and mention this below:
     Note that .stop() or .defer_stop() may be called from a different
     thread, asynchronously to any other functions.
     """
+    def provision(self, count=1):
+        """
+        Request that 'count' machines be provisioned (reserved) for use,
+        to be returned at a later point by .get_remote().
+        """
+        raise NotImplementedError(f"'provision' not implemented for {self.__class__.__name__}")
     def get_remote(self, block=True):
         """
-        Get a connected class Remote instance.
+        Return a connected class Remote instance of a previously .provision()ed
+        remote system.
-        If 'block' is True, wait for the remote to be available and connected,
-        otherwise return None if there is no Remote available yet.
+        If 'block' is True, wait for the Remote to be available and connected,
+        otherwise return None if there is none available yet.
         """
         raise NotImplementedError(f"'get_remote' not implemented for {self.__class__.__name__}")
@@ -80,31 +117,6 @@ class Provisioner:
         self.stop()
-class Remote(_connection.Connection):
-    """
-    Representation of a provisioned (reserved) remote system, providing
-    a Connection-like API in addition to system management helpers.
-    An instance of Remote is typically prepared by a Provisioner and lent out
-    for further use, to be .release()d by the user (if destroyed).
-    It is not meant for repeated reserve/release cycles, hence the lack
-    of .reserve().
-    Also note that Remote can be used via Context Manager, but does not
-    do automatic .release(), the manager only handles the built-in Connection.
-    The intention is for a Provisioner to run via its own Contest Manager and
-    release all Remotes upon exit.
-    If you need automatic release of one Remote, use a contextlib.ExitStack
-    with a callback, or a try/finally block.
-    """
-    def release(self):
-        """
-        Release (de-provision) the remote resource.
-        """
-        raise NotImplementedError(f"'release' not implemented for {self.__class__.__name__}")
 _submodules = [
     info.name for info in _pkgutil.iter_modules(__spec__.submodule_search_locations)
 ]

atex 0.9__py3-none-any.whl → 0.10__py3-none-any.whl

atex 0.9py3-none-any.whl → 0.10py3-none-any.whl