PyPI - atex - Versions diffs - 0.7__py3-none-any.whl → 0.8__py3-none-any.whl - Mend

atex 0.7py3-none-any.whl → 0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

atex/cli/fmf.py +93 -0
atex/cli/testingfarm.py +23 -13
atex/connection/__init__.py +0 -8
atex/connection/ssh.py +3 -19
atex/executor/__init__.py +2 -0
atex/executor/duration.py +60 -0
atex/executor/executor.py +378 -0
atex/executor/reporter.py +106 -0
atex/{minitmt → executor}/scripts.py +30 -24
atex/{minitmt → executor}/testcontrol.py +16 -17
atex/{minitmt/fmf.py → fmf.py} +49 -34
atex/orchestrator/__init__.py +2 -59
atex/orchestrator/aggregator.py +66 -123
atex/orchestrator/orchestrator.py +324 -0
atex/provision/__init__.py +68 -99
atex/provision/testingfarm/__init__.py +2 -29
atex/provision/testingfarm/api.py +55 -40
atex/provision/testingfarm/testingfarm.py +236 -0
atex/util/__init__.py +1 -6
atex/util/log.py +8 -0
atex/util/path.py +16 -0
atex/util/ssh_keygen.py +14 -0
atex/util/threads.py +55 -0
{atex-0.7.dist-info → atex-0.8.dist-info}/METADATA +97 -2
atex-0.8.dist-info/RECORD +37 -0
atex/cli/minitmt.py +0 -175
atex/minitmt/__init__.py +0 -23
atex/minitmt/executor.py +0 -348
atex/provision/nspawn/README +0 -74
atex/provision/testingfarm/foo.py +0 -1
atex-0.7.dist-info/RECORD +0 -32
{atex-0.7.dist-info → atex-0.8.dist-info}/WHEEL +0 -0
{atex-0.7.dist-info → atex-0.8.dist-info}/entry_points.txt +0 -0
{atex-0.7.dist-info → atex-0.8.dist-info}/licenses/COPYING.txt +0 -0

atex/provision/__init__.py CHANGED Viewed

@@ -7,60 +7,27 @@ from .. import connection as _connection
 class Provisioner:
     """
-    A resource (machine/system) provider.
-    Any class derived from Provisioner serves as a mechanisms for requesting
-    a resource (machine/system), waiting for it to be reserved, providing ssh
-    details on how to connect to it, and releasing it when no longer useful.
-    The 4 main API points for this are reserve(), connection(), release() and
-    alive().
-    If necessary, these methods can share data via class instance attributes,
-    which are transparently guarded by a thread-aware mutex. For any complex
-    reads/writes, use 'self.lock' via a context manager.
-    Note that reserve() always runs in a separate thread (and thus may block),
-    and other functions (incl. release()) may be called at any time from
-    a different thread, even while reserve() is still running.
-    It is thus recommended for reserve() to store metadata in self.* as soon
-    as the metadata becomes available (some job ID, request UUID, Popen proc
-    object with PID, etc.) so that release() can free the resource at any time.
-    Once release()'d, the instance is never reused for reserve() again.
-    However connection(), release() and alive() may be called several times at
-    any time and need to handle it safely.
-    Ie. once released(), an instance must never return alive() == True.
-        # explicit method calls
-        res = Provisioner(...)
-        res.reserve()
-        conn = res.connection()
-        conn.connect()
-        conn.ssh('ls /')
-        conn.disconnect()
-        res.release()
-        # via a context manager
-        with Provisioner(...) as res:
-            with res.connection() as conn:
-                conn.ssh('ls /')
-    If a Provisioner class needs additional configuration, it should do so via
-    class (not instance) attributes, allowing it to be instantiated many times.
-        class ConfiguredProvisioner(Provisioner):
-            resource_hub = 'https://...'
-            login = 'joe'
-        # or dynamically
-        name = 'joe'
-        cls = type(
-            f'Provisioner_for_{name}',
-            (Provisioner,),
-            {'resource_hub': 'https://...', 'login': name},
-        )
-    These attributes can then be accessed from __init__ or any other function.
+    A remote resource (machine/system) provider.
+    The main interface is .get_remote() that returns a connected class Remote
+    instance for use by the user, to be .release()d when not needed anymore,
+    with Provisioner automatically getting a replacement for it, to be returned
+    via .get_remote() later.
+        p = Provisioner()
+        p.start()
+        remote = p.get_remote()
+        remote.cmd(["ls", "/"])
+        remote.release()
+        p.stop()
+        with Provisioner() as p:
+            remote = p.get_remote()
+            ...
+            remote.release()
+    Note that .stop() or .defer_stop() may be called from a different
+    thread, asynchronously to any other functions.
     """
     def __init__(self):
@@ -70,41 +37,58 @@ class Provisioner:
         """
         self.lock = _threading.RLock()
-#    def reserve(self):
-#        """
-#        Send a reservation request for a resource and wait for it to be
-#        reserved.
-#        """
-#        raise NotImplementedError(f"'reserve' not implemented for {self.__class__.__name__}")
-#
-#    def connection(self):
-#        """
-#        Return an atex.ssh.SSHConn instance configured for connection to
-#        the reserved resource, but not yet connected.
-#        """
-#        raise NotImplementedError(f"'connection' not implemented for {self.__class__.__name__}")
-#
-#    def release(self):
-#        """
-#        Release a reserved resource, or cancel a reservation-in-progress.
-#        """
-#        raise NotImplementedError(f"'release' not implemented for {self.__class__.__name__}")
-#
-#    def alive(self):
-#        """
-#        Return True if the resource is still reserved, False otherwise.
-#        """
-#        raise NotImplementedError(f"'alive' not implemented for {self.__class__.__name__}")
+    def get_remote(self, block=True):
+        """
+        Get a connected class Remote instance.
+        If 'block' is True, wait for the remote to be available and connected,
+        otherwise return None if there is no Remote available yet.
+        """
+        raise NotImplementedError(f"'get_remote' not implemented for {self.__class__.__name__}")
+    def start(self):
+        """
+        Start the Provisioner instance, start any provisioning-related
+        processes that lead to systems being reserved.
+        """
+        raise NotImplementedError(f"'start' not implemented for {self.__class__.__name__}")
+    def stop(self):
+        """
+        Stop the Provisioner instance, freeing all reserved resources,
+        calling .release() on all Remote instances that were created.
+        """
+        raise NotImplementedError(f"'stop' not implemented for {self.__class__.__name__}")
+    def stop_defer(self):
+        """
+        Enable an external caller to stop the Provisioner instance,
+        deferring resource deallocation to the caller.
+        Return an iterable of argument-free thread-safe callables that can be
+        called, possibly in parallel, to free up resources.
+        Ie. a list of 200 .release() functions, to be called in a thread pool
+        by the user, speeding up cleanup.
+        """
+        return self.stop
+    def __enter__(self):
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.stop()
 class Remote(_connection.Connection):
     """
     Representation of a provisioned (reserved) remote system, providing
-    a Connection-like API in addition system management helpers.
+    a Connection-like API in addition to system management helpers.
-    An instance of Remote is typically prepared by a Provisioner and given
-    away for further use, to be .release()d by the user. It is not meant
-    for repeated reserve/release cycles, hence the lack of .reserve().
+    An instance of Remote is typically prepared by a Provisioner and lent out
+    for further use, to be .release()d by the user (if destroyed).
+    It is not meant for repeated reserve/release cycles, hence the lack
+    of .reserve().
     Also note that Remote can be used via Context Manager, but does not
     do automatic .release(), the manager only handles the built-in Connection.
@@ -114,27 +98,12 @@ class Remote(_connection.Connection):
     with a callback, or a try/finally block.
     """
-    # TODO: pass platform as arg ?
-    #def __init__(self, platform, *args, **kwargs):
-    #    """
-    #    Initialize a new Remote instance based on a Connection instance.
-    #    If extending __init__, always call 'super().__init__(conn)' at the top.
-    #    """
-    #    self.lock = _threading.RLock()
-    #    self.platform = platform
     def release(self):
         """
-        Release (de-provision) the remote resource, freeing resources.
+        Release (de-provision) the remote resource.
         """
         raise NotImplementedError(f"'release' not implemented for {self.__class__.__name__}")
-    def alive(self):
-        """
-        Return True if the remote resource is still valid and reserved.
-        """
-        raise NotImplementedError(f"'alive' not implemented for {self.__class__.__name__}")
 _submodules = [
     info.name for info in _pkgutil.iter_modules(__spec__.submodule_search_locations)

atex/provision/testingfarm/__init__.py CHANGED Viewed

@@ -1,29 +1,2 @@
-#from ... import connection
-from .. import Provisioner, Remote
-#from . import api
-class TestingFarmRemote(Remote):
-    def __init__(self, connection, request):
-        """
-        'connection' is a class Connection instance.
-        'request' is a testing farm Request class instance.
-        """
-        super().__init__(connection)
-        self.request = request
-        self.valid = True
-    def release(self):
-        self.disconnect()
-        self.request.cancel()
-        self.valid = False
-    def alive(self):
-        return self.valid
-class TestingFarmProvisioner(Provisioner):
-    pass
+from . import api  # noqa: F401
+from .testingfarm import TestingFarmProvisioner, TestingFarmRemote  # noqa: F401

atex/provision/testingfarm/api.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
-import sys
 import re
 import time
 import tempfile
 import textwrap
+import threading
 import subprocess
 import collections
@@ -17,7 +17,7 @@ import urllib3
 DEFAULT_API_URL = "https://api.testing-farm.io/v0.1"
 # how many seconds to sleep for during API polling
-API_QUERY_DELAY = 10
+API_QUERY_DELAY = 30
 RESERVE_TASK = {
     "fmf": {
@@ -32,10 +32,10 @@ RESERVE_TASK = {
 # https://gitlab.com/testing-farm/nucleus/-/blob/main/api/src/tft/nucleus/api/core/schemes/test_request.py
 END_STATES = ("error", "complete", "canceled")
-# always have at most 3 outstanding HTTP requests to every given API host,
+# always have at most 10 outstanding HTTP requests to every given API host,
 # shared by all instances of all classes here, to avoid flooding the host
 # by multi-threaded users
-_http = urllib3.PoolManager(maxsize=3, block=True)
+_http = urllib3.PoolManager(maxsize=10, block=True)
 class TestingFarmError(Exception):
@@ -132,15 +132,20 @@ class TestingFarmAPI:
         return self._query("GET", f"/composes/{ranch}")
     def search_requests(
-        self, state, mine=True, ranch=None, created_before=None, created_after=None,
+        self, *, state, ranch=None,
+        mine=True, user_id=None, token_id=None,
+        created_before=None, created_after=None,
     ):
         """
         'state' is one of 'running', 'queued', etc., and is required by the API.
+        'ranch' is 'public' or 'redhat', or (probably?) all if left empty.
         If 'mine' is True and a token was given, return only requests for that
         token (user), otherwise return *all* requests (use extra filters pls).
-        'ranch' is 'public' or 'redhat', or (probably?) all if left empty.
+        'user_id' and 'token_id' are search API parameters - if not given and
+        'mine' is True, these are extracted from a user-provided token.
         'created_*' take ISO 8601 formatted strings, as returned by the API
         elsewhere, ie. 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SS' (or with '.MS'),
@@ -154,7 +159,12 @@ class TestingFarmAPI:
         if created_after:
             fields["created_after"] = created_after
-        if mine:
+        if user_id or token_id:
+            if user_id:
+                fields["user_id"] = user_id
+            if token_id:
+                fields["token_id"] = token_id
+        elif mine:
             if not self.api_token:
                 raise ValueError("search_requests(mine=True) requires an auth token")
             fields["token_id"] = self.whoami()["token"]["id"]
@@ -289,9 +299,12 @@ class PipelineLogStreamer:
                 log = f"{artifacts}/pipeline.log"
                 reply = _http.request("HEAD", log)
-                # TF has a race condition of adding the .log entry without it being created
-                if reply.status == 404:
-                    util.debug(f"got 404 for {log}, retrying")
+                # 404: TF has a race condition of adding the .log entry without
+                #      it being created
+                # 403: happens on internal OSCI artifacts server, probably
+                #      due to similar reasons (folder exists without log)
+                if reply.status in (404,403):
+                    util.debug(f"got {reply.status} for {log}, retrying")
                     continue
                 elif reply.status != 200:
                     raise APIError(f"got HTTP {reply.status} on HEAD {log}", reply)
@@ -431,6 +444,7 @@ class Reserve:
         self._source_host = source_host
         self.api = api or TestingFarmAPI()
+        self.lock = threading.RLock()
         self.request = None
         self._tmpdir = None
@@ -445,17 +459,11 @@ class Reserve:
             r = _http.request("GET", "https://ifconfig.co", headers=curl_agent)
         return r.data.decode().strip()
-    @staticmethod
-    def _gen_ssh_keypair(tmpdir):
-        tmpdir = Path(tmpdir)
-        subprocess.run(
-            ("ssh-keygen", "-t", "rsa", "-N", "", "-f", tmpdir / "key_rsa"),
-            stdout=subprocess.DEVNULL,
-            check=True,
-        )
-        return (tmpdir / "key_rsa", tmpdir / "key_rsa.pub")
+    def reserve(self):
+        with self.lock:
+            if self.request:
+                raise RuntimeError("reservation already in progress")
-    def __enter__(self):
         spec = self._spec.copy()
         try:
@@ -478,21 +486,25 @@ class Reserve:
                     raise FileNotFoundError(f"{ssh_key} specified, but does not exist")
                 ssh_pubkey = Path(f"{ssh_key}.pub")
             else:
-                self._tmpdir = tempfile.TemporaryDirectory()
-                ssh_key, ssh_pubkey = self._gen_ssh_keypair(self._tmpdir.name)
+                with self.lock:
+                    self._tmpdir = tempfile.TemporaryDirectory()
+                    ssh_key, ssh_pubkey = util.ssh_keygen(self._tmpdir.name)
             pubkey_contents = ssh_pubkey.read_text().strip()
             secrets = spec["environments"][0]["secrets"]
             secrets["RESERVE_SSH_PUBKEY"] = pubkey_contents
-            self.request = Request(api=self.api)
-            self.request.submit(spec)
+            with self.lock:
+                self.request = Request(api=self.api)
+                self.request.submit(spec)
             util.debug(f"submitted request:\n{textwrap.indent(str(self.request), '    ')}")
             # wait for user/host to ssh to
             ssh_user = ssh_host = None
             for line in PipelineLogStreamer(self.request):
-                util.debug(f"pipeline: {line}")
+                # the '\033[0m' is to reset colors sometimes left in a bad
+                # state by pipeline.log
+                util.debug(f"pipeline: {line}\033[0m")
                 # find hidden login details
                 m = re.search(r"\] Guest is ready: ArtemisGuest\([^,]+, (\w+)@([0-9\.]+), ", line)
                 if m:
@@ -534,22 +546,25 @@ class Reserve:
             )
         except:
-            self.__exit__(*sys.exc_info())
+            self.release()
             raise
-    def __exit__(self, exc_type, exc_value, traceback):
-        if self.request:
-            try:
-                self.request.cancel()
-            except APIError:
-                pass
-            finally:
-                self.request = None
+    def release(self):
+        with self.lock:
+            if self.request:
+                try:
+                    self.request.cancel()
+                except APIError:
+                    pass
+                finally:
+                    self.request = None
-        if self._tmpdir:
-            self._tmpdir.cleanup()
-            self._tmpdir = None
+            if self._tmpdir:
+                self._tmpdir.cleanup()
+                self._tmpdir = None
-        # cancel request
-        # clear out stored self.request
-        pass
+    def __enter__(self):
+        return self.reserve()
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.release()

atex/provision/testingfarm/testingfarm.py ADDED Viewed

@@ -0,0 +1,236 @@
+import time
+import tempfile
+import threading
+from ... import connection, util
+from .. import Provisioner, Remote
+from . import api
+class TestingFarmRemote(Remote, connection.ssh.ManagedSSHConn):
+    """
+    Built on the official Remote API, pulling in the Connection API
+    as implemented by ManagedSSHConn.
+    """
+    def __init__(self, ssh_options, *, release_hook, provisioner):
+        """
+        'ssh_options' are a dict, passed to ManagedSSHConn __init__().
+        'release_hook' is a callable called on .release() in addition
+        to disconnecting the connection.
+        """
+        # start with empty ssh options, we'll fill them in later
+        super().__init__(options=ssh_options)
+        self.release_hook = release_hook
+        self.provisioner = provisioner
+        self.lock = threading.RLock()
+        self.release_called = False
+    def release(self):
+        with self.lock:
+            if not self.release_called:
+                self.release_called = True
+            else:
+                return
+        self.release_hook(self)
+        self.disconnect()
+    # not /technically/ a valid repr(), but meh
+    def __repr__(self):
+        class_name = self.__class__.__name__
+        compose = self.provisioner.compose
+        arch = self.provisioner.arch
+        return f"{class_name}({compose} @ {arch}, {hex(id(self))})"
+#    def alive(self):
+#        return self.valid
+    # TODO: def __str__(self):  as root@1.2.3.4 and arch, ranch, etc.
+class TestingFarmProvisioner(Provisioner):
+    # TODO: have max_systems as (min,default,max) tuple; have an algorithm that
+    #       starts at default and scales up/down as needed
+    def __init__(self, compose, arch="x86_64", *, max_systems=1, timeout=60, max_retries=10):
+        """
+        'compose' is a Testing Farm compose to prepare.
+        'arch' is an architecture associated with the compose.
+        'max_systems' is an int of how many systems to reserve (and keep
+        reserved) in an internal pool.
+        'timeout' is the maximum Testing Farm pipeline timeout (waiting for
+        a system + OS installation + reservation time).
+        'max_retries' is a maximum number of provisioning (Testing Farm) errors
+        that will be reprovisioned before giving up.
+        """
+        super().__init__()
+        self.compose = compose  # TODO: translate "centos9" to "CentOS-Stream-9"
+        self.arch = arch
+        self.max_systems = max_systems
+        self.timeout = timeout
+        self.retries = max_retries
+        self._tmpdir = None
+        self.ssh_key = self.ssh_pubkey = None
+        self.queue = util.ThreadQueue(daemon=True)
+        self.tf_api = api.TestingFarmAPI()
+        # TF Reserve instances (not Remotes) actively being provisioned,
+        # in case we need to call their .release() on abort
+        self.reserving = []
+        # active TestingFarmRemote instances, ready to be handed over to the user,
+        # or already in use by the user
+        self.remotes = []
+    def _wait_for_reservation(self, tf_reserve, initial_delay):
+        # assuming this function will be called many times, attempt to
+        # distribute load on TF servers
+        # (we can sleep here as this code is running in a separate thread)
+        if initial_delay:
+            util.debug(f"delaying for {initial_delay}s to distribute load")
+            time.sleep(initial_delay)
+        # 'machine' is api.Reserve.ReservedMachine namedtuple
+        machine = tf_reserve.reserve()
+        # connect our Remote to the machine via its class Connection API
+        ssh_options = {
+            "Hostname": machine.host,
+            "User": machine.user,
+            "Port": machine.port,
+            "IdentityFile": machine.ssh_key,
+        }
+        def release_hook(remote):
+            # remove from the list of remotes inside this Provisioner
+            with self.lock:
+                try:
+                    self.remotes.remove(remote)
+                except ValueError:
+                    pass
+            # call TF API, cancel the request, etc.
+            tf_reserve.release()
+        remote = TestingFarmRemote(
+            ssh_options,
+            release_hook=release_hook,
+            provisioner=self,
+        )
+        remote.connect()
+        # since the system is fully ready, stop tracking its reservation
+        # and return the finished Remote instance
+        with self.lock:
+            self.remotes.append(remote)
+            self.reserving.remove(tf_reserve)
+        return remote
+    def _schedule_one_reservation(self, initial_delay=None):
+        # instantiate a class Reserve from the Testing Farm api module
+        # (which typically provides context manager, but we use its .reserve()
+        #  and .release() functions directly)
+        tf_reserve = api.Reserve(
+            compose=self.compose,
+            arch=self.arch,
+            timeout=self.timeout,
+            ssh_key=self.ssh_key,
+            api=self.tf_api,
+        )
+        # add it to self.reserving even before we schedule a provision,
+        # to avoid races on suddent abort
+        with self.lock:
+            self.reserving.append(tf_reserve)
+        # start a background wait
+        self.queue.start_thread(
+            target=self._wait_for_reservation,
+            args=(tf_reserve, initial_delay),
+        )
+    def start(self):
+        with self.lock:
+            self._tmpdir = tempfile.TemporaryDirectory()
+            self.ssh_key, self.ssh_pubkey = util.ssh_keygen(self._tmpdir.name)
+            # start up all initial reservations
+            for i in range(self.max_systems):
+                delay = (api.API_QUERY_DELAY / self.max_systems) * i
+                #self.queue.start_thread(target=self._schedule_one_reservation, args=(delay,))
+                self._schedule_one_reservation(delay)
+    def stop(self):
+        with self.lock:
+            # abort reservations in progress
+            for tf_reserve in self.reserving:
+                tf_reserve.release()
+            self.reserving = []
+            # cancel/release all Remotes ever created by us
+            for remote in self.remotes:
+                remote.release()
+            self.remotes = []  # just in case
+            # explicitly remove the tmpdir rather than relying on destructor
+            self._tmpdir.cleanup()
+            self._tmpdir = None
+    def stop_defer(self):
+        callables = []
+        with self.lock:
+            callables += (f.release for f in self.reserving)
+            self.reserving = []
+            callables += (r.release for r in self.remotes)
+            self.remotes = []  # just in case
+            callables.append(self._tmpdir.cleanup)
+            self._tmpdir = None
+        return callables
+    def get_remote(self, block=True):
+        # fill .release()d remotes back up with reservations
+        with self.lock:
+            deficit = self.max_systems - len(self.remotes) - len(self.reserving)
+            for i in range(deficit):
+                delay = (api.API_QUERY_DELAY / deficit) * i
+                self._schedule_one_reservation(delay)
+        while True:
+            # otherwise wait on a queue of Remotes being provisioned
+            try:
+                return self.queue.get(block=block)  # thread-safe
+            except util.ThreadQueue.Empty:
+                # always non-blocking
+                return None
+            except (api.TestingFarmError, connection.ssh.SSHError) as e:
+                with self.lock:
+                    if self.retries > 0:
+                        util.warning(
+                            f"caught while reserving a TF system: {repr(e)}, "
+                            f"retrying ({self.retries} left)",
+                        )
+                        self.retries -= 1
+                        self._schedule_one_reservation()
+                        if block:
+                            continue
+                        else:
+                            return None
+                    else:
+                        util.warning(
+                            f"caught while reserving a TF system: {repr(e)}, "
+                            "exhausted all retries, giving up",
+                        )
+                        raise
+    # not /technically/ a valid repr(), but meh
+    def __repr__(self):
+        class_name = self.__class__.__name__
+        reserving = len(self.reserving)
+        remotes = len(self.remotes)
+        return (
+            f"{class_name}({self.compose} @ {self.arch}, {reserving} reserving, "
+            f"{remotes} remotes, {hex(id(self))})"
+        )

atex/util/__init__.py CHANGED Viewed

@@ -1,7 +1,3 @@
-"""
-TODO some description about utilities
-"""
 import importlib as _importlib
 import pkgutil as _pkgutil
 import inspect as _inspect
@@ -39,8 +35,7 @@ def _import_submodules():
             if _inspect.ismodule(attr):
                 continue
             # do not override already processed objects (avoid duplicates)
-            if key in __all__:
-                raise AssertionError(f"tried to override already-imported '{key}'")
+            assert key not in __all__, f"tried to override already-imported '{key}'"
             globals()[key] = attr
             __all__.append(key)

atex 0.7__py3-none-any.whl → 0.8__py3-none-any.whl

atex 0.7py3-none-any.whl → 0.8py3-none-any.whl