PyPI - atex - Versions diffs - 0.7__py3-none-any.whl → 0.9__py3-none-any.whl - Mend

atex 0.7py3-none-any.whl → 0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

atex/cli/fmf.py +143 -0
atex/cli/libvirt.py +127 -0
atex/cli/testingfarm.py +35 -13
atex/connection/__init__.py +13 -19
atex/connection/podman.py +63 -0
atex/connection/ssh.py +34 -52
atex/executor/__init__.py +2 -0
atex/executor/duration.py +60 -0
atex/executor/executor.py +402 -0
atex/executor/reporter.py +101 -0
atex/{minitmt → executor}/scripts.py +37 -25
atex/{minitmt → executor}/testcontrol.py +54 -42
atex/fmf.py +237 -0
atex/orchestrator/__init__.py +3 -59
atex/orchestrator/aggregator.py +82 -134
atex/orchestrator/orchestrator.py +385 -0
atex/provision/__init__.py +74 -105
atex/provision/libvirt/__init__.py +2 -24
atex/provision/libvirt/libvirt.py +465 -0
atex/provision/libvirt/locking.py +168 -0
atex/provision/libvirt/setup-libvirt.sh +21 -1
atex/provision/podman/__init__.py +1 -0
atex/provision/podman/podman.py +274 -0
atex/provision/testingfarm/__init__.py +2 -29
atex/provision/testingfarm/api.py +123 -65
atex/provision/testingfarm/testingfarm.py +234 -0
atex/util/__init__.py +1 -6
atex/util/libvirt.py +18 -0
atex/util/log.py +31 -8
atex/util/named_mapping.py +158 -0
atex/util/path.py +16 -0
atex/util/ssh_keygen.py +14 -0
atex/util/threads.py +99 -0
atex-0.9.dist-info/METADATA +178 -0
atex-0.9.dist-info/RECORD +43 -0
atex/cli/minitmt.py +0 -175
atex/minitmt/__init__.py +0 -23
atex/minitmt/executor.py +0 -348
atex/minitmt/fmf.py +0 -202
atex/provision/nspawn/README +0 -74
atex/provision/podman/README +0 -59
atex/provision/podman/host_container.sh +0 -74
atex/provision/testingfarm/foo.py +0 -1
atex-0.7.dist-info/METADATA +0 -102
atex-0.7.dist-info/RECORD +0 -32
{atex-0.7.dist-info → atex-0.9.dist-info}/WHEEL +0 -0
{atex-0.7.dist-info → atex-0.9.dist-info}/entry_points.txt +0 -0
{atex-0.7.dist-info → atex-0.9.dist-info}/licenses/COPYING.txt +0 -0

atex/provision/podman/podman.py ADDED Viewed

@@ -0,0 +1,274 @@
+import os
+import time
+import enum
+import threading
+import subprocess
+from ... import connection, util
+from .. import Provisioner, Remote
+class PodmanRemote(Remote, connection.podman.PodmanConn):
+    """
+    Built on the official Remote API, pulling in the Connection API
+    as implemented by ManagedSSHConn.
+    """
+    def __init__(self, image, container, *, release_hook):
+        """
+        'image' is an image tag (used for repr()).
+        'container' is a podman container id / name.
+        'release_hook' is a callable called on .release() in addition
+        to disconnecting the connection.
+        """
+        super().__init__(container=container)
+        self.lock = threading.RLock()
+        self.image = image
+        self.container = container
+        self.release_called = False
+        self.release_hook = release_hook
+    def release(self):
+        with self.lock:
+            if self.release_called:
+                return
+            else:
+                self.release_called = True
+        self.release_hook(self)
+        self.disconnect()
+        util.subprocess_run(
+            ("podman", "container", "rm", "-f", "-t", "0", self.container),
+            check=False,  # ignore if it fails
+            stdout=subprocess.DEVNULL,
+        )
+    # not /technically/ a valid repr(), but meh
+    def __repr__(self):
+        class_name = self.__class__.__name__
+        if "/" in self.image:
+            image = self.image.rsplit("/",1)[1]
+        elif len(self.image) > 20:
+            image = f"{self.image[:17]}..."
+        else:
+            image = self.image
+        name = f"{self.container[:17]}..." if len(self.container) > 20 else self.container
+        return f"{class_name}({image}, {name})"
+class PodmanProvisioner(Provisioner):
+    class State(enum.Enum):
+        WAITING_FOR_PULL = enum.auto()
+        CREATING_CONTAINER = enum.auto()
+        WAITING_FOR_CREATION = enum.auto()
+        SETTING_UP_CONTAINER = enum.auto()
+        WAITING_FOR_SETUP = enum.auto()
+    # NOTE: this uses a single Popen process to run podman commands,
+    #       to avoid double downloads/pulls, but also to avoid SQLite errors
+    #       when creating multiple containers in parallel
+    def __init__(self, image, run_options=None, *, pull=True, max_systems=1):
+        """
+        'image' is a string of image tag/id to create containers from.
+        It can be a local identifier or an URL.
+        'run_options' is an iterable with additional CLI options passed
+        to 'podman container run'.
+        'pull' specifies whether to attempt 'podman image pull' on the specified
+        image tag/id before any container creation.
+        'max_systems' is a maximum number of containers running at any one time.
+        """
+        self.lock = threading.RLock()
+        self.image = image
+        self.run_options = run_options or ()
+        self.pull = pull
+        self.max_systems = max_systems
+        self.image_id = None
+        self.container_id = None
+        self.worker = None
+        self.worker_output = bytearray()
+        self.state = None
+        # created PodmanRemote instances, ready to be handed over to the user,
+        # or already in use by the user
+        self.remotes = []
+    @staticmethod
+    def _spawn_proc(cmd):
+        proc = util.subprocess_Popen(
+            cmd,
+            stdin=subprocess.DEVNULL,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+        )
+        os.set_blocking(proc.stdout.fileno(), False)
+        return proc
+#    @staticmethod
+#    def _poll_proc(proc):
+#        # read from the process to un-block any kernel buffers
+#        try:
+#            out = proc.stdout.read()  # non-blocking
+#        except BlockingIOError:
+#            out = ""
+#        return (proc.poll(), out)
+    def _make_remote(self, container):
+        def release_hook(remote):
+            # remove from the list of remotes inside this Provisioner
+            with self.lock:
+                try:
+                    self.remotes.remove(remote)
+                except ValueError:
+                    pass
+        remote = PodmanRemote(
+            self.image,
+            container,
+            release_hook=release_hook,
+        )
+        self.remotes.append(remote)
+        return remote
+    def start(self):
+        if not self.image:
+            raise ValueError("image cannot be empty")
+        if not self.pull:
+            self.image_id = self.image
+            self.state = self.State.CREATING_CONTAINER
+        else:
+            self.worker = self._spawn_proc(
+                ("podman", "image", "pull", "--quiet", self.image),
+            )
+            self.state = self.State.WAITING_FOR_PULL
+    def stop(self):
+        with self.lock:
+            while self.remotes:
+                self.remotes.pop().release()
+            worker = self.worker
+            self.worker = None
+        if worker:
+            worker.kill()
+            # don"t zombie forever, return EPIPE on any attempts to write to us
+            worker.stdout.close()
+            worker.wait()
+    def stop_defer(self):
+        # avoid SQLite errors by removing containers sequentially
+        return self.stop
+    @staticmethod
+    def _nonblock_read(fobj):
+        """Return b'' if there was nothing to read, instead of None."""
+        data = fobj.read()
+        return b"" if data is None else data
+    def _get_remote_nonblock(self):
+        if self.state is None:
+            raise RuntimeError("the provisioner is in an invalid state")
+        # NOTE: these are not 'elif' statements explicitly to allow a next block
+        #       to follow a previous one (if the condition is met)
+        if self.state is self.State.WAITING_FOR_PULL:
+            self.worker_output += self._nonblock_read(self.worker.stdout)
+            rc = self.worker.poll()
+            if rc is None:
+                return None  # still running
+            elif rc != 0:
+                out = self.worker_output.decode().rstrip("\n")
+                self.worker_output.clear()
+                self.worker = None
+                self.state = None
+                raise RuntimeError(f"podman image pull failed with {rc}:\n{out}")
+            else:
+                self.image_id = self.worker_output.decode().rstrip("\n")
+                self.worker_output.clear()
+                self.worker = None
+                self.state = self.State.CREATING_CONTAINER
+        if self.state is self.State.CREATING_CONTAINER:
+            if len(self.remotes) < self.max_systems:
+                self.worker = self._spawn_proc(
+                    (
+                        "podman", "container", "run", "--quiet", "--detach", "--pull", "never",
+                        *self.run_options, self.image_id, "sleep", "inf",
+                    ),
+                )
+                self.state = self.State.WAITING_FOR_CREATION
+            else:
+                # too many remotes requested
+                return None
+        if self.state is self.State.WAITING_FOR_CREATION:
+            self.worker_output += self._nonblock_read(self.worker.stdout)
+            rc = self.worker.poll()
+            if rc is None:
+                return None  # still running
+            elif rc != 0:
+                out = self.worker_output.decode().rstrip("\n")
+                self.worker_output.clear()
+                self.worker = None
+                self.state = None
+                raise RuntimeError(f"podman run failed with {rc}:\n{out}")
+            else:
+                self.container_id = self.worker_output.decode().rstrip("\n")
+                self.worker_output.clear()
+                self.worker = None
+                self.state = self.State.SETTING_UP_CONTAINER
+        if self.state is self.State.SETTING_UP_CONTAINER:
+            cmd = ("dnf", "install", "-y", "-q", "--setopt=install_weak_deps=False", "rsync")
+            self.worker = self._spawn_proc(
+                ("podman", "container", "exec", self.container_id, *cmd),
+            )
+            self.state = self.State.WAITING_FOR_SETUP
+        if self.state is self.State.WAITING_FOR_SETUP:
+            self.worker_output += self._nonblock_read(self.worker.stdout)
+            rc = self.worker.poll()
+            if rc is None:
+                return None  # still running
+            elif rc != 0:
+                out = self.worker_output.decode().rstrip("\n")
+                self.worker_output.clear()
+                self.worker = None
+                self.state = None
+                raise RuntimeError(f"setting up failed with {rc}:\n{out}")
+            else:
+                # everything ready, give the Remote to the caller and reset
+                remote = self._make_remote(self.container_id)
+                self.worker_output.clear()
+                self.worker = None
+                self.state = self.State.CREATING_CONTAINER
+                return remote
+        raise AssertionError(f"reached end (invalid state {self.state}?)")
+    def get_remote(self, block=True):
+        if not block:
+            with self.lock:
+                return self._get_remote_nonblock()
+        else:
+            while True:
+                with self.lock:
+                    if remote := self._get_remote_nonblock():
+                        return remote
+                time.sleep(0.1)
+    # not /technically/ a valid repr(), but meh
+    def __repr__(self):
+        class_name = self.__class__.__name__
+        return (
+            f"{class_name}({self.image}, {len(self.remotes)}/{self.max_systems} remotes, "
+            f"{hex(id(self))})"
+        )

atex/provision/testingfarm/__init__.py CHANGED Viewed

@@ -1,29 +1,2 @@
-#from ... import connection
-from .. import Provisioner, Remote
-#from . import api
-class TestingFarmRemote(Remote):
-    def __init__(self, connection, request):
-        """
-        'connection' is a class Connection instance.
-        'request' is a testing farm Request class instance.
-        """
-        super().__init__(connection)
-        self.request = request
-        self.valid = True
-    def release(self):
-        self.disconnect()
-        self.request.cancel()
-        self.valid = False
-    def alive(self):
-        return self.valid
-class TestingFarmProvisioner(Provisioner):
-    pass
+from . import api  # noqa: F401
+from .testingfarm import TestingFarmProvisioner, TestingFarmRemote  # noqa: F401

atex/provision/testingfarm/api.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
-import sys
 import re
 import time
 import tempfile
 import textwrap
+import threading
 import subprocess
 import collections
@@ -17,25 +17,35 @@ import urllib3
 DEFAULT_API_URL = "https://api.testing-farm.io/v0.1"
 # how many seconds to sleep for during API polling
-API_QUERY_DELAY = 10
-RESERVE_TASK = {
-    "fmf": {
-        "url": "https://github.com/RHSecurityCompliance/atex",
-        "ref": "main",
-        "path": "tmt_tests",
-        "name": "/plans/reserve",
-    },
+API_QUERY_DELAY = 30
+DEFAULT_RESERVE_TEST = {
+    "url": "https://github.com/RHSecurityCompliance/atex-reserve",
+    "ref": "v0.9",
+    "path": ".",
+    "name": "/plans/reserve",
 }
 # final states of a request,
 # https://gitlab.com/testing-farm/nucleus/-/blob/main/api/src/tft/nucleus/api/core/schemes/test_request.py
 END_STATES = ("error", "complete", "canceled")
-# always have at most 3 outstanding HTTP requests to every given API host,
+# always have at most 10 outstanding HTTP requests to every given API host,
 # shared by all instances of all classes here, to avoid flooding the host
 # by multi-threaded users
-_http = urllib3.PoolManager(maxsize=3, block=True)
+_http = urllib3.PoolManager(
+    maxsize=10,
+    block=True,
+    retries=urllib3.Retry(
+        total=10,
+        # account for API restarts / short outages
+        backoff_factor=60,
+        backoff_max=600,
+        # retry on API server errors too, not just connection issues
+        status=10,
+        status_forcelist={403,404,408,429,500,502,503,504},
+    ),
+)
 class TestingFarmError(Exception):
@@ -132,15 +142,20 @@ class TestingFarmAPI:
         return self._query("GET", f"/composes/{ranch}")
     def search_requests(
-        self, state, mine=True, ranch=None, created_before=None, created_after=None,
+        self, *, state, ranch=None,
+        mine=True, user_id=None, token_id=None,
+        created_before=None, created_after=None,
     ):
         """
         'state' is one of 'running', 'queued', etc., and is required by the API.
+        'ranch' is 'public' or 'redhat', or (probably?) all if left empty.
         If 'mine' is True and a token was given, return only requests for that
         token (user), otherwise return *all* requests (use extra filters pls).
-        'ranch' is 'public' or 'redhat', or (probably?) all if left empty.
+        'user_id' and 'token_id' are search API parameters - if not given and
+        'mine' is True, these are extracted from a user-provided token.
         'created_*' take ISO 8601 formatted strings, as returned by the API
         elsewhere, ie. 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM:SS' (or with '.MS'),
@@ -154,7 +169,12 @@ class TestingFarmAPI:
         if created_after:
             fields["created_after"] = created_after
-        if mine:
+        if user_id or token_id:
+            if user_id:
+                fields["user_id"] = user_id
+            if token_id:
+                fields["token_id"] = token_id
+        elif mine:
             if not self.api_token:
                 raise ValueError("search_requests(mine=True) requires an auth token")
             fields["token_id"] = self.whoami()["token"]["id"]
@@ -289,9 +309,12 @@ class PipelineLogStreamer:
                 log = f"{artifacts}/pipeline.log"
                 reply = _http.request("HEAD", log)
-                # TF has a race condition of adding the .log entry without it being created
-                if reply.status == 404:
-                    util.debug(f"got 404 for {log}, retrying")
+                # 404: TF has a race condition of adding the .log entry without
+                #      it being created
+                # 403: happens on internal OSCI artifacts server, probably
+                #      due to similar reasons (folder exists without log)
+                if reply.status in (404,403):
+                    util.debug(f"got {reply.status} for {log}, retrying")
                     continue
                 elif reply.status != 200:
                     raise APIError(f"got HTTP {reply.status} on HEAD {log}", reply)
@@ -357,7 +380,9 @@ class Reserve:
     def __init__(
         self, *, compose, arch="x86_64", pool=None, hardware=None, kickstart=None,
-        timeout=60, ssh_key=None, source_host=None, api=None,
+        timeout=60, ssh_key=None, source_host=None,
+        reserve_test=None, variables=None, secrets=None,
+        api=None,
     ):
         """
         'compose' (str) is the OS to install, chosen from the composes supported
@@ -390,18 +415,31 @@ class Reserve:
         facing address of the current system.
         Ignored on the 'redhat' ranch.
+        'reserve_test' is a dict with a fmf test specification to be run on the
+        target system to reserve it, ie.:
+            {
+                "url": "https://some-host/path/to/repo",
+                "ref": "main",
+                "name": "/plans/reserve",
+            }
+        'variables' and 'secrets' are dicts with environment variable key/values
+        exported for the reserve test - variables are visible via TF API,
+        secrets are not (but can still be extracted from pipeline log).
         'api' is a TestingFarmAPI instance - if unspecified, a sensible default
         will be used.
         """
-        util.info(f"Will reserve compose:{compose} on arch:{arch} for {timeout}min")
+        util.info(f"will reserve compose:{compose} on arch:{arch} for {timeout}min")
         spec = {
-            "test": RESERVE_TASK,
+            "test": {
+                "fmf": reserve_test or DEFAULT_RESERVE_TEST,
+            },
             "environments": [{
                 "arch": arch,
                 "os": {
                     "compose": compose,
                 },
-                "pool": pool,
                 "settings": {
                     "pipeline": {
                         "skip_guest_setup": True,
@@ -410,10 +448,8 @@ class Reserve:
                         "tags": {
                             "ArtemisUseSpot": "false",
                         },
-                        "security_group_rules_ingress": [],
                     },
                 },
-                "secrets": {},
             }],
             "settings": {
                 "pipeline": {
@@ -421,16 +457,23 @@ class Reserve:
                 },
             },
         }
+        spec_env = spec["environments"][0]
+        if pool:
+            spec_env["pool"] = pool
         if hardware:
-            spec["environments"][0]["hardware"] = hardware
+            spec_env["hardware"] = hardware
         if kickstart:
-            spec["environments"][0]["kickstart"] = kickstart
+            spec_env["kickstart"] = kickstart
+        if variables:
+            spec_env["variables"] = variables
+        spec_env["secrets"] = secrets.copy() if secrets else {}  # we need it for ssh pubkey
         self._spec = spec
         self._ssh_key = Path(ssh_key) if ssh_key else None
         self._source_host = source_host
         self.api = api or TestingFarmAPI()
+        self.lock = threading.RLock()
         self.request = None
         self._tmpdir = None
@@ -445,32 +488,31 @@ class Reserve:
             r = _http.request("GET", "https://ifconfig.co", headers=curl_agent)
         return r.data.decode().strip()
-    @staticmethod
-    def _gen_ssh_keypair(tmpdir):
-        tmpdir = Path(tmpdir)
-        subprocess.run(
-            ("ssh-keygen", "-t", "rsa", "-N", "", "-f", tmpdir / "key_rsa"),
-            stdout=subprocess.DEVNULL,
-            check=True,
-        )
-        return (tmpdir / "key_rsa", tmpdir / "key_rsa.pub")
+    def reserve(self):
+        with self.lock:
+            if self.request:
+                raise RuntimeError("reservation already in progress")
-    def __enter__(self):
         spec = self._spec.copy()
+        spec_env = spec["environments"][0]
-        try:
-            # add source_host firewall filter
+        # add source_host firewall filter on the public ranch
+        if self.api.whoami()["token"]["ranch"] == "public":
             source_host = self._source_host or f"{self._guess_host_ipv4()}/32"
-            ingress = \
-                spec["environments"][0]["settings"]["provisioning"]["security_group_rules_ingress"]
-            ingress.append({
+            ingress_rule = {
                 "type": "ingress",
                 "protocol": "-1",
                 "cidr": source_host,
                 "port_min": 0,
                 "port_max": 65535,
-            })
+            }
+            provisioning = spec_env["settings"]["provisioning"]
+            if "security_group_rules_ingress" in provisioning:
+                provisioning["security_group_rules_ingress"].append(ingress_rule)
+            else:
+                provisioning["security_group_rules_ingress"] = [ingress_rule]
+        try:
             # read user-provided ssh key, or generate one
             ssh_key = self._ssh_key
             if ssh_key:
@@ -478,23 +520,32 @@ class Reserve:
                     raise FileNotFoundError(f"{ssh_key} specified, but does not exist")
                 ssh_pubkey = Path(f"{ssh_key}.pub")
             else:
-                self._tmpdir = tempfile.TemporaryDirectory()
-                ssh_key, ssh_pubkey = self._gen_ssh_keypair(self._tmpdir.name)
+                with self.lock:
+                    self._tmpdir = tempfile.TemporaryDirectory()
+                    ssh_key, ssh_pubkey = util.ssh_keygen(self._tmpdir.name)
             pubkey_contents = ssh_pubkey.read_text().strip()
-            secrets = spec["environments"][0]["secrets"]
-            secrets["RESERVE_SSH_PUBKEY"] = pubkey_contents
+            # TODO: split ^^^ into 3 parts (key type, hash, comment), assert it,
+            #       and anonymize comment in case it contains a secret user/hostname
+            spec_env["secrets"]["RESERVE_SSH_PUBKEY"] = pubkey_contents
-            self.request = Request(api=self.api)
-            self.request.submit(spec)
+            with self.lock:
+                self.request = Request(api=self.api)
+                self.request.submit(spec)
             util.debug(f"submitted request:\n{textwrap.indent(str(self.request), '    ')}")
             # wait for user/host to ssh to
             ssh_user = ssh_host = None
             for line in PipelineLogStreamer(self.request):
-                util.debug(f"pipeline: {line}")
+                # the '\033[0m' is to reset colors sometimes left in a bad
+                # state by pipeline.log
+                util.debug(f"pipeline: {line}\033[0m")
                 # find hidden login details
-                m = re.search(r"\] Guest is ready: ArtemisGuest\([^,]+, (\w+)@([0-9\.]+), ", line)
+                m = re.search(
+                    # host address can be an IP address or a hostname
+                    r"\] Guest is ready: ArtemisGuest\([^,]+, (\w+)@([^,]+), arch=",
+                    line,
+                )
                 if m:
                     ssh_user, ssh_host = m.groups()
                     continue
@@ -534,22 +585,29 @@ class Reserve:
             )
         except:
-            self.__exit__(*sys.exc_info())
+            self.release()
             raise
-    def __exit__(self, exc_type, exc_value, traceback):
-        if self.request:
-            try:
-                self.request.cancel()
-            except APIError:
-                pass
-            finally:
-                self.request = None
+    def release(self):
+        with self.lock:
+            if self.request:
+                try:
+                    self.request.cancel()
+                except APIError:
+                    pass
+                finally:
+                    self.request = None
-        if self._tmpdir:
-            self._tmpdir.cleanup()
-            self._tmpdir = None
+            if self._tmpdir:
+                self._tmpdir.cleanup()
+                self._tmpdir = None
-        # cancel request
-        # clear out stored self.request
-        pass
+    def __enter__(self):
+        try:
+            return self.reserve()
+        except Exception:
+            self.release()
+            raise
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.release()

atex 0.7__py3-none-any.whl → 0.9__py3-none-any.whl

atex 0.7py3-none-any.whl → 0.9py3-none-any.whl