PyPI - teuthology - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

teuthology 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

scripts/describe.py +1 -0
scripts/dispatcher.py +62 -0
scripts/exporter.py +18 -0
scripts/lock.py +1 -1
scripts/node_cleanup.py +58 -0
scripts/openstack.py +9 -9
scripts/results.py +12 -11
scripts/run.py +4 -0
scripts/schedule.py +4 -0
scripts/suite.py +61 -16
scripts/supervisor.py +44 -0
scripts/update_inventory.py +10 -4
scripts/wait.py +31 -0
teuthology/__init__.py +24 -21
teuthology/beanstalk.py +4 -3
teuthology/config.py +17 -6
teuthology/contextutil.py +18 -14
teuthology/describe_tests.py +25 -18
teuthology/dispatcher/__init__.py +365 -0
teuthology/dispatcher/supervisor.py +374 -0
teuthology/exceptions.py +54 -0
teuthology/exporter.py +347 -0
teuthology/kill.py +76 -75
teuthology/lock/cli.py +16 -7
teuthology/lock/ops.py +276 -70
teuthology/lock/query.py +61 -44
teuthology/ls.py +9 -18
teuthology/misc.py +152 -137
teuthology/nuke/__init__.py +12 -351
teuthology/openstack/__init__.py +4 -3
teuthology/openstack/openstack-centos-7.0-user-data.txt +1 -1
teuthology/openstack/openstack-centos-7.1-user-data.txt +1 -1
teuthology/openstack/openstack-centos-7.2-user-data.txt +1 -1
teuthology/openstack/openstack-debian-8.0-user-data.txt +1 -1
teuthology/openstack/openstack-opensuse-42.1-user-data.txt +1 -1
teuthology/openstack/openstack-teuthology.cron +0 -1
teuthology/orchestra/cluster.py +51 -9
teuthology/orchestra/connection.py +23 -16
teuthology/orchestra/console.py +111 -50
teuthology/orchestra/daemon/cephadmunit.py +23 -5
teuthology/orchestra/daemon/state.py +10 -3
teuthology/orchestra/daemon/systemd.py +10 -8
teuthology/orchestra/opsys.py +32 -11
teuthology/orchestra/remote.py +369 -152
teuthology/orchestra/run.py +21 -12
teuthology/packaging.py +54 -15
teuthology/provision/__init__.py +30 -10
teuthology/provision/cloud/openstack.py +12 -6
teuthology/provision/cloud/util.py +1 -2
teuthology/provision/downburst.py +83 -29
teuthology/provision/fog.py +68 -20
teuthology/provision/openstack.py +5 -4
teuthology/provision/pelagos.py +13 -5
teuthology/repo_utils.py +91 -44
teuthology/report.py +57 -35
teuthology/results.py +5 -3
teuthology/run.py +21 -15
teuthology/run_tasks.py +114 -40
teuthology/schedule.py +4 -3
teuthology/scrape.py +28 -22
teuthology/suite/__init__.py +75 -46
teuthology/suite/build_matrix.py +34 -24
teuthology/suite/fragment-merge.lua +105 -0
teuthology/suite/matrix.py +31 -2
teuthology/suite/merge.py +175 -0
teuthology/suite/placeholder.py +8 -8
teuthology/suite/run.py +204 -102
teuthology/suite/util.py +67 -211
teuthology/task/__init__.py +1 -1
teuthology/task/ansible.py +101 -31
teuthology/task/buildpackages.py +2 -2
teuthology/task/ceph_ansible.py +13 -6
teuthology/task/cephmetrics.py +2 -1
teuthology/task/clock.py +33 -14
teuthology/task/exec.py +18 -0
teuthology/task/hadoop.py +2 -2
teuthology/task/install/__init__.py +51 -22
teuthology/task/install/bin/adjust-ulimits +16 -0
teuthology/task/install/bin/daemon-helper +114 -0
teuthology/task/install/bin/stdin-killer +263 -0
teuthology/task/install/deb.py +24 -4
teuthology/task/install/redhat.py +36 -32
teuthology/task/install/rpm.py +41 -14
teuthology/task/install/util.py +48 -22
teuthology/task/internal/__init__.py +69 -11
teuthology/task/internal/edit_sudoers.sh +10 -0
teuthology/task/internal/lock_machines.py +3 -133
teuthology/task/internal/redhat.py +48 -28
teuthology/task/internal/syslog.py +31 -8
teuthology/task/kernel.py +155 -147
teuthology/task/lockfile.py +1 -1
teuthology/task/mpi.py +10 -10
teuthology/task/pcp.py +1 -1
teuthology/task/selinux.py +17 -8
teuthology/task/ssh_keys.py +6 -6
teuthology/task/tests/__init__.py +137 -77
teuthology/task/tests/test_fetch_coredumps.py +116 -0
teuthology/task/tests/test_run.py +4 -4
teuthology/timer.py +3 -3
teuthology/util/loggerfile.py +19 -0
teuthology/util/scanner.py +159 -0
teuthology/util/sentry.py +52 -0
teuthology/util/time.py +52 -0
teuthology-1.2.0.data/scripts/adjust-ulimits +16 -0
teuthology-1.2.0.data/scripts/daemon-helper +114 -0
teuthology-1.2.0.data/scripts/stdin-killer +263 -0
teuthology-1.2.0.dist-info/METADATA +89 -0
teuthology-1.2.0.dist-info/RECORD +174 -0
{teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/WHEEL +1 -1
{teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/entry_points.txt +5 -2
scripts/nuke.py +0 -45
scripts/worker.py +0 -37
teuthology/nuke/actions.py +0 -456
teuthology/openstack/test/__init__.py +0 -0
teuthology/openstack/test/openstack-integration.py +0 -286
teuthology/openstack/test/test_config.py +0 -35
teuthology/openstack/test/test_openstack.py +0 -1695
teuthology/orchestra/test/__init__.py +0 -0
teuthology/orchestra/test/integration/__init__.py +0 -0
teuthology/orchestra/test/integration/test_integration.py +0 -94
teuthology/orchestra/test/test_cluster.py +0 -240
teuthology/orchestra/test/test_connection.py +0 -106
teuthology/orchestra/test/test_console.py +0 -217
teuthology/orchestra/test/test_opsys.py +0 -404
teuthology/orchestra/test/test_remote.py +0 -185
teuthology/orchestra/test/test_run.py +0 -286
teuthology/orchestra/test/test_systemd.py +0 -54
teuthology/orchestra/test/util.py +0 -12
teuthology/sentry.py +0 -18
teuthology/test/__init__.py +0 -0
teuthology/test/fake_archive.py +0 -107
teuthology/test/fake_fs.py +0 -92
teuthology/test/integration/__init__.py +0 -0
teuthology/test/integration/test_suite.py +0 -86
teuthology/test/task/__init__.py +0 -205
teuthology/test/task/test_ansible.py +0 -624
teuthology/test/task/test_ceph_ansible.py +0 -176
teuthology/test/task/test_console_log.py +0 -88
teuthology/test/task/test_install.py +0 -337
teuthology/test/task/test_internal.py +0 -57
teuthology/test/task/test_kernel.py +0 -243
teuthology/test/task/test_pcp.py +0 -379
teuthology/test/task/test_selinux.py +0 -35
teuthology/test/test_config.py +0 -189
teuthology/test/test_contextutil.py +0 -68
teuthology/test/test_describe_tests.py +0 -316
teuthology/test/test_email_sleep_before_teardown.py +0 -81
teuthology/test/test_exit.py +0 -97
teuthology/test/test_get_distro.py +0 -47
teuthology/test/test_get_distro_version.py +0 -47
teuthology/test/test_get_multi_machine_types.py +0 -27
teuthology/test/test_job_status.py +0 -60
teuthology/test/test_ls.py +0 -48
teuthology/test/test_misc.py +0 -368
teuthology/test/test_nuke.py +0 -232
teuthology/test/test_packaging.py +0 -763
teuthology/test/test_parallel.py +0 -28
teuthology/test/test_repo_utils.py +0 -204
teuthology/test/test_report.py +0 -77
teuthology/test/test_results.py +0 -155
teuthology/test/test_run.py +0 -238
teuthology/test/test_safepath.py +0 -55
teuthology/test/test_schedule.py +0 -45
teuthology/test/test_scrape.py +0 -167
teuthology/test/test_timer.py +0 -80
teuthology/test/test_vps_os_vers_parameter_checking.py +0 -84
teuthology/test/test_worker.py +0 -303
teuthology/worker.py +0 -339
teuthology-1.0.0.dist-info/METADATA +0 -76
teuthology-1.0.0.dist-info/RECORD +0 -210
{teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/LICENSE +0 -0
{teuthology-1.0.0.dist-info → teuthology-1.2.0.dist-info}/top_level.txt +0 -0

teuthology/dispatcher/supervisor.py ADDED Viewed

@@ -0,0 +1,374 @@
+import datetime
+import logging
+import os
+import subprocess
+import time
+import yaml
+import requests
+from urllib.parse import urljoin
+from teuthology import exporter, dispatcher, kill, report, safepath
+from teuthology.config import config as teuth_config
+from teuthology.exceptions import SkipJob, MaxWhileTries
+from teuthology import setup_log_file, install_except_hook
+from teuthology.misc import get_user, archive_logs, compress_logs
+from teuthology.config import FakeNamespace
+from teuthology.lock import ops as lock_ops
+from teuthology.task import internal
+from teuthology.misc import decanonicalize_hostname as shortname
+from teuthology.lock import query
+from teuthology.util import sentry
+log = logging.getLogger(__name__)
+def main(args):
+    with open(args.job_config, 'r') as config_file:
+        job_config = yaml.safe_load(config_file)
+    loglevel = logging.INFO
+    if args.verbose:
+        loglevel = logging.DEBUG
+    logging.getLogger().setLevel(loglevel)
+    log.setLevel(loglevel)
+    log_file_path = os.path.join(job_config['archive_path'],
+                                 f"supervisor.{job_config['job_id']}.log")
+    setup_log_file(log_file_path)
+    install_except_hook()
+    try:
+        dispatcher.check_job_expiration(job_config)
+    except SkipJob:
+        return 0
+    # reimage target machines before running the job
+    if 'targets' in job_config:
+        node_count = len(job_config["targets"])
+        # If a job (e.g. from the nop suite) doesn't need nodes, avoid
+        # submitting a zero here.
+        if node_count:
+            with exporter.NodeReimagingTime().time(
+                machine_type=job_config["machine_type"],
+                node_count=node_count,
+            ):
+                reimage(job_config)
+        else:
+            reimage(job_config)
+        with open(args.job_config, 'w') as f:
+            yaml.safe_dump(job_config, f, default_flow_style=False)
+    suite = job_config.get("suite")
+    if suite:
+        with exporter.JobTime().time(suite=suite):
+            return run_job(
+                job_config,
+                args.bin_path,
+                args.archive_dir,
+                args.verbose
+            )
+    else:
+        return run_job(
+            job_config,
+            args.bin_path,
+            args.archive_dir,
+            args.verbose
+        )
+def run_job(job_config, teuth_bin_path, archive_dir, verbose):
+    safe_archive = safepath.munge(job_config['name'])
+    if job_config.get('first_in_suite') or job_config.get('last_in_suite'):
+        job_archive = os.path.join(archive_dir, safe_archive)
+        args = [
+            os.path.join(teuth_bin_path, 'teuthology-results'),
+            '--archive-dir', job_archive,
+            '--name', job_config['name'],
+        ]
+        if job_config.get('first_in_suite'):
+            log.info('Generating memo for %s', job_config['name'])
+            if job_config.get('seed'):
+                args.extend(['--seed', job_config['seed']])
+            if job_config.get('subset'):
+                args.extend(['--subset', job_config['subset']])
+            if job_config.get('no_nested_subset'):
+                args.extend(['--no-nested-subset'])
+        else:
+            log.info('Generating results for %s', job_config['name'])
+            timeout = job_config.get('results_timeout',
+                                     teuth_config.results_timeout)
+            args.extend(['--timeout', str(timeout)])
+            if job_config.get('email'):
+                args.extend(['--email', job_config['email']])
+        # Execute teuthology-results, passing 'preexec_fn=os.setpgrp' to
+        # make sure that it will continue to run if this worker process
+        # dies (e.g. because of a restart)
+        result_proc = subprocess.Popen(args=args, preexec_fn=os.setpgrp)
+        log.info("teuthology-results PID: %s", result_proc.pid)
+        # Remove unnecessary logs for first and last jobs in run
+        log.info('Deleting job\'s archive dir %s', job_config['archive_path'])
+        for f in os.listdir(job_config['archive_path']):
+            os.remove(os.path.join(job_config['archive_path'], f))
+        os.rmdir(job_config['archive_path'])
+        return
+    log.info('Running job %s', job_config['job_id'])
+    arg = [
+        os.path.join(teuth_bin_path, 'teuthology'),
+    ]
+    # The following is for compatibility with older schedulers, from before we
+    # started merging the contents of job_config['config'] into job_config
+    # itself.
+    if 'config' in job_config:
+        inner_config = job_config.pop('config')
+        if not isinstance(inner_config, dict):
+            log.warning("run_job: job_config['config'] isn't a dict, it's a %s",
+                     str(type(inner_config)))
+        else:
+            job_config.update(inner_config)
+    if verbose or job_config['verbose']:
+        arg.append('-v')
+    arg.extend([
+        '--owner', job_config['owner'],
+        '--archive', job_config['archive_path'],
+        '--name', job_config['name'],
+    ])
+    if job_config['description'] is not None:
+        arg.extend(['--description', job_config['description']])
+    job_archive = os.path.join(job_config['archive_path'], 'orig.config.yaml')
+    arg.extend(['--', job_archive])
+    log.debug("Running: %s" % ' '.join(arg))
+    p = subprocess.Popen(
+        args=arg,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    log.info("Job archive: %s", job_config['archive_path'])
+    log.info("Job PID: %s", str(p.pid))
+    if teuth_config.results_server:
+        log.info("Running with watchdog")
+        try:
+            run_with_watchdog(p, job_config)
+        except Exception:
+            log.exception("run_with_watchdog had an unhandled exception")
+            raise
+    else:
+        log.info("Running without watchdog")
+        # This sleep() is to give the child time to start up and create the
+        # archive dir.
+        time.sleep(5)
+        p.wait()
+    if p.returncode != 0:
+        log.error('Child exited with code %d', p.returncode)
+    else:
+        log.info('Success!')
+    if 'targets' in job_config:
+        unlock_targets(job_config)
+    return p.returncode
+def failure_is_reimage(failure_reason):
+    if not failure_reason:
+        return False
+    reimage_failure = "Error reimaging machines:"
+    if reimage_failure in failure_reason:
+        return True
+    else:
+        return False
+def check_for_reimage_failures_and_mark_down(targets, count=10):
+    # Grab paddles history of jobs in the machine
+    # and count the number of reimaging errors
+    # if it fails N times then mark the machine down
+    base_url = teuth_config.results_server
+    for k, _ in targets.items():
+        machine = k.split('@')[-1]
+        url = urljoin(
+            base_url,
+            '/nodes/{0}/jobs/?count={1}'.format(machine, count)
+        )
+        resp = requests.get(url)
+        jobs = resp.json()
+        if len(jobs) < count:
+            continue
+        reimage_failures = list(filter(
+            lambda j: failure_is_reimage(j['failure_reason']),
+            jobs
+        ))
+        if len(reimage_failures) < count:
+            continue
+        # Mark machine down
+        machine_name = shortname(k)
+        lock_ops.update_lock(
+            machine_name,
+            description='reimage failed {0} times'.format(count),
+            status='down',
+        )
+        log.error(
+            'Reimage failed {0} times ... marking machine down'.format(count)
+        )
+def reimage(job_config):
+    # Reimage the targets specified in job config
+    # and update their keys in config after reimaging
+    ctx = create_fake_context(job_config)
+    # change the status during the reimaging process
+    report.try_push_job_info(ctx.config, dict(status='waiting'))
+    targets = job_config['targets']
+    try:
+        reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type'])
+    except Exception as e:
+        log.exception('Reimaging error. Nuking machines...')
+        # Reimage failures should map to the 'dead' status instead of 'fail'
+        report.try_push_job_info(
+            ctx.config,
+            dict(status='dead', failure_reason='Error reimaging machines: ' + str(e))
+        )
+        # There isn't an actual task called "reimage", but it doesn't seem
+        # necessary to create a whole new Sentry tag for this.
+        ctx.summary = {
+            'sentry_event': sentry.report_error(job_config, e, task_name="reimage")
+        }
+        # Machine that fails to reimage after 10 times will be marked down
+        check_for_reimage_failures_and_mark_down(targets)
+        raise
+    ctx.config['targets'] = reimaged
+    # change the status to running after the reimaging process
+    report.try_push_job_info(ctx.config, dict(status='running'))
+def unlock_targets(job_config):
+    serializer = report.ResultsSerializer(teuth_config.archive_base)
+    job_info = serializer.job_info(job_config['name'], job_config['job_id'])
+    machine_statuses = query.get_statuses(job_info['targets'].keys())
+    # only unlock targets if locked and description matches
+    locked = []
+    for status in machine_statuses:
+        name = shortname(status['name'])
+        description = status['description']
+        if not status['locked']:
+            continue
+        if description != job_info['archive_path']:
+            log.warning(
+                "Was going to unlock %s but it was locked by another job: %s",
+                name, description
+            )
+            continue
+        locked.append(name)
+    if not locked:
+        return
+    if job_config.get("unlock_on_failure", True):
+        log.info('Unlocking machines...')
+        lock_ops.unlock_safe(locked, job_info["owner"], job_info["name"], job_info["job_id"])
+def run_with_watchdog(process, job_config):
+    job_start_time = datetime.datetime.now(datetime.timezone.utc)
+    # Only push the information that's relevant to the watchdog, to save db
+    # load
+    job_info = dict(
+        name=job_config['name'],
+        job_id=job_config['job_id'],
+    )
+    # Sleep once outside of the loop to avoid double-posting jobs
+    time.sleep(teuth_config.watchdog_interval)
+    hit_max_timeout = False
+    while process.poll() is None:
+        # Kill jobs that have been running longer than the global max
+        run_time = datetime.datetime.now(datetime.timezone.utc) - job_start_time
+        total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds
+        if total_seconds > teuth_config.max_job_time:
+            hit_max_timeout = True
+            log.warning("Job ran longer than {max}s. Killing...".format(
+                max=teuth_config.max_job_time))
+            try:
+                # kill processes but do not unlock yet so we can save
+                # the logs, coredumps, etc.
+                kill.kill_job(
+                    job_info['name'], job_info['job_id'],
+                    teuth_config.archive_base, job_config['owner'],
+                    skip_unlock=True
+                )
+            except Exception:
+                log.exception('Failed to kill job')
+            try:
+                transfer_archives(job_info['name'], job_info['job_id'],
+                                  teuth_config.archive_base, job_config)
+            except Exception:
+                log.exception('Could not save logs')
+            try:
+                # this time remove everything and unlock the machines
+                kill.kill_job(
+                    job_info['name'], job_info['job_id'],
+                    teuth_config.archive_base, job_config['owner']
+                )
+            except Exception:
+                log.exception('Failed to kill job and unlock machines')
+        # calling this without a status just updates the jobs updated time
+        try:
+            report.try_push_job_info(job_info)
+        except MaxWhileTries:
+            log.exception("Failed to report job status; ignoring")
+        time.sleep(teuth_config.watchdog_interval)
+    # we no longer support testing theses old branches
+    assert(job_config.get('teuthology_branch') not in ('argonaut', 'bobtail',
+                                                       'cuttlefish', 'dumpling'))
+    # Let's make sure that paddles knows the job is finished. We don't know
+    # the status, but if it was a pass or fail it will have already been
+    # reported to paddles. In that case paddles ignores the 'dead' status.
+    # If the job was killed, paddles will use the 'dead' status.
+    extra_info = dict(status='dead')
+    if hit_max_timeout:
+        extra_info['failure_reason'] = 'hit max job timeout'
+    if not (job_config.get('first_in_suite') or job_config.get('last_in_suite')):
+        report.try_push_job_info(job_info, extra_info)
+def create_fake_context(job_config, block=False):
+    owner = job_config.get('owner', get_user())
+    os_version = job_config.get('os_version', None)
+    ctx_args = {
+        'config': job_config,
+        'block': block,
+        'owner': owner,
+        'archive': job_config['archive_path'],
+        'machine_type': job_config['machine_type'],
+        'os_type': job_config.get('os_type', 'ubuntu'),
+        'os_version': os_version,
+        'name': job_config['name'],
+        'job_id': job_config['job_id'],
+    }
+    return FakeNamespace(ctx_args)
+def transfer_archives(run_name, job_id, archive_base, job_config):
+    serializer = report.ResultsSerializer(archive_base)
+    job_info = serializer.job_info(run_name, job_id, simple=True)
+    if 'archive' in job_info:
+        ctx = create_fake_context(job_config)
+        internal.add_remotes(ctx, job_config)
+        for log_type, log_path in job_info['archive'].items():
+            if log_type == 'init':
+                log_type = ''
+            compress_logs(ctx, log_path)
+            archive_logs(ctx, log_path, log_type)
+    else:
+        log.info('No archives to transfer.')

teuthology/exceptions.py CHANGED Viewed

@@ -12,6 +12,18 @@ class BranchNotFoundError(ValueError):
             branch=self.branch, repo_str=repo_str)
+class BranchMismatchError(ValueError):
+    def __init__(self, branch, repo, reason=None):
+        self.branch = branch
+        self.repo = repo
+        self.reason = reason
+    def __str__(self):
+        msg = f"Cannot use branch {self.branch} with repo {self.repo}"
+        if self.reason:
+            msg = f"{msg} because {self.reason}"
+        return msg
 class CommitNotFoundError(ValueError):
     def __init__(self, commit, repo=None):
         self.commit = commit
@@ -68,6 +80,17 @@ class CommandFailedError(Exception):
             prefix=prefix,
             )
+    def fingerprint(self):
+        """
+        Returns a list of strings to group failures with.
+        Used by sentry instead of grouping by backtrace.
+        """
+        return [
+            self.label or self.command,
+            'exit status {}'.format(self.exitstatus),
+            '{{ type }}',
+        ]
 class AnsibleFailedError(Exception):
@@ -82,6 +105,13 @@ class AnsibleFailedError(Exception):
             failures=self.failures,
         )
+    def fingerprint(self):
+        """
+        Sentry will use this to group events by their failure reasons, rather
+        than lumping all AnsibleFailedErrors together
+        """
+        return self.failures
 class CommandCrashedError(Exception):
@@ -182,3 +212,27 @@ class NoRemoteError(Exception):
     def __str__(self):
         return self.message
+class UnitTestError(Exception):
+    """
+    Exception thrown on unit test failure
+    """
+    def __init__(self, exitstatus=None, node=None, label=None, message=None):
+        self.exitstatus = exitstatus
+        self.node = node
+        self.label = label
+        self.message = message
+    def __str__(self):
+        prefix = "Unit test failed"
+        if self.label:
+            prefix += " ({label})".format(label=self.label)
+        if self.node:
+            prefix += " on {node}".format(node=self.node)
+        if self.exitstatus:
+            prefix += " with status {status}".format(status=self.exitstatus)
+        return "{prefix}: '{message}'".format(
+            prefix=prefix,
+            message=self.message,
+        )

teuthology 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

teuthology 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl