PyPI - thds.mops - Versions diffs - 3.6.20250219172032__py3-none-any.whl - Mend

thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show

thds/mops/__about__.py +8 -0
thds/mops/__init__.py +3 -0
thds/mops/_compat.py +6 -0
thds/mops/_utils/__init__.py +0 -0
thds/mops/_utils/colorize.py +110 -0
thds/mops/_utils/config_tree.py +167 -0
thds/mops/_utils/exception.py +16 -0
thds/mops/_utils/locked_cache.py +78 -0
thds/mops/_utils/names.py +23 -0
thds/mops/_utils/on_slow.py +28 -0
thds/mops/_utils/once.py +30 -0
thds/mops/_utils/temp.py +32 -0
thds/mops/config.py +60 -0
thds/mops/impure/__init__.py +2 -0
thds/mops/impure/keyfunc.py +14 -0
thds/mops/impure/runner.py +73 -0
thds/mops/k8s/__init__.py +27 -0
thds/mops/k8s/_shared.py +3 -0
thds/mops/k8s/apply_yaml.py +22 -0
thds/mops/k8s/auth.py +49 -0
thds/mops/k8s/config.py +37 -0
thds/mops/k8s/container_registry.py +14 -0
thds/mops/k8s/jobs.py +57 -0
thds/mops/k8s/launch.py +234 -0
thds/mops/k8s/logging.py +239 -0
thds/mops/k8s/namespace.py +17 -0
thds/mops/k8s/node_selection.py +58 -0
thds/mops/k8s/retry.py +75 -0
thds/mops/k8s/too_old_resource_version.py +42 -0
thds/mops/k8s/tools/krsync.py +50 -0
thds/mops/k8s/tools/krsync.sh +22 -0
thds/mops/k8s/wait_job.py +72 -0
thds/mops/k8s/warn_image_backoff.py +63 -0
thds/mops/k8s/watch.py +266 -0
thds/mops/meta.json +8 -0
thds/mops/parallel.py +36 -0
thds/mops/pure/__init__.py +43 -0
thds/mops/pure/_magic/__init__.py +0 -0
thds/mops/pure/_magic/api.py +114 -0
thds/mops/pure/_magic/sauce.py +152 -0
thds/mops/pure/_magic/shims.py +34 -0
thds/mops/pure/adls/__init__.py +1 -0
thds/mops/pure/adls/_files.py +22 -0
thds/mops/pure/adls/blob_store.py +185 -0
thds/mops/pure/adls/output_fqn.py +17 -0
thds/mops/pure/core/__init__.py +0 -0
thds/mops/pure/core/content_addressed.py +31 -0
thds/mops/pure/core/deferred_work.py +83 -0
thds/mops/pure/core/entry/__init__.py +2 -0
thds/mops/pure/core/entry/main.py +47 -0
thds/mops/pure/core/entry/route_result.py +66 -0
thds/mops/pure/core/entry/runner_registry.py +31 -0
thds/mops/pure/core/file_blob_store.py +120 -0
thds/mops/pure/core/lock/__init__.py +7 -0
thds/mops/pure/core/lock/_acquire.py +192 -0
thds/mops/pure/core/lock/_funcs.py +37 -0
thds/mops/pure/core/lock/cli.py +73 -0
thds/mops/pure/core/lock/maintain.py +150 -0
thds/mops/pure/core/lock/read.py +39 -0
thds/mops/pure/core/lock/types.py +37 -0
thds/mops/pure/core/lock/write.py +136 -0
thds/mops/pure/core/memo/__init__.py +6 -0
thds/mops/pure/core/memo/function_memospace.py +267 -0
thds/mops/pure/core/memo/keyfunc.py +53 -0
thds/mops/pure/core/memo/overwrite_params.py +61 -0
thds/mops/pure/core/memo/results.py +103 -0
thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
thds/mops/pure/core/metadata.py +230 -0
thds/mops/pure/core/output_naming.py +52 -0
thds/mops/pure/core/partial.py +15 -0
thds/mops/pure/core/pipeline_id.py +62 -0
thds/mops/pure/core/pipeline_id_mask.py +79 -0
thds/mops/pure/core/script_support.py +25 -0
thds/mops/pure/core/serialize_big_objs.py +73 -0
thds/mops/pure/core/serialize_paths.py +149 -0
thds/mops/pure/core/source.py +291 -0
thds/mops/pure/core/types.py +142 -0
thds/mops/pure/core/uris.py +81 -0
thds/mops/pure/core/use_runner.py +47 -0
thds/mops/pure/joblib/__init__.py +1 -0
thds/mops/pure/joblib/backend.py +81 -0
thds/mops/pure/joblib/batching.py +67 -0
thds/mops/pure/pickling/__init__.py +3 -0
thds/mops/pure/pickling/_pickle.py +193 -0
thds/mops/pure/pickling/memoize_only.py +22 -0
thds/mops/pure/pickling/mprunner.py +173 -0
thds/mops/pure/pickling/pickles.py +149 -0
thds/mops/pure/pickling/remote.py +145 -0
thds/mops/pure/pickling/sha256_b64.py +71 -0
thds/mops/pure/runner/__init__.py +0 -0
thds/mops/pure/runner/local.py +239 -0
thds/mops/pure/runner/shim_builder.py +25 -0
thds/mops/pure/runner/simple_shims.py +21 -0
thds/mops/pure/runner/strings.py +1 -0
thds/mops/pure/runner/types.py +28 -0
thds/mops/pure/tools/__init__.py +0 -0
thds/mops/pure/tools/history.py +35 -0
thds/mops/pure/tools/inspect.py +372 -0
thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
thds/mops/pure/tools/stress.py +63 -0
thds/mops/pure/tools/summarize/__init__.py +4 -0
thds/mops/pure/tools/summarize/cli.py +293 -0
thds/mops/pure/tools/summarize/run_summary.py +143 -0
thds/mops/py.typed +0 -0
thds/mops/testing/__init__.py +0 -0
thds/mops/testing/deferred_imports.py +81 -0
thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0

thds/mops/k8s/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Trilliant Health abstraction around launching K8S Jobs."""
+try:
+    from kubernetes import client as _  # noqa
+except ModuleNotFoundError as mnf:
+    raise ModuleNotFoundError(
+        "Please install mops with the `k8s` extra to use `thds.mops.k8s`."
+    ) from mnf
+from .container_registry import autocr  # noqa: F401
+from .launch import K8sJobFailedError, launch, shim  # noqa
+from .node_selection import (  # noqa
+    NodeNarrowing,
+    ResourceDefinition,
+    require_gpu,
+    tolerates_64cpu,
+    tolerates_gpu,
+    tolerates_spot,
+)
+try:
+    from . import thds_std  # noqa: F401
+except ModuleNotFoundError:
+    pass
+mops_shell = shim  # deprecated alias

thds/mops/k8s/_shared.py ADDED Viewed

@@ -0,0 +1,3 @@
+from thds.core.log import getLogger
+logger = getLogger(__name__[: -len("._shared")])

thds/mops/k8s/apply_yaml.py ADDED Viewed

@@ -0,0 +1,22 @@
+import tempfile
+from kubernetes import client, utils
+def format_yaml(yaml_template_str: str, **template_values: str) -> str:
+    return yaml_template_str.format(**template_values)
+def create_yaml_template(yaml_str: str, **template_values: str) -> None:
+    """Format a YAML template with the given keyword arguments, then apply it to the Kubernetes cluster.
+    You must already have set up your SDK config.
+    NOTE: This function doesn't actually apply, and can't until the next release of the K8S SDK:
+    https://github.com/kubernetes-client/python/pull/2252
+    """
+    formatted_yaml = format_yaml(yaml_str, **template_values)
+    with tempfile.NamedTemporaryFile("w", prefix="kubectl-yaml") as f:
+        f.write(formatted_yaml)
+        f.flush()
+        utils.create_from_yaml(client.ApiClient(), f.name)

thds/mops/k8s/auth.py ADDED Viewed

@@ -0,0 +1,49 @@
+import typing as ty
+from threading import RLock
+from cachetools import TTLCache
+from kubernetes import client, config
+from thds.core import fretry, log, scope
+from .._utils.locked_cache import locked_cached
+logger = log.getLogger(__name__)
+def _retry_config(exc: Exception) -> bool:
+    if isinstance(exc, config.ConfigException):
+        logger.debug("Retrying config load...")
+        return True
+    return False
+empty_config_retry = fretry.retry_sleep(_retry_config, fretry.expo(retries=3, delay=0.2))
+_AUTH_RLOCK = RLock()
+# load_config gets called all over the place and way too often.
+@locked_cached(TTLCache(1, ttl=120), lock=_AUTH_RLOCK)
+def load_config() -> None:
+    logger.debug("Loading Kubernetes config...")
+    try:
+        empty_config_retry(config.load_config)()
+    except config.ConfigException:
+        logger.error("Failed to load kube-config")
+@scope.bound
+def upsert_namespace(namespace: str, created_cache: ty.Set[str] = set()) -> None:  # noqa: B006
+    scope.enter(_AUTH_RLOCK)
+    if namespace in created_cache:
+        return
+    logger.debug("Creating namespace if not exists: %s" % namespace)
+    load_config()
+    kubeapi = client.CoreV1Api()
+    ns_obj = client.V1Namespace(metadata=client.V1ObjectMeta(name=namespace))
+    namespaces = set([item.metadata.name for item in kubeapi.list_namespace().items])
+    if namespace not in namespaces:
+        logger.info(f"Creating namespace {namespace}")
+        kubeapi.create_namespace(ns_obj)
+    created_cache.add(namespace)

thds/mops/k8s/config.py ADDED Viewed

@@ -0,0 +1,37 @@
+from datetime import timedelta
+from thds.core import config
+from .namespace import parse_namespace, user_namespace
+k8s_namespace = config.item("mops.k8s.namespace", user_namespace(), parse=parse_namespace)
+k8s_namespace_env_var_key = config.item("mops.k8s.namespace_env_var_key", "MOPS_K8S_NAMESPACE")
+# the above is used to embed the current namespace _inside_ the container as an
+# environment variable.  it will not affect how your namespace is selected in the first
+# place.
+k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 30 * 60, parse=int)
+k8s_acr_url = config.item("mops.k8s.acr.url", "")
+k8s_job_retry_count = config.item("mops.k8s.job.retry_count", 6, parse=int)
+k8s_job_cleanup_ttl_seconds_after_completion = config.item(
+    "mops.k8s.job.cleanup_ttl_seconds", int(timedelta(minutes=60).total_seconds()), parse=int
+)
+k8s_job_timeout_seconds = config.item(
+    "mops.k8s.job.timeout_seconds", int(timedelta(minutes=3).total_seconds()), parse=int
+)
+k8s_monitor_delay = config.item("mops.k8s.monitor.delay_seconds", 5, parse=int)
+k8s_monitor_max_attempts = config.item("mops.k8s.monitor.max_attempts", 100, parse=int)
+# In the East, we use the newer pod managed identity by default,
+# which provides access to a metadata endpoint that Azure clients know
+# how to access automatically.
+# https://docs.microsoft.com/en-us/azure/aks/use-azure-ad-pod-identity
+aad_pod_managed_identity = config.item("mops.k8s.azure.aad_pod_managed_identity", "")
+# but there's an even newer, better type of auth called Workload
+# Identity, which unfortunately requires specific infrastructure
+# configuration that lives outside this library.
+# https://azure.github.io/azure-workload-identity/docs/introduction.html
+namespaces_supporting_workload_identity = config.item(
+    "mops.k8s.azure.namespaces_supporting_workload_identity", ["default"]
+)

thds/mops/k8s/container_registry.py ADDED Viewed

@@ -0,0 +1,14 @@
+from . import config
+def autocr(container_image_name: str, cr_url: str = "") -> str:
+    """Prefix the container with the configured container registry URL.
+    Idempotent, so it will not apply if called a second time.
+    """
+    cr_url = cr_url or config.k8s_acr_url()
+    assert cr_url, "No container registry URL configured."
+    prefix = cr_url + "/" if cr_url and not cr_url.endswith("/") else cr_url
+    if not container_image_name.startswith(prefix):
+        return prefix + container_image_name
+    return container_image_name

thds/mops/k8s/jobs.py ADDED Viewed

@@ -0,0 +1,57 @@
+import typing as ty
+from kubernetes import client
+from ._shared import logger
+from .retry import k8s_sdk_retry
+from .watch import WatchingObjectSource
+@k8s_sdk_retry()
+def _get_job(namespace: str, job_name: str) -> ty.Optional[client.models.V1Job]:
+    logger.debug(f"Reading job {job_name}")
+    return client.BatchV1Api().read_namespaced_job(
+        namespace=namespace,
+        name=job_name,
+    )
+_JOB_SOURCE = WatchingObjectSource(
+    lambda _, __: client.BatchV1Api().list_namespaced_job,
+    lambda job: job.metadata.name,  # type: ignore
+    _get_job,
+    typename="Job",
+)
+def get_job(job_name: str, namespace: str = "") -> ty.Optional[client.models.V1Job]:
+    return _JOB_SOURCE.get(job_name, namespace=namespace)
+# https://github.com/kubernetes/kubernetes/issues/68712#issuecomment-514008330
+# https://kubernetes.io/docs/concepts/workloads/controllers/job/#terminal-job-conditions
+def is_job_succeeded(job: client.models.V1Job) -> bool:
+    if not job.status:
+        return False
+    if not job.status.completion_time:
+        return False
+    for condition in job.status.conditions or tuple():
+        if condition.type == "Complete" and condition.status == "True":
+            return True
+    return False
+def is_job_failed(job: client.models.V1Job) -> bool:
+    if not job.status:
+        return False
+    for condition in job.status.conditions or tuple():
+        if condition.type == "Failed" and condition.status == "True":
+            return True
+    return False

thds/mops/k8s/launch.py ADDED Viewed

@@ -0,0 +1,234 @@
+"""Provides an abstraction for launching Docker images on Kubernetes and waiting until they finish."""
+import os
+import threading
+import typing as ty
+import uuid
+from kubernetes import client
+from thds.core import scope
+from thds.core.log import logger_context
+from thds.mops.pure.runner.simple_shims import samethread_shim
+from .._utils.colorize import colorized
+from . import config
+from ._shared import logger
+from .auth import load_config, upsert_namespace
+from .logging import JobLogWatcher
+from .node_selection import NodeNarrowing, ResourceDefinition
+from .retry import k8s_sdk_retry
+from .thds_std import embed_thds_auth
+from .wait_job import wait_for_job
+LAUNCHED = colorized(fg="white", bg="green")
+COMPLETE = colorized(fg="white", bg="blue")
+FAILED = colorized(fg="white", bg="red")
+class K8sJobFailedError(Exception):
+    """Raised by `launch` when a Job is seen to terminate in a Failed state."""
+class Counter:
+    def __init__(self) -> None:
+        self.value = 0
+        self._lock = threading.Lock()
+    def inc(self) -> int:
+        with self._lock:
+            self.value += 1
+            return self.value
+_LAUNCH_COUNT = Counter()
+_FINISH_COUNT = Counter()
+_SIMULTANEOUS_LAUNCHES = threading.BoundedSemaphore(20)
+@scope.bound
+def launch(
+    container_image: str,
+    args: ty.Sequence[str],
+    *,
+    node_narrowing: ty.Optional[NodeNarrowing] = None,
+    container_name: str = "jobcontainer",
+    env_vars: ty.Optional[ty.Mapping[str, str]] = None,
+    # arguments below are for launching; arguments above are for
+    # building.  these should get separated in a future change.
+    name_prefix: str = "",
+    dry_run: bool = False,
+    fire_and_forget: bool = False,
+    suppress_logs: bool = False,
+    transform_job: ty.Callable[[client.models.V1Job], client.models.V1Job] = embed_thds_auth,
+    # this is a default for now. later if we share this code we'll need to have a wrapper interface
+    service_account_name: str = "",
+) -> None:
+    """Launch a Kubernetes job.
+    Required parameters are the container_image and the arguments to
+    that image, just as if you were running this directly with Docker.
+    Unless fire_and_forget=True, will poll until Job completes and
+    will raise K8sJobFailedError if the Job fails. None is returned
+    if the Job succeeds.
+    `name_prefix` is an optional parameter for debugging/developer
+    convenience. A generated suffix will be added to it.
+    """
+    if not container_image:
+        raise ValueError("container_image (the fully qualified Docker tag) must not be empty.")
+    job_num = f"{_LAUNCH_COUNT.inc():0>3}"
+    name = "-".join([name_prefix, str(os.getpid()), job_num, str(uuid.uuid4())[:8]]).lstrip("-")
+    scope.enter(logger_context(job=name))
+    node_narrowing = node_narrowing or dict()
+    # TODO move this entire function out to be separately callable
+    @k8s_sdk_retry()
+    def assemble_base_job() -> client.models.V1Job:
+        logger.debug(f"Assembling job named `{name}` on image `{container_image}`")
+        logger.debug("Fire and forget: %s", fire_and_forget)
+        logger.debug("Loading kube configs ...")
+        load_config()
+        logger.debug("Populating job object ...")
+        v1_job_body = client.V1Job(api_version="batch/v1", kind="Job")
+        logger.debug("Setting object meta ...")
+        v1_job_body.metadata = client.V1ObjectMeta(namespace=config.k8s_namespace(), name=name)
+        v1_job_body.status = client.V1JobStatus()
+        logger.debug("Creating pod template ...")
+        pod_template = client.V1PodTemplate()
+        pod_template.template = client.V1PodTemplateSpec(metadata=client.V1ObjectMeta(labels=dict()))
+        # we make empty labels just in case a later transformer wants to add some.
+        logger.debug("Applying environment variables ...")
+        env_list = [
+            client.V1EnvVar(name="MOPS_IMAGE_FULL_TAG", value=container_image),
+            # by setting these, things will be 'reentrant' if it is necessary to launch jobs within this job.
+        ]
+        if env_vars is not None:
+            for env_name, env_value in env_vars.items():
+                env_list.append(client.V1EnvVar(name=env_name, value=env_value))
+        env_list.append(
+            client.V1EnvVar(name=config.k8s_namespace_env_var_key(), value=config.k8s_namespace())
+        )
+        logger.debug("Creating container definition ...")
+        logger.debug("Setting container CPU/RAM requirements ...")
+        v1_container_args = dict(
+            args=args,
+            name=container_name,
+            image=container_image,
+            env=env_list,
+            image_pull_policy="Always",  # default is IfNotPresent, which leads to staleness when reusing a tag.
+            # https://kubernetes.io/docs/concepts/containers/images/#updating-images
+        )
+        assert node_narrowing is not None
+        resource_requests: ResourceDefinition = node_narrowing.get("resource_requests", dict())
+        resource_limits: ResourceDefinition = node_narrowing.get("resource_limits", dict())
+        if resource_requests or resource_limits:
+            v1_container_args["resources"] = client.V1ResourceRequirements(
+                requests=resource_requests,
+                limits=resource_limits,
+            )
+        container = client.V1Container(**v1_container_args)
+        logger.debug("Creating podspec definition ...")
+        pod_template.template.spec = client.V1PodSpec(
+            containers=[container],
+            restart_policy="Never",
+            node_selector=node_narrowing.get("node_selector", dict()),
+            tolerations=node_narrowing.get("tolerations", list()),
+            service_account_name=service_account_name,
+        )
+        logger.debug("Creating job definition ...")
+        v1_job_body.spec = client.V1JobSpec(
+            backoff_limit=config.k8s_job_retry_count(),
+            completions=1,
+            ttl_seconds_after_finished=config.k8s_job_cleanup_ttl_seconds_after_completion(),
+            template=pod_template.template,
+        )
+        logger.debug("Finished creating base job definition ...")
+        return v1_job_body
+    def job_with_all_transforms() -> client.models.V1Job:
+        return transform_job(assemble_base_job())
+    if dry_run:
+        job_with_all_transforms()
+        logger.info("Dry run assembly successful; not launching...")
+        return
+    @k8s_sdk_retry()
+    def launch_job() -> client.models.V1Job:
+        with _SIMULTANEOUS_LAUNCHES:
+            upsert_namespace(config.k8s_namespace())
+            # we do the job transform after actually upserting the namespace so that
+            # the transform can use the namespace if necessary.
+            return client.BatchV1Api().create_namespaced_job(
+                namespace=config.k8s_namespace(), body=job_with_all_transforms()
+            )
+    job = launch_job()
+    logger.info(LAUNCHED(f"Job {job_num} launched!") + f" on {container_image}")
+    if not suppress_logs:
+        threading.Thread(  # fire and forget a log watching thread
+            target=JobLogWatcher(job.metadata.name, len(job.spec.template.spec.containers)).start,
+            daemon=True,
+        ).start()
+    if not fire_and_forget:
+        def counts() -> str:
+            launched = _LAUNCH_COUNT.value
+            return f"- ({launched - _FINISH_COUNT.inc()} unfinished of {launched})"
+        job_name = job.metadata.name
+        del job  # trying to save memory here while we wait...
+        if not wait_for_job(job_name, short_name=job_num):
+            logger.error(FAILED(f"Job {job_num} Failed! {counts()}"))
+            raise K8sJobFailedError(f"Job {job_name} failed.")
+        logger.info(COMPLETE(f"Job {job_num} Complete! {counts()}"))
+def shim(
+    container_image: ty.Union[str, ty.Callable[[], str]],
+    disable_remote: ty.Callable[[], bool] = lambda: False,
+    **outer_kwargs: ty.Any,
+) -> ty.Callable[[ty.Sequence[str]], None]:
+    """Return a closure that can launch the given configuration and run a mops pure function.
+    Now supports callables that return a container image name; the
+    goal being to allow applications to perform this lazily on the
+    first actual use of the k8s runtime shim. The passed callable will be
+    called each time, so if you want it to be called only once, you'll
+    need to wrap it yourself.
+    Supports an optional callable argument `disable_remote` which when evaluated to True
+    causes the mops pure function to be run in a local shell.
+    """
+    assert (
+        "args" not in outer_kwargs
+    ), "Passing 'args' as a keyword argument will cause conflicts with the closure."
+    if disable_remote():
+        return samethread_shim
+    if isinstance(container_image, str):
+        get_container_image: ty.Callable[[], str] = lambda: container_image  # noqa: E731
+    else:
+        get_container_image = container_image
+    def launch_container_on_k8s_with_args(args: ty.Sequence[str], **inner_kwargs: ty.Any) -> None:
+        assert "args" not in inner_kwargs
+        launch(
+            get_container_image(),
+            ["python", "-m", "thds.mops.pure.core.entry.main", *args],
+            **{**outer_kwargs, **inner_kwargs},
+        )
+    return launch_container_on_k8s_with_args

thds/mops/k8s/logging.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""Handles things having to do with getting logs out of the Pods of a Job."""
+import enum
+import random
+import threading
+import time
+import typing as ty
+from timeit import default_timer
+import cachetools
+import urllib3.exceptions
+from kubernetes import client, watch
+from thds import core
+from thds.core.log import logger_context
+from .._utils.colorize import colorized, make_colorized_out, next_color
+from .._utils.locked_cache import locked_cached
+from . import config
+from ._shared import logger
+from .jobs import get_job
+from .retry import k8s_sdk_retry
+NO_K8S_LOGS = core.config.item("mops.no_k8s_logs", parse=core.config.tobool, default=False)
+# non-empty if you want to completely disable k8s pod logs.
+K8S_LOG_POD_FRACTION = core.config.item("mops.k8s.log_pod_fraction", parse=float, default=1.0)
+# fraction of pods to log. 1.0 means all pods.
+BOINK = colorized(fg="white", bg="magenta")
+# this module has tons of logs. occasionally you want to find a needle
+# in that haystack when you're debugging something. Wrap the logged
+# string in this and it'll stand out.
+class JobLogWatcher:
+    """Will spawn one or more daemon threads.
+    Each pod scraped will get its own randomly-selected ANSI color for
+    logs printed to the terminal.
+    When pods enter a failure state, a new check for pods will be
+    launched, in the hopes that the Job is planning to create new Pods
+    to replace them.
+    If the Job goes away entirely, this may or may not eventually
+    terminate. Because the threads are daemon threads, this will not
+    affect the logic of your program, but it's possible you may see
+    some spurious logging messages.
+    """
+    def __init__(self, job_name: str, num_pods_expected: int = 1) -> None:
+        self.job_name = job_name
+        self.num_pods_expected = num_pods_expected
+        self.pods_being_scraped: ty.Set[str] = set()
+        self.pod_colors: ty.Dict[str, ty.Callable[[str], ty.Any]] = dict()
+        self.job_pods_discovery_lock = threading.Lock()
+    @k8s_sdk_retry()
+    @core.scope.bound
+    def start(self, failed_pod_name: str = "") -> None:
+        """Call this one time - it will spawn threads as needed."""
+        if NO_K8S_LOGS():
+            return
+        if random.random() > K8S_LOG_POD_FRACTION():
+            logger.info(f"Skipping log watcher for {self.job_name} due to fraction.")
+            return
+        core.scope.enter(self.job_pods_discovery_lock)
+        # we lock here because some of the threads we spawn may
+        # eventually call this same method, and we only want one
+        # instance of this running at a time.
+        core.scope.enter(logger_context(log=self.job_name))
+        logger.debug("Starting log watcher")
+        if failed_pod_name:
+            logger.info(
+                BOINK(f"Failed to scrape logs in pod {failed_pod_name}, looking for new pods...")
+            )
+            self.pods_being_scraped.discard(failed_pod_name)
+            # this one can be retried if it's still out there.
+        time.sleep(config.k8s_monitor_delay())
+        for pod in _yield_running_pods_for_job(
+            self.job_name,
+            self.num_pods_expected if not self.pods_being_scraped else 1,
+        ):
+            pod_name = pod.metadata.name
+            if pod_name not in self.pods_being_scraped:
+                # don't start new threads for pods we've already previously discovered - they have their own thread.
+                self.pods_being_scraped.add(pod_name)
+                if pod_name not in self.pod_colors:
+                    self.pod_colors[pod_name] = make_colorized_out(
+                        colorized(fg=next_color()), fmt_str=pod_name + " {}"
+                    )
+                log_thread = threading.Thread(
+                    target=_scrape_pod_logs,
+                    args=(
+                        self.pod_colors[pod_name],
+                        pod_name,
+                        self.start,
+                    ),
+                    daemon=True,
+                )
+                log_thread.start()
+# we really don't want many threads calling the K8S API a billion times all at once
+@locked_cached(cachetools.TTLCache(maxsize=1, ttl=2))
+def _list_pods_in_our_namespace() -> ty.List[client.models.V1Pod]:
+    return client.CoreV1Api().list_namespaced_pod(namespace=config.k8s_namespace()).items
+class K8sPodStatus(enum.Enum):
+    PENDING = "Pending"
+    RUNNING = "Running"
+    SUCCEEDED = "Succeeded"
+    FAILED = "Failed"
+    UNKNOWN = "Unknown"
+def _yield_running_pods_for_job(
+    job_name: str, expected_number_of_pods: int = 1
+) -> ty.Iterator[client.models.V1Pod]:
+    """TODO: stop polling if the Job cannot be found at all."""
+    attempt = 0
+    yielded = 0
+    logger.debug("Polling for pods created by job: %s", job_name)
+    while attempt < config.k8s_monitor_max_attempts():
+        for pod in _list_pods_in_our_namespace():
+            owner_refs = pod.metadata.owner_references
+            if not owner_refs:
+                # this is a rare and undocumented case where a pod
+                # will have owner_references=None if it was manually created.
+                # since we're looking for pods created by jobs, we can safely skip these.
+                continue
+            if len(owner_refs) > 1:
+                logger.warning("Found multiple owner references for a pod. Taking first one...")
+            owner_ref = owner_refs[0]
+            if owner_ref.name == job_name:
+                if pod.status.phase in {
+                    K8sPodStatus.RUNNING.value,
+                    K8sPodStatus.UNKNOWN.value,
+                }:
+                    logger.debug(f"Found a pod {pod.metadata.name} in phase {pod.status.phase}")
+                    yielded += 1
+                    yield pod
+        if yielded >= expected_number_of_pods:
+            logger.debug("Found all expected running pods.")
+            return
+        if not get_job(job_name):
+            logger.warning("Job not found; not a good sign for pod logs")
+            attempt += 50
+        logger.debug("Didn't find enough pods yet, sleeping for a moment...")
+        time.sleep(config.k8s_monitor_delay())
+        attempt += 1
+def _get_pod_phase(pod_name: str) -> str:
+    return (
+        client.CoreV1Api()
+        .read_namespaced_pod(
+            namespace=config.k8s_namespace(),
+            name=pod_name,
+            _request_timeout=(10, config.k8s_job_timeout_seconds()),
+        )
+        .status.phase
+    )
+def _await_pod_phases(phases: ty.Set[K8sPodStatus], pod_name: str) -> str:
+    while True:
+        phase = _get_pod_phase(pod_name)
+        if phase in {phase.value for phase in phases}:
+            return phase
+        time.sleep(config.k8s_monitor_delay())
+@core.scope.bound
+def _scrape_pod_logs(
+    out: ty.Callable[[str], ty.Any],
+    pod_name: str,
+    failure_callback: ty.Callable[[str], ty.Any],
+) -> None:
+    """Contains its own retry error boundary b/c this is notoriously unreliable."""
+    core.scope.enter(logger_context(log=pod_name))
+    last_scraped_at = default_timer()
+    base_kwargs = dict(
+        name=pod_name,
+        namespace=config.k8s_namespace(),
+        _request_timeout=(10, config.k8s_job_timeout_seconds()),
+        # i'm occasionally seeing the `stream()` call below hang
+        # indefinitely if logs don't come back from the pod for a
+        # while. Which is ironic, since most of this code is here to
+        # help us make sure we keep retrying if no logs happen on the
+        # pod for a while, since frequently `stream()` will just end
+        # quietly when that happens.  In any case, at this point,
+        # we're better-equipped to handle all kinds of retries, so
+        # using the (connect, read) _request timeout tuple is probably
+        # what we want to try next.
+    )
+    def get_retry_kwargs(_: int) -> ty.Tuple[tuple, dict]:
+        return tuple(), dict(base_kwargs, since_seconds=int(default_timer() - last_scraped_at))
+    def scrape_logs(*_args: ty.Any, **kwargs: ty.Any) -> None:
+        nonlocal last_scraped_at
+        _await_pod_phases(
+            {K8sPodStatus.RUNNING, K8sPodStatus.SUCCEEDED, K8sPodStatus.FAILED},
+            pod_name,
+        )
+        logger.debug("Watching pod log stream...")
+        while True:
+            for e in watch.Watch().stream(
+                client.CoreV1Api().read_namespaced_pod_log,
+                **kwargs,
+            ):
+                out(e)
+                last_scraped_at = default_timer()
+            time.sleep(config.k8s_monitor_delay())
+            pod_phase = _get_pod_phase(pod_name)
+            if pod_phase == K8sPodStatus.SUCCEEDED.value:
+                logger.debug("Done scraping pod logs")
+                return
+            if pod_phase == K8sPodStatus.FAILED.value:
+                logger.warning("Pod failed - calling callback")
+                failure_callback(pod_name)
+                return
+            logger.debug("Pod is not complete - will retry the log watch")
+    def should_retry(ex: Exception) -> bool:
+        return isinstance(ex, urllib3.exceptions.ReadTimeoutError)
+    try:
+        k8s_sdk_retry(get_retry_kwargs, should_retry=should_retry)(scrape_logs)(**base_kwargs)
+    except Exception:
+        logger.exception(BOINK("Pod log scraping failed utterly. Pod may have died?"))
+        # at least let the caller know something went horribly wrong
+        failure_callback(pod_name)