PyPI - thds.mops - Versions diffs - 3.9.20250722163657__py3-none-any.whl → 3.9.20250722200009__py3-none-any.whl - Mend

thds.mops 3.9.20250722163657py3-none-any.whl → 3.9.20250722200009py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of thds.mops might be problematic. Click here for more details.

Files changed (37) hide show

thds/mops/impure/runner.py +1 -1
thds/mops/k8s/__init__.py +3 -1
thds/mops/k8s/{launch.py → _launch.py} +56 -57
thds/mops/k8s/batching.py +198 -0
thds/mops/k8s/config.py +1 -1
thds/mops/k8s/counts.py +28 -0
thds/mops/k8s/job_future.py +109 -0
thds/mops/k8s/jobs.py +4 -0
thds/mops/k8s/logging.py +37 -5
thds/mops/k8s/uncertain_future.py +160 -0
thds/mops/k8s/watch.py +120 -62
thds/mops/pure/__init__.py +2 -1
thds/mops/pure/_magic/sauce.py +11 -3
thds/mops/pure/_magic/shims.py +2 -2
thds/mops/pure/core/deferred_work.py +15 -12
thds/mops/pure/core/entry/runner_registry.py +1 -10
thds/mops/pure/core/lock/__init__.py +1 -0
thds/mops/pure/core/lock/_acquire.py +2 -2
thds/mops/pure/core/lock/maintain.py +22 -3
thds/mops/pure/core/lock/write.py +19 -19
thds/mops/pure/core/memo/__init__.py +1 -1
thds/mops/pure/core/memo/results.py +5 -4
thds/mops/pure/core/use_runner.py +21 -7
thds/mops/pure/pickling/mprunner.py +21 -14
thds/mops/pure/pickling/pickles.py +19 -8
thds/mops/pure/pickling/remote.py +3 -1
thds/mops/pure/runner/get_results.py +106 -0
thds/mops/pure/runner/local.py +58 -87
thds/mops/pure/runner/shim_builder.py +7 -7
thds/mops/pure/runner/simple_shims.py +7 -0
thds/mops/pure/runner/types.py +15 -4
thds/mops/pure/tools/summarize/run_summary.py +9 -8
{thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/METADATA +1 -1
{thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/RECORD +37 -32
{thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/WHEEL +0 -0
{thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/entry_points.txt +0 -0
{thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/top_level.txt +0 -0

thds/mops/impure/runner.py CHANGED Viewed

@@ -67,7 +67,7 @@ class KeyedLocalRunner(MemoizingPicklingRunner):
             redirect=lambda _f, _args, _kwargs: _perform_original_invocation,
         )
-    def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R:
+    def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R:  # type: ignore[valid-type]
         actual_function_to_call = self._pre_pickle_redirect(raw_func, raw_args, raw_kwargs)
         with _ORIGINAL_F_ARGS_KWARGS.set((actual_function_to_call, raw_args, raw_kwargs)):
             return super().__call__(*self._impure_keyfunc(raw_func, raw_args, raw_kwargs))

thds/mops/k8s/__init__.py CHANGED Viewed

@@ -7,8 +7,10 @@ except ModuleNotFoundError as mnf:
         "Please install mops with the `k8s` extra to use `thds.mops.k8s`."
     ) from mnf
+from . import batching, counts, job_future  # noqa: F401
+from ._launch import launch, shim  # noqa
 from .container_registry import autocr  # noqa: F401
-from .launch import K8sJobFailedError, launch, shim  # noqa
+from .job_future import K8sJobFailedError  # noqa: F401
 from .node_selection import (  # noqa
     NodeNarrowing,
     ResourceDefinition,

thds/mops/k8s/{launch.py → _launch.py} RENAMED Viewed

@@ -4,41 +4,22 @@ import os
 import threading
 import typing as ty
 import uuid
+from functools import partial
 from kubernetes import client
-from thds.core import scope
-from thds.core.log import logger_context
+from thds import core
 from thds.mops.pure.runner.simple_shims import samethread_shim
 from thds.termtool.colorize import colorized
-from . import config
+from . import config, counts, job_future, logging
 from ._shared import logger
 from .auth import load_config, upsert_namespace
-from .logging import JobLogWatcher
 from .node_selection import NodeNarrowing, ResourceDefinition
 from .retry import k8s_sdk_retry
 from .thds_std import embed_thds_auth
-from .wait_job import wait_for_job
 LAUNCHED = colorized(fg="white", bg="green")
-COMPLETE = colorized(fg="white", bg="blue")
-FAILED = colorized(fg="white", bg="red")
-class K8sJobFailedError(Exception):
-    """Raised by `launch` when a Job is seen to terminate in a Failed state."""
-class Counter:
-    def __init__(self) -> None:
-        self.value = 0
-        self._lock = threading.Lock()
-    def inc(self) -> int:
-        with self._lock:
-            self.value += 1
-            return self.value
 def sanitize_str(name: str) -> str:
@@ -49,7 +30,7 @@ def sanitize_str(name: str) -> str:
 def construct_job_name(user_prefix: str, job_num: str) -> str:
     # we want some consistency here, but also some randomness in case the prefixes don't exist or aren't unique.
-    mops_name_part = "-".join([str(os.getpid()), sanitize_str(job_num), str(uuid.uuid4())[:8]])
+    mops_name_part = "-".join([sanitize_str(job_num), str(os.getpid()), str(uuid.uuid4())[:8]])
     if len(mops_name_part) > 63:
         # this should be _impossible_, because having a job num longer than even 20 digits would be an impossibly large
         # number of jobs. but just in case, we'll truncate it to the last 63 characters.
@@ -65,12 +46,11 @@ def construct_job_name(user_prefix: str, job_num: str) -> str:
     return name
-_LAUNCH_COUNT = Counter()
-_FINISH_COUNT = Counter()
 _SIMULTANEOUS_LAUNCHES = threading.BoundedSemaphore(20)
+JOB_NAME = core.stack_context.StackContext("job_name", "")
-@scope.bound
+@core.scope.bound
 def launch(
     container_image: str,
     args: ty.Sequence[str],
@@ -81,38 +61,46 @@ def launch(
     # arguments below are for launching; arguments above are for
     # building.  these should get separated in a future change.
     name_prefix: str = "",
+    full_name: str = "",
     dry_run: bool = False,
-    fire_and_forget: bool = False,
     suppress_logs: bool = False,
     transform_job: ty.Callable[[client.models.V1Job], client.models.V1Job] = embed_thds_auth,
     # this is a default for now. later if we share this code we'll need to have a wrapper interface
     service_account_name: str = "",
-) -> None:
+) -> core.futures.LazyFuture[bool]:
     """Launch a Kubernetes job.
     Required parameters are the container_image and the arguments to
     that image, just as if you were running this directly with Docker.
-    Unless fire_and_forget=True, will poll until Job completes and
-    will raise K8sJobFailedError if the Job fails. None is returned
-    if the Job succeeds.
+    Returns a Future that will resolve to True when the Job completes successfully, or
+    raise K8sJobFailedError if the Job fails.
     `name_prefix` is an optional parameter for debugging/developer
     convenience. A generated suffix will be added to it.
     """
     if not container_image:
         raise ValueError("container_image (the fully qualified Docker tag) must not be empty.")
-    job_num = f"{_LAUNCH_COUNT.inc():0>3}"
-    name = construct_job_name(name_prefix, job_num)
-    scope.enter(logger_context(job=name))
+    full_name = full_name or JOB_NAME()
+    # in certain cases, it may be necessary to set the job name
+    # via a StackContext, so we check that here, and prefer it over name_prefix.
+    if full_name and name_prefix:
+        raise ValueError("You cannot specify both full_name and name_prefix; use one or the other.")
+    if not full_name:
+        name = construct_job_name(name_prefix, counts.to_name(counts.inc(counts.LAUNCH_COUNT)))
+    else:
+        name = full_name
+    core.scope.enter(core.log.logger_context(job=name))
     node_narrowing = node_narrowing or dict()
     # TODO move this entire function out to be separately callable
     @k8s_sdk_retry()
     def assemble_base_job() -> client.models.V1Job:
         logger.debug(f"Assembling job named `{name}` on image `{container_image}`")
-        logger.debug("Fire and forget: %s", fire_and_forget)
         logger.debug("Loading kube configs ...")
         load_config()
         logger.debug("Populating job object ...")
@@ -185,7 +173,7 @@ def launch(
     if dry_run:
         job_with_all_transforms()
         logger.info("Dry run assembly successful; not launching...")
-        return
+        return core.futures.LazyFuture(partial(core.futures.ResolvedFuture, True))
     @k8s_sdk_retry()
     def launch_job() -> client.models.V1Job:
@@ -198,32 +186,41 @@ def launch(
             )
     job = launch_job()
-    logger.info(LAUNCHED(f"Job {job_num} launched!") + f" on {container_image}")
+    logger.info(LAUNCHED(f"Job {name} launched!") + f" on {container_image}")
+    return core.futures.make_lazy(_launch_logs_and_create_future)(  # see below for implementation
+        job.metadata.name,
+        num_pods_expected=len(job.spec.template.spec.containers),
+        namespace=config.k8s_namespace(),
+        suppress_logs=suppress_logs,
+    )
+# this function has to be a top level def because it will sometimes be transferred across process boundaries,
+# and Python/pickle in its infinite wisdom does not allow nested functions to be pickled.
+def _launch_logs_and_create_future(
+    job_name: str, *, num_pods_expected: int, namespace: str, suppress_logs: bool
+) -> core.futures.PFuture[bool]:
     if not suppress_logs:
-        threading.Thread(  # fire and forget a log watching thread
-            target=JobLogWatcher(job.metadata.name, len(job.spec.template.spec.containers)).start,
-            daemon=True,
-        ).start()
-    if not fire_and_forget:
+        logging.maybe_start_job_thread(job_name, num_pods_expected)
+    return job_future.make_job_completion_future(job_name, namespace=namespace)
-        def counts() -> str:
-            launched = _LAUNCH_COUNT.value
-            return f"- ({launched - _FINISH_COUNT.inc()} unfinished of {launched})"
-        job_name = job.metadata.name
-        del job  # trying to save memory here while we wait...
-        if not wait_for_job(job_name, short_name=job_num):
-            logger.error(FAILED(f"Job {job_num} Failed! {counts()}"))
-            raise K8sJobFailedError(f"Job {job_name} failed.")
-        logger.info(COMPLETE(f"Job {job_num} Complete! {counts()}"))
+def create_lazy_job_logging_future(
+    job_name: str, *, namespace: str = "", num_pods_expected: int = 1
+) -> core.futures.LazyFuture[bool]:
+    return core.futures.make_lazy(_launch_logs_and_create_future)(
+        job_name,
+        num_pods_expected=num_pods_expected,
+        namespace=namespace or config.k8s_namespace(),
+        suppress_logs=False,
+    )
 def shim(
     container_image: ty.Union[str, ty.Callable[[], str]],
     disable_remote: ty.Callable[[], bool] = lambda: False,
     **outer_kwargs: ty.Any,
-) -> ty.Callable[[ty.Sequence[str]], None]:
+) -> ty.Callable[[ty.Sequence[str]], core.futures.LazyFuture[bool]]:
     """Return a closure that can launch the given configuration and run a mops pure function.
     Now supports callables that return a container image name; the
@@ -240,16 +237,18 @@ def shim(
     ), "Passing 'args' as a keyword argument will cause conflicts with the closure."
     if disable_remote():
-        return samethread_shim
+        return samethread_shim  # type: ignore[return-value]
     if isinstance(container_image, str):
         get_container_image: ty.Callable[[], str] = lambda: container_image  # noqa: E731
     else:
         get_container_image = container_image
-    def launch_container_on_k8s_with_args(args: ty.Sequence[str], **inner_kwargs: ty.Any) -> None:
+    def launch_container_on_k8s_with_args(
+        args: ty.Sequence[str], **inner_kwargs: ty.Any
+    ) -> core.futures.LazyFuture[bool]:
         assert "args" not in inner_kwargs
-        launch(
+        return launch(
             get_container_image(),
             ["python", "-m", "thds.mops.pure.core.entry.main", *args],
             **{**outer_kwargs, **inner_kwargs},

thds/mops/k8s/batching.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""The basic idea of this module is that different threads can submit _parts_ of a job to a batcher,
+and immediately get the job name back, while the batcher itself defers creating the job until the
+batch is full, or when the process exits.
+The theory is that will get used in processes whose only responsibility is to create jobs,
+so waiting on atexit to create the final batch is not an issue.
+If you want a batcher that has a more context-manager-like behavior, you can write one of
+those, but it wouldn't work well with a concurrent.futures Executor-style approach, since
+those don't have an explicit shutdown procedure that we can hook to call __exit__.
+"""
+import atexit
+import concurrent.futures
+import itertools
+import multiprocessing
+import threading
+import typing as ty
+from thds.core import cpus, futures, log
+from . import _launch, counts
+T = ty.TypeVar("T")
+logger = log.getLogger(__name__)
+class _AtExitBatcher(ty.Generic[T]):
+    def __init__(self, batch_processor: ty.Callable[[ty.Collection[T]], None]) -> None:
+        self.batch: list[T] = []
+        self._registered = False
+        self._lock = threading.RLock()
+        self._batch_processor = batch_processor
+    def add(self, item: T) -> None:
+        with self._lock:
+            if not self._registered:
+                atexit.register(self.process)
+                # ensure we flush on process exit, since we don't know how many items are coming
+                self._registered = True
+            self.batch.append(item)
+    def process(self) -> None:
+        if self.batch:
+            with self._lock:
+                if self.batch:
+                    self._batch_processor(self.batch)
+                    self.batch = []
+class K8sJobBatchingShim(_AtExitBatcher[str]):
+    """Thread-safe for use within a single process by multiple threads."""
+    def __init__(
+        self,
+        submit_func: ty.Callable[[ty.Collection[str]], ty.Any],
+        max_batch_size: int,
+        job_counter: counts.MpValue[int],
+        name_prefix: str = "",
+    ) -> None:
+        """submit_func in particular should be a closure around whatever setup you need to
+        do to call back into a function that is locally wrapped with a k8s shim that will
+        ultimately call k8s.launch. Notably, you
+        """
+        super().__init__(self._process_batch)
+        self._max_batch_size = max_batch_size
+        self._job_counter = job_counter
+        self._job_name = ""
+        self._name_prefix = name_prefix
+        self._submit_func = submit_func
+    def _get_new_name(self) -> str:
+        # counts.inc takes a multiprocess lock. do not forget this!
+        job_num = counts.inc(self._job_counter)
+        return _launch.construct_job_name(self._name_prefix, counts.to_name(job_num))
+    def add_to_named_job(self, mops_invocation: ty.Sequence[str]) -> str:
+        """Returns job name for the invocation."""
+        with self._lock:
+            if not self._job_name:
+                self._job_name = self._get_new_name()
+            if len(self.batch) >= self._max_batch_size:
+                self.process()
+                self._job_name = self._get_new_name()
+            super().add(" ".join(mops_invocation))
+            return self._job_name
+    def _process_batch(self, batch: ty.Collection[str]) -> None:
+        with _launch.JOB_NAME.set(self._job_name):
+            log_lvl = logger.warning if len(batch) < self._max_batch_size else logger.info
+            log_lvl(f"Processing batch of len {len(batch)} with job name {self._job_name}")
+            self._submit_func(batch)
+F = ty.TypeVar("F", bound=ty.Callable)
+FunctionDecorator = ty.Callable[[F], F]
+_BATCHER: ty.Optional[K8sJobBatchingShim] = None
+def init_batcher(
+    submit_func: ty.Callable[[ty.Collection[str]], ty.Any],
+    func_max_batch_size: int,
+    job_counter: counts.MpValue[int],
+    name_prefix: str = "",
+) -> None:
+    # for use with multiprocessing pool initializer
+    global _BATCHER
+    if _BATCHER is not None:
+        logger.warning("Batcher is already initialized; reinitializing will reset the job name.")
+        return
+    _BATCHER = K8sJobBatchingShim(submit_func, func_max_batch_size, job_counter, name_prefix)
+def init_batcher_with_unpicklable_submit_func(
+    make_submit_func: ty.Callable[[T], ty.Callable[[ty.Collection[str]], ty.Any]],
+    submit_func_arg: T,
+    func_max_batch_size: int,
+    job_counter: counts.MpValue[int],
+    name_prefix: str = "",
+) -> None:
+    """Use this if you want to have an unpicklable submit function - because applying make_submit_func(submit_func_arg)
+    will happen inside the pool worker process after all the pickling/unpickling has happened.
+    """
+    return init_batcher(
+        make_submit_func(submit_func_arg), func_max_batch_size, job_counter, name_prefix=name_prefix
+    )
+def make_counting_process_pool_executor(
+    make_submit_func: ty.Callable[[T], ty.Callable[[ty.Collection[str]], ty.Any]],
+    submit_func_arg: T,
+    max_batch_size: int,
+    name_prefix: str = "",
+    max_workers: int = 0,
+) -> concurrent.futures.ProcessPoolExecutor:
+    """Creates a ProcessPoolExecutor that uses the batching shim for job submission.
+    We are introducing this because we see segfaults prior to Python 3.12 related to this issue:
+    https://github.com/python/cpython/issues/77377
+    And it would seem that this had to do with creating mp.Values using a 'fork' start
+    method, and then passing those to a ProcessPoolExecutor with
+    mp_context=multiprolcessing.get_context('spawn'). So we can help you avoid that by creating
+    the mp.Value for you, alongside its ProcessPoolExecutor.
+    NOTE!!
+    You should only have one of these per process at a time, because we're doing spooky
+    things with the Job Counter.  In fact, you should probably only create one of these
+    _ever_ within a single logical 'application'.
+    If you fail to heed this advice, you will get weird launched/finished counts at a
+    minimum. Although these job counts are not mission-critical, you _will_ be confused.
+    """
+    start_method: str = "spawn"
+    # 'spawn' prevents weird batch processing deadlocks that seem to only happen on Linux with 'fork'.
+    # it is strongly recommended to use 'spawn' for this reason.
+    mp_context = multiprocessing.get_context(start_method)
+    launch_count = mp_context.Value("i", 0)
+    # even though i want to assign this to a global, I also want to prevent
+    # any possible race condition where i somehow use a different thread's LAUNCH_COUNT
+    # when i create the ProcessPoolExecutor a few lines below.
+    counts.LAUNCH_COUNT = launch_count
+    counts.FINISH_COUNT = mp_context.Value("i", 0)  # we don't use this here; we just reset it to zero.
+    # SPOOKY - reset the global finish counter and make it be the same 'type'
+    return concurrent.futures.ProcessPoolExecutor(
+        max_workers=max_workers or cpus.available_cpu_count(),
+        initializer=init_batcher_with_unpicklable_submit_func,
+        initargs=(make_submit_func, submit_func_arg, max_batch_size, launch_count, name_prefix),
+        mp_context=mp_context,
+    )
+def shim(args: ty.Sequence[str]) -> futures.PFuture[bool]:
+    # This thing needs to return a lazy Uncertain Future that contains a job name, so that Job can be polled on
+    # ... but the job does not exist yet! So the batcher is in charge of creating the job name
+    # upfront, and then ensuring that it gets used when the job is created.
+    assert _BATCHER is not None, "Batcher must be initialized before using the batching shim."
+    job_name = _BATCHER.add_to_named_job(args)
+    return _launch.create_lazy_job_logging_future(job_name)
+def batched(iterable: ty.Iterable[T], n: int, *, strict: bool = False) -> ty.Iterator[tuple[T, ...]]:
+    """Just a utility for pre-batching if you're using multiprocessing to create batches."""
+    # TODO get rid of this when we go to Python 3.12+ which has itertools.batched
+    #
+    # batched('ABCDEFG', 3) → ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    iterator = iter(iterable)
+    while batch := tuple(itertools.islice(iterator, n)):
+        if strict and len(batch) != n:
+            raise ValueError("batched(): incomplete batch")
+        yield batch

thds/mops/k8s/config.py CHANGED Viewed

@@ -10,7 +10,7 @@ k8s_namespace_env_var_key = config.item("mops.k8s.namespace_env_var_key", "MOPS_
 # environment variable.  it will not affect how your namespace is selected in the first
 # place.
-k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 30 * 60, parse=int)
+k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 5 * 60, parse=int)
 k8s_acr_url = config.item("mops.k8s.acr.url", "")
 k8s_job_retry_count = config.item("mops.k8s.job.retry_count", 6, parse=int)
 k8s_job_cleanup_ttl_seconds_after_completion = config.item(

thds/mops/k8s/counts.py ADDED Viewed

@@ -0,0 +1,28 @@
+import multiprocessing as mp
+import typing as ty
+T = ty.TypeVar("T")
+class MpValue(ty.Protocol[T]):
+    def get_lock(self) -> ty.Any:
+        ...
+    value: T
+def inc(mp_val: MpValue[int]) -> int:
+    with mp_val.get_lock():
+        mp_val.value += 1
+        return mp_val.value
+LAUNCH_COUNT = mp.Value("i", 0)
+FINISH_COUNT = mp.Value("i", 0)
+# these are spooky - they're global and mutable, and may in fact get overwritten by code
+# using specific multiprocessing contexts.
+def to_name(count: int) -> str:
+    """Convert a count to a name."""
+    return f"{count:0>4}"

thds/mops/k8s/job_future.py ADDED Viewed

@@ -0,0 +1,109 @@
+import threading
+import typing as ty
+from kubernetes import client
+from thds.core import futures, log
+from thds.termtool.colorize import colorized
+from . import config, counts, uncertain_future
+from .jobs import is_job_failed, is_job_succeeded, job_source
+logger = log.getLogger(__name__)
+UNUSUAL = colorized(fg="white", bg="yellow")
+SUCCEEDED = colorized(fg="white", bg="blue")
+FAILED = colorized(fg="white", bg="red")
+_FINISHED_JOBS = set[str]()
+_FINISHED_JOBS_LOCK = threading.Lock()
+def _check_newly_finished(job_name: str, namespace: str = "") -> str:
+    # I don't believe it's possible to ever have a Job that both succeeds and fails.
+    namespace = namespace or config.k8s_namespace()
+    job_full = f"{namespace}/{job_name}"
+    if job_full in _FINISHED_JOBS:
+        return ""
+    with _FINISHED_JOBS_LOCK:
+        if job_full in _FINISHED_JOBS:
+            return ""
+        _FINISHED_JOBS.add(job_full)
+    launched = counts.LAUNCH_COUNT.value
+    return f"- ({launched - counts.inc(counts.FINISH_COUNT)} unfinished of {launched})"
+class K8sJobFailedError(Exception):
+    """Raised by `launch` when a Job is seen to terminate in a Failed state."""
+def make_job_completion_future(job_name: str, *, namespace: str = "") -> futures.PFuture[bool]:
+    """This is a natural boundary for a serializable lazy future - something that represents
+    work being done across process boundaries (since Kubernetes jobs will be listed via an API.
+    If True is returned, the Job has definitely succeeded.
+    If False is returned, the Job may have succeeded but we saw no evidence of it.
+    If the Job definitely failed, an Exception will be raised.
+    """
+    JOB_SEEN = False
+    def job_completion_interpreter(
+        job: ty.Optional[client.models.V1Job], last_seen_at: float
+    ) -> ty.Union[uncertain_future.NotYetDone, bool]:
+        nonlocal JOB_SEEN
+        if not job:
+            if JOB_SEEN:
+                logger.warning(
+                    UNUSUAL(f"Previously-seen job {job_name} no longer exists - assuming success!")
+                )
+                # we hereby indicate an unusual success to the Future waiter.
+                return False
+            time_since_last_seen = uncertain_future.official_timer() - last_seen_at
+            if time_since_last_seen > config.k8s_watch_object_stale_seconds():
+                # this is 5 minutes by default as of 2025-07-15.
+                raise TimeoutError(
+                    f"Job {job_name} has not been seen for {time_since_last_seen:.1f} seconds - assuming failure!"
+                )
+            # we don't know what's going on but things aren't truly stale yet.
+            return uncertain_future.NotYetDone()
+        JOB_SEEN = True
+        if is_job_succeeded(job):
+            newly_succeeded = _check_newly_finished(job_name, namespace)
+            if newly_succeeded:
+                logger.info(SUCCEEDED(f"Job {job_name} Succeeded! {newly_succeeded}"))
+            return True
+        if is_job_failed(job):
+            newly_failed = _check_newly_finished(job_name, namespace)
+            if newly_failed:
+                logger.error(FAILED(f"Job {job_name} Failed! {newly_failed}"))
+            raise K8sJobFailedError(f"Job {job_name} has failed with status: {job.status}")
+        return uncertain_future.NotYetDone()  # job is still in progress
+    return job_source().create_future(
+        job_completion_interpreter,
+        job_name,
+        namespace=namespace or config.k8s_namespace(),
+    )
+def make_lazy_completion_future(job_name: str, *, namespace: str = "") -> futures.LazyFuture[bool]:
+    """This is a convenience function that will create a job completion future and then
+    immediately process it, returning the result. See docs on function above.
+    """
+    return futures.make_lazy(make_job_completion_future)(
+        job_name,
+        namespace=namespace or config.k8s_namespace(),
+    )

thds/mops/k8s/jobs.py CHANGED Viewed

@@ -25,6 +25,10 @@ _JOB_SOURCE = WatchingObjectSource(
 )
+def job_source() -> WatchingObjectSource[client.models.V1Job]:
+    return _JOB_SOURCE
 def get_job(job_name: str, namespace: str = "") -> ty.Optional[client.models.V1Job]:
     return _JOB_SOURCE.get(job_name, namespace=namespace)

thds/mops/k8s/logging.py CHANGED Viewed

@@ -32,6 +32,17 @@ BOINK = colorized(fg="white", bg="magenta")
 # string in this and it'll stand out.
+def should_log(job_name: str) -> bool:
+    if NO_K8S_LOGS():
+        return False
+    if random.random() > K8S_LOG_POD_FRACTION():
+        logger.info(f"Skipping log watcher for {job_name} due to fraction.")
+        return False
+    return True
 class JobLogWatcher:
     """Will spawn one or more daemon threads.
@@ -59,11 +70,7 @@ class JobLogWatcher:
     @core.scope.bound
     def start(self, failed_pod_name: str = "") -> None:
         """Call this one time - it will spawn threads as needed."""
-        if NO_K8S_LOGS():
-            return
-        if random.random() > K8S_LOG_POD_FRACTION():
-            logger.info(f"Skipping log watcher for {self.job_name} due to fraction.")
+        if not should_log(self.job_name):
             return
         core.scope.enter(self.job_pods_discovery_lock)
@@ -245,3 +252,28 @@ def _scrape_pod_logs(
         logger.exception(BOINK("Pod log scraping failed utterly. Pod may have died?"))
         # at least let the caller know something went horribly wrong
         failure_callback(pod_name)
+_JOB_LOG_THREADS: set[str] = set()
+_JOB_LOG_THREAD_COUNT: int = 0
+_JOB_LOG_THREADS_LOCK = threading.Lock()
+def maybe_start_job_thread(job_name: str, num_pods_expected: int = 1) -> bool:
+    """Starts a thread to watch the logs of a job. Makes sure we only start one thread per
+    job even if there are multiple calls to this function.
+    """
+    if job_name not in _JOB_LOG_THREADS:
+        with _JOB_LOG_THREADS_LOCK:
+            if job_name not in _JOB_LOG_THREADS:
+                # double-checked locking to avoid creating multiple threads for the same job
+                _JOB_LOG_THREADS.add(job_name)
+                if should_log(job_name):
+                    global _JOB_LOG_THREAD_COUNT
+                    _JOB_LOG_THREAD_COUNT += 1
+                    logger.info(f"Starting log watcher {_JOB_LOG_THREAD_COUNT} for job {job_name}")
+                    threading.Thread(
+                        target=JobLogWatcher(job_name, num_pods_expected).start, daemon=True
+                    ).start()
+                    return True
+    return False

thds.mops 3.9.20250722163657__py3-none-any.whl → 3.9.20250722200009__py3-none-any.whl

Potentially problematic release.

thds.mops 3.9.20250722163657py3-none-any.whl → 3.9.20250722200009py3-none-any.whl