thds.mops 3.9.20250722163657__py3-none-any.whl → 3.9.20250722164625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (37) hide show
  1. thds/mops/impure/runner.py +1 -1
  2. thds/mops/k8s/__init__.py +3 -1
  3. thds/mops/k8s/{launch.py → _launch.py} +56 -57
  4. thds/mops/k8s/batching.py +198 -0
  5. thds/mops/k8s/config.py +1 -1
  6. thds/mops/k8s/counts.py +28 -0
  7. thds/mops/k8s/job_future.py +109 -0
  8. thds/mops/k8s/jobs.py +4 -0
  9. thds/mops/k8s/logging.py +37 -5
  10. thds/mops/k8s/uncertain_future.py +160 -0
  11. thds/mops/k8s/watch.py +120 -62
  12. thds/mops/pure/__init__.py +2 -1
  13. thds/mops/pure/_magic/sauce.py +11 -3
  14. thds/mops/pure/_magic/shims.py +2 -2
  15. thds/mops/pure/core/deferred_work.py +15 -12
  16. thds/mops/pure/core/entry/runner_registry.py +1 -10
  17. thds/mops/pure/core/lock/__init__.py +1 -0
  18. thds/mops/pure/core/lock/_acquire.py +2 -2
  19. thds/mops/pure/core/lock/maintain.py +22 -3
  20. thds/mops/pure/core/lock/write.py +19 -19
  21. thds/mops/pure/core/memo/__init__.py +1 -1
  22. thds/mops/pure/core/memo/results.py +5 -4
  23. thds/mops/pure/core/use_runner.py +21 -7
  24. thds/mops/pure/pickling/mprunner.py +21 -14
  25. thds/mops/pure/pickling/pickles.py +19 -8
  26. thds/mops/pure/pickling/remote.py +3 -1
  27. thds/mops/pure/runner/get_results.py +106 -0
  28. thds/mops/pure/runner/local.py +58 -87
  29. thds/mops/pure/runner/shim_builder.py +7 -7
  30. thds/mops/pure/runner/simple_shims.py +7 -0
  31. thds/mops/pure/runner/types.py +15 -4
  32. thds/mops/pure/tools/summarize/run_summary.py +9 -8
  33. {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722164625.dist-info}/METADATA +1 -1
  34. {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722164625.dist-info}/RECORD +37 -32
  35. {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722164625.dist-info}/WHEEL +0 -0
  36. {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722164625.dist-info}/entry_points.txt +0 -0
  37. {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722164625.dist-info}/top_level.txt +0 -0
@@ -67,7 +67,7 @@ class KeyedLocalRunner(MemoizingPicklingRunner):
67
67
  redirect=lambda _f, _args, _kwargs: _perform_original_invocation,
68
68
  )
69
69
 
70
- def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R:
70
+ def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R: # type: ignore[valid-type]
71
71
  actual_function_to_call = self._pre_pickle_redirect(raw_func, raw_args, raw_kwargs)
72
72
  with _ORIGINAL_F_ARGS_KWARGS.set((actual_function_to_call, raw_args, raw_kwargs)):
73
73
  return super().__call__(*self._impure_keyfunc(raw_func, raw_args, raw_kwargs))
thds/mops/k8s/__init__.py CHANGED
@@ -7,8 +7,10 @@ except ModuleNotFoundError as mnf:
7
7
  "Please install mops with the `k8s` extra to use `thds.mops.k8s`."
8
8
  ) from mnf
9
9
 
10
+ from . import batching, counts, job_future # noqa: F401
11
+ from ._launch import launch, shim # noqa
10
12
  from .container_registry import autocr # noqa: F401
11
- from .launch import K8sJobFailedError, launch, shim # noqa
13
+ from .job_future import K8sJobFailedError # noqa: F401
12
14
  from .node_selection import ( # noqa
13
15
  NodeNarrowing,
14
16
  ResourceDefinition,
@@ -4,41 +4,22 @@ import os
4
4
  import threading
5
5
  import typing as ty
6
6
  import uuid
7
+ from functools import partial
7
8
 
8
9
  from kubernetes import client
9
10
 
10
- from thds.core import scope
11
- from thds.core.log import logger_context
11
+ from thds import core
12
12
  from thds.mops.pure.runner.simple_shims import samethread_shim
13
13
  from thds.termtool.colorize import colorized
14
14
 
15
- from . import config
15
+ from . import config, counts, job_future, logging
16
16
  from ._shared import logger
17
17
  from .auth import load_config, upsert_namespace
18
- from .logging import JobLogWatcher
19
18
  from .node_selection import NodeNarrowing, ResourceDefinition
20
19
  from .retry import k8s_sdk_retry
21
20
  from .thds_std import embed_thds_auth
22
- from .wait_job import wait_for_job
23
21
 
24
22
  LAUNCHED = colorized(fg="white", bg="green")
25
- COMPLETE = colorized(fg="white", bg="blue")
26
- FAILED = colorized(fg="white", bg="red")
27
-
28
-
29
- class K8sJobFailedError(Exception):
30
- """Raised by `launch` when a Job is seen to terminate in a Failed state."""
31
-
32
-
33
- class Counter:
34
- def __init__(self) -> None:
35
- self.value = 0
36
- self._lock = threading.Lock()
37
-
38
- def inc(self) -> int:
39
- with self._lock:
40
- self.value += 1
41
- return self.value
42
23
 
43
24
 
44
25
  def sanitize_str(name: str) -> str:
@@ -49,7 +30,7 @@ def sanitize_str(name: str) -> str:
49
30
 
50
31
  def construct_job_name(user_prefix: str, job_num: str) -> str:
51
32
  # we want some consistency here, but also some randomness in case the prefixes don't exist or aren't unique.
52
- mops_name_part = "-".join([str(os.getpid()), sanitize_str(job_num), str(uuid.uuid4())[:8]])
33
+ mops_name_part = "-".join([sanitize_str(job_num), str(os.getpid()), str(uuid.uuid4())[:8]])
53
34
  if len(mops_name_part) > 63:
54
35
  # this should be _impossible_, because having a job num longer than even 20 digits would be an impossibly large
55
36
  # number of jobs. but just in case, we'll truncate it to the last 63 characters.
@@ -65,12 +46,11 @@ def construct_job_name(user_prefix: str, job_num: str) -> str:
65
46
  return name
66
47
 
67
48
 
68
- _LAUNCH_COUNT = Counter()
69
- _FINISH_COUNT = Counter()
70
49
  _SIMULTANEOUS_LAUNCHES = threading.BoundedSemaphore(20)
50
+ JOB_NAME = core.stack_context.StackContext("job_name", "")
71
51
 
72
52
 
73
- @scope.bound
53
+ @core.scope.bound
74
54
  def launch(
75
55
  container_image: str,
76
56
  args: ty.Sequence[str],
@@ -81,38 +61,46 @@ def launch(
81
61
  # arguments below are for launching; arguments above are for
82
62
  # building. these should get separated in a future change.
83
63
  name_prefix: str = "",
64
+ full_name: str = "",
84
65
  dry_run: bool = False,
85
- fire_and_forget: bool = False,
86
66
  suppress_logs: bool = False,
87
67
  transform_job: ty.Callable[[client.models.V1Job], client.models.V1Job] = embed_thds_auth,
88
68
  # this is a default for now. later if we share this code we'll need to have a wrapper interface
89
69
  service_account_name: str = "",
90
- ) -> None:
70
+ ) -> core.futures.LazyFuture[bool]:
91
71
  """Launch a Kubernetes job.
92
72
 
93
73
  Required parameters are the container_image and the arguments to
94
74
  that image, just as if you were running this directly with Docker.
95
75
 
96
- Unless fire_and_forget=True, will poll until Job completes and
97
- will raise K8sJobFailedError if the Job fails. None is returned
98
- if the Job succeeds.
76
+ Returns a Future that will resolve to True when the Job completes successfully, or
77
+ raise K8sJobFailedError if the Job fails.
99
78
 
100
79
  `name_prefix` is an optional parameter for debugging/developer
101
80
  convenience. A generated suffix will be added to it.
102
-
103
81
  """
104
82
  if not container_image:
105
83
  raise ValueError("container_image (the fully qualified Docker tag) must not be empty.")
106
- job_num = f"{_LAUNCH_COUNT.inc():0>3}"
107
- name = construct_job_name(name_prefix, job_num)
108
- scope.enter(logger_context(job=name))
84
+
85
+ full_name = full_name or JOB_NAME()
86
+ # in certain cases, it may be necessary to set the job name
87
+ # via a StackContext, so we check that here, and prefer it over name_prefix.
88
+
89
+ if full_name and name_prefix:
90
+ raise ValueError("You cannot specify both full_name and name_prefix; use one or the other.")
91
+
92
+ if not full_name:
93
+ name = construct_job_name(name_prefix, counts.to_name(counts.inc(counts.LAUNCH_COUNT)))
94
+ else:
95
+ name = full_name
96
+
97
+ core.scope.enter(core.log.logger_context(job=name))
109
98
  node_narrowing = node_narrowing or dict()
110
99
 
111
100
  # TODO move this entire function out to be separately callable
112
101
  @k8s_sdk_retry()
113
102
  def assemble_base_job() -> client.models.V1Job:
114
103
  logger.debug(f"Assembling job named `{name}` on image `{container_image}`")
115
- logger.debug("Fire and forget: %s", fire_and_forget)
116
104
  logger.debug("Loading kube configs ...")
117
105
  load_config()
118
106
  logger.debug("Populating job object ...")
@@ -185,7 +173,7 @@ def launch(
185
173
  if dry_run:
186
174
  job_with_all_transforms()
187
175
  logger.info("Dry run assembly successful; not launching...")
188
- return
176
+ return core.futures.LazyFuture(partial(core.futures.ResolvedFuture, True))
189
177
 
190
178
  @k8s_sdk_retry()
191
179
  def launch_job() -> client.models.V1Job:
@@ -198,32 +186,41 @@ def launch(
198
186
  )
199
187
 
200
188
  job = launch_job()
201
- logger.info(LAUNCHED(f"Job {job_num} launched!") + f" on {container_image}")
189
+ logger.info(LAUNCHED(f"Job {name} launched!") + f" on {container_image}")
190
+ return core.futures.make_lazy(_launch_logs_and_create_future)( # see below for implementation
191
+ job.metadata.name,
192
+ num_pods_expected=len(job.spec.template.spec.containers),
193
+ namespace=config.k8s_namespace(),
194
+ suppress_logs=suppress_logs,
195
+ )
196
+
197
+
198
+ # this function has to be a top level def because it will sometimes be transferred across process boundaries,
199
+ # and Python/pickle in its infinite wisdom does not allow nested functions to be pickled.
200
+ def _launch_logs_and_create_future(
201
+ job_name: str, *, num_pods_expected: int, namespace: str, suppress_logs: bool
202
+ ) -> core.futures.PFuture[bool]:
202
203
  if not suppress_logs:
203
- threading.Thread( # fire and forget a log watching thread
204
- target=JobLogWatcher(job.metadata.name, len(job.spec.template.spec.containers)).start,
205
- daemon=True,
206
- ).start()
207
-
208
- if not fire_and_forget:
204
+ logging.maybe_start_job_thread(job_name, num_pods_expected)
205
+ return job_future.make_job_completion_future(job_name, namespace=namespace)
209
206
 
210
- def counts() -> str:
211
- launched = _LAUNCH_COUNT.value
212
- return f"- ({launched - _FINISH_COUNT.inc()} unfinished of {launched})"
213
207
 
214
- job_name = job.metadata.name
215
- del job # trying to save memory here while we wait...
216
- if not wait_for_job(job_name, short_name=job_num):
217
- logger.error(FAILED(f"Job {job_num} Failed! {counts()}"))
218
- raise K8sJobFailedError(f"Job {job_name} failed.")
219
- logger.info(COMPLETE(f"Job {job_num} Complete! {counts()}"))
208
+ def create_lazy_job_logging_future(
209
+ job_name: str, *, namespace: str = "", num_pods_expected: int = 1
210
+ ) -> core.futures.LazyFuture[bool]:
211
+ return core.futures.make_lazy(_launch_logs_and_create_future)(
212
+ job_name,
213
+ num_pods_expected=num_pods_expected,
214
+ namespace=namespace or config.k8s_namespace(),
215
+ suppress_logs=False,
216
+ )
220
217
 
221
218
 
222
219
  def shim(
223
220
  container_image: ty.Union[str, ty.Callable[[], str]],
224
221
  disable_remote: ty.Callable[[], bool] = lambda: False,
225
222
  **outer_kwargs: ty.Any,
226
- ) -> ty.Callable[[ty.Sequence[str]], None]:
223
+ ) -> ty.Callable[[ty.Sequence[str]], core.futures.LazyFuture[bool]]:
227
224
  """Return a closure that can launch the given configuration and run a mops pure function.
228
225
 
229
226
  Now supports callables that return a container image name; the
@@ -240,16 +237,18 @@ def shim(
240
237
  ), "Passing 'args' as a keyword argument will cause conflicts with the closure."
241
238
 
242
239
  if disable_remote():
243
- return samethread_shim
240
+ return samethread_shim # type: ignore[return-value]
244
241
 
245
242
  if isinstance(container_image, str):
246
243
  get_container_image: ty.Callable[[], str] = lambda: container_image # noqa: E731
247
244
  else:
248
245
  get_container_image = container_image
249
246
 
250
- def launch_container_on_k8s_with_args(args: ty.Sequence[str], **inner_kwargs: ty.Any) -> None:
247
+ def launch_container_on_k8s_with_args(
248
+ args: ty.Sequence[str], **inner_kwargs: ty.Any
249
+ ) -> core.futures.LazyFuture[bool]:
251
250
  assert "args" not in inner_kwargs
252
- launch(
251
+ return launch(
253
252
  get_container_image(),
254
253
  ["python", "-m", "thds.mops.pure.core.entry.main", *args],
255
254
  **{**outer_kwargs, **inner_kwargs},
@@ -0,0 +1,198 @@
1
+ """The basic idea of this module is that different threads can submit _parts_ of a job to a batcher,
2
+ and immediately get the job name back, while the batcher itself defers creating the job until the
3
+ batch is full, or when the process exits.
4
+
5
+ The theory is that will get used in processes whose only responsibility is to create jobs,
6
+ so waiting on atexit to create the final batch is not an issue.
7
+
8
+ If you want a batcher that has a more context-manager-like behavior, you can write one of
9
+ those, but it wouldn't work well with a concurrent.futures Executor-style approach, since
10
+ those don't have an explicit shutdown procedure that we can hook to call __exit__.
11
+ """
12
+
13
+ import atexit
14
+ import concurrent.futures
15
+ import itertools
16
+ import multiprocessing
17
+ import threading
18
+ import typing as ty
19
+
20
+ from thds.core import cpus, futures, log
21
+
22
+ from . import _launch, counts
23
+
24
+ T = ty.TypeVar("T")
25
+ logger = log.getLogger(__name__)
26
+
27
+
28
+ class _AtExitBatcher(ty.Generic[T]):
29
+ def __init__(self, batch_processor: ty.Callable[[ty.Collection[T]], None]) -> None:
30
+ self.batch: list[T] = []
31
+ self._registered = False
32
+ self._lock = threading.RLock()
33
+ self._batch_processor = batch_processor
34
+
35
+ def add(self, item: T) -> None:
36
+ with self._lock:
37
+ if not self._registered:
38
+ atexit.register(self.process)
39
+ # ensure we flush on process exit, since we don't know how many items are coming
40
+ self._registered = True
41
+ self.batch.append(item)
42
+
43
+ def process(self) -> None:
44
+ if self.batch:
45
+ with self._lock:
46
+ if self.batch:
47
+ self._batch_processor(self.batch)
48
+ self.batch = []
49
+
50
+
51
+ class K8sJobBatchingShim(_AtExitBatcher[str]):
52
+ """Thread-safe for use within a single process by multiple threads."""
53
+
54
+ def __init__(
55
+ self,
56
+ submit_func: ty.Callable[[ty.Collection[str]], ty.Any],
57
+ max_batch_size: int,
58
+ job_counter: counts.MpValue[int],
59
+ name_prefix: str = "",
60
+ ) -> None:
61
+ """submit_func in particular should be a closure around whatever setup you need to
62
+ do to call back into a function that is locally wrapped with a k8s shim that will
63
+ ultimately call k8s.launch. Notably, you
64
+ """
65
+ super().__init__(self._process_batch)
66
+ self._max_batch_size = max_batch_size
67
+ self._job_counter = job_counter
68
+ self._job_name = ""
69
+ self._name_prefix = name_prefix
70
+ self._submit_func = submit_func
71
+
72
+ def _get_new_name(self) -> str:
73
+ # counts.inc takes a multiprocess lock. do not forget this!
74
+ job_num = counts.inc(self._job_counter)
75
+ return _launch.construct_job_name(self._name_prefix, counts.to_name(job_num))
76
+
77
+ def add_to_named_job(self, mops_invocation: ty.Sequence[str]) -> str:
78
+ """Returns job name for the invocation."""
79
+ with self._lock:
80
+ if not self._job_name:
81
+ self._job_name = self._get_new_name()
82
+ if len(self.batch) >= self._max_batch_size:
83
+ self.process()
84
+ self._job_name = self._get_new_name()
85
+ super().add(" ".join(mops_invocation))
86
+ return self._job_name
87
+
88
+ def _process_batch(self, batch: ty.Collection[str]) -> None:
89
+ with _launch.JOB_NAME.set(self._job_name):
90
+ log_lvl = logger.warning if len(batch) < self._max_batch_size else logger.info
91
+ log_lvl(f"Processing batch of len {len(batch)} with job name {self._job_name}")
92
+ self._submit_func(batch)
93
+
94
+
95
+ F = ty.TypeVar("F", bound=ty.Callable)
96
+ FunctionDecorator = ty.Callable[[F], F]
97
+
98
+
99
+ _BATCHER: ty.Optional[K8sJobBatchingShim] = None
100
+
101
+
102
+ def init_batcher(
103
+ submit_func: ty.Callable[[ty.Collection[str]], ty.Any],
104
+ func_max_batch_size: int,
105
+ job_counter: counts.MpValue[int],
106
+ name_prefix: str = "",
107
+ ) -> None:
108
+ # for use with multiprocessing pool initializer
109
+ global _BATCHER
110
+ if _BATCHER is not None:
111
+ logger.warning("Batcher is already initialized; reinitializing will reset the job name.")
112
+ return
113
+
114
+ _BATCHER = K8sJobBatchingShim(submit_func, func_max_batch_size, job_counter, name_prefix)
115
+
116
+
117
+ def init_batcher_with_unpicklable_submit_func(
118
+ make_submit_func: ty.Callable[[T], ty.Callable[[ty.Collection[str]], ty.Any]],
119
+ submit_func_arg: T,
120
+ func_max_batch_size: int,
121
+ job_counter: counts.MpValue[int],
122
+ name_prefix: str = "",
123
+ ) -> None:
124
+ """Use this if you want to have an unpicklable submit function - because applying make_submit_func(submit_func_arg)
125
+ will happen inside the pool worker process after all the pickling/unpickling has happened.
126
+ """
127
+ return init_batcher(
128
+ make_submit_func(submit_func_arg), func_max_batch_size, job_counter, name_prefix=name_prefix
129
+ )
130
+
131
+
132
+ def make_counting_process_pool_executor(
133
+ make_submit_func: ty.Callable[[T], ty.Callable[[ty.Collection[str]], ty.Any]],
134
+ submit_func_arg: T,
135
+ max_batch_size: int,
136
+ name_prefix: str = "",
137
+ max_workers: int = 0,
138
+ ) -> concurrent.futures.ProcessPoolExecutor:
139
+ """Creates a ProcessPoolExecutor that uses the batching shim for job submission.
140
+
141
+ We are introducing this because we see segfaults prior to Python 3.12 related to this issue:
142
+ https://github.com/python/cpython/issues/77377
143
+
144
+ And it would seem that this had to do with creating mp.Values using a 'fork' start
145
+ method, and then passing those to a ProcessPoolExecutor with
146
+ mp_context=multiprolcessing.get_context('spawn'). So we can help you avoid that by creating
147
+ the mp.Value for you, alongside its ProcessPoolExecutor.
148
+
149
+ NOTE!!
150
+
151
+ You should only have one of these per process at a time, because we're doing spooky
152
+ things with the Job Counter. In fact, you should probably only create one of these
153
+ _ever_ within a single logical 'application'.
154
+
155
+ If you fail to heed this advice, you will get weird launched/finished counts at a
156
+ minimum. Although these job counts are not mission-critical, you _will_ be confused.
157
+ """
158
+ start_method: str = "spawn"
159
+ # 'spawn' prevents weird batch processing deadlocks that seem to only happen on Linux with 'fork'.
160
+ # it is strongly recommended to use 'spawn' for this reason.
161
+
162
+ mp_context = multiprocessing.get_context(start_method)
163
+ launch_count = mp_context.Value("i", 0)
164
+ # even though i want to assign this to a global, I also want to prevent
165
+ # any possible race condition where i somehow use a different thread's LAUNCH_COUNT
166
+ # when i create the ProcessPoolExecutor a few lines below.
167
+ counts.LAUNCH_COUNT = launch_count
168
+ counts.FINISH_COUNT = mp_context.Value("i", 0) # we don't use this here; we just reset it to zero.
169
+ # SPOOKY - reset the global finish counter and make it be the same 'type'
170
+ return concurrent.futures.ProcessPoolExecutor(
171
+ max_workers=max_workers or cpus.available_cpu_count(),
172
+ initializer=init_batcher_with_unpicklable_submit_func,
173
+ initargs=(make_submit_func, submit_func_arg, max_batch_size, launch_count, name_prefix),
174
+ mp_context=mp_context,
175
+ )
176
+
177
+
178
+ def shim(args: ty.Sequence[str]) -> futures.PFuture[bool]:
179
+ # This thing needs to return a lazy Uncertain Future that contains a job name, so that Job can be polled on
180
+ # ... but the job does not exist yet! So the batcher is in charge of creating the job name
181
+ # upfront, and then ensuring that it gets used when the job is created.
182
+ assert _BATCHER is not None, "Batcher must be initialized before using the batching shim."
183
+ job_name = _BATCHER.add_to_named_job(args)
184
+ return _launch.create_lazy_job_logging_future(job_name)
185
+
186
+
187
+ def batched(iterable: ty.Iterable[T], n: int, *, strict: bool = False) -> ty.Iterator[tuple[T, ...]]:
188
+ """Just a utility for pre-batching if you're using multiprocessing to create batches."""
189
+ # TODO get rid of this when we go to Python 3.12+ which has itertools.batched
190
+ #
191
+ # batched('ABCDEFG', 3) → ABC DEF G
192
+ if n < 1:
193
+ raise ValueError("n must be at least one")
194
+ iterator = iter(iterable)
195
+ while batch := tuple(itertools.islice(iterator, n)):
196
+ if strict and len(batch) != n:
197
+ raise ValueError("batched(): incomplete batch")
198
+ yield batch
thds/mops/k8s/config.py CHANGED
@@ -10,7 +10,7 @@ k8s_namespace_env_var_key = config.item("mops.k8s.namespace_env_var_key", "MOPS_
10
10
  # environment variable. it will not affect how your namespace is selected in the first
11
11
  # place.
12
12
 
13
- k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 30 * 60, parse=int)
13
+ k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 5 * 60, parse=int)
14
14
  k8s_acr_url = config.item("mops.k8s.acr.url", "")
15
15
  k8s_job_retry_count = config.item("mops.k8s.job.retry_count", 6, parse=int)
16
16
  k8s_job_cleanup_ttl_seconds_after_completion = config.item(
@@ -0,0 +1,28 @@
1
+ import multiprocessing as mp
2
+ import typing as ty
3
+
4
+ T = ty.TypeVar("T")
5
+
6
+
7
+ class MpValue(ty.Protocol[T]):
8
+ def get_lock(self) -> ty.Any:
9
+ ...
10
+
11
+ value: T
12
+
13
+
14
+ def inc(mp_val: MpValue[int]) -> int:
15
+ with mp_val.get_lock():
16
+ mp_val.value += 1
17
+ return mp_val.value
18
+
19
+
20
+ LAUNCH_COUNT = mp.Value("i", 0)
21
+ FINISH_COUNT = mp.Value("i", 0)
22
+ # these are spooky - they're global and mutable, and may in fact get overwritten by code
23
+ # using specific multiprocessing contexts.
24
+
25
+
26
+ def to_name(count: int) -> str:
27
+ """Convert a count to a name."""
28
+ return f"{count:0>4}"
@@ -0,0 +1,109 @@
1
+ import threading
2
+ import typing as ty
3
+
4
+ from kubernetes import client
5
+
6
+ from thds.core import futures, log
7
+ from thds.termtool.colorize import colorized
8
+
9
+ from . import config, counts, uncertain_future
10
+ from .jobs import is_job_failed, is_job_succeeded, job_source
11
+
12
+ logger = log.getLogger(__name__)
13
+
14
+ UNUSUAL = colorized(fg="white", bg="yellow")
15
+ SUCCEEDED = colorized(fg="white", bg="blue")
16
+ FAILED = colorized(fg="white", bg="red")
17
+
18
+
19
+ _FINISHED_JOBS = set[str]()
20
+ _FINISHED_JOBS_LOCK = threading.Lock()
21
+
22
+
23
+ def _check_newly_finished(job_name: str, namespace: str = "") -> str:
24
+ # I don't believe it's possible to ever have a Job that both succeeds and fails.
25
+ namespace = namespace or config.k8s_namespace()
26
+ job_full = f"{namespace}/{job_name}"
27
+ if job_full in _FINISHED_JOBS:
28
+ return ""
29
+
30
+ with _FINISHED_JOBS_LOCK:
31
+ if job_full in _FINISHED_JOBS:
32
+ return ""
33
+
34
+ _FINISHED_JOBS.add(job_full)
35
+
36
+ launched = counts.LAUNCH_COUNT.value
37
+ return f"- ({launched - counts.inc(counts.FINISH_COUNT)} unfinished of {launched})"
38
+
39
+
40
+ class K8sJobFailedError(Exception):
41
+ """Raised by `launch` when a Job is seen to terminate in a Failed state."""
42
+
43
+
44
+ def make_job_completion_future(job_name: str, *, namespace: str = "") -> futures.PFuture[bool]:
45
+ """This is a natural boundary for a serializable lazy future - something that represents
46
+ work being done across process boundaries (since Kubernetes jobs will be listed via an API.
47
+
48
+ If True is returned, the Job has definitely succeeded.
49
+
50
+ If False is returned, the Job may have succeeded but we saw no evidence of it.
51
+
52
+ If the Job definitely failed, an Exception will be raised.
53
+ """
54
+
55
+ JOB_SEEN = False
56
+
57
+ def job_completion_interpreter(
58
+ job: ty.Optional[client.models.V1Job], last_seen_at: float
59
+ ) -> ty.Union[uncertain_future.NotYetDone, bool]:
60
+ nonlocal JOB_SEEN
61
+ if not job:
62
+ if JOB_SEEN:
63
+ logger.warning(
64
+ UNUSUAL(f"Previously-seen job {job_name} no longer exists - assuming success!")
65
+ )
66
+ # we hereby indicate an unusual success to the Future waiter.
67
+ return False
68
+
69
+ time_since_last_seen = uncertain_future.official_timer() - last_seen_at
70
+ if time_since_last_seen > config.k8s_watch_object_stale_seconds():
71
+ # this is 5 minutes by default as of 2025-07-15.
72
+ raise TimeoutError(
73
+ f"Job {job_name} has not been seen for {time_since_last_seen:.1f} seconds - assuming failure!"
74
+ )
75
+
76
+ # we don't know what's going on but things aren't truly stale yet.
77
+ return uncertain_future.NotYetDone()
78
+
79
+ JOB_SEEN = True
80
+
81
+ if is_job_succeeded(job):
82
+ newly_succeeded = _check_newly_finished(job_name, namespace)
83
+ if newly_succeeded:
84
+ logger.info(SUCCEEDED(f"Job {job_name} Succeeded! {newly_succeeded}"))
85
+ return True
86
+
87
+ if is_job_failed(job):
88
+ newly_failed = _check_newly_finished(job_name, namespace)
89
+ if newly_failed:
90
+ logger.error(FAILED(f"Job {job_name} Failed! {newly_failed}"))
91
+ raise K8sJobFailedError(f"Job {job_name} has failed with status: {job.status}")
92
+
93
+ return uncertain_future.NotYetDone() # job is still in progress
94
+
95
+ return job_source().create_future(
96
+ job_completion_interpreter,
97
+ job_name,
98
+ namespace=namespace or config.k8s_namespace(),
99
+ )
100
+
101
+
102
+ def make_lazy_completion_future(job_name: str, *, namespace: str = "") -> futures.LazyFuture[bool]:
103
+ """This is a convenience function that will create a job completion future and then
104
+ immediately process it, returning the result. See docs on function above.
105
+ """
106
+ return futures.make_lazy(make_job_completion_future)(
107
+ job_name,
108
+ namespace=namespace or config.k8s_namespace(),
109
+ )
thds/mops/k8s/jobs.py CHANGED
@@ -25,6 +25,10 @@ _JOB_SOURCE = WatchingObjectSource(
25
25
  )
26
26
 
27
27
 
28
+ def job_source() -> WatchingObjectSource[client.models.V1Job]:
29
+ return _JOB_SOURCE
30
+
31
+
28
32
  def get_job(job_name: str, namespace: str = "") -> ty.Optional[client.models.V1Job]:
29
33
  return _JOB_SOURCE.get(job_name, namespace=namespace)
30
34
 
thds/mops/k8s/logging.py CHANGED
@@ -32,6 +32,17 @@ BOINK = colorized(fg="white", bg="magenta")
32
32
  # string in this and it'll stand out.
33
33
 
34
34
 
35
+ def should_log(job_name: str) -> bool:
36
+ if NO_K8S_LOGS():
37
+ return False
38
+
39
+ if random.random() > K8S_LOG_POD_FRACTION():
40
+ logger.info(f"Skipping log watcher for {job_name} due to fraction.")
41
+ return False
42
+
43
+ return True
44
+
45
+
35
46
  class JobLogWatcher:
36
47
  """Will spawn one or more daemon threads.
37
48
 
@@ -59,11 +70,7 @@ class JobLogWatcher:
59
70
  @core.scope.bound
60
71
  def start(self, failed_pod_name: str = "") -> None:
61
72
  """Call this one time - it will spawn threads as needed."""
62
- if NO_K8S_LOGS():
63
- return
64
-
65
- if random.random() > K8S_LOG_POD_FRACTION():
66
- logger.info(f"Skipping log watcher for {self.job_name} due to fraction.")
73
+ if not should_log(self.job_name):
67
74
  return
68
75
 
69
76
  core.scope.enter(self.job_pods_discovery_lock)
@@ -245,3 +252,28 @@ def _scrape_pod_logs(
245
252
  logger.exception(BOINK("Pod log scraping failed utterly. Pod may have died?"))
246
253
  # at least let the caller know something went horribly wrong
247
254
  failure_callback(pod_name)
255
+
256
+
257
+ _JOB_LOG_THREADS: set[str] = set()
258
+ _JOB_LOG_THREAD_COUNT: int = 0
259
+ _JOB_LOG_THREADS_LOCK = threading.Lock()
260
+
261
+
262
+ def maybe_start_job_thread(job_name: str, num_pods_expected: int = 1) -> bool:
263
+ """Starts a thread to watch the logs of a job. Makes sure we only start one thread per
264
+ job even if there are multiple calls to this function.
265
+ """
266
+ if job_name not in _JOB_LOG_THREADS:
267
+ with _JOB_LOG_THREADS_LOCK:
268
+ if job_name not in _JOB_LOG_THREADS:
269
+ # double-checked locking to avoid creating multiple threads for the same job
270
+ _JOB_LOG_THREADS.add(job_name)
271
+ if should_log(job_name):
272
+ global _JOB_LOG_THREAD_COUNT
273
+ _JOB_LOG_THREAD_COUNT += 1
274
+ logger.info(f"Starting log watcher {_JOB_LOG_THREAD_COUNT} for job {job_name}")
275
+ threading.Thread(
276
+ target=JobLogWatcher(job_name, num_pods_expected).start, daemon=True
277
+ ).start()
278
+ return True
279
+ return False