thds.mops 3.9.20250722163657__py3-none-any.whl → 3.9.20250722200009__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.mops might be problematic. Click here for more details.
- thds/mops/impure/runner.py +1 -1
- thds/mops/k8s/__init__.py +3 -1
- thds/mops/k8s/{launch.py → _launch.py} +56 -57
- thds/mops/k8s/batching.py +198 -0
- thds/mops/k8s/config.py +1 -1
- thds/mops/k8s/counts.py +28 -0
- thds/mops/k8s/job_future.py +109 -0
- thds/mops/k8s/jobs.py +4 -0
- thds/mops/k8s/logging.py +37 -5
- thds/mops/k8s/uncertain_future.py +160 -0
- thds/mops/k8s/watch.py +120 -62
- thds/mops/pure/__init__.py +2 -1
- thds/mops/pure/_magic/sauce.py +11 -3
- thds/mops/pure/_magic/shims.py +2 -2
- thds/mops/pure/core/deferred_work.py +15 -12
- thds/mops/pure/core/entry/runner_registry.py +1 -10
- thds/mops/pure/core/lock/__init__.py +1 -0
- thds/mops/pure/core/lock/_acquire.py +2 -2
- thds/mops/pure/core/lock/maintain.py +22 -3
- thds/mops/pure/core/lock/write.py +19 -19
- thds/mops/pure/core/memo/__init__.py +1 -1
- thds/mops/pure/core/memo/results.py +5 -4
- thds/mops/pure/core/use_runner.py +21 -7
- thds/mops/pure/pickling/mprunner.py +21 -14
- thds/mops/pure/pickling/pickles.py +19 -8
- thds/mops/pure/pickling/remote.py +3 -1
- thds/mops/pure/runner/get_results.py +106 -0
- thds/mops/pure/runner/local.py +58 -87
- thds/mops/pure/runner/shim_builder.py +7 -7
- thds/mops/pure/runner/simple_shims.py +7 -0
- thds/mops/pure/runner/types.py +15 -4
- thds/mops/pure/tools/summarize/run_summary.py +9 -8
- {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/METADATA +1 -1
- {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/RECORD +37 -32
- {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/WHEEL +0 -0
- {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/entry_points.txt +0 -0
- {thds_mops-3.9.20250722163657.dist-info → thds_mops-3.9.20250722200009.dist-info}/top_level.txt +0 -0
thds/mops/impure/runner.py
CHANGED
|
@@ -67,7 +67,7 @@ class KeyedLocalRunner(MemoizingPicklingRunner):
|
|
|
67
67
|
redirect=lambda _f, _args, _kwargs: _perform_original_invocation,
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
-
def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R:
|
|
70
|
+
def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R: # type: ignore[valid-type]
|
|
71
71
|
actual_function_to_call = self._pre_pickle_redirect(raw_func, raw_args, raw_kwargs)
|
|
72
72
|
with _ORIGINAL_F_ARGS_KWARGS.set((actual_function_to_call, raw_args, raw_kwargs)):
|
|
73
73
|
return super().__call__(*self._impure_keyfunc(raw_func, raw_args, raw_kwargs))
|
thds/mops/k8s/__init__.py
CHANGED
|
@@ -7,8 +7,10 @@ except ModuleNotFoundError as mnf:
|
|
|
7
7
|
"Please install mops with the `k8s` extra to use `thds.mops.k8s`."
|
|
8
8
|
) from mnf
|
|
9
9
|
|
|
10
|
+
from . import batching, counts, job_future # noqa: F401
|
|
11
|
+
from ._launch import launch, shim # noqa
|
|
10
12
|
from .container_registry import autocr # noqa: F401
|
|
11
|
-
from .
|
|
13
|
+
from .job_future import K8sJobFailedError # noqa: F401
|
|
12
14
|
from .node_selection import ( # noqa
|
|
13
15
|
NodeNarrowing,
|
|
14
16
|
ResourceDefinition,
|
|
@@ -4,41 +4,22 @@ import os
|
|
|
4
4
|
import threading
|
|
5
5
|
import typing as ty
|
|
6
6
|
import uuid
|
|
7
|
+
from functools import partial
|
|
7
8
|
|
|
8
9
|
from kubernetes import client
|
|
9
10
|
|
|
10
|
-
from thds
|
|
11
|
-
from thds.core.log import logger_context
|
|
11
|
+
from thds import core
|
|
12
12
|
from thds.mops.pure.runner.simple_shims import samethread_shim
|
|
13
13
|
from thds.termtool.colorize import colorized
|
|
14
14
|
|
|
15
|
-
from . import config
|
|
15
|
+
from . import config, counts, job_future, logging
|
|
16
16
|
from ._shared import logger
|
|
17
17
|
from .auth import load_config, upsert_namespace
|
|
18
|
-
from .logging import JobLogWatcher
|
|
19
18
|
from .node_selection import NodeNarrowing, ResourceDefinition
|
|
20
19
|
from .retry import k8s_sdk_retry
|
|
21
20
|
from .thds_std import embed_thds_auth
|
|
22
|
-
from .wait_job import wait_for_job
|
|
23
21
|
|
|
24
22
|
LAUNCHED = colorized(fg="white", bg="green")
|
|
25
|
-
COMPLETE = colorized(fg="white", bg="blue")
|
|
26
|
-
FAILED = colorized(fg="white", bg="red")
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class K8sJobFailedError(Exception):
|
|
30
|
-
"""Raised by `launch` when a Job is seen to terminate in a Failed state."""
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class Counter:
|
|
34
|
-
def __init__(self) -> None:
|
|
35
|
-
self.value = 0
|
|
36
|
-
self._lock = threading.Lock()
|
|
37
|
-
|
|
38
|
-
def inc(self) -> int:
|
|
39
|
-
with self._lock:
|
|
40
|
-
self.value += 1
|
|
41
|
-
return self.value
|
|
42
23
|
|
|
43
24
|
|
|
44
25
|
def sanitize_str(name: str) -> str:
|
|
@@ -49,7 +30,7 @@ def sanitize_str(name: str) -> str:
|
|
|
49
30
|
|
|
50
31
|
def construct_job_name(user_prefix: str, job_num: str) -> str:
|
|
51
32
|
# we want some consistency here, but also some randomness in case the prefixes don't exist or aren't unique.
|
|
52
|
-
mops_name_part = "-".join([str(os.getpid()),
|
|
33
|
+
mops_name_part = "-".join([sanitize_str(job_num), str(os.getpid()), str(uuid.uuid4())[:8]])
|
|
53
34
|
if len(mops_name_part) > 63:
|
|
54
35
|
# this should be _impossible_, because having a job num longer than even 20 digits would be an impossibly large
|
|
55
36
|
# number of jobs. but just in case, we'll truncate it to the last 63 characters.
|
|
@@ -65,12 +46,11 @@ def construct_job_name(user_prefix: str, job_num: str) -> str:
|
|
|
65
46
|
return name
|
|
66
47
|
|
|
67
48
|
|
|
68
|
-
_LAUNCH_COUNT = Counter()
|
|
69
|
-
_FINISH_COUNT = Counter()
|
|
70
49
|
_SIMULTANEOUS_LAUNCHES = threading.BoundedSemaphore(20)
|
|
50
|
+
JOB_NAME = core.stack_context.StackContext("job_name", "")
|
|
71
51
|
|
|
72
52
|
|
|
73
|
-
@scope.bound
|
|
53
|
+
@core.scope.bound
|
|
74
54
|
def launch(
|
|
75
55
|
container_image: str,
|
|
76
56
|
args: ty.Sequence[str],
|
|
@@ -81,38 +61,46 @@ def launch(
|
|
|
81
61
|
# arguments below are for launching; arguments above are for
|
|
82
62
|
# building. these should get separated in a future change.
|
|
83
63
|
name_prefix: str = "",
|
|
64
|
+
full_name: str = "",
|
|
84
65
|
dry_run: bool = False,
|
|
85
|
-
fire_and_forget: bool = False,
|
|
86
66
|
suppress_logs: bool = False,
|
|
87
67
|
transform_job: ty.Callable[[client.models.V1Job], client.models.V1Job] = embed_thds_auth,
|
|
88
68
|
# this is a default for now. later if we share this code we'll need to have a wrapper interface
|
|
89
69
|
service_account_name: str = "",
|
|
90
|
-
) ->
|
|
70
|
+
) -> core.futures.LazyFuture[bool]:
|
|
91
71
|
"""Launch a Kubernetes job.
|
|
92
72
|
|
|
93
73
|
Required parameters are the container_image and the arguments to
|
|
94
74
|
that image, just as if you were running this directly with Docker.
|
|
95
75
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
if the Job succeeds.
|
|
76
|
+
Returns a Future that will resolve to True when the Job completes successfully, or
|
|
77
|
+
raise K8sJobFailedError if the Job fails.
|
|
99
78
|
|
|
100
79
|
`name_prefix` is an optional parameter for debugging/developer
|
|
101
80
|
convenience. A generated suffix will be added to it.
|
|
102
|
-
|
|
103
81
|
"""
|
|
104
82
|
if not container_image:
|
|
105
83
|
raise ValueError("container_image (the fully qualified Docker tag) must not be empty.")
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
84
|
+
|
|
85
|
+
full_name = full_name or JOB_NAME()
|
|
86
|
+
# in certain cases, it may be necessary to set the job name
|
|
87
|
+
# via a StackContext, so we check that here, and prefer it over name_prefix.
|
|
88
|
+
|
|
89
|
+
if full_name and name_prefix:
|
|
90
|
+
raise ValueError("You cannot specify both full_name and name_prefix; use one or the other.")
|
|
91
|
+
|
|
92
|
+
if not full_name:
|
|
93
|
+
name = construct_job_name(name_prefix, counts.to_name(counts.inc(counts.LAUNCH_COUNT)))
|
|
94
|
+
else:
|
|
95
|
+
name = full_name
|
|
96
|
+
|
|
97
|
+
core.scope.enter(core.log.logger_context(job=name))
|
|
109
98
|
node_narrowing = node_narrowing or dict()
|
|
110
99
|
|
|
111
100
|
# TODO move this entire function out to be separately callable
|
|
112
101
|
@k8s_sdk_retry()
|
|
113
102
|
def assemble_base_job() -> client.models.V1Job:
|
|
114
103
|
logger.debug(f"Assembling job named `{name}` on image `{container_image}`")
|
|
115
|
-
logger.debug("Fire and forget: %s", fire_and_forget)
|
|
116
104
|
logger.debug("Loading kube configs ...")
|
|
117
105
|
load_config()
|
|
118
106
|
logger.debug("Populating job object ...")
|
|
@@ -185,7 +173,7 @@ def launch(
|
|
|
185
173
|
if dry_run:
|
|
186
174
|
job_with_all_transforms()
|
|
187
175
|
logger.info("Dry run assembly successful; not launching...")
|
|
188
|
-
return
|
|
176
|
+
return core.futures.LazyFuture(partial(core.futures.ResolvedFuture, True))
|
|
189
177
|
|
|
190
178
|
@k8s_sdk_retry()
|
|
191
179
|
def launch_job() -> client.models.V1Job:
|
|
@@ -198,32 +186,41 @@ def launch(
|
|
|
198
186
|
)
|
|
199
187
|
|
|
200
188
|
job = launch_job()
|
|
201
|
-
logger.info(LAUNCHED(f"Job {
|
|
189
|
+
logger.info(LAUNCHED(f"Job {name} launched!") + f" on {container_image}")
|
|
190
|
+
return core.futures.make_lazy(_launch_logs_and_create_future)( # see below for implementation
|
|
191
|
+
job.metadata.name,
|
|
192
|
+
num_pods_expected=len(job.spec.template.spec.containers),
|
|
193
|
+
namespace=config.k8s_namespace(),
|
|
194
|
+
suppress_logs=suppress_logs,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# this function has to be a top level def because it will sometimes be transferred across process boundaries,
|
|
199
|
+
# and Python/pickle in its infinite wisdom does not allow nested functions to be pickled.
|
|
200
|
+
def _launch_logs_and_create_future(
|
|
201
|
+
job_name: str, *, num_pods_expected: int, namespace: str, suppress_logs: bool
|
|
202
|
+
) -> core.futures.PFuture[bool]:
|
|
202
203
|
if not suppress_logs:
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
daemon=True,
|
|
206
|
-
).start()
|
|
207
|
-
|
|
208
|
-
if not fire_and_forget:
|
|
204
|
+
logging.maybe_start_job_thread(job_name, num_pods_expected)
|
|
205
|
+
return job_future.make_job_completion_future(job_name, namespace=namespace)
|
|
209
206
|
|
|
210
|
-
def counts() -> str:
|
|
211
|
-
launched = _LAUNCH_COUNT.value
|
|
212
|
-
return f"- ({launched - _FINISH_COUNT.inc()} unfinished of {launched})"
|
|
213
207
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
208
|
+
def create_lazy_job_logging_future(
|
|
209
|
+
job_name: str, *, namespace: str = "", num_pods_expected: int = 1
|
|
210
|
+
) -> core.futures.LazyFuture[bool]:
|
|
211
|
+
return core.futures.make_lazy(_launch_logs_and_create_future)(
|
|
212
|
+
job_name,
|
|
213
|
+
num_pods_expected=num_pods_expected,
|
|
214
|
+
namespace=namespace or config.k8s_namespace(),
|
|
215
|
+
suppress_logs=False,
|
|
216
|
+
)
|
|
220
217
|
|
|
221
218
|
|
|
222
219
|
def shim(
|
|
223
220
|
container_image: ty.Union[str, ty.Callable[[], str]],
|
|
224
221
|
disable_remote: ty.Callable[[], bool] = lambda: False,
|
|
225
222
|
**outer_kwargs: ty.Any,
|
|
226
|
-
) -> ty.Callable[[ty.Sequence[str]],
|
|
223
|
+
) -> ty.Callable[[ty.Sequence[str]], core.futures.LazyFuture[bool]]:
|
|
227
224
|
"""Return a closure that can launch the given configuration and run a mops pure function.
|
|
228
225
|
|
|
229
226
|
Now supports callables that return a container image name; the
|
|
@@ -240,16 +237,18 @@ def shim(
|
|
|
240
237
|
), "Passing 'args' as a keyword argument will cause conflicts with the closure."
|
|
241
238
|
|
|
242
239
|
if disable_remote():
|
|
243
|
-
return samethread_shim
|
|
240
|
+
return samethread_shim # type: ignore[return-value]
|
|
244
241
|
|
|
245
242
|
if isinstance(container_image, str):
|
|
246
243
|
get_container_image: ty.Callable[[], str] = lambda: container_image # noqa: E731
|
|
247
244
|
else:
|
|
248
245
|
get_container_image = container_image
|
|
249
246
|
|
|
250
|
-
def launch_container_on_k8s_with_args(
|
|
247
|
+
def launch_container_on_k8s_with_args(
|
|
248
|
+
args: ty.Sequence[str], **inner_kwargs: ty.Any
|
|
249
|
+
) -> core.futures.LazyFuture[bool]:
|
|
251
250
|
assert "args" not in inner_kwargs
|
|
252
|
-
launch(
|
|
251
|
+
return launch(
|
|
253
252
|
get_container_image(),
|
|
254
253
|
["python", "-m", "thds.mops.pure.core.entry.main", *args],
|
|
255
254
|
**{**outer_kwargs, **inner_kwargs},
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""The basic idea of this module is that different threads can submit _parts_ of a job to a batcher,
|
|
2
|
+
and immediately get the job name back, while the batcher itself defers creating the job until the
|
|
3
|
+
batch is full, or when the process exits.
|
|
4
|
+
|
|
5
|
+
The theory is that will get used in processes whose only responsibility is to create jobs,
|
|
6
|
+
so waiting on atexit to create the final batch is not an issue.
|
|
7
|
+
|
|
8
|
+
If you want a batcher that has a more context-manager-like behavior, you can write one of
|
|
9
|
+
those, but it wouldn't work well with a concurrent.futures Executor-style approach, since
|
|
10
|
+
those don't have an explicit shutdown procedure that we can hook to call __exit__.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import atexit
|
|
14
|
+
import concurrent.futures
|
|
15
|
+
import itertools
|
|
16
|
+
import multiprocessing
|
|
17
|
+
import threading
|
|
18
|
+
import typing as ty
|
|
19
|
+
|
|
20
|
+
from thds.core import cpus, futures, log
|
|
21
|
+
|
|
22
|
+
from . import _launch, counts
|
|
23
|
+
|
|
24
|
+
T = ty.TypeVar("T")
|
|
25
|
+
logger = log.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _AtExitBatcher(ty.Generic[T]):
|
|
29
|
+
def __init__(self, batch_processor: ty.Callable[[ty.Collection[T]], None]) -> None:
|
|
30
|
+
self.batch: list[T] = []
|
|
31
|
+
self._registered = False
|
|
32
|
+
self._lock = threading.RLock()
|
|
33
|
+
self._batch_processor = batch_processor
|
|
34
|
+
|
|
35
|
+
def add(self, item: T) -> None:
|
|
36
|
+
with self._lock:
|
|
37
|
+
if not self._registered:
|
|
38
|
+
atexit.register(self.process)
|
|
39
|
+
# ensure we flush on process exit, since we don't know how many items are coming
|
|
40
|
+
self._registered = True
|
|
41
|
+
self.batch.append(item)
|
|
42
|
+
|
|
43
|
+
def process(self) -> None:
|
|
44
|
+
if self.batch:
|
|
45
|
+
with self._lock:
|
|
46
|
+
if self.batch:
|
|
47
|
+
self._batch_processor(self.batch)
|
|
48
|
+
self.batch = []
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class K8sJobBatchingShim(_AtExitBatcher[str]):
|
|
52
|
+
"""Thread-safe for use within a single process by multiple threads."""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
submit_func: ty.Callable[[ty.Collection[str]], ty.Any],
|
|
57
|
+
max_batch_size: int,
|
|
58
|
+
job_counter: counts.MpValue[int],
|
|
59
|
+
name_prefix: str = "",
|
|
60
|
+
) -> None:
|
|
61
|
+
"""submit_func in particular should be a closure around whatever setup you need to
|
|
62
|
+
do to call back into a function that is locally wrapped with a k8s shim that will
|
|
63
|
+
ultimately call k8s.launch. Notably, you
|
|
64
|
+
"""
|
|
65
|
+
super().__init__(self._process_batch)
|
|
66
|
+
self._max_batch_size = max_batch_size
|
|
67
|
+
self._job_counter = job_counter
|
|
68
|
+
self._job_name = ""
|
|
69
|
+
self._name_prefix = name_prefix
|
|
70
|
+
self._submit_func = submit_func
|
|
71
|
+
|
|
72
|
+
def _get_new_name(self) -> str:
|
|
73
|
+
# counts.inc takes a multiprocess lock. do not forget this!
|
|
74
|
+
job_num = counts.inc(self._job_counter)
|
|
75
|
+
return _launch.construct_job_name(self._name_prefix, counts.to_name(job_num))
|
|
76
|
+
|
|
77
|
+
def add_to_named_job(self, mops_invocation: ty.Sequence[str]) -> str:
|
|
78
|
+
"""Returns job name for the invocation."""
|
|
79
|
+
with self._lock:
|
|
80
|
+
if not self._job_name:
|
|
81
|
+
self._job_name = self._get_new_name()
|
|
82
|
+
if len(self.batch) >= self._max_batch_size:
|
|
83
|
+
self.process()
|
|
84
|
+
self._job_name = self._get_new_name()
|
|
85
|
+
super().add(" ".join(mops_invocation))
|
|
86
|
+
return self._job_name
|
|
87
|
+
|
|
88
|
+
def _process_batch(self, batch: ty.Collection[str]) -> None:
|
|
89
|
+
with _launch.JOB_NAME.set(self._job_name):
|
|
90
|
+
log_lvl = logger.warning if len(batch) < self._max_batch_size else logger.info
|
|
91
|
+
log_lvl(f"Processing batch of len {len(batch)} with job name {self._job_name}")
|
|
92
|
+
self._submit_func(batch)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
F = ty.TypeVar("F", bound=ty.Callable)
|
|
96
|
+
FunctionDecorator = ty.Callable[[F], F]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
_BATCHER: ty.Optional[K8sJobBatchingShim] = None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def init_batcher(
|
|
103
|
+
submit_func: ty.Callable[[ty.Collection[str]], ty.Any],
|
|
104
|
+
func_max_batch_size: int,
|
|
105
|
+
job_counter: counts.MpValue[int],
|
|
106
|
+
name_prefix: str = "",
|
|
107
|
+
) -> None:
|
|
108
|
+
# for use with multiprocessing pool initializer
|
|
109
|
+
global _BATCHER
|
|
110
|
+
if _BATCHER is not None:
|
|
111
|
+
logger.warning("Batcher is already initialized; reinitializing will reset the job name.")
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
_BATCHER = K8sJobBatchingShim(submit_func, func_max_batch_size, job_counter, name_prefix)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def init_batcher_with_unpicklable_submit_func(
|
|
118
|
+
make_submit_func: ty.Callable[[T], ty.Callable[[ty.Collection[str]], ty.Any]],
|
|
119
|
+
submit_func_arg: T,
|
|
120
|
+
func_max_batch_size: int,
|
|
121
|
+
job_counter: counts.MpValue[int],
|
|
122
|
+
name_prefix: str = "",
|
|
123
|
+
) -> None:
|
|
124
|
+
"""Use this if you want to have an unpicklable submit function - because applying make_submit_func(submit_func_arg)
|
|
125
|
+
will happen inside the pool worker process after all the pickling/unpickling has happened.
|
|
126
|
+
"""
|
|
127
|
+
return init_batcher(
|
|
128
|
+
make_submit_func(submit_func_arg), func_max_batch_size, job_counter, name_prefix=name_prefix
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def make_counting_process_pool_executor(
|
|
133
|
+
make_submit_func: ty.Callable[[T], ty.Callable[[ty.Collection[str]], ty.Any]],
|
|
134
|
+
submit_func_arg: T,
|
|
135
|
+
max_batch_size: int,
|
|
136
|
+
name_prefix: str = "",
|
|
137
|
+
max_workers: int = 0,
|
|
138
|
+
) -> concurrent.futures.ProcessPoolExecutor:
|
|
139
|
+
"""Creates a ProcessPoolExecutor that uses the batching shim for job submission.
|
|
140
|
+
|
|
141
|
+
We are introducing this because we see segfaults prior to Python 3.12 related to this issue:
|
|
142
|
+
https://github.com/python/cpython/issues/77377
|
|
143
|
+
|
|
144
|
+
And it would seem that this had to do with creating mp.Values using a 'fork' start
|
|
145
|
+
method, and then passing those to a ProcessPoolExecutor with
|
|
146
|
+
mp_context=multiprolcessing.get_context('spawn'). So we can help you avoid that by creating
|
|
147
|
+
the mp.Value for you, alongside its ProcessPoolExecutor.
|
|
148
|
+
|
|
149
|
+
NOTE!!
|
|
150
|
+
|
|
151
|
+
You should only have one of these per process at a time, because we're doing spooky
|
|
152
|
+
things with the Job Counter. In fact, you should probably only create one of these
|
|
153
|
+
_ever_ within a single logical 'application'.
|
|
154
|
+
|
|
155
|
+
If you fail to heed this advice, you will get weird launched/finished counts at a
|
|
156
|
+
minimum. Although these job counts are not mission-critical, you _will_ be confused.
|
|
157
|
+
"""
|
|
158
|
+
start_method: str = "spawn"
|
|
159
|
+
# 'spawn' prevents weird batch processing deadlocks that seem to only happen on Linux with 'fork'.
|
|
160
|
+
# it is strongly recommended to use 'spawn' for this reason.
|
|
161
|
+
|
|
162
|
+
mp_context = multiprocessing.get_context(start_method)
|
|
163
|
+
launch_count = mp_context.Value("i", 0)
|
|
164
|
+
# even though i want to assign this to a global, I also want to prevent
|
|
165
|
+
# any possible race condition where i somehow use a different thread's LAUNCH_COUNT
|
|
166
|
+
# when i create the ProcessPoolExecutor a few lines below.
|
|
167
|
+
counts.LAUNCH_COUNT = launch_count
|
|
168
|
+
counts.FINISH_COUNT = mp_context.Value("i", 0) # we don't use this here; we just reset it to zero.
|
|
169
|
+
# SPOOKY - reset the global finish counter and make it be the same 'type'
|
|
170
|
+
return concurrent.futures.ProcessPoolExecutor(
|
|
171
|
+
max_workers=max_workers or cpus.available_cpu_count(),
|
|
172
|
+
initializer=init_batcher_with_unpicklable_submit_func,
|
|
173
|
+
initargs=(make_submit_func, submit_func_arg, max_batch_size, launch_count, name_prefix),
|
|
174
|
+
mp_context=mp_context,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def shim(args: ty.Sequence[str]) -> futures.PFuture[bool]:
|
|
179
|
+
# This thing needs to return a lazy Uncertain Future that contains a job name, so that Job can be polled on
|
|
180
|
+
# ... but the job does not exist yet! So the batcher is in charge of creating the job name
|
|
181
|
+
# upfront, and then ensuring that it gets used when the job is created.
|
|
182
|
+
assert _BATCHER is not None, "Batcher must be initialized before using the batching shim."
|
|
183
|
+
job_name = _BATCHER.add_to_named_job(args)
|
|
184
|
+
return _launch.create_lazy_job_logging_future(job_name)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def batched(iterable: ty.Iterable[T], n: int, *, strict: bool = False) -> ty.Iterator[tuple[T, ...]]:
|
|
188
|
+
"""Just a utility for pre-batching if you're using multiprocessing to create batches."""
|
|
189
|
+
# TODO get rid of this when we go to Python 3.12+ which has itertools.batched
|
|
190
|
+
#
|
|
191
|
+
# batched('ABCDEFG', 3) → ABC DEF G
|
|
192
|
+
if n < 1:
|
|
193
|
+
raise ValueError("n must be at least one")
|
|
194
|
+
iterator = iter(iterable)
|
|
195
|
+
while batch := tuple(itertools.islice(iterator, n)):
|
|
196
|
+
if strict and len(batch) != n:
|
|
197
|
+
raise ValueError("batched(): incomplete batch")
|
|
198
|
+
yield batch
|
thds/mops/k8s/config.py
CHANGED
|
@@ -10,7 +10,7 @@ k8s_namespace_env_var_key = config.item("mops.k8s.namespace_env_var_key", "MOPS_
|
|
|
10
10
|
# environment variable. it will not affect how your namespace is selected in the first
|
|
11
11
|
# place.
|
|
12
12
|
|
|
13
|
-
k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds",
|
|
13
|
+
k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 5 * 60, parse=int)
|
|
14
14
|
k8s_acr_url = config.item("mops.k8s.acr.url", "")
|
|
15
15
|
k8s_job_retry_count = config.item("mops.k8s.job.retry_count", 6, parse=int)
|
|
16
16
|
k8s_job_cleanup_ttl_seconds_after_completion = config.item(
|
thds/mops/k8s/counts.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import multiprocessing as mp
|
|
2
|
+
import typing as ty
|
|
3
|
+
|
|
4
|
+
T = ty.TypeVar("T")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MpValue(ty.Protocol[T]):
|
|
8
|
+
def get_lock(self) -> ty.Any:
|
|
9
|
+
...
|
|
10
|
+
|
|
11
|
+
value: T
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def inc(mp_val: MpValue[int]) -> int:
|
|
15
|
+
with mp_val.get_lock():
|
|
16
|
+
mp_val.value += 1
|
|
17
|
+
return mp_val.value
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
LAUNCH_COUNT = mp.Value("i", 0)
|
|
21
|
+
FINISH_COUNT = mp.Value("i", 0)
|
|
22
|
+
# these are spooky - they're global and mutable, and may in fact get overwritten by code
|
|
23
|
+
# using specific multiprocessing contexts.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def to_name(count: int) -> str:
|
|
27
|
+
"""Convert a count to a name."""
|
|
28
|
+
return f"{count:0>4}"
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import typing as ty
|
|
3
|
+
|
|
4
|
+
from kubernetes import client
|
|
5
|
+
|
|
6
|
+
from thds.core import futures, log
|
|
7
|
+
from thds.termtool.colorize import colorized
|
|
8
|
+
|
|
9
|
+
from . import config, counts, uncertain_future
|
|
10
|
+
from .jobs import is_job_failed, is_job_succeeded, job_source
|
|
11
|
+
|
|
12
|
+
logger = log.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
UNUSUAL = colorized(fg="white", bg="yellow")
|
|
15
|
+
SUCCEEDED = colorized(fg="white", bg="blue")
|
|
16
|
+
FAILED = colorized(fg="white", bg="red")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_FINISHED_JOBS = set[str]()
|
|
20
|
+
_FINISHED_JOBS_LOCK = threading.Lock()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _check_newly_finished(job_name: str, namespace: str = "") -> str:
|
|
24
|
+
# I don't believe it's possible to ever have a Job that both succeeds and fails.
|
|
25
|
+
namespace = namespace or config.k8s_namespace()
|
|
26
|
+
job_full = f"{namespace}/{job_name}"
|
|
27
|
+
if job_full in _FINISHED_JOBS:
|
|
28
|
+
return ""
|
|
29
|
+
|
|
30
|
+
with _FINISHED_JOBS_LOCK:
|
|
31
|
+
if job_full in _FINISHED_JOBS:
|
|
32
|
+
return ""
|
|
33
|
+
|
|
34
|
+
_FINISHED_JOBS.add(job_full)
|
|
35
|
+
|
|
36
|
+
launched = counts.LAUNCH_COUNT.value
|
|
37
|
+
return f"- ({launched - counts.inc(counts.FINISH_COUNT)} unfinished of {launched})"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class K8sJobFailedError(Exception):
|
|
41
|
+
"""Raised by `launch` when a Job is seen to terminate in a Failed state."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def make_job_completion_future(job_name: str, *, namespace: str = "") -> futures.PFuture[bool]:
|
|
45
|
+
"""This is a natural boundary for a serializable lazy future - something that represents
|
|
46
|
+
work being done across process boundaries (since Kubernetes jobs will be listed via an API.
|
|
47
|
+
|
|
48
|
+
If True is returned, the Job has definitely succeeded.
|
|
49
|
+
|
|
50
|
+
If False is returned, the Job may have succeeded but we saw no evidence of it.
|
|
51
|
+
|
|
52
|
+
If the Job definitely failed, an Exception will be raised.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
JOB_SEEN = False
|
|
56
|
+
|
|
57
|
+
def job_completion_interpreter(
|
|
58
|
+
job: ty.Optional[client.models.V1Job], last_seen_at: float
|
|
59
|
+
) -> ty.Union[uncertain_future.NotYetDone, bool]:
|
|
60
|
+
nonlocal JOB_SEEN
|
|
61
|
+
if not job:
|
|
62
|
+
if JOB_SEEN:
|
|
63
|
+
logger.warning(
|
|
64
|
+
UNUSUAL(f"Previously-seen job {job_name} no longer exists - assuming success!")
|
|
65
|
+
)
|
|
66
|
+
# we hereby indicate an unusual success to the Future waiter.
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
time_since_last_seen = uncertain_future.official_timer() - last_seen_at
|
|
70
|
+
if time_since_last_seen > config.k8s_watch_object_stale_seconds():
|
|
71
|
+
# this is 5 minutes by default as of 2025-07-15.
|
|
72
|
+
raise TimeoutError(
|
|
73
|
+
f"Job {job_name} has not been seen for {time_since_last_seen:.1f} seconds - assuming failure!"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# we don't know what's going on but things aren't truly stale yet.
|
|
77
|
+
return uncertain_future.NotYetDone()
|
|
78
|
+
|
|
79
|
+
JOB_SEEN = True
|
|
80
|
+
|
|
81
|
+
if is_job_succeeded(job):
|
|
82
|
+
newly_succeeded = _check_newly_finished(job_name, namespace)
|
|
83
|
+
if newly_succeeded:
|
|
84
|
+
logger.info(SUCCEEDED(f"Job {job_name} Succeeded! {newly_succeeded}"))
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
if is_job_failed(job):
|
|
88
|
+
newly_failed = _check_newly_finished(job_name, namespace)
|
|
89
|
+
if newly_failed:
|
|
90
|
+
logger.error(FAILED(f"Job {job_name} Failed! {newly_failed}"))
|
|
91
|
+
raise K8sJobFailedError(f"Job {job_name} has failed with status: {job.status}")
|
|
92
|
+
|
|
93
|
+
return uncertain_future.NotYetDone() # job is still in progress
|
|
94
|
+
|
|
95
|
+
return job_source().create_future(
|
|
96
|
+
job_completion_interpreter,
|
|
97
|
+
job_name,
|
|
98
|
+
namespace=namespace or config.k8s_namespace(),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def make_lazy_completion_future(job_name: str, *, namespace: str = "") -> futures.LazyFuture[bool]:
|
|
103
|
+
"""This is a convenience function that will create a job completion future and then
|
|
104
|
+
immediately process it, returning the result. See docs on function above.
|
|
105
|
+
"""
|
|
106
|
+
return futures.make_lazy(make_job_completion_future)(
|
|
107
|
+
job_name,
|
|
108
|
+
namespace=namespace or config.k8s_namespace(),
|
|
109
|
+
)
|
thds/mops/k8s/jobs.py
CHANGED
|
@@ -25,6 +25,10 @@ _JOB_SOURCE = WatchingObjectSource(
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
def job_source() -> WatchingObjectSource[client.models.V1Job]:
|
|
29
|
+
return _JOB_SOURCE
|
|
30
|
+
|
|
31
|
+
|
|
28
32
|
def get_job(job_name: str, namespace: str = "") -> ty.Optional[client.models.V1Job]:
|
|
29
33
|
return _JOB_SOURCE.get(job_name, namespace=namespace)
|
|
30
34
|
|
thds/mops/k8s/logging.py
CHANGED
|
@@ -32,6 +32,17 @@ BOINK = colorized(fg="white", bg="magenta")
|
|
|
32
32
|
# string in this and it'll stand out.
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
def should_log(job_name: str) -> bool:
|
|
36
|
+
if NO_K8S_LOGS():
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
if random.random() > K8S_LOG_POD_FRACTION():
|
|
40
|
+
logger.info(f"Skipping log watcher for {job_name} due to fraction.")
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
return True
|
|
44
|
+
|
|
45
|
+
|
|
35
46
|
class JobLogWatcher:
|
|
36
47
|
"""Will spawn one or more daemon threads.
|
|
37
48
|
|
|
@@ -59,11 +70,7 @@ class JobLogWatcher:
|
|
|
59
70
|
@core.scope.bound
|
|
60
71
|
def start(self, failed_pod_name: str = "") -> None:
|
|
61
72
|
"""Call this one time - it will spawn threads as needed."""
|
|
62
|
-
if
|
|
63
|
-
return
|
|
64
|
-
|
|
65
|
-
if random.random() > K8S_LOG_POD_FRACTION():
|
|
66
|
-
logger.info(f"Skipping log watcher for {self.job_name} due to fraction.")
|
|
73
|
+
if not should_log(self.job_name):
|
|
67
74
|
return
|
|
68
75
|
|
|
69
76
|
core.scope.enter(self.job_pods_discovery_lock)
|
|
@@ -245,3 +252,28 @@ def _scrape_pod_logs(
|
|
|
245
252
|
logger.exception(BOINK("Pod log scraping failed utterly. Pod may have died?"))
|
|
246
253
|
# at least let the caller know something went horribly wrong
|
|
247
254
|
failure_callback(pod_name)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
_JOB_LOG_THREADS: set[str] = set()
|
|
258
|
+
_JOB_LOG_THREAD_COUNT: int = 0
|
|
259
|
+
_JOB_LOG_THREADS_LOCK = threading.Lock()
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def maybe_start_job_thread(job_name: str, num_pods_expected: int = 1) -> bool:
|
|
263
|
+
"""Starts a thread to watch the logs of a job. Makes sure we only start one thread per
|
|
264
|
+
job even if there are multiple calls to this function.
|
|
265
|
+
"""
|
|
266
|
+
if job_name not in _JOB_LOG_THREADS:
|
|
267
|
+
with _JOB_LOG_THREADS_LOCK:
|
|
268
|
+
if job_name not in _JOB_LOG_THREADS:
|
|
269
|
+
# double-checked locking to avoid creating multiple threads for the same job
|
|
270
|
+
_JOB_LOG_THREADS.add(job_name)
|
|
271
|
+
if should_log(job_name):
|
|
272
|
+
global _JOB_LOG_THREAD_COUNT
|
|
273
|
+
_JOB_LOG_THREAD_COUNT += 1
|
|
274
|
+
logger.info(f"Starting log watcher {_JOB_LOG_THREAD_COUNT} for job {job_name}")
|
|
275
|
+
threading.Thread(
|
|
276
|
+
target=JobLogWatcher(job_name, num_pods_expected).start, daemon=True
|
|
277
|
+
).start()
|
|
278
|
+
return True
|
|
279
|
+
return False
|