thds.mops 3.9.20250722200009__py3-none-any.whl → 3.9.20250722213952__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.mops might be problematic. Click here for more details.
- thds/mops/impure/runner.py +1 -1
- thds/mops/k8s/__init__.py +1 -3
- thds/mops/k8s/config.py +1 -1
- thds/mops/k8s/jobs.py +0 -4
- thds/mops/k8s/{_launch.py → launch.py} +57 -56
- thds/mops/k8s/logging.py +5 -37
- thds/mops/k8s/watch.py +62 -120
- thds/mops/pure/__init__.py +1 -2
- thds/mops/pure/_magic/sauce.py +3 -11
- thds/mops/pure/_magic/shims.py +2 -2
- thds/mops/pure/core/deferred_work.py +12 -15
- thds/mops/pure/core/entry/runner_registry.py +10 -1
- thds/mops/pure/core/lock/__init__.py +0 -1
- thds/mops/pure/core/lock/_acquire.py +2 -2
- thds/mops/pure/core/lock/maintain.py +3 -22
- thds/mops/pure/core/lock/write.py +19 -19
- thds/mops/pure/core/memo/__init__.py +1 -1
- thds/mops/pure/core/memo/results.py +4 -5
- thds/mops/pure/core/use_runner.py +7 -21
- thds/mops/pure/pickling/mprunner.py +14 -21
- thds/mops/pure/pickling/pickles.py +8 -19
- thds/mops/pure/pickling/remote.py +1 -3
- thds/mops/pure/runner/local.py +87 -58
- thds/mops/pure/runner/shim_builder.py +7 -7
- thds/mops/pure/runner/simple_shims.py +0 -7
- thds/mops/pure/runner/types.py +4 -15
- thds/mops/pure/tools/summarize/run_summary.py +8 -9
- {thds_mops-3.9.20250722200009.dist-info → thds_mops-3.9.20250722213952.dist-info}/METADATA +1 -1
- {thds_mops-3.9.20250722200009.dist-info → thds_mops-3.9.20250722213952.dist-info}/RECORD +32 -37
- thds/mops/k8s/batching.py +0 -198
- thds/mops/k8s/counts.py +0 -28
- thds/mops/k8s/job_future.py +0 -109
- thds/mops/k8s/uncertain_future.py +0 -160
- thds/mops/pure/runner/get_results.py +0 -106
- {thds_mops-3.9.20250722200009.dist-info → thds_mops-3.9.20250722213952.dist-info}/WHEEL +0 -0
- {thds_mops-3.9.20250722200009.dist-info → thds_mops-3.9.20250722213952.dist-info}/entry_points.txt +0 -0
- {thds_mops-3.9.20250722200009.dist-info → thds_mops-3.9.20250722213952.dist-info}/top_level.txt +0 -0
thds/mops/impure/runner.py
CHANGED
|
@@ -67,7 +67,7 @@ class KeyedLocalRunner(MemoizingPicklingRunner):
|
|
|
67
67
|
redirect=lambda _f, _args, _kwargs: _perform_original_invocation,
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
-
def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R:
|
|
70
|
+
def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R:
|
|
71
71
|
actual_function_to_call = self._pre_pickle_redirect(raw_func, raw_args, raw_kwargs)
|
|
72
72
|
with _ORIGINAL_F_ARGS_KWARGS.set((actual_function_to_call, raw_args, raw_kwargs)):
|
|
73
73
|
return super().__call__(*self._impure_keyfunc(raw_func, raw_args, raw_kwargs))
|
thds/mops/k8s/__init__.py
CHANGED
|
@@ -7,10 +7,8 @@ except ModuleNotFoundError as mnf:
|
|
|
7
7
|
"Please install mops with the `k8s` extra to use `thds.mops.k8s`."
|
|
8
8
|
) from mnf
|
|
9
9
|
|
|
10
|
-
from . import batching, counts, job_future # noqa: F401
|
|
11
|
-
from ._launch import launch, shim # noqa
|
|
12
10
|
from .container_registry import autocr # noqa: F401
|
|
13
|
-
from .
|
|
11
|
+
from .launch import K8sJobFailedError, launch, shim # noqa
|
|
14
12
|
from .node_selection import ( # noqa
|
|
15
13
|
NodeNarrowing,
|
|
16
14
|
ResourceDefinition,
|
thds/mops/k8s/config.py
CHANGED
|
@@ -10,7 +10,7 @@ k8s_namespace_env_var_key = config.item("mops.k8s.namespace_env_var_key", "MOPS_
|
|
|
10
10
|
# environment variable. it will not affect how your namespace is selected in the first
|
|
11
11
|
# place.
|
|
12
12
|
|
|
13
|
-
k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds",
|
|
13
|
+
k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 30 * 60, parse=int)
|
|
14
14
|
k8s_acr_url = config.item("mops.k8s.acr.url", "")
|
|
15
15
|
k8s_job_retry_count = config.item("mops.k8s.job.retry_count", 6, parse=int)
|
|
16
16
|
k8s_job_cleanup_ttl_seconds_after_completion = config.item(
|
thds/mops/k8s/jobs.py
CHANGED
|
@@ -25,10 +25,6 @@ _JOB_SOURCE = WatchingObjectSource(
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
def job_source() -> WatchingObjectSource[client.models.V1Job]:
|
|
29
|
-
return _JOB_SOURCE
|
|
30
|
-
|
|
31
|
-
|
|
32
28
|
def get_job(job_name: str, namespace: str = "") -> ty.Optional[client.models.V1Job]:
|
|
33
29
|
return _JOB_SOURCE.get(job_name, namespace=namespace)
|
|
34
30
|
|
|
@@ -4,22 +4,41 @@ import os
|
|
|
4
4
|
import threading
|
|
5
5
|
import typing as ty
|
|
6
6
|
import uuid
|
|
7
|
-
from functools import partial
|
|
8
7
|
|
|
9
8
|
from kubernetes import client
|
|
10
9
|
|
|
11
|
-
from thds import
|
|
10
|
+
from thds.core import scope
|
|
11
|
+
from thds.core.log import logger_context
|
|
12
12
|
from thds.mops.pure.runner.simple_shims import samethread_shim
|
|
13
13
|
from thds.termtool.colorize import colorized
|
|
14
14
|
|
|
15
|
-
from . import config
|
|
15
|
+
from . import config
|
|
16
16
|
from ._shared import logger
|
|
17
17
|
from .auth import load_config, upsert_namespace
|
|
18
|
+
from .logging import JobLogWatcher
|
|
18
19
|
from .node_selection import NodeNarrowing, ResourceDefinition
|
|
19
20
|
from .retry import k8s_sdk_retry
|
|
20
21
|
from .thds_std import embed_thds_auth
|
|
22
|
+
from .wait_job import wait_for_job
|
|
21
23
|
|
|
22
24
|
LAUNCHED = colorized(fg="white", bg="green")
|
|
25
|
+
COMPLETE = colorized(fg="white", bg="blue")
|
|
26
|
+
FAILED = colorized(fg="white", bg="red")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class K8sJobFailedError(Exception):
|
|
30
|
+
"""Raised by `launch` when a Job is seen to terminate in a Failed state."""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Counter:
|
|
34
|
+
def __init__(self) -> None:
|
|
35
|
+
self.value = 0
|
|
36
|
+
self._lock = threading.Lock()
|
|
37
|
+
|
|
38
|
+
def inc(self) -> int:
|
|
39
|
+
with self._lock:
|
|
40
|
+
self.value += 1
|
|
41
|
+
return self.value
|
|
23
42
|
|
|
24
43
|
|
|
25
44
|
def sanitize_str(name: str) -> str:
|
|
@@ -30,7 +49,7 @@ def sanitize_str(name: str) -> str:
|
|
|
30
49
|
|
|
31
50
|
def construct_job_name(user_prefix: str, job_num: str) -> str:
|
|
32
51
|
# we want some consistency here, but also some randomness in case the prefixes don't exist or aren't unique.
|
|
33
|
-
mops_name_part = "-".join([
|
|
52
|
+
mops_name_part = "-".join([str(os.getpid()), sanitize_str(job_num), str(uuid.uuid4())[:8]])
|
|
34
53
|
if len(mops_name_part) > 63:
|
|
35
54
|
# this should be _impossible_, because having a job num longer than even 20 digits would be an impossibly large
|
|
36
55
|
# number of jobs. but just in case, we'll truncate it to the last 63 characters.
|
|
@@ -46,11 +65,12 @@ def construct_job_name(user_prefix: str, job_num: str) -> str:
|
|
|
46
65
|
return name
|
|
47
66
|
|
|
48
67
|
|
|
68
|
+
_LAUNCH_COUNT = Counter()
|
|
69
|
+
_FINISH_COUNT = Counter()
|
|
49
70
|
_SIMULTANEOUS_LAUNCHES = threading.BoundedSemaphore(20)
|
|
50
|
-
JOB_NAME = core.stack_context.StackContext("job_name", "")
|
|
51
71
|
|
|
52
72
|
|
|
53
|
-
@
|
|
73
|
+
@scope.bound
|
|
54
74
|
def launch(
|
|
55
75
|
container_image: str,
|
|
56
76
|
args: ty.Sequence[str],
|
|
@@ -61,46 +81,38 @@ def launch(
|
|
|
61
81
|
# arguments below are for launching; arguments above are for
|
|
62
82
|
# building. these should get separated in a future change.
|
|
63
83
|
name_prefix: str = "",
|
|
64
|
-
full_name: str = "",
|
|
65
84
|
dry_run: bool = False,
|
|
85
|
+
fire_and_forget: bool = False,
|
|
66
86
|
suppress_logs: bool = False,
|
|
67
87
|
transform_job: ty.Callable[[client.models.V1Job], client.models.V1Job] = embed_thds_auth,
|
|
68
88
|
# this is a default for now. later if we share this code we'll need to have a wrapper interface
|
|
69
89
|
service_account_name: str = "",
|
|
70
|
-
) ->
|
|
90
|
+
) -> None:
|
|
71
91
|
"""Launch a Kubernetes job.
|
|
72
92
|
|
|
73
93
|
Required parameters are the container_image and the arguments to
|
|
74
94
|
that image, just as if you were running this directly with Docker.
|
|
75
95
|
|
|
76
|
-
|
|
77
|
-
raise K8sJobFailedError if the Job fails.
|
|
96
|
+
Unless fire_and_forget=True, will poll until Job completes and
|
|
97
|
+
will raise K8sJobFailedError if the Job fails. None is returned
|
|
98
|
+
if the Job succeeds.
|
|
78
99
|
|
|
79
100
|
`name_prefix` is an optional parameter for debugging/developer
|
|
80
101
|
convenience. A generated suffix will be added to it.
|
|
102
|
+
|
|
81
103
|
"""
|
|
82
104
|
if not container_image:
|
|
83
105
|
raise ValueError("container_image (the fully qualified Docker tag) must not be empty.")
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
# via a StackContext, so we check that here, and prefer it over name_prefix.
|
|
88
|
-
|
|
89
|
-
if full_name and name_prefix:
|
|
90
|
-
raise ValueError("You cannot specify both full_name and name_prefix; use one or the other.")
|
|
91
|
-
|
|
92
|
-
if not full_name:
|
|
93
|
-
name = construct_job_name(name_prefix, counts.to_name(counts.inc(counts.LAUNCH_COUNT)))
|
|
94
|
-
else:
|
|
95
|
-
name = full_name
|
|
96
|
-
|
|
97
|
-
core.scope.enter(core.log.logger_context(job=name))
|
|
106
|
+
job_num = f"{_LAUNCH_COUNT.inc():0>3}"
|
|
107
|
+
name = construct_job_name(name_prefix, job_num)
|
|
108
|
+
scope.enter(logger_context(job=name))
|
|
98
109
|
node_narrowing = node_narrowing or dict()
|
|
99
110
|
|
|
100
111
|
# TODO move this entire function out to be separately callable
|
|
101
112
|
@k8s_sdk_retry()
|
|
102
113
|
def assemble_base_job() -> client.models.V1Job:
|
|
103
114
|
logger.debug(f"Assembling job named `{name}` on image `{container_image}`")
|
|
115
|
+
logger.debug("Fire and forget: %s", fire_and_forget)
|
|
104
116
|
logger.debug("Loading kube configs ...")
|
|
105
117
|
load_config()
|
|
106
118
|
logger.debug("Populating job object ...")
|
|
@@ -173,7 +185,7 @@ def launch(
|
|
|
173
185
|
if dry_run:
|
|
174
186
|
job_with_all_transforms()
|
|
175
187
|
logger.info("Dry run assembly successful; not launching...")
|
|
176
|
-
return
|
|
188
|
+
return
|
|
177
189
|
|
|
178
190
|
@k8s_sdk_retry()
|
|
179
191
|
def launch_job() -> client.models.V1Job:
|
|
@@ -186,41 +198,32 @@ def launch(
|
|
|
186
198
|
)
|
|
187
199
|
|
|
188
200
|
job = launch_job()
|
|
189
|
-
logger.info(LAUNCHED(f"Job {
|
|
190
|
-
return core.futures.make_lazy(_launch_logs_and_create_future)( # see below for implementation
|
|
191
|
-
job.metadata.name,
|
|
192
|
-
num_pods_expected=len(job.spec.template.spec.containers),
|
|
193
|
-
namespace=config.k8s_namespace(),
|
|
194
|
-
suppress_logs=suppress_logs,
|
|
195
|
-
)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
# this function has to be a top level def because it will sometimes be transferred across process boundaries,
|
|
199
|
-
# and Python/pickle in its infinite wisdom does not allow nested functions to be pickled.
|
|
200
|
-
def _launch_logs_and_create_future(
|
|
201
|
-
job_name: str, *, num_pods_expected: int, namespace: str, suppress_logs: bool
|
|
202
|
-
) -> core.futures.PFuture[bool]:
|
|
201
|
+
logger.info(LAUNCHED(f"Job {job_num} launched!") + f" on {container_image}")
|
|
203
202
|
if not suppress_logs:
|
|
204
|
-
|
|
205
|
-
|
|
203
|
+
threading.Thread( # fire and forget a log watching thread
|
|
204
|
+
target=JobLogWatcher(job.metadata.name, len(job.spec.template.spec.containers)).start,
|
|
205
|
+
daemon=True,
|
|
206
|
+
).start()
|
|
207
|
+
|
|
208
|
+
if not fire_and_forget:
|
|
206
209
|
|
|
210
|
+
def counts() -> str:
|
|
211
|
+
launched = _LAUNCH_COUNT.value
|
|
212
|
+
return f"- ({launched - _FINISH_COUNT.inc()} unfinished of {launched})"
|
|
207
213
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
namespace=namespace or config.k8s_namespace(),
|
|
215
|
-
suppress_logs=False,
|
|
216
|
-
)
|
|
214
|
+
job_name = job.metadata.name
|
|
215
|
+
del job # trying to save memory here while we wait...
|
|
216
|
+
if not wait_for_job(job_name, short_name=job_num):
|
|
217
|
+
logger.error(FAILED(f"Job {job_num} Failed! {counts()}"))
|
|
218
|
+
raise K8sJobFailedError(f"Job {job_name} failed.")
|
|
219
|
+
logger.info(COMPLETE(f"Job {job_num} Complete! {counts()}"))
|
|
217
220
|
|
|
218
221
|
|
|
219
222
|
def shim(
|
|
220
223
|
container_image: ty.Union[str, ty.Callable[[], str]],
|
|
221
224
|
disable_remote: ty.Callable[[], bool] = lambda: False,
|
|
222
225
|
**outer_kwargs: ty.Any,
|
|
223
|
-
) -> ty.Callable[[ty.Sequence[str]],
|
|
226
|
+
) -> ty.Callable[[ty.Sequence[str]], None]:
|
|
224
227
|
"""Return a closure that can launch the given configuration and run a mops pure function.
|
|
225
228
|
|
|
226
229
|
Now supports callables that return a container image name; the
|
|
@@ -237,18 +240,16 @@ def shim(
|
|
|
237
240
|
), "Passing 'args' as a keyword argument will cause conflicts with the closure."
|
|
238
241
|
|
|
239
242
|
if disable_remote():
|
|
240
|
-
return samethread_shim
|
|
243
|
+
return samethread_shim
|
|
241
244
|
|
|
242
245
|
if isinstance(container_image, str):
|
|
243
246
|
get_container_image: ty.Callable[[], str] = lambda: container_image # noqa: E731
|
|
244
247
|
else:
|
|
245
248
|
get_container_image = container_image
|
|
246
249
|
|
|
247
|
-
def launch_container_on_k8s_with_args(
|
|
248
|
-
args: ty.Sequence[str], **inner_kwargs: ty.Any
|
|
249
|
-
) -> core.futures.LazyFuture[bool]:
|
|
250
|
+
def launch_container_on_k8s_with_args(args: ty.Sequence[str], **inner_kwargs: ty.Any) -> None:
|
|
250
251
|
assert "args" not in inner_kwargs
|
|
251
|
-
|
|
252
|
+
launch(
|
|
252
253
|
get_container_image(),
|
|
253
254
|
["python", "-m", "thds.mops.pure.core.entry.main", *args],
|
|
254
255
|
**{**outer_kwargs, **inner_kwargs},
|
thds/mops/k8s/logging.py
CHANGED
|
@@ -32,17 +32,6 @@ BOINK = colorized(fg="white", bg="magenta")
|
|
|
32
32
|
# string in this and it'll stand out.
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def should_log(job_name: str) -> bool:
|
|
36
|
-
if NO_K8S_LOGS():
|
|
37
|
-
return False
|
|
38
|
-
|
|
39
|
-
if random.random() > K8S_LOG_POD_FRACTION():
|
|
40
|
-
logger.info(f"Skipping log watcher for {job_name} due to fraction.")
|
|
41
|
-
return False
|
|
42
|
-
|
|
43
|
-
return True
|
|
44
|
-
|
|
45
|
-
|
|
46
35
|
class JobLogWatcher:
|
|
47
36
|
"""Will spawn one or more daemon threads.
|
|
48
37
|
|
|
@@ -70,7 +59,11 @@ class JobLogWatcher:
|
|
|
70
59
|
@core.scope.bound
|
|
71
60
|
def start(self, failed_pod_name: str = "") -> None:
|
|
72
61
|
"""Call this one time - it will spawn threads as needed."""
|
|
73
|
-
if
|
|
62
|
+
if NO_K8S_LOGS():
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
if random.random() > K8S_LOG_POD_FRACTION():
|
|
66
|
+
logger.info(f"Skipping log watcher for {self.job_name} due to fraction.")
|
|
74
67
|
return
|
|
75
68
|
|
|
76
69
|
core.scope.enter(self.job_pods_discovery_lock)
|
|
@@ -252,28 +245,3 @@ def _scrape_pod_logs(
|
|
|
252
245
|
logger.exception(BOINK("Pod log scraping failed utterly. Pod may have died?"))
|
|
253
246
|
# at least let the caller know something went horribly wrong
|
|
254
247
|
failure_callback(pod_name)
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
_JOB_LOG_THREADS: set[str] = set()
|
|
258
|
-
_JOB_LOG_THREAD_COUNT: int = 0
|
|
259
|
-
_JOB_LOG_THREADS_LOCK = threading.Lock()
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
def maybe_start_job_thread(job_name: str, num_pods_expected: int = 1) -> bool:
|
|
263
|
-
"""Starts a thread to watch the logs of a job. Makes sure we only start one thread per
|
|
264
|
-
job even if there are multiple calls to this function.
|
|
265
|
-
"""
|
|
266
|
-
if job_name not in _JOB_LOG_THREADS:
|
|
267
|
-
with _JOB_LOG_THREADS_LOCK:
|
|
268
|
-
if job_name not in _JOB_LOG_THREADS:
|
|
269
|
-
# double-checked locking to avoid creating multiple threads for the same job
|
|
270
|
-
_JOB_LOG_THREADS.add(job_name)
|
|
271
|
-
if should_log(job_name):
|
|
272
|
-
global _JOB_LOG_THREAD_COUNT
|
|
273
|
-
_JOB_LOG_THREAD_COUNT += 1
|
|
274
|
-
logger.info(f"Starting log watcher {_JOB_LOG_THREAD_COUNT} for job {job_name}")
|
|
275
|
-
threading.Thread(
|
|
276
|
-
target=JobLogWatcher(job_name, num_pods_expected).start, daemon=True
|
|
277
|
-
).start()
|
|
278
|
-
return True
|
|
279
|
-
return False
|
thds/mops/k8s/watch.py
CHANGED
|
@@ -12,20 +12,17 @@ import urllib3
|
|
|
12
12
|
from kubernetes import client
|
|
13
13
|
from kubernetes import watch as k8s_watch
|
|
14
14
|
|
|
15
|
-
from thds.core import
|
|
15
|
+
from thds.core import scope
|
|
16
16
|
from thds.core.log import getLogger, logger_context
|
|
17
17
|
from thds.termtool.colorize import colorized
|
|
18
18
|
|
|
19
19
|
from . import config
|
|
20
20
|
from .auth import load_config
|
|
21
21
|
from .too_old_resource_version import parse_too_old_resource_version
|
|
22
|
-
from .uncertain_future import FutureInterpreter, UncertainFuturesTracker
|
|
23
22
|
|
|
24
23
|
logger = getLogger(__name__)
|
|
25
24
|
|
|
26
25
|
T = ty.TypeVar("T")
|
|
27
|
-
K = ty.TypeVar("K")
|
|
28
|
-
R = ty.TypeVar("R")
|
|
29
26
|
|
|
30
27
|
|
|
31
28
|
class V1List(ty.Protocol[T]):
|
|
@@ -118,6 +115,10 @@ def callback_events(
|
|
|
118
115
|
break
|
|
119
116
|
|
|
120
117
|
|
|
118
|
+
def _make_name(namespace: str, name: str) -> str:
|
|
119
|
+
return f"{namespace}/{name}"
|
|
120
|
+
|
|
121
|
+
|
|
121
122
|
def _default_get_name(obj: ty.Any) -> str:
|
|
122
123
|
return obj.metadata.name
|
|
123
124
|
|
|
@@ -147,15 +148,9 @@ class OneShotLimiter:
|
|
|
147
148
|
self._names.add(name)
|
|
148
149
|
|
|
149
150
|
|
|
150
|
-
def _watch_timer() -> float:
|
|
151
|
-
# in this context, monotonicity (actual timing) is most useful because we don't need sentinels.
|
|
152
|
-
return time.monotonic()
|
|
153
|
-
|
|
154
|
-
|
|
155
151
|
def is_stale(api_last_update_time: float, obj_last_seen_time: float) -> bool:
|
|
156
|
-
now =
|
|
152
|
+
now = time.monotonic()
|
|
157
153
|
allowed_stale_seconds = config.k8s_watch_object_stale_seconds()
|
|
158
|
-
# about 5 minutes by default as of 2025-07-15.
|
|
159
154
|
if (time_since_api_update := now - api_last_update_time) > allowed_stale_seconds: # noqa: F841
|
|
160
155
|
# we haven't heard anything from the API in a while; probably
|
|
161
156
|
# the API is down. Ignore object staleness to avoid false positives.
|
|
@@ -228,146 +223,93 @@ def watch_forever(
|
|
|
228
223
|
break
|
|
229
224
|
|
|
230
225
|
|
|
231
|
-
class
|
|
232
|
-
"""
|
|
233
|
-
|
|
226
|
+
class WatchingObjectSource(ty.Generic[T]):
|
|
227
|
+
"""Efficiently 'get' objects by reliably watching for changes to all such objects in a given namespace.
|
|
228
|
+
|
|
229
|
+
This is network-efficient for observing many different objects,
|
|
230
|
+
but not memory efficient if you really only need to fetch details
|
|
231
|
+
for a few objects.
|
|
234
232
|
"""
|
|
235
233
|
|
|
236
234
|
def __init__(
|
|
237
235
|
self,
|
|
238
|
-
|
|
236
|
+
get_list_method: GetListMethod[T],
|
|
237
|
+
get_name: ty.Callable[[T], str] = ty.cast( # noqa: B008
|
|
238
|
+
ty.Callable[[T], str], _default_get_name
|
|
239
|
+
),
|
|
240
|
+
backup_fetch: ty.Optional[ty.Callable[[str, str], T]] = None,
|
|
241
|
+
typename: str = "object",
|
|
242
|
+
starting: ty.Callable[[str], str] = STARTING,
|
|
239
243
|
) -> None:
|
|
240
|
-
self.
|
|
244
|
+
self.get_list_method = get_list_method
|
|
245
|
+
self.get_name = get_name
|
|
246
|
+
self.backup_fetch = backup_fetch
|
|
247
|
+
self.typename = typename
|
|
248
|
+
self._objs_by_name: ty.Dict[str, T] = dict()
|
|
241
249
|
# ^ is a possibly big/expensive local cache of the most recent
|
|
242
250
|
# state for all of the event type in the namespace. Don't use
|
|
243
251
|
# this class if you can't afford the memory overhead of
|
|
244
252
|
# observing everything in your namespace and keeping the last
|
|
245
253
|
# known copy of everything forever.
|
|
246
|
-
self.
|
|
254
|
+
self._last_seen_time_by_name: ty.Dict[str, float] = dict()
|
|
247
255
|
self._last_api_update_time = 0.0
|
|
248
|
-
self.
|
|
256
|
+
self._limiter = OneShotLimiter()
|
|
249
257
|
|
|
250
|
-
def
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
258
|
+
def _start_thread(self, namespace: str) -> None:
|
|
259
|
+
create_watch_thread(
|
|
260
|
+
self.get_list_method, self._add_object, namespace, typename=self.typename
|
|
261
|
+
).start()
|
|
262
|
+
|
|
263
|
+
def _add_object(self, namespace: str, obj: T, _event_type: EventType) -> None:
|
|
264
|
+
"""This is where we receive updates from the k8s API."""
|
|
265
|
+
self._last_api_update_time = time.monotonic()
|
|
266
|
+
|
|
267
|
+
if not obj:
|
|
268
|
+
logger.warning(f"Received null/empty {self.typename}")
|
|
269
|
+
return
|
|
256
270
|
|
|
257
|
-
|
|
258
|
-
|
|
271
|
+
name = _make_name(namespace, self.get_name(obj))
|
|
272
|
+
logger.debug(f"{self.typename} {name} updated")
|
|
273
|
+
self._last_seen_time_by_name[name] = time.monotonic()
|
|
274
|
+
self._objs_by_name[name] = obj
|
|
275
|
+
|
|
276
|
+
def _is_stale(self, name: str) -> bool:
|
|
277
|
+
return is_stale(self._last_api_update_time, self._last_seen_time_by_name.get(name) or 0)
|
|
278
|
+
|
|
279
|
+
@scope.bound
|
|
280
|
+
def get(self, obj_name: str, namespace: str = "") -> ty.Optional[T]:
|
|
281
|
+
namespace = namespace or config.k8s_namespace()
|
|
282
|
+
name = _make_name(namespace, obj_name)
|
|
283
|
+
scope.enter(logger_context(name=obj_name, namespace=namespace))
|
|
259
284
|
|
|
260
|
-
def get(self, key: K) -> ty.Optional[T]:
|
|
261
285
|
# first try is looking in our local cache
|
|
262
|
-
if (obj := self.
|
|
286
|
+
if (obj := self._objs_by_name.get(name)) and not self._is_stale(name):
|
|
263
287
|
return obj
|
|
264
288
|
|
|
265
289
|
# second try is making sure the namespace watcher is running, sleeping, and then looking in the cache again.
|
|
266
290
|
# This is much more efficient than a manual fetch.
|
|
291
|
+
self._limiter(namespace, self._start_thread)
|
|
267
292
|
time.sleep(config.k8s_monitor_delay())
|
|
268
|
-
if (obj := self.
|
|
293
|
+
if (obj := self._objs_by_name.get(name)) and not self._is_stale(name):
|
|
269
294
|
return obj
|
|
270
295
|
|
|
271
296
|
# if that doesn't work, try a manual fetch.
|
|
272
297
|
if self.backup_fetch:
|
|
273
|
-
logger.warning(f"Manually fetching {
|
|
298
|
+
logger.warning(f"Manually fetching {self.typename}...")
|
|
274
299
|
# doing a lot of manual fetches may indicate that the k8s API is having trouble keeping up...
|
|
275
300
|
try:
|
|
276
|
-
if obj := self.backup_fetch(
|
|
277
|
-
self.
|
|
301
|
+
if obj := self.backup_fetch(namespace, obj_name):
|
|
302
|
+
self._add_object(namespace, obj, "FETCH") # updates last seen, too
|
|
278
303
|
return obj
|
|
279
304
|
|
|
280
305
|
except Exception:
|
|
281
|
-
logger.exception(f"Unexpected error during manual fetch of {
|
|
306
|
+
logger.exception(f"Unexpected error during manual fetch of {self.typename}.")
|
|
282
307
|
|
|
283
|
-
if self._is_stale(
|
|
308
|
+
if self._is_stale(name):
|
|
284
309
|
logger.warning(
|
|
285
|
-
f"Could not refresh {
|
|
310
|
+
f"Could not refresh {name}, and our record of it is stale - dropping stale object!"
|
|
286
311
|
)
|
|
287
|
-
self.
|
|
288
|
-
self.
|
|
312
|
+
self._objs_by_name.pop(name, None)
|
|
313
|
+
self._last_seen_time_by_name.pop(name, None)
|
|
289
314
|
|
|
290
315
|
return None
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
class WatchingObjectSource(ty.Generic[T]):
|
|
294
|
-
"""Efficiently 'get' objects by launching a single thread to
|
|
295
|
-
watch for changes to all such objects in a given namespace.
|
|
296
|
-
|
|
297
|
-
Also provide a way to create a future that will be resolved according to the logic
|
|
298
|
-
provided by the caller whenever an object is updated, or if the object has not been
|
|
299
|
-
updated in a while.
|
|
300
|
-
|
|
301
|
-
Importantly, the Futures are only prevented from deadlocking (never awakening their
|
|
302
|
-
condition variable) by the fact that we very occasionally will go through the list
|
|
303
|
-
of seen objects and raise Exceptions for objects that have not been updated in a while.
|
|
304
|
-
This is vaguely akin to garbage collection, in that it will occasionally
|
|
305
|
-
cause a 'pause' in the watcher thread as it tries to collect stale objects.
|
|
306
|
-
|
|
307
|
-
This is network-efficient for observing many different objects,
|
|
308
|
-
but not memory efficient if you really only need to fetch details
|
|
309
|
-
for a few objects, because we retain the last known state for every observed object indefinitely.
|
|
310
|
-
"""
|
|
311
|
-
|
|
312
|
-
def __init__(
|
|
313
|
-
self,
|
|
314
|
-
get_list_method: GetListMethod[T],
|
|
315
|
-
get_name: ty.Callable[[T], str] = ty.cast( # noqa: B008
|
|
316
|
-
ty.Callable[[T], str], _default_get_name
|
|
317
|
-
),
|
|
318
|
-
backup_fetch: ty.Optional[ty.Callable[[str, str], ty.Optional[T]]] = None,
|
|
319
|
-
typename: str = "object",
|
|
320
|
-
) -> None:
|
|
321
|
-
self.get_list_method = get_list_method
|
|
322
|
-
self.get_name = get_name
|
|
323
|
-
self.typename = typename
|
|
324
|
-
self._limiter = OneShotLimiter()
|
|
325
|
-
self._uncertain_futures = UncertainFuturesTracker[tuple[str, str], T](
|
|
326
|
-
config.k8s_watch_object_stale_seconds()
|
|
327
|
-
)
|
|
328
|
-
self._seen_objects = _SeenObjectContainer[tuple[str, str], T](
|
|
329
|
-
lambda namespace_and_name: backup_fetch(*namespace_and_name) if backup_fetch else None
|
|
330
|
-
)
|
|
331
|
-
|
|
332
|
-
def _add_object(self, namespace: str, obj: T, _event_type: EventType) -> None:
|
|
333
|
-
"""This is where we receive updates from the k8s API."""
|
|
334
|
-
if not obj:
|
|
335
|
-
logger.warning(f"Received null/empty {self.typename}")
|
|
336
|
-
return
|
|
337
|
-
|
|
338
|
-
key = (namespace, self.get_name(obj))
|
|
339
|
-
self._seen_objects.set_object(key, obj)
|
|
340
|
-
self._uncertain_futures.update(key, obj)
|
|
341
|
-
logger.debug("%s %s updated", self.typename, key)
|
|
342
|
-
|
|
343
|
-
def _start_namespace_watcher_thread(self, namespace: str) -> None:
|
|
344
|
-
create_watch_thread(
|
|
345
|
-
self.get_list_method, self._add_object, namespace, typename=self.typename
|
|
346
|
-
).start()
|
|
347
|
-
|
|
348
|
-
@scope.bound
|
|
349
|
-
def get(self, obj_name: str, namespace: str = "") -> ty.Optional[T]:
|
|
350
|
-
"""May block for a little while if a manual fetch is required."""
|
|
351
|
-
namespace = namespace or config.k8s_namespace()
|
|
352
|
-
scope.enter(logger_context(name=obj_name, namespace=namespace))
|
|
353
|
-
self._limiter(namespace, self._start_namespace_watcher_thread)
|
|
354
|
-
return self._seen_objects.get((namespace, obj_name))
|
|
355
|
-
|
|
356
|
-
def create_future(
|
|
357
|
-
self,
|
|
358
|
-
interpreter: FutureInterpreter[T, R],
|
|
359
|
-
obj_name: str,
|
|
360
|
-
*,
|
|
361
|
-
namespace: str = "",
|
|
362
|
-
) -> futures.PFuture[R]:
|
|
363
|
-
"""Create a future that will be resolved when the object is available according to
|
|
364
|
-
the interpreter.
|
|
365
|
-
|
|
366
|
-
The FutureInterpreter must:
|
|
367
|
-
- raise an exception if it wishes the future to raise.
|
|
368
|
-
- return a Done with the result if it wishes the future to resolve successfully.
|
|
369
|
-
-return None if the status is still in progress.
|
|
370
|
-
"""
|
|
371
|
-
namespace = namespace or config.k8s_namespace()
|
|
372
|
-
self._limiter(namespace, self._start_namespace_watcher_thread)
|
|
373
|
-
return self._uncertain_futures.create((namespace, obj_name), interpreter)
|
thds/mops/pure/__init__.py
CHANGED
|
@@ -7,7 +7,6 @@
|
|
|
7
7
|
from . import adls # noqa
|
|
8
8
|
from ._magic.api import magic # noqa
|
|
9
9
|
from .core.entry import register_entry_handler
|
|
10
|
-
from .core.lock.maintain import no_maintain as no_maintain_locks # noqa: F401
|
|
11
10
|
from .core.memo import results # noqa
|
|
12
11
|
from .core.memo.function_memospace import ( # noqa
|
|
13
12
|
add_pipeline_memospace_handlers,
|
|
@@ -22,7 +21,7 @@ from .core.use_runner import use_runner # noqa
|
|
|
22
21
|
from .pickling.memoize_only import memoize_in # noqa
|
|
23
22
|
from .pickling.mprunner import MemoizingPicklingRunner # noqa
|
|
24
23
|
from .runner.simple_shims import samethread_shim, subprocess_shim # noqa
|
|
25
|
-
from .runner.types import
|
|
24
|
+
from .runner.types import Shim, ShimBuilder # noqa
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
def _register_things() -> None:
|
thds/mops/pure/_magic/sauce.py
CHANGED
|
@@ -6,7 +6,7 @@ import typing as ty
|
|
|
6
6
|
|
|
7
7
|
from typing_extensions import ParamSpec
|
|
8
8
|
|
|
9
|
-
from thds.core import
|
|
9
|
+
from thds.core import stack_context
|
|
10
10
|
from thds.mops._utils import config_tree
|
|
11
11
|
|
|
12
12
|
from ..core import file_blob_store, pipeline_id, pipeline_id_mask, uris
|
|
@@ -109,7 +109,7 @@ class Magic(ty.Generic[P, R]):
|
|
|
109
109
|
def _is_off(self) -> bool:
|
|
110
110
|
return self._shim_builder_or_off is None
|
|
111
111
|
|
|
112
|
-
def _shimbuilder(self, f: ty.Callable[P, R], args: P.args, kwargs: P.kwargs) -> Shim:
|
|
112
|
+
def _shimbuilder(self, f: ty.Callable[P, R], args: P.args, kwargs: P.kwargs) -> Shim:
|
|
113
113
|
# this can be set using a stack-local context, or set globally as specifically
|
|
114
114
|
# or generally as the user needs. We prefer stack local over everything else.
|
|
115
115
|
sb = self._shim_builder_or_off
|
|
@@ -123,16 +123,8 @@ class Magic(ty.Generic[P, R]):
|
|
|
123
123
|
def _pipeline_id(self) -> str:
|
|
124
124
|
return self.config.pipeline_id.getv(self._func_config_path)
|
|
125
125
|
|
|
126
|
-
def submit(self, *args: P.args, **kwargs: P.kwargs) -> futures.PFuture[R]:
|
|
127
|
-
"""A futures-based interface that doesn't block on the result of the wrapped
|
|
128
|
-
function call, but returns a PFuture once either a result has been found or a a
|
|
129
|
-
new invocation has been started.
|
|
130
|
-
"""
|
|
131
|
-
with pipeline_id.set_pipeline_id_for_stack(self._pipeline_id):
|
|
132
|
-
return self.runner.submit(self.__wrapped__, *args, **kwargs)
|
|
133
|
-
|
|
134
126
|
def __call__(self, *args: P.args, **kwargs: P.kwargs) -> R:
|
|
135
|
-
"""This is the wrapped function
|
|
127
|
+
"""This is the wrapped function."""
|
|
136
128
|
with pipeline_id.set_pipeline_id_for_stack(self._pipeline_id):
|
|
137
129
|
return self._func(*args, **kwargs)
|
|
138
130
|
|
thds/mops/pure/_magic/shims.py
CHANGED
|
@@ -4,14 +4,14 @@ from thds import core
|
|
|
4
4
|
|
|
5
5
|
from ..runner.shim_builder import make_builder
|
|
6
6
|
from ..runner.simple_shims import samethread_shim, subprocess_shim
|
|
7
|
-
from ..runner.types import
|
|
7
|
+
from ..runner.types import Shim, ShimBuilder
|
|
8
8
|
|
|
9
9
|
ShimName = ty.Literal[
|
|
10
10
|
"samethread", # memoization and coordination, but run in the same thread as the caller.
|
|
11
11
|
"subprocess", # memoization and coordination, but transfer to a subprocess rather than remote.
|
|
12
12
|
"off", # equivalent to None - disables use of mops.
|
|
13
13
|
]
|
|
14
|
-
ShimOrBuilder = ty.Union[ShimBuilder, Shim
|
|
14
|
+
ShimOrBuilder = ty.Union[ShimBuilder, Shim]
|
|
15
15
|
logger = core.log.getLogger(__name__)
|
|
16
16
|
|
|
17
17
|
|