thds.mops 3.9.20250721225429__py3-none-any.whl → 3.9.20250721231027__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thds/mops/impure/runner.py +1 -1
- thds/mops/k8s/__init__.py +3 -1
- thds/mops/k8s/{launch.py → _launch.py} +56 -57
- thds/mops/k8s/batching.py +198 -0
- thds/mops/k8s/config.py +1 -1
- thds/mops/k8s/counts.py +28 -0
- thds/mops/k8s/job_future.py +109 -0
- thds/mops/k8s/jobs.py +4 -0
- thds/mops/k8s/logging.py +37 -5
- thds/mops/k8s/uncertain_future.py +160 -0
- thds/mops/k8s/watch.py +120 -62
- thds/mops/pure/__init__.py +2 -1
- thds/mops/pure/_magic/sauce.py +11 -3
- thds/mops/pure/_magic/shims.py +2 -2
- thds/mops/pure/core/deferred_work.py +0 -8
- thds/mops/pure/core/entry/runner_registry.py +1 -10
- thds/mops/pure/core/lock/__init__.py +1 -0
- thds/mops/pure/core/lock/_acquire.py +2 -2
- thds/mops/pure/core/lock/maintain.py +22 -3
- thds/mops/pure/core/lock/write.py +19 -19
- thds/mops/pure/core/memo/__init__.py +1 -1
- thds/mops/pure/core/memo/results.py +5 -4
- thds/mops/pure/core/use_runner.py +21 -7
- thds/mops/pure/pickling/mprunner.py +21 -14
- thds/mops/pure/pickling/pickles.py +19 -8
- thds/mops/pure/pickling/remote.py +3 -1
- thds/mops/pure/runner/get_results.py +106 -0
- thds/mops/pure/runner/local.py +58 -87
- thds/mops/pure/runner/shim_builder.py +7 -7
- thds/mops/pure/runner/simple_shims.py +7 -0
- thds/mops/pure/runner/types.py +15 -4
- thds/mops/pure/tools/summarize/run_summary.py +9 -8
- {thds_mops-3.9.20250721225429.dist-info → thds_mops-3.9.20250721231027.dist-info}/METADATA +1 -1
- {thds_mops-3.9.20250721225429.dist-info → thds_mops-3.9.20250721231027.dist-info}/RECORD +37 -32
- {thds_mops-3.9.20250721225429.dist-info → thds_mops-3.9.20250721231027.dist-info}/WHEEL +0 -0
- {thds_mops-3.9.20250721225429.dist-info → thds_mops-3.9.20250721231027.dist-info}/entry_points.txt +0 -0
- {thds_mops-3.9.20250721225429.dist-info → thds_mops-3.9.20250721231027.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import threading
|
|
3
|
+
import time
|
|
4
|
+
import typing as ty
|
|
5
|
+
|
|
6
|
+
# we use concurrent.futures.Future as an implementation detail, but it's communicated
|
|
7
|
+
# as core.futures.PFuture to give us the flexibility to change the implementation later if needed.
|
|
8
|
+
from concurrent.futures import Future
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from uuid import uuid4
|
|
11
|
+
|
|
12
|
+
from typing_extensions import Self
|
|
13
|
+
|
|
14
|
+
from thds import core
|
|
15
|
+
|
|
16
|
+
R_0 = ty.TypeVar("R_0", contravariant=True) # R-naught - the thing that might resolve a Future.
|
|
17
|
+
# a value for this type may never be None.
|
|
18
|
+
|
|
19
|
+
R = ty.TypeVar("R")
|
|
20
|
+
# the Result type of the Future. These are allowed to be None, since some Futures may
|
|
21
|
+
# resolve but not return a value.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NotYetDone:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_LastSeenAt = float # type alias for the last seen time of the Future, in seconds since epoch
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
FutureInterpreter = ty.Callable[[ty.Optional[R_0], _LastSeenAt], ty.Union[R, NotYetDone]]
|
|
32
|
+
# a FutureInterpreter is a function that takes an object R_0 and the time.monotonic() at
|
|
33
|
+
# which it was last seen, and returns either NotYetDone (if the status is still in progress) or
|
|
34
|
+
# the actual Future result of type R, or, if the status is failure,
|
|
35
|
+
# _raises_ an appropriate Exception.
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _FutureInterpretationShim(ty.Generic[R_0, R]):
|
|
39
|
+
def __init__(self, interpreter: FutureInterpreter[R_0, ty.Union[NotYetDone, R]]) -> None:
|
|
40
|
+
self.future = Future[R]()
|
|
41
|
+
self._interpreter = interpreter
|
|
42
|
+
self._id = uuid4().hex # has an id so it can be hashed and therefore easily found in a set
|
|
43
|
+
|
|
44
|
+
def __hash__(self) -> int:
|
|
45
|
+
return hash(self._id)
|
|
46
|
+
|
|
47
|
+
def __call__(self, r_0: ty.Optional[R_0], last_seen_at: float) -> ty.Optional[Self]:
|
|
48
|
+
"""First and foremost - this _must_ be treated as an object that the creator
|
|
49
|
+
is ultimately responsible for calling on a semi-regular basis. It represents a
|
|
50
|
+
likely deadlock for the holder of the Future if it is never called.
|
|
51
|
+
|
|
52
|
+
Return False if the Future is still in progress and should not be unregistered.
|
|
53
|
+
Return True if the Future is done and should be unregistered.
|
|
54
|
+
"""
|
|
55
|
+
try:
|
|
56
|
+
interpretation = self._interpreter(r_0, last_seen_at)
|
|
57
|
+
if isinstance(interpretation, NotYetDone):
|
|
58
|
+
return None # do nothing and do not unregister - the status is still in progress.
|
|
59
|
+
|
|
60
|
+
self.future.set_result(interpretation)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
self.future.set_exception(e)
|
|
63
|
+
|
|
64
|
+
return self
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
K = ty.TypeVar("K") # Key type for the UncertainFuturesTracker
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class _FuturesState(ty.Generic[R_0]):
|
|
72
|
+
"""Represents a single 'observable' that may have multiple Futures (and therefore interpretations) associated with it."""
|
|
73
|
+
|
|
74
|
+
futshims: list[_FutureInterpretationShim[R_0, ty.Any]]
|
|
75
|
+
last_seen_at: float
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def official_timer() -> float:
|
|
79
|
+
# we don't need any particular meaning to the time.
|
|
80
|
+
return time.monotonic()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class UncertainFuturesTracker(ty.Generic[K, R_0]):
|
|
84
|
+
"""This class represents a kind of Future where we cannot be guaranteed that we will ever see
|
|
85
|
+
any further information about it, because we do not control the source of the data.
|
|
86
|
+
|
|
87
|
+
A good example would be a Kubernetes object that we are watching - we may _think_ that a Job will be created,
|
|
88
|
+
but there are race conditions galore in terms of actually looking for that object.
|
|
89
|
+
|
|
90
|
+
However, if we _do_ see it at a some point, then we can interpret future 'missingness'
|
|
91
|
+
as a tentative success.
|
|
92
|
+
|
|
93
|
+
The danger with this uncertainty is that Futures represent implicit deadlocks - if we
|
|
94
|
+
never resolve the Future, then a caller may be waiting for it forever. Therefore, we
|
|
95
|
+
ask the original requestor of the Future to specify how long they are willing to wait
|
|
96
|
+
to get a result, after which point we will resolve the Future as an exception.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(self, allowed_stale_seconds: float) -> None:
|
|
100
|
+
self._keyed_futures_state = collections.OrderedDict[K, _FuturesState[R_0]]()
|
|
101
|
+
self._lock = threading.Lock() # i don't trust ordered dict operations to be thread-safe.
|
|
102
|
+
self._check_stale_seconds = allowed_stale_seconds
|
|
103
|
+
|
|
104
|
+
def create(self, key: K, interpreter: FutureInterpreter[R_0, R]) -> core.futures.PFuture[R]:
|
|
105
|
+
futshim = _FutureInterpretationShim(interpreter)
|
|
106
|
+
with self._lock:
|
|
107
|
+
if key not in self._keyed_futures_state:
|
|
108
|
+
self._keyed_futures_state[key] = _FuturesState(
|
|
109
|
+
[futshim],
|
|
110
|
+
last_seen_at=official_timer() + self._check_stale_seconds,
|
|
111
|
+
# we provide a double margin for objects that we have never seen before.
|
|
112
|
+
)
|
|
113
|
+
self._keyed_futures_state.move_to_end(key, last=False)
|
|
114
|
+
# never seen and therefore should be at the beginning (most stale)
|
|
115
|
+
else:
|
|
116
|
+
# maintain our ordered dict so we can handle garbage collection of stale Futures.
|
|
117
|
+
self._keyed_futures_state[key].futshims.append(futshim)
|
|
118
|
+
|
|
119
|
+
return futshim.future
|
|
120
|
+
|
|
121
|
+
def update(self, key: ty.Optional[K], r_0: ty.Optional[R_0]) -> None:
|
|
122
|
+
"""Update the keyed Futures based on their interpreters.
|
|
123
|
+
|
|
124
|
+
Also check any stale Futures - Futures that have not seen an update (via their key) in a while.
|
|
125
|
+
|
|
126
|
+
If `key` is None, we will update all Futures that have been created so far.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def check_resolution(fut_state: _FuturesState[R_0], inner_r_0: ty.Optional[R_0]) -> None:
|
|
130
|
+
for future_shim_that_is_done in core.parallel.yield_results(
|
|
131
|
+
[
|
|
132
|
+
core.thunks.thunking(futshim)(inner_r_0, fut_state.last_seen_at)
|
|
133
|
+
for futshim in fut_state.futshims
|
|
134
|
+
],
|
|
135
|
+
progress_logger=core.log.getLogger(__name__).debug,
|
|
136
|
+
named="UncertainFuturesTracker.update",
|
|
137
|
+
):
|
|
138
|
+
if future_shim_that_is_done is not None:
|
|
139
|
+
# the Future is done, so we can remove it from the list of Futures.
|
|
140
|
+
fut_state.futshims.remove(future_shim_that_is_done)
|
|
141
|
+
|
|
142
|
+
if key is not None:
|
|
143
|
+
with self._lock:
|
|
144
|
+
if key not in self._keyed_futures_state:
|
|
145
|
+
self._keyed_futures_state[key] = _FuturesState(list(), last_seen_at=official_timer())
|
|
146
|
+
else:
|
|
147
|
+
# maintain our ordered dict so we can handle garbage collection of stale Futures.
|
|
148
|
+
self._keyed_futures_state.move_to_end(key)
|
|
149
|
+
self._keyed_futures_state[key].last_seen_at = official_timer()
|
|
150
|
+
|
|
151
|
+
fut_state = self._keyed_futures_state[key]
|
|
152
|
+
check_resolution(fut_state, r_0)
|
|
153
|
+
|
|
154
|
+
# 'garbage collect' any Futures that haven't been updated in a while.
|
|
155
|
+
for futs_state in self._keyed_futures_state.values():
|
|
156
|
+
if futs_state.last_seen_at + self._check_stale_seconds < official_timer():
|
|
157
|
+
check_resolution(futs_state, None)
|
|
158
|
+
else: # these are ordered, so once we see one that's not stale, we can stop checking.
|
|
159
|
+
# this prevents us from having to do O(N) checks for every update.
|
|
160
|
+
break
|
thds/mops/k8s/watch.py
CHANGED
|
@@ -12,17 +12,20 @@ import urllib3
|
|
|
12
12
|
from kubernetes import client
|
|
13
13
|
from kubernetes import watch as k8s_watch
|
|
14
14
|
|
|
15
|
-
from thds.core import scope
|
|
15
|
+
from thds.core import futures, scope
|
|
16
16
|
from thds.core.log import getLogger, logger_context
|
|
17
17
|
from thds.termtool.colorize import colorized
|
|
18
18
|
|
|
19
19
|
from . import config
|
|
20
20
|
from .auth import load_config
|
|
21
21
|
from .too_old_resource_version import parse_too_old_resource_version
|
|
22
|
+
from .uncertain_future import FutureInterpreter, UncertainFuturesTracker
|
|
22
23
|
|
|
23
24
|
logger = getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
T = ty.TypeVar("T")
|
|
27
|
+
K = ty.TypeVar("K")
|
|
28
|
+
R = ty.TypeVar("R")
|
|
26
29
|
|
|
27
30
|
|
|
28
31
|
class V1List(ty.Protocol[T]):
|
|
@@ -115,10 +118,6 @@ def callback_events(
|
|
|
115
118
|
break
|
|
116
119
|
|
|
117
120
|
|
|
118
|
-
def _make_name(namespace: str, name: str) -> str:
|
|
119
|
-
return f"{namespace}/{name}"
|
|
120
|
-
|
|
121
|
-
|
|
122
121
|
def _default_get_name(obj: ty.Any) -> str:
|
|
123
122
|
return obj.metadata.name
|
|
124
123
|
|
|
@@ -148,9 +147,15 @@ class OneShotLimiter:
|
|
|
148
147
|
self._names.add(name)
|
|
149
148
|
|
|
150
149
|
|
|
150
|
+
def _watch_timer() -> float:
|
|
151
|
+
# in this context, monotonicity (actual timing) is most useful because we don't need sentinels.
|
|
152
|
+
return time.monotonic()
|
|
153
|
+
|
|
154
|
+
|
|
151
155
|
def is_stale(api_last_update_time: float, obj_last_seen_time: float) -> bool:
|
|
152
|
-
now =
|
|
156
|
+
now = _watch_timer()
|
|
153
157
|
allowed_stale_seconds = config.k8s_watch_object_stale_seconds()
|
|
158
|
+
# about 5 minutes by default as of 2025-07-15.
|
|
154
159
|
if (time_since_api_update := now - api_last_update_time) > allowed_stale_seconds: # noqa: F841
|
|
155
160
|
# we haven't heard anything from the API in a while; probably
|
|
156
161
|
# the API is down. Ignore object staleness to avoid false positives.
|
|
@@ -223,93 +228,146 @@ def watch_forever(
|
|
|
223
228
|
break
|
|
224
229
|
|
|
225
230
|
|
|
226
|
-
class
|
|
227
|
-
"""
|
|
228
|
-
|
|
229
|
-
This is network-efficient for observing many different objects,
|
|
230
|
-
but not memory efficient if you really only need to fetch details
|
|
231
|
-
for a few objects.
|
|
231
|
+
class _SeenObjectContainer(ty.Generic[K, T]):
|
|
232
|
+
"""Splits some of the logic for 'get' out of WatchingObjectSource
|
|
233
|
+
so that we can have it be a simpler container for both this and the UncertainFuturesTracker.
|
|
232
234
|
"""
|
|
233
235
|
|
|
234
236
|
def __init__(
|
|
235
237
|
self,
|
|
236
|
-
|
|
237
|
-
get_name: ty.Callable[[T], str] = ty.cast( # noqa: B008
|
|
238
|
-
ty.Callable[[T], str], _default_get_name
|
|
239
|
-
),
|
|
240
|
-
backup_fetch: ty.Optional[ty.Callable[[str, str], T]] = None,
|
|
241
|
-
typename: str = "object",
|
|
242
|
-
starting: ty.Callable[[str], str] = STARTING,
|
|
238
|
+
backup_fetch: ty.Optional[ty.Callable[[K], ty.Optional[T]]] = None,
|
|
243
239
|
) -> None:
|
|
244
|
-
self.
|
|
245
|
-
self.get_name = get_name
|
|
246
|
-
self.backup_fetch = backup_fetch
|
|
247
|
-
self.typename = typename
|
|
248
|
-
self._objs_by_name: ty.Dict[str, T] = dict()
|
|
240
|
+
self._objs: ty.Dict[K, T] = dict()
|
|
249
241
|
# ^ is a possibly big/expensive local cache of the most recent
|
|
250
242
|
# state for all of the event type in the namespace. Don't use
|
|
251
243
|
# this class if you can't afford the memory overhead of
|
|
252
244
|
# observing everything in your namespace and keeping the last
|
|
253
245
|
# known copy of everything forever.
|
|
254
|
-
self.
|
|
246
|
+
self._last_seen_times: ty.Dict[K, float] = dict()
|
|
255
247
|
self._last_api_update_time = 0.0
|
|
256
|
-
self.
|
|
257
|
-
|
|
258
|
-
def _start_thread(self, namespace: str) -> None:
|
|
259
|
-
create_watch_thread(
|
|
260
|
-
self.get_list_method, self._add_object, namespace, typename=self.typename
|
|
261
|
-
).start()
|
|
262
|
-
|
|
263
|
-
def _add_object(self, namespace: str, obj: T, _event_type: EventType) -> None:
|
|
264
|
-
"""This is where we receive updates from the k8s API."""
|
|
265
|
-
self._last_api_update_time = time.monotonic()
|
|
266
|
-
|
|
267
|
-
if not obj:
|
|
268
|
-
logger.warning(f"Received null/empty {self.typename}")
|
|
269
|
-
return
|
|
248
|
+
self.backup_fetch = backup_fetch
|
|
270
249
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
self.
|
|
250
|
+
def set_object(self, key: K, obj: T) -> None:
|
|
251
|
+
"""Set an object in the cache, updating the last seen time."""
|
|
252
|
+
now = _watch_timer()
|
|
253
|
+
self._last_api_update_time = now
|
|
254
|
+
self._last_seen_times[key] = now
|
|
255
|
+
self._objs[key] = obj
|
|
275
256
|
|
|
276
|
-
def _is_stale(self,
|
|
277
|
-
return is_stale(self._last_api_update_time, self.
|
|
278
|
-
|
|
279
|
-
@scope.bound
|
|
280
|
-
def get(self, obj_name: str, namespace: str = "") -> ty.Optional[T]:
|
|
281
|
-
namespace = namespace or config.k8s_namespace()
|
|
282
|
-
name = _make_name(namespace, obj_name)
|
|
283
|
-
scope.enter(logger_context(name=obj_name, namespace=namespace))
|
|
257
|
+
def _is_stale(self, key: K) -> bool:
|
|
258
|
+
return is_stale(self._last_api_update_time, self._last_seen_times.get(key) or 0)
|
|
284
259
|
|
|
260
|
+
def get(self, key: K) -> ty.Optional[T]:
|
|
285
261
|
# first try is looking in our local cache
|
|
286
|
-
if (obj := self.
|
|
262
|
+
if (obj := self._objs.get(key)) and not self._is_stale(key):
|
|
287
263
|
return obj
|
|
288
264
|
|
|
289
265
|
# second try is making sure the namespace watcher is running, sleeping, and then looking in the cache again.
|
|
290
266
|
# This is much more efficient than a manual fetch.
|
|
291
|
-
self._limiter(namespace, self._start_thread)
|
|
292
267
|
time.sleep(config.k8s_monitor_delay())
|
|
293
|
-
if (obj := self.
|
|
268
|
+
if (obj := self._objs.get(key)) and not self._is_stale(key):
|
|
294
269
|
return obj
|
|
295
270
|
|
|
296
271
|
# if that doesn't work, try a manual fetch.
|
|
297
272
|
if self.backup_fetch:
|
|
298
|
-
logger.warning(f"Manually fetching {
|
|
273
|
+
logger.warning(f"Manually fetching {key}...")
|
|
299
274
|
# doing a lot of manual fetches may indicate that the k8s API is having trouble keeping up...
|
|
300
275
|
try:
|
|
301
|
-
if obj := self.backup_fetch(
|
|
302
|
-
self.
|
|
276
|
+
if obj := self.backup_fetch(key):
|
|
277
|
+
self.set_object(key, obj) # updates last seen, too
|
|
303
278
|
return obj
|
|
304
279
|
|
|
305
280
|
except Exception:
|
|
306
|
-
logger.exception(f"Unexpected error during manual fetch of {
|
|
281
|
+
logger.exception(f"Unexpected error during manual fetch of {key}.")
|
|
307
282
|
|
|
308
|
-
if self._is_stale(
|
|
283
|
+
if self._is_stale(key):
|
|
309
284
|
logger.warning(
|
|
310
|
-
f"Could not refresh {
|
|
285
|
+
f"Could not refresh {key}, and our record of it is stale - dropping stale object!"
|
|
311
286
|
)
|
|
312
|
-
self.
|
|
313
|
-
self.
|
|
287
|
+
self._objs.pop(key, None)
|
|
288
|
+
self._last_seen_times.pop(key, None)
|
|
314
289
|
|
|
315
290
|
return None
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
class WatchingObjectSource(ty.Generic[T]):
|
|
294
|
+
"""Efficiently 'get' objects by launching a single thread to
|
|
295
|
+
watch for changes to all such objects in a given namespace.
|
|
296
|
+
|
|
297
|
+
Also provide a way to create a future that will be resolved according to the logic
|
|
298
|
+
provided by the caller whenever an object is updated, or if the object has not been
|
|
299
|
+
updated in a while.
|
|
300
|
+
|
|
301
|
+
Importantly, the Futures are only prevented from deadlocking (never awakening their
|
|
302
|
+
condition variable) by the fact that we very occasionally will go through the list
|
|
303
|
+
of seen objects and raise Exceptions for objects that have not been updated in a while.
|
|
304
|
+
This is vaguely akin to garbage collection, in that it will occasionally
|
|
305
|
+
cause a 'pause' in the watcher thread as it tries to collect stale objects.
|
|
306
|
+
|
|
307
|
+
This is network-efficient for observing many different objects,
|
|
308
|
+
but not memory efficient if you really only need to fetch details
|
|
309
|
+
for a few objects, because we retain the last known state for every observed object indefinitely.
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
def __init__(
|
|
313
|
+
self,
|
|
314
|
+
get_list_method: GetListMethod[T],
|
|
315
|
+
get_name: ty.Callable[[T], str] = ty.cast( # noqa: B008
|
|
316
|
+
ty.Callable[[T], str], _default_get_name
|
|
317
|
+
),
|
|
318
|
+
backup_fetch: ty.Optional[ty.Callable[[str, str], ty.Optional[T]]] = None,
|
|
319
|
+
typename: str = "object",
|
|
320
|
+
) -> None:
|
|
321
|
+
self.get_list_method = get_list_method
|
|
322
|
+
self.get_name = get_name
|
|
323
|
+
self.typename = typename
|
|
324
|
+
self._limiter = OneShotLimiter()
|
|
325
|
+
self._uncertain_futures = UncertainFuturesTracker[tuple[str, str], T](
|
|
326
|
+
config.k8s_watch_object_stale_seconds()
|
|
327
|
+
)
|
|
328
|
+
self._seen_objects = _SeenObjectContainer[tuple[str, str], T](
|
|
329
|
+
lambda namespace_and_name: backup_fetch(*namespace_and_name) if backup_fetch else None
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
def _add_object(self, namespace: str, obj: T, _event_type: EventType) -> None:
|
|
333
|
+
"""This is where we receive updates from the k8s API."""
|
|
334
|
+
if not obj:
|
|
335
|
+
logger.warning(f"Received null/empty {self.typename}")
|
|
336
|
+
return
|
|
337
|
+
|
|
338
|
+
key = (namespace, self.get_name(obj))
|
|
339
|
+
self._seen_objects.set_object(key, obj)
|
|
340
|
+
self._uncertain_futures.update(key, obj)
|
|
341
|
+
logger.debug("%s %s updated", self.typename, key)
|
|
342
|
+
|
|
343
|
+
def _start_namespace_watcher_thread(self, namespace: str) -> None:
|
|
344
|
+
create_watch_thread(
|
|
345
|
+
self.get_list_method, self._add_object, namespace, typename=self.typename
|
|
346
|
+
).start()
|
|
347
|
+
|
|
348
|
+
@scope.bound
|
|
349
|
+
def get(self, obj_name: str, namespace: str = "") -> ty.Optional[T]:
|
|
350
|
+
"""May block for a little while if a manual fetch is required."""
|
|
351
|
+
namespace = namespace or config.k8s_namespace()
|
|
352
|
+
scope.enter(logger_context(name=obj_name, namespace=namespace))
|
|
353
|
+
self._limiter(namespace, self._start_namespace_watcher_thread)
|
|
354
|
+
return self._seen_objects.get((namespace, obj_name))
|
|
355
|
+
|
|
356
|
+
def create_future(
|
|
357
|
+
self,
|
|
358
|
+
interpreter: FutureInterpreter[T, R],
|
|
359
|
+
obj_name: str,
|
|
360
|
+
*,
|
|
361
|
+
namespace: str = "",
|
|
362
|
+
) -> futures.PFuture[R]:
|
|
363
|
+
"""Create a future that will be resolved when the object is available according to
|
|
364
|
+
the interpreter.
|
|
365
|
+
|
|
366
|
+
The FutureInterpreter must:
|
|
367
|
+
- raise an exception if it wishes the future to raise.
|
|
368
|
+
- return a Done with the result if it wishes the future to resolve successfully.
|
|
369
|
+
-return None if the status is still in progress.
|
|
370
|
+
"""
|
|
371
|
+
namespace = namespace or config.k8s_namespace()
|
|
372
|
+
self._limiter(namespace, self._start_namespace_watcher_thread)
|
|
373
|
+
return self._uncertain_futures.create((namespace, obj_name), interpreter)
|
thds/mops/pure/__init__.py
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
from . import adls # noqa
|
|
8
8
|
from ._magic.api import magic # noqa
|
|
9
9
|
from .core.entry import register_entry_handler
|
|
10
|
+
from .core.lock.maintain import no_maintain as no_maintain_locks # noqa: F401
|
|
10
11
|
from .core.memo import results # noqa
|
|
11
12
|
from .core.memo.function_memospace import ( # noqa
|
|
12
13
|
add_pipeline_memospace_handlers,
|
|
@@ -21,7 +22,7 @@ from .core.use_runner import use_runner # noqa
|
|
|
21
22
|
from .pickling.memoize_only import memoize_in # noqa
|
|
22
23
|
from .pickling.mprunner import MemoizingPicklingRunner # noqa
|
|
23
24
|
from .runner.simple_shims import samethread_shim, subprocess_shim # noqa
|
|
24
|
-
from .runner.types import Shim, ShimBuilder # noqa
|
|
25
|
+
from .runner.types import FutureShim, Shim, ShimBuilder # noqa
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
def _register_things() -> None:
|
thds/mops/pure/_magic/sauce.py
CHANGED
|
@@ -6,7 +6,7 @@ import typing as ty
|
|
|
6
6
|
|
|
7
7
|
from typing_extensions import ParamSpec
|
|
8
8
|
|
|
9
|
-
from thds.core import stack_context
|
|
9
|
+
from thds.core import futures, stack_context
|
|
10
10
|
from thds.mops._utils import config_tree
|
|
11
11
|
|
|
12
12
|
from ..core import file_blob_store, pipeline_id, pipeline_id_mask, uris
|
|
@@ -109,7 +109,7 @@ class Magic(ty.Generic[P, R]):
|
|
|
109
109
|
def _is_off(self) -> bool:
|
|
110
110
|
return self._shim_builder_or_off is None
|
|
111
111
|
|
|
112
|
-
def _shimbuilder(self, f: ty.Callable[P, R], args: P.args, kwargs: P.kwargs) -> Shim:
|
|
112
|
+
def _shimbuilder(self, f: ty.Callable[P, R], args: P.args, kwargs: P.kwargs) -> Shim: # type: ignore[valid-type]
|
|
113
113
|
# this can be set using a stack-local context, or set globally as specifically
|
|
114
114
|
# or generally as the user needs. We prefer stack local over everything else.
|
|
115
115
|
sb = self._shim_builder_or_off
|
|
@@ -123,8 +123,16 @@ class Magic(ty.Generic[P, R]):
|
|
|
123
123
|
def _pipeline_id(self) -> str:
|
|
124
124
|
return self.config.pipeline_id.getv(self._func_config_path)
|
|
125
125
|
|
|
126
|
+
def submit(self, *args: P.args, **kwargs: P.kwargs) -> futures.PFuture[R]:
|
|
127
|
+
"""A futures-based interface that doesn't block on the result of the wrapped
|
|
128
|
+
function call, but returns a PFuture once either a result has been found or a a
|
|
129
|
+
new invocation has been started.
|
|
130
|
+
"""
|
|
131
|
+
with pipeline_id.set_pipeline_id_for_stack(self._pipeline_id):
|
|
132
|
+
return self.runner.submit(self.__wrapped__, *args, **kwargs)
|
|
133
|
+
|
|
126
134
|
def __call__(self, *args: P.args, **kwargs: P.kwargs) -> R:
|
|
127
|
-
"""This is the wrapped function."""
|
|
135
|
+
"""This is the wrapped function - call this as though it were the function itself."""
|
|
128
136
|
with pipeline_id.set_pipeline_id_for_stack(self._pipeline_id):
|
|
129
137
|
return self._func(*args, **kwargs)
|
|
130
138
|
|
thds/mops/pure/_magic/shims.py
CHANGED
|
@@ -4,14 +4,14 @@ from thds import core
|
|
|
4
4
|
|
|
5
5
|
from ..runner.shim_builder import make_builder
|
|
6
6
|
from ..runner.simple_shims import samethread_shim, subprocess_shim
|
|
7
|
-
from ..runner.types import Shim, ShimBuilder
|
|
7
|
+
from ..runner.types import FutureShim, Shim, ShimBuilder
|
|
8
8
|
|
|
9
9
|
ShimName = ty.Literal[
|
|
10
10
|
"samethread", # memoization and coordination, but run in the same thread as the caller.
|
|
11
11
|
"subprocess", # memoization and coordination, but transfer to a subprocess rather than remote.
|
|
12
12
|
"off", # equivalent to None - disables use of mops.
|
|
13
13
|
]
|
|
14
|
-
ShimOrBuilder = ty.Union[ShimBuilder, Shim]
|
|
14
|
+
ShimOrBuilder = ty.Union[ShimBuilder, Shim, FutureShim]
|
|
15
15
|
logger = core.log.getLogger(__name__)
|
|
16
16
|
|
|
17
17
|
|
|
@@ -32,15 +32,7 @@ def open_context() -> ty.Iterator[None]:
|
|
|
32
32
|
The idea is that you'd call perform_all() inside your Shim which transfers
|
|
33
33
|
execution to a remote environment, but _not_ call it if you're transferring execution
|
|
34
34
|
to a local environment, as the upload will not be needed.
|
|
35
|
-
|
|
36
|
-
This is not re-entrant. If this is called while the dictionary is non-empty, an
|
|
37
|
-
exception will be raised. This is only because I can think of no reason why anyone
|
|
38
|
-
would want it to be re-entrant, so it seems better to raise an error. If for some
|
|
39
|
-
reason re-entrancy were desired, we could just silently pass if the dictionary already
|
|
40
|
-
has deferred work.
|
|
41
35
|
"""
|
|
42
|
-
existing_work = _DEFERRED_INVOCATION_WORK()
|
|
43
|
-
assert existing_work is None, f"deferred work context is not re-entrant! {existing_work}"
|
|
44
36
|
with _DEFERRED_INVOCATION_WORK.set(dict()):
|
|
45
37
|
logger.debug("Opening deferred work context")
|
|
46
38
|
yield
|
|
@@ -5,14 +5,6 @@ In practice we only have a single Runner type registered, the MemoizingPicklingR
|
|
|
5
5
|
|
|
6
6
|
import typing as ty
|
|
7
7
|
|
|
8
|
-
from thds.core import stack_context
|
|
9
|
-
|
|
10
|
-
RUNNER_ENTRY_COUNT = stack_context.StackContext("runner_entry_count", 0)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def entry_count() -> int:
|
|
14
|
-
return RUNNER_ENTRY_COUNT()
|
|
15
|
-
|
|
16
8
|
|
|
17
9
|
class EntryHandler(ty.Protocol):
|
|
18
10
|
def __call__(self, *__args: str) -> ty.Any:
|
|
@@ -27,5 +19,4 @@ def register_entry_handler(name: str, mh: EntryHandler) -> None:
|
|
|
27
19
|
|
|
28
20
|
|
|
29
21
|
def run_named_entry_handler(name: str, *args: str) -> None:
|
|
30
|
-
|
|
31
|
-
ENTRY_HANDLERS[name](*args)
|
|
22
|
+
ENTRY_HANDLERS[name](*args)
|
|
@@ -32,7 +32,7 @@ from thds.core import log
|
|
|
32
32
|
from . import _funcs
|
|
33
33
|
from .read import get_writer_id, make_read_lockfile
|
|
34
34
|
from .types import LockAcquired, LockContents
|
|
35
|
-
from .write import
|
|
35
|
+
from .write import LockEmitter, LockfileWriter
|
|
36
36
|
|
|
37
37
|
logger = log.getLogger(__name__)
|
|
38
38
|
|
|
@@ -106,7 +106,7 @@ def acquire( # noqa: C901
|
|
|
106
106
|
lockfile_writer = LockfileWriter(
|
|
107
107
|
my_writer_id,
|
|
108
108
|
lock_dir_uri,
|
|
109
|
-
|
|
109
|
+
LockEmitter(my_writer_id, expire),
|
|
110
110
|
expire.total_seconds(),
|
|
111
111
|
debug=debug,
|
|
112
112
|
)
|
|
@@ -15,12 +15,14 @@ from datetime import datetime, timedelta
|
|
|
15
15
|
from functools import partial
|
|
16
16
|
from threading import Thread
|
|
17
17
|
|
|
18
|
-
from thds.core import log
|
|
18
|
+
from thds.core import config, log
|
|
19
19
|
|
|
20
20
|
from ._funcs import make_lock_uri
|
|
21
21
|
from .read import get_writer_id, make_read_lockfile
|
|
22
22
|
from .types import LockAcquired
|
|
23
|
-
from .write import
|
|
23
|
+
from .write import LockEmitter, LockfileWriter
|
|
24
|
+
|
|
25
|
+
MAINTAIN_LOCKS = config.item("thds.mops.pure.local.maintain_locks", default=True, parse=config.tobool)
|
|
24
26
|
|
|
25
27
|
logger = log.getLogger(__name__)
|
|
26
28
|
|
|
@@ -103,7 +105,7 @@ def remote_lock_maintain(lock_dir_uri: str, expected_writer_id: str = "") -> Loc
|
|
|
103
105
|
lockfile_writer = LockfileWriter(
|
|
104
106
|
current_writer_id,
|
|
105
107
|
lock_dir_uri,
|
|
106
|
-
|
|
108
|
+
LockEmitter(get_writer_id(lock_contents), timedelta(seconds=expire_s)),
|
|
107
109
|
expire_s,
|
|
108
110
|
writer_name="remote",
|
|
109
111
|
)
|
|
@@ -148,3 +150,20 @@ def launch_daemon_lock_maintainer(lock_acq: LockAcquired) -> ty.Callable[[], Non
|
|
|
148
150
|
lock_acq.release()
|
|
149
151
|
|
|
150
152
|
return stop_maintaining
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def maintain_to_release(
|
|
156
|
+
acquired_lock: LockAcquired,
|
|
157
|
+
) -> ty.Callable[[], None]:
|
|
158
|
+
"""Depending on configuration, potentially start maintaining the lock.
|
|
159
|
+
|
|
160
|
+
Return a callable that will release the lock when called.
|
|
161
|
+
"""
|
|
162
|
+
if MAINTAIN_LOCKS():
|
|
163
|
+
return launch_daemon_lock_maintainer(acquired_lock)
|
|
164
|
+
|
|
165
|
+
return acquired_lock.release
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def no_maintain() -> None:
|
|
169
|
+
MAINTAIN_LOCKS.set_global(False)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import typing as ty
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
from datetime import datetime, timedelta
|
|
4
5
|
|
|
5
6
|
from thds.core import hostname, log
|
|
@@ -10,38 +11,37 @@ from .types import LockContents
|
|
|
10
11
|
logger = log.getLogger(__name__)
|
|
11
12
|
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
write_count = 0
|
|
18
|
-
first_written_at = ""
|
|
14
|
+
@dataclass
|
|
15
|
+
class LockEmitter:
|
|
16
|
+
writer_id: str
|
|
17
|
+
expire: timedelta
|
|
19
18
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
), f"{writer_id} should not contain a slash - maybe you passed a URI instead?"
|
|
19
|
+
write_count: int = 0
|
|
20
|
+
first_written_at: str = ""
|
|
23
21
|
|
|
24
|
-
def
|
|
25
|
-
|
|
26
|
-
|
|
22
|
+
def __post_init__(self) -> None:
|
|
23
|
+
assert (
|
|
24
|
+
"/" not in self.writer_id
|
|
25
|
+
), f"{self.writer_id} should not contain a slash - maybe you passed a URI instead?"
|
|
26
|
+
|
|
27
|
+
def __call__(self, first_acquired_at: ty.Optional[datetime]) -> LockContents:
|
|
28
|
+
self.write_count += 1
|
|
27
29
|
now = _funcs.utc_now().isoformat()
|
|
28
|
-
first_written_at = first_written_at or now
|
|
30
|
+
self.first_written_at = self.first_written_at or now
|
|
29
31
|
|
|
30
32
|
return {
|
|
31
|
-
"writer_id": writer_id,
|
|
33
|
+
"writer_id": self.writer_id,
|
|
32
34
|
"written_at": now,
|
|
33
|
-
"expire_s": expire.total_seconds(),
|
|
35
|
+
"expire_s": self.expire.total_seconds(),
|
|
34
36
|
# debug stuff:
|
|
35
|
-
"write_count": write_count,
|
|
37
|
+
"write_count": self.write_count,
|
|
36
38
|
"hostname": hostname.friendly(),
|
|
37
39
|
"pid": str(os.getpid()),
|
|
38
|
-
"first_written_at": first_written_at,
|
|
40
|
+
"first_written_at": self.first_written_at,
|
|
39
41
|
"first_acquired_at": first_acquired_at.isoformat() if first_acquired_at else "",
|
|
40
42
|
"released_at": "",
|
|
41
43
|
}
|
|
42
44
|
|
|
43
|
-
return lock_contents
|
|
44
|
-
|
|
45
45
|
|
|
46
46
|
class LockfileWriter:
|
|
47
47
|
"""The core purpose of this class is to allow setting of first_acquired_at immediately
|