thds.mops 3.9.20250722150738__py3-none-any.whl → 3.9.20250722163657__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.mops might be problematic. Click here for more details.
- thds/mops/impure/runner.py +1 -1
- thds/mops/k8s/__init__.py +1 -3
- thds/mops/k8s/config.py +1 -1
- thds/mops/k8s/jobs.py +0 -4
- thds/mops/k8s/{_launch.py → launch.py} +57 -56
- thds/mops/k8s/logging.py +5 -37
- thds/mops/k8s/watch.py +62 -120
- thds/mops/pure/__init__.py +1 -2
- thds/mops/pure/_magic/sauce.py +3 -11
- thds/mops/pure/_magic/shims.py +2 -2
- thds/mops/pure/core/deferred_work.py +12 -15
- thds/mops/pure/core/entry/runner_registry.py +10 -1
- thds/mops/pure/core/lock/__init__.py +0 -1
- thds/mops/pure/core/lock/_acquire.py +2 -2
- thds/mops/pure/core/lock/maintain.py +3 -22
- thds/mops/pure/core/lock/write.py +19 -19
- thds/mops/pure/core/memo/__init__.py +1 -1
- thds/mops/pure/core/memo/results.py +4 -5
- thds/mops/pure/core/use_runner.py +7 -21
- thds/mops/pure/pickling/mprunner.py +14 -21
- thds/mops/pure/pickling/pickles.py +8 -19
- thds/mops/pure/pickling/remote.py +1 -3
- thds/mops/pure/runner/local.py +87 -58
- thds/mops/pure/runner/shim_builder.py +7 -7
- thds/mops/pure/runner/simple_shims.py +0 -7
- thds/mops/pure/runner/types.py +4 -15
- thds/mops/pure/tools/summarize/run_summary.py +8 -9
- {thds_mops-3.9.20250722150738.dist-info → thds_mops-3.9.20250722163657.dist-info}/METADATA +1 -1
- {thds_mops-3.9.20250722150738.dist-info → thds_mops-3.9.20250722163657.dist-info}/RECORD +32 -37
- thds/mops/k8s/batching.py +0 -198
- thds/mops/k8s/counts.py +0 -28
- thds/mops/k8s/job_future.py +0 -109
- thds/mops/k8s/uncertain_future.py +0 -160
- thds/mops/pure/runner/get_results.py +0 -106
- {thds_mops-3.9.20250722150738.dist-info → thds_mops-3.9.20250722163657.dist-info}/WHEEL +0 -0
- {thds_mops-3.9.20250722150738.dist-info → thds_mops-3.9.20250722163657.dist-info}/entry_points.txt +0 -0
- {thds_mops-3.9.20250722150738.dist-info → thds_mops-3.9.20250722163657.dist-info}/top_level.txt +0 -0
thds/mops/k8s/batching.py
DELETED
|
@@ -1,198 +0,0 @@
|
|
|
1
|
-
"""The basic idea of this module is that different threads can submit _parts_ of a job to a batcher,
|
|
2
|
-
and immediately get the job name back, while the batcher itself defers creating the job until the
|
|
3
|
-
batch is full, or when the process exits.
|
|
4
|
-
|
|
5
|
-
The theory is that will get used in processes whose only responsibility is to create jobs,
|
|
6
|
-
so waiting on atexit to create the final batch is not an issue.
|
|
7
|
-
|
|
8
|
-
If you want a batcher that has a more context-manager-like behavior, you can write one of
|
|
9
|
-
those, but it wouldn't work well with a concurrent.futures Executor-style approach, since
|
|
10
|
-
those don't have an explicit shutdown procedure that we can hook to call __exit__.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
import atexit
|
|
14
|
-
import concurrent.futures
|
|
15
|
-
import itertools
|
|
16
|
-
import multiprocessing
|
|
17
|
-
import threading
|
|
18
|
-
import typing as ty
|
|
19
|
-
|
|
20
|
-
from thds.core import cpus, futures, log
|
|
21
|
-
|
|
22
|
-
from . import _launch, counts
|
|
23
|
-
|
|
24
|
-
T = ty.TypeVar("T")
|
|
25
|
-
logger = log.getLogger(__name__)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class _AtExitBatcher(ty.Generic[T]):
|
|
29
|
-
def __init__(self, batch_processor: ty.Callable[[ty.Collection[T]], None]) -> None:
|
|
30
|
-
self.batch: list[T] = []
|
|
31
|
-
self._registered = False
|
|
32
|
-
self._lock = threading.RLock()
|
|
33
|
-
self._batch_processor = batch_processor
|
|
34
|
-
|
|
35
|
-
def add(self, item: T) -> None:
|
|
36
|
-
with self._lock:
|
|
37
|
-
if not self._registered:
|
|
38
|
-
atexit.register(self.process)
|
|
39
|
-
# ensure we flush on process exit, since we don't know how many items are coming
|
|
40
|
-
self._registered = True
|
|
41
|
-
self.batch.append(item)
|
|
42
|
-
|
|
43
|
-
def process(self) -> None:
|
|
44
|
-
if self.batch:
|
|
45
|
-
with self._lock:
|
|
46
|
-
if self.batch:
|
|
47
|
-
self._batch_processor(self.batch)
|
|
48
|
-
self.batch = []
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class K8sJobBatchingShim(_AtExitBatcher[str]):
|
|
52
|
-
"""Thread-safe for use within a single process by multiple threads."""
|
|
53
|
-
|
|
54
|
-
def __init__(
|
|
55
|
-
self,
|
|
56
|
-
submit_func: ty.Callable[[ty.Collection[str]], ty.Any],
|
|
57
|
-
max_batch_size: int,
|
|
58
|
-
job_counter: counts.MpValue[int],
|
|
59
|
-
name_prefix: str = "",
|
|
60
|
-
) -> None:
|
|
61
|
-
"""submit_func in particular should be a closure around whatever setup you need to
|
|
62
|
-
do to call back into a function that is locally wrapped with a k8s shim that will
|
|
63
|
-
ultimately call k8s.launch. Notably, you
|
|
64
|
-
"""
|
|
65
|
-
super().__init__(self._process_batch)
|
|
66
|
-
self._max_batch_size = max_batch_size
|
|
67
|
-
self._job_counter = job_counter
|
|
68
|
-
self._job_name = ""
|
|
69
|
-
self._name_prefix = name_prefix
|
|
70
|
-
self._submit_func = submit_func
|
|
71
|
-
|
|
72
|
-
def _get_new_name(self) -> str:
|
|
73
|
-
# counts.inc takes a multiprocess lock. do not forget this!
|
|
74
|
-
job_num = counts.inc(self._job_counter)
|
|
75
|
-
return _launch.construct_job_name(self._name_prefix, counts.to_name(job_num))
|
|
76
|
-
|
|
77
|
-
def add_to_named_job(self, mops_invocation: ty.Sequence[str]) -> str:
|
|
78
|
-
"""Returns job name for the invocation."""
|
|
79
|
-
with self._lock:
|
|
80
|
-
if not self._job_name:
|
|
81
|
-
self._job_name = self._get_new_name()
|
|
82
|
-
if len(self.batch) >= self._max_batch_size:
|
|
83
|
-
self.process()
|
|
84
|
-
self._job_name = self._get_new_name()
|
|
85
|
-
super().add(" ".join(mops_invocation))
|
|
86
|
-
return self._job_name
|
|
87
|
-
|
|
88
|
-
def _process_batch(self, batch: ty.Collection[str]) -> None:
|
|
89
|
-
with _launch.JOB_NAME.set(self._job_name):
|
|
90
|
-
log_lvl = logger.warning if len(batch) < self._max_batch_size else logger.info
|
|
91
|
-
log_lvl(f"Processing batch of len {len(batch)} with job name {self._job_name}")
|
|
92
|
-
self._submit_func(batch)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
F = ty.TypeVar("F", bound=ty.Callable)
|
|
96
|
-
FunctionDecorator = ty.Callable[[F], F]
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
_BATCHER: ty.Optional[K8sJobBatchingShim] = None
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def init_batcher(
|
|
103
|
-
submit_func: ty.Callable[[ty.Collection[str]], ty.Any],
|
|
104
|
-
func_max_batch_size: int,
|
|
105
|
-
job_counter: counts.MpValue[int],
|
|
106
|
-
name_prefix: str = "",
|
|
107
|
-
) -> None:
|
|
108
|
-
# for use with multiprocessing pool initializer
|
|
109
|
-
global _BATCHER
|
|
110
|
-
if _BATCHER is not None:
|
|
111
|
-
logger.warning("Batcher is already initialized; reinitializing will reset the job name.")
|
|
112
|
-
return
|
|
113
|
-
|
|
114
|
-
_BATCHER = K8sJobBatchingShim(submit_func, func_max_batch_size, job_counter, name_prefix)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def init_batcher_with_unpicklable_submit_func(
|
|
118
|
-
make_submit_func: ty.Callable[[T], ty.Callable[[ty.Collection[str]], ty.Any]],
|
|
119
|
-
submit_func_arg: T,
|
|
120
|
-
func_max_batch_size: int,
|
|
121
|
-
job_counter: counts.MpValue[int],
|
|
122
|
-
name_prefix: str = "",
|
|
123
|
-
) -> None:
|
|
124
|
-
"""Use this if you want to have an unpicklable submit function - because applying make_submit_func(submit_func_arg)
|
|
125
|
-
will happen inside the pool worker process after all the pickling/unpickling has happened.
|
|
126
|
-
"""
|
|
127
|
-
return init_batcher(
|
|
128
|
-
make_submit_func(submit_func_arg), func_max_batch_size, job_counter, name_prefix=name_prefix
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def make_counting_process_pool_executor(
|
|
133
|
-
make_submit_func: ty.Callable[[T], ty.Callable[[ty.Collection[str]], ty.Any]],
|
|
134
|
-
submit_func_arg: T,
|
|
135
|
-
max_batch_size: int,
|
|
136
|
-
name_prefix: str = "",
|
|
137
|
-
max_workers: int = 0,
|
|
138
|
-
) -> concurrent.futures.ProcessPoolExecutor:
|
|
139
|
-
"""Creates a ProcessPoolExecutor that uses the batching shim for job submission.
|
|
140
|
-
|
|
141
|
-
We are introducing this because we see segfaults prior to Python 3.12 related to this issue:
|
|
142
|
-
https://github.com/python/cpython/issues/77377
|
|
143
|
-
|
|
144
|
-
And it would seem that this had to do with creating mp.Values using a 'fork' start
|
|
145
|
-
method, and then passing those to a ProcessPoolExecutor with
|
|
146
|
-
mp_context=multiprolcessing.get_context('spawn'). So we can help you avoid that by creating
|
|
147
|
-
the mp.Value for you, alongside its ProcessPoolExecutor.
|
|
148
|
-
|
|
149
|
-
NOTE!!
|
|
150
|
-
|
|
151
|
-
You should only have one of these per process at a time, because we're doing spooky
|
|
152
|
-
things with the Job Counter. In fact, you should probably only create one of these
|
|
153
|
-
_ever_ within a single logical 'application'.
|
|
154
|
-
|
|
155
|
-
If you fail to heed this advice, you will get weird launched/finished counts at a
|
|
156
|
-
minimum. Although these job counts are not mission-critical, you _will_ be confused.
|
|
157
|
-
"""
|
|
158
|
-
start_method: str = "spawn"
|
|
159
|
-
# 'spawn' prevents weird batch processing deadlocks that seem to only happen on Linux with 'fork'.
|
|
160
|
-
# it is strongly recommended to use 'spawn' for this reason.
|
|
161
|
-
|
|
162
|
-
mp_context = multiprocessing.get_context(start_method)
|
|
163
|
-
launch_count = mp_context.Value("i", 0)
|
|
164
|
-
# even though i want to assign this to a global, I also want to prevent
|
|
165
|
-
# any possible race condition where i somehow use a different thread's LAUNCH_COUNT
|
|
166
|
-
# when i create the ProcessPoolExecutor a few lines below.
|
|
167
|
-
counts.LAUNCH_COUNT = launch_count
|
|
168
|
-
counts.FINISH_COUNT = mp_context.Value("i", 0) # we don't use this here; we just reset it to zero.
|
|
169
|
-
# SPOOKY - reset the global finish counter and make it be the same 'type'
|
|
170
|
-
return concurrent.futures.ProcessPoolExecutor(
|
|
171
|
-
max_workers=max_workers or cpus.available_cpu_count(),
|
|
172
|
-
initializer=init_batcher_with_unpicklable_submit_func,
|
|
173
|
-
initargs=(make_submit_func, submit_func_arg, max_batch_size, launch_count, name_prefix),
|
|
174
|
-
mp_context=mp_context,
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def shim(args: ty.Sequence[str]) -> futures.PFuture[bool]:
|
|
179
|
-
# This thing needs to return a lazy Uncertain Future that contains a job name, so that Job can be polled on
|
|
180
|
-
# ... but the job does not exist yet! So the batcher is in charge of creating the job name
|
|
181
|
-
# upfront, and then ensuring that it gets used when the job is created.
|
|
182
|
-
assert _BATCHER is not None, "Batcher must be initialized before using the batching shim."
|
|
183
|
-
job_name = _BATCHER.add_to_named_job(args)
|
|
184
|
-
return _launch.create_lazy_job_logging_future(job_name)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
def batched(iterable: ty.Iterable[T], n: int, *, strict: bool = False) -> ty.Iterator[tuple[T, ...]]:
|
|
188
|
-
"""Just a utility for pre-batching if you're using multiprocessing to create batches."""
|
|
189
|
-
# TODO get rid of this when we go to Python 3.12+ which has itertools.batched
|
|
190
|
-
#
|
|
191
|
-
# batched('ABCDEFG', 3) → ABC DEF G
|
|
192
|
-
if n < 1:
|
|
193
|
-
raise ValueError("n must be at least one")
|
|
194
|
-
iterator = iter(iterable)
|
|
195
|
-
while batch := tuple(itertools.islice(iterator, n)):
|
|
196
|
-
if strict and len(batch) != n:
|
|
197
|
-
raise ValueError("batched(): incomplete batch")
|
|
198
|
-
yield batch
|
thds/mops/k8s/counts.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import multiprocessing as mp
|
|
2
|
-
import typing as ty
|
|
3
|
-
|
|
4
|
-
T = ty.TypeVar("T")
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class MpValue(ty.Protocol[T]):
|
|
8
|
-
def get_lock(self) -> ty.Any:
|
|
9
|
-
...
|
|
10
|
-
|
|
11
|
-
value: T
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def inc(mp_val: MpValue[int]) -> int:
|
|
15
|
-
with mp_val.get_lock():
|
|
16
|
-
mp_val.value += 1
|
|
17
|
-
return mp_val.value
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
LAUNCH_COUNT = mp.Value("i", 0)
|
|
21
|
-
FINISH_COUNT = mp.Value("i", 0)
|
|
22
|
-
# these are spooky - they're global and mutable, and may in fact get overwritten by code
|
|
23
|
-
# using specific multiprocessing contexts.
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def to_name(count: int) -> str:
|
|
27
|
-
"""Convert a count to a name."""
|
|
28
|
-
return f"{count:0>4}"
|
thds/mops/k8s/job_future.py
DELETED
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
import threading
|
|
2
|
-
import typing as ty
|
|
3
|
-
|
|
4
|
-
from kubernetes import client
|
|
5
|
-
|
|
6
|
-
from thds.core import futures, log
|
|
7
|
-
from thds.termtool.colorize import colorized
|
|
8
|
-
|
|
9
|
-
from . import config, counts, uncertain_future
|
|
10
|
-
from .jobs import is_job_failed, is_job_succeeded, job_source
|
|
11
|
-
|
|
12
|
-
logger = log.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
UNUSUAL = colorized(fg="white", bg="yellow")
|
|
15
|
-
SUCCEEDED = colorized(fg="white", bg="blue")
|
|
16
|
-
FAILED = colorized(fg="white", bg="red")
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
_FINISHED_JOBS = set[str]()
|
|
20
|
-
_FINISHED_JOBS_LOCK = threading.Lock()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _check_newly_finished(job_name: str, namespace: str = "") -> str:
|
|
24
|
-
# I don't believe it's possible to ever have a Job that both succeeds and fails.
|
|
25
|
-
namespace = namespace or config.k8s_namespace()
|
|
26
|
-
job_full = f"{namespace}/{job_name}"
|
|
27
|
-
if job_full in _FINISHED_JOBS:
|
|
28
|
-
return ""
|
|
29
|
-
|
|
30
|
-
with _FINISHED_JOBS_LOCK:
|
|
31
|
-
if job_full in _FINISHED_JOBS:
|
|
32
|
-
return ""
|
|
33
|
-
|
|
34
|
-
_FINISHED_JOBS.add(job_full)
|
|
35
|
-
|
|
36
|
-
launched = counts.LAUNCH_COUNT.value
|
|
37
|
-
return f"- ({launched - counts.inc(counts.FINISH_COUNT)} unfinished of {launched})"
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class K8sJobFailedError(Exception):
|
|
41
|
-
"""Raised by `launch` when a Job is seen to terminate in a Failed state."""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def make_job_completion_future(job_name: str, *, namespace: str = "") -> futures.PFuture[bool]:
|
|
45
|
-
"""This is a natural boundary for a serializable lazy future - something that represents
|
|
46
|
-
work being done across process boundaries (since Kubernetes jobs will be listed via an API.
|
|
47
|
-
|
|
48
|
-
If True is returned, the Job has definitely succeeded.
|
|
49
|
-
|
|
50
|
-
If False is returned, the Job may have succeeded but we saw no evidence of it.
|
|
51
|
-
|
|
52
|
-
If the Job definitely failed, an Exception will be raised.
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
JOB_SEEN = False
|
|
56
|
-
|
|
57
|
-
def job_completion_interpreter(
|
|
58
|
-
job: ty.Optional[client.models.V1Job], last_seen_at: float
|
|
59
|
-
) -> ty.Union[uncertain_future.NotYetDone, bool]:
|
|
60
|
-
nonlocal JOB_SEEN
|
|
61
|
-
if not job:
|
|
62
|
-
if JOB_SEEN:
|
|
63
|
-
logger.warning(
|
|
64
|
-
UNUSUAL(f"Previously-seen job {job_name} no longer exists - assuming success!")
|
|
65
|
-
)
|
|
66
|
-
# we hereby indicate an unusual success to the Future waiter.
|
|
67
|
-
return False
|
|
68
|
-
|
|
69
|
-
time_since_last_seen = uncertain_future.official_timer() - last_seen_at
|
|
70
|
-
if time_since_last_seen > config.k8s_watch_object_stale_seconds():
|
|
71
|
-
# this is 5 minutes by default as of 2025-07-15.
|
|
72
|
-
raise TimeoutError(
|
|
73
|
-
f"Job {job_name} has not been seen for {time_since_last_seen:.1f} seconds - assuming failure!"
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
# we don't know what's going on but things aren't truly stale yet.
|
|
77
|
-
return uncertain_future.NotYetDone()
|
|
78
|
-
|
|
79
|
-
JOB_SEEN = True
|
|
80
|
-
|
|
81
|
-
if is_job_succeeded(job):
|
|
82
|
-
newly_succeeded = _check_newly_finished(job_name, namespace)
|
|
83
|
-
if newly_succeeded:
|
|
84
|
-
logger.info(SUCCEEDED(f"Job {job_name} Succeeded! {newly_succeeded}"))
|
|
85
|
-
return True
|
|
86
|
-
|
|
87
|
-
if is_job_failed(job):
|
|
88
|
-
newly_failed = _check_newly_finished(job_name, namespace)
|
|
89
|
-
if newly_failed:
|
|
90
|
-
logger.error(FAILED(f"Job {job_name} Failed! {newly_failed}"))
|
|
91
|
-
raise K8sJobFailedError(f"Job {job_name} has failed with status: {job.status}")
|
|
92
|
-
|
|
93
|
-
return uncertain_future.NotYetDone() # job is still in progress
|
|
94
|
-
|
|
95
|
-
return job_source().create_future(
|
|
96
|
-
job_completion_interpreter,
|
|
97
|
-
job_name,
|
|
98
|
-
namespace=namespace or config.k8s_namespace(),
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def make_lazy_completion_future(job_name: str, *, namespace: str = "") -> futures.LazyFuture[bool]:
|
|
103
|
-
"""This is a convenience function that will create a job completion future and then
|
|
104
|
-
immediately process it, returning the result. See docs on function above.
|
|
105
|
-
"""
|
|
106
|
-
return futures.make_lazy(make_job_completion_future)(
|
|
107
|
-
job_name,
|
|
108
|
-
namespace=namespace or config.k8s_namespace(),
|
|
109
|
-
)
|
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
import collections
|
|
2
|
-
import threading
|
|
3
|
-
import time
|
|
4
|
-
import typing as ty
|
|
5
|
-
|
|
6
|
-
# we use concurrent.futures.Future as an implementation detail, but it's communicated
|
|
7
|
-
# as core.futures.PFuture to give us the flexibility to change the implementation later if needed.
|
|
8
|
-
from concurrent.futures import Future
|
|
9
|
-
from dataclasses import dataclass
|
|
10
|
-
from uuid import uuid4
|
|
11
|
-
|
|
12
|
-
from typing_extensions import Self
|
|
13
|
-
|
|
14
|
-
from thds import core
|
|
15
|
-
|
|
16
|
-
R_0 = ty.TypeVar("R_0", contravariant=True) # R-naught - the thing that might resolve a Future.
|
|
17
|
-
# a value for this type may never be None.
|
|
18
|
-
|
|
19
|
-
R = ty.TypeVar("R")
|
|
20
|
-
# the Result type of the Future. These are allowed to be None, since some Futures may
|
|
21
|
-
# resolve but not return a value.
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class NotYetDone:
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
_LastSeenAt = float # type alias for the last seen time of the Future, in seconds since epoch
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
FutureInterpreter = ty.Callable[[ty.Optional[R_0], _LastSeenAt], ty.Union[R, NotYetDone]]
|
|
32
|
-
# a FutureInterpreter is a function that takes an object R_0 and the time.monotonic() at
|
|
33
|
-
# which it was last seen, and returns either NotYetDone (if the status is still in progress) or
|
|
34
|
-
# the actual Future result of type R, or, if the status is failure,
|
|
35
|
-
# _raises_ an appropriate Exception.
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class _FutureInterpretationShim(ty.Generic[R_0, R]):
|
|
39
|
-
def __init__(self, interpreter: FutureInterpreter[R_0, ty.Union[NotYetDone, R]]) -> None:
|
|
40
|
-
self.future = Future[R]()
|
|
41
|
-
self._interpreter = interpreter
|
|
42
|
-
self._id = uuid4().hex # has an id so it can be hashed and therefore easily found in a set
|
|
43
|
-
|
|
44
|
-
def __hash__(self) -> int:
|
|
45
|
-
return hash(self._id)
|
|
46
|
-
|
|
47
|
-
def __call__(self, r_0: ty.Optional[R_0], last_seen_at: float) -> ty.Optional[Self]:
|
|
48
|
-
"""First and foremost - this _must_ be treated as an object that the creator
|
|
49
|
-
is ultimately responsible for calling on a semi-regular basis. It represents a
|
|
50
|
-
likely deadlock for the holder of the Future if it is never called.
|
|
51
|
-
|
|
52
|
-
Return False if the Future is still in progress and should not be unregistered.
|
|
53
|
-
Return True if the Future is done and should be unregistered.
|
|
54
|
-
"""
|
|
55
|
-
try:
|
|
56
|
-
interpretation = self._interpreter(r_0, last_seen_at)
|
|
57
|
-
if isinstance(interpretation, NotYetDone):
|
|
58
|
-
return None # do nothing and do not unregister - the status is still in progress.
|
|
59
|
-
|
|
60
|
-
self.future.set_result(interpretation)
|
|
61
|
-
except Exception as e:
|
|
62
|
-
self.future.set_exception(e)
|
|
63
|
-
|
|
64
|
-
return self
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
K = ty.TypeVar("K") # Key type for the UncertainFuturesTracker
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
@dataclass
|
|
71
|
-
class _FuturesState(ty.Generic[R_0]):
|
|
72
|
-
"""Represents a single 'observable' that may have multiple Futures (and therefore interpretations) associated with it."""
|
|
73
|
-
|
|
74
|
-
futshims: list[_FutureInterpretationShim[R_0, ty.Any]]
|
|
75
|
-
last_seen_at: float
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def official_timer() -> float:
|
|
79
|
-
# we don't need any particular meaning to the time.
|
|
80
|
-
return time.monotonic()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
class UncertainFuturesTracker(ty.Generic[K, R_0]):
|
|
84
|
-
"""This class represents a kind of Future where we cannot be guaranteed that we will ever see
|
|
85
|
-
any further information about it, because we do not control the source of the data.
|
|
86
|
-
|
|
87
|
-
A good example would be a Kubernetes object that we are watching - we may _think_ that a Job will be created,
|
|
88
|
-
but there are race conditions galore in terms of actually looking for that object.
|
|
89
|
-
|
|
90
|
-
However, if we _do_ see it at a some point, then we can interpret future 'missingness'
|
|
91
|
-
as a tentative success.
|
|
92
|
-
|
|
93
|
-
The danger with this uncertainty is that Futures represent implicit deadlocks - if we
|
|
94
|
-
never resolve the Future, then a caller may be waiting for it forever. Therefore, we
|
|
95
|
-
ask the original requestor of the Future to specify how long they are willing to wait
|
|
96
|
-
to get a result, after which point we will resolve the Future as an exception.
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
def __init__(self, allowed_stale_seconds: float) -> None:
|
|
100
|
-
self._keyed_futures_state = collections.OrderedDict[K, _FuturesState[R_0]]()
|
|
101
|
-
self._lock = threading.Lock() # i don't trust ordered dict operations to be thread-safe.
|
|
102
|
-
self._check_stale_seconds = allowed_stale_seconds
|
|
103
|
-
|
|
104
|
-
def create(self, key: K, interpreter: FutureInterpreter[R_0, R]) -> core.futures.PFuture[R]:
|
|
105
|
-
futshim = _FutureInterpretationShim(interpreter)
|
|
106
|
-
with self._lock:
|
|
107
|
-
if key not in self._keyed_futures_state:
|
|
108
|
-
self._keyed_futures_state[key] = _FuturesState(
|
|
109
|
-
[futshim],
|
|
110
|
-
last_seen_at=official_timer() + self._check_stale_seconds,
|
|
111
|
-
# we provide a double margin for objects that we have never seen before.
|
|
112
|
-
)
|
|
113
|
-
self._keyed_futures_state.move_to_end(key, last=False)
|
|
114
|
-
# never seen and therefore should be at the beginning (most stale)
|
|
115
|
-
else:
|
|
116
|
-
# maintain our ordered dict so we can handle garbage collection of stale Futures.
|
|
117
|
-
self._keyed_futures_state[key].futshims.append(futshim)
|
|
118
|
-
|
|
119
|
-
return futshim.future
|
|
120
|
-
|
|
121
|
-
def update(self, key: ty.Optional[K], r_0: ty.Optional[R_0]) -> None:
|
|
122
|
-
"""Update the keyed Futures based on their interpreters.
|
|
123
|
-
|
|
124
|
-
Also check any stale Futures - Futures that have not seen an update (via their key) in a while.
|
|
125
|
-
|
|
126
|
-
If `key` is None, we will update all Futures that have been created so far.
|
|
127
|
-
"""
|
|
128
|
-
|
|
129
|
-
def check_resolution(fut_state: _FuturesState[R_0], inner_r_0: ty.Optional[R_0]) -> None:
|
|
130
|
-
for future_shim_that_is_done in core.parallel.yield_results(
|
|
131
|
-
[
|
|
132
|
-
core.thunks.thunking(futshim)(inner_r_0, fut_state.last_seen_at)
|
|
133
|
-
for futshim in fut_state.futshims
|
|
134
|
-
],
|
|
135
|
-
progress_logger=core.log.getLogger(__name__).debug,
|
|
136
|
-
named="UncertainFuturesTracker.update",
|
|
137
|
-
):
|
|
138
|
-
if future_shim_that_is_done is not None:
|
|
139
|
-
# the Future is done, so we can remove it from the list of Futures.
|
|
140
|
-
fut_state.futshims.remove(future_shim_that_is_done)
|
|
141
|
-
|
|
142
|
-
if key is not None:
|
|
143
|
-
with self._lock:
|
|
144
|
-
if key not in self._keyed_futures_state:
|
|
145
|
-
self._keyed_futures_state[key] = _FuturesState(list(), last_seen_at=official_timer())
|
|
146
|
-
else:
|
|
147
|
-
# maintain our ordered dict so we can handle garbage collection of stale Futures.
|
|
148
|
-
self._keyed_futures_state.move_to_end(key)
|
|
149
|
-
self._keyed_futures_state[key].last_seen_at = official_timer()
|
|
150
|
-
|
|
151
|
-
fut_state = self._keyed_futures_state[key]
|
|
152
|
-
check_resolution(fut_state, r_0)
|
|
153
|
-
|
|
154
|
-
# 'garbage collect' any Futures that haven't been updated in a while.
|
|
155
|
-
for futs_state in self._keyed_futures_state.values():
|
|
156
|
-
if futs_state.last_seen_at + self._check_stale_seconds < official_timer():
|
|
157
|
-
check_resolution(futs_state, None)
|
|
158
|
-
else: # these are ordered, so once we see one that's not stale, we can stop checking.
|
|
159
|
-
# this prevents us from having to do O(N) checks for every update.
|
|
160
|
-
break
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
import concurrent.futures
|
|
2
|
-
import threading
|
|
3
|
-
import typing as ty
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
from thds.core import futures, log
|
|
8
|
-
|
|
9
|
-
from ...config import max_concurrent_network_ops
|
|
10
|
-
from ..core import lock, memo
|
|
11
|
-
from ..core.types import NoResultAfterShimSuccess
|
|
12
|
-
from ..tools.summarize import run_summary
|
|
13
|
-
from . import types
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class ResultAndInvocationType(ty.NamedTuple):
|
|
17
|
-
value_or_error: ty.Union[memo.results.Success, memo.results.Error]
|
|
18
|
-
invoc_type: run_summary.InvocationType
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def unwrap_value_or_error(
|
|
22
|
-
get_meta_and_result: types.GetMetaAndResult,
|
|
23
|
-
run_directory: ty.Optional[Path],
|
|
24
|
-
runner_prefix: str,
|
|
25
|
-
args_kwargs_uris: ty.Collection[str],
|
|
26
|
-
memo_uri: str,
|
|
27
|
-
result_and_itype: ResultAndInvocationType,
|
|
28
|
-
) -> ty.Any: # the result value
|
|
29
|
-
result = result_and_itype.value_or_error
|
|
30
|
-
metadata = None
|
|
31
|
-
value_t = None
|
|
32
|
-
try:
|
|
33
|
-
if isinstance(result, memo.results.Success):
|
|
34
|
-
metadata, value_t = get_meta_and_result("value", result.value_uri)
|
|
35
|
-
return value_t
|
|
36
|
-
else:
|
|
37
|
-
assert isinstance(result, memo.results.Error), "Must be Error or Success"
|
|
38
|
-
metadata, exc = get_meta_and_result("EXCEPTION", result.exception_uri)
|
|
39
|
-
raise exc
|
|
40
|
-
finally:
|
|
41
|
-
run_summary.log_function_execution(
|
|
42
|
-
*(run_directory, memo_uri, result_and_itype.invoc_type),
|
|
43
|
-
metadata=metadata,
|
|
44
|
-
runner_prefix=runner_prefix,
|
|
45
|
-
was_error=not isinstance(result, memo.results.Success),
|
|
46
|
-
return_value=value_t,
|
|
47
|
-
args_kwargs_uris=args_kwargs_uris,
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
_AFTER_INVOCATION_SEMAPHORE = threading.BoundedSemaphore(int(max_concurrent_network_ops()) * 3)
|
|
52
|
-
# _IN prioritizes retrieving the result of a Shim that has completed.
|
|
53
|
-
logger = log.getLogger(__name__)
|
|
54
|
-
T = ty.TypeVar("T")
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@dataclass
|
|
58
|
-
class PostShimResultGetter(ty.Generic[T]):
|
|
59
|
-
"""Must be serializable on its own, so we can pass it across process boundaries
|
|
60
|
-
to serve as a foundation for a cross-process Future.
|
|
61
|
-
|
|
62
|
-
Happily, this should not be terribly difficult, as the 'state' of a mops function
|
|
63
|
-
is predicted entirely on the memo URI, which is a string.
|
|
64
|
-
"""
|
|
65
|
-
|
|
66
|
-
memo_uri: str
|
|
67
|
-
partially_applied_unwrap_value_or_error: ty.Callable[[str, ResultAndInvocationType], T]
|
|
68
|
-
release_lock: ty.Optional[ty.Callable[[], None]] = None
|
|
69
|
-
|
|
70
|
-
def __call__(self, _shim_result: ty.Any) -> T:
|
|
71
|
-
"""Check if the result exists, and return it if it does.
|
|
72
|
-
|
|
73
|
-
This is the future 'translator' that allows us to chain a shim future to be a result future.
|
|
74
|
-
"""
|
|
75
|
-
memo_uri = self.memo_uri
|
|
76
|
-
|
|
77
|
-
try:
|
|
78
|
-
with _AFTER_INVOCATION_SEMAPHORE:
|
|
79
|
-
value_or_error = memo.results.check_if_result_exists(memo_uri, check_for_exception=True)
|
|
80
|
-
if not value_or_error:
|
|
81
|
-
raise NoResultAfterShimSuccess(
|
|
82
|
-
f"The shim for {memo_uri} exited cleanly, but no result or exception was found."
|
|
83
|
-
)
|
|
84
|
-
return self.partially_applied_unwrap_value_or_error(
|
|
85
|
-
memo_uri, ResultAndInvocationType(value_or_error, "invoked")
|
|
86
|
-
)
|
|
87
|
-
finally:
|
|
88
|
-
if self.release_lock is not None:
|
|
89
|
-
try:
|
|
90
|
-
self.release_lock()
|
|
91
|
-
except Exception:
|
|
92
|
-
logger.exception("Failed to release lock after shim result retrieval.")
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def lock_maintaining_future(
|
|
96
|
-
lock_acquired: lock.LockAcquired,
|
|
97
|
-
post_shim_result_getter: PostShimResultGetter[futures.R1],
|
|
98
|
-
inner_future: futures.PFuture[futures.R],
|
|
99
|
-
) -> concurrent.futures.Future[futures.R1]:
|
|
100
|
-
"""Create a Future that will be used to retrieve the result of a shim invocation.
|
|
101
|
-
|
|
102
|
-
This Future will be used to retrieve the result of a shim invocation, and will
|
|
103
|
-
maintain the lock while it is being retrieved.
|
|
104
|
-
"""
|
|
105
|
-
post_shim_result_getter.release_lock = lock.maintain_to_release(lock_acquired)
|
|
106
|
-
return futures.chain_futures(inner_future, concurrent.futures.Future(), post_shim_result_getter)
|
|
File without changes
|
{thds_mops-3.9.20250722150738.dist-info → thds_mops-3.9.20250722163657.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{thds_mops-3.9.20250722150738.dist-info → thds_mops-3.9.20250722163657.dist-info}/top_level.txt
RENAMED
|
File without changes
|