thds.mops 3.9.20250721231027__py3-none-any.whl → 3.9.20250722163657__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (37) hide show
  1. thds/mops/impure/runner.py +1 -1
  2. thds/mops/k8s/__init__.py +1 -3
  3. thds/mops/k8s/config.py +1 -1
  4. thds/mops/k8s/jobs.py +0 -4
  5. thds/mops/k8s/{_launch.py → launch.py} +57 -56
  6. thds/mops/k8s/logging.py +5 -37
  7. thds/mops/k8s/watch.py +62 -120
  8. thds/mops/pure/__init__.py +1 -2
  9. thds/mops/pure/_magic/sauce.py +3 -11
  10. thds/mops/pure/_magic/shims.py +2 -2
  11. thds/mops/pure/core/deferred_work.py +12 -15
  12. thds/mops/pure/core/entry/runner_registry.py +10 -1
  13. thds/mops/pure/core/lock/__init__.py +0 -1
  14. thds/mops/pure/core/lock/_acquire.py +2 -2
  15. thds/mops/pure/core/lock/maintain.py +3 -22
  16. thds/mops/pure/core/lock/write.py +19 -19
  17. thds/mops/pure/core/memo/__init__.py +1 -1
  18. thds/mops/pure/core/memo/results.py +4 -5
  19. thds/mops/pure/core/use_runner.py +7 -21
  20. thds/mops/pure/pickling/mprunner.py +14 -21
  21. thds/mops/pure/pickling/pickles.py +8 -19
  22. thds/mops/pure/pickling/remote.py +1 -3
  23. thds/mops/pure/runner/local.py +87 -58
  24. thds/mops/pure/runner/shim_builder.py +7 -7
  25. thds/mops/pure/runner/simple_shims.py +0 -7
  26. thds/mops/pure/runner/types.py +4 -15
  27. thds/mops/pure/tools/summarize/run_summary.py +8 -9
  28. {thds_mops-3.9.20250721231027.dist-info → thds_mops-3.9.20250722163657.dist-info}/METADATA +1 -1
  29. {thds_mops-3.9.20250721231027.dist-info → thds_mops-3.9.20250722163657.dist-info}/RECORD +32 -37
  30. thds/mops/k8s/batching.py +0 -198
  31. thds/mops/k8s/counts.py +0 -28
  32. thds/mops/k8s/job_future.py +0 -109
  33. thds/mops/k8s/uncertain_future.py +0 -160
  34. thds/mops/pure/runner/get_results.py +0 -106
  35. {thds_mops-3.9.20250721231027.dist-info → thds_mops-3.9.20250722163657.dist-info}/WHEEL +0 -0
  36. {thds_mops-3.9.20250721231027.dist-info → thds_mops-3.9.20250722163657.dist-info}/entry_points.txt +0 -0
  37. {thds_mops-3.9.20250721231027.dist-info → thds_mops-3.9.20250722163657.dist-info}/top_level.txt +0 -0
@@ -67,7 +67,7 @@ class KeyedLocalRunner(MemoizingPicklingRunner):
67
67
  redirect=lambda _f, _args, _kwargs: _perform_original_invocation,
68
68
  )
69
69
 
70
- def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R: # type: ignore[valid-type]
70
+ def __call__(self, raw_func: ty.Callable[P, R], raw_args: P.args, raw_kwargs: P.kwargs) -> R:
71
71
  actual_function_to_call = self._pre_pickle_redirect(raw_func, raw_args, raw_kwargs)
72
72
  with _ORIGINAL_F_ARGS_KWARGS.set((actual_function_to_call, raw_args, raw_kwargs)):
73
73
  return super().__call__(*self._impure_keyfunc(raw_func, raw_args, raw_kwargs))
thds/mops/k8s/__init__.py CHANGED
@@ -7,10 +7,8 @@ except ModuleNotFoundError as mnf:
7
7
  "Please install mops with the `k8s` extra to use `thds.mops.k8s`."
8
8
  ) from mnf
9
9
 
10
- from . import batching, counts, job_future # noqa: F401
11
- from ._launch import launch, shim # noqa
12
10
  from .container_registry import autocr # noqa: F401
13
- from .job_future import K8sJobFailedError # noqa: F401
11
+ from .launch import K8sJobFailedError, launch, shim # noqa
14
12
  from .node_selection import ( # noqa
15
13
  NodeNarrowing,
16
14
  ResourceDefinition,
thds/mops/k8s/config.py CHANGED
@@ -10,7 +10,7 @@ k8s_namespace_env_var_key = config.item("mops.k8s.namespace_env_var_key", "MOPS_
10
10
  # environment variable. it will not affect how your namespace is selected in the first
11
11
  # place.
12
12
 
13
- k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 5 * 60, parse=int)
13
+ k8s_watch_object_stale_seconds = config.item("mops.k8s.watch.object_stale_seconds", 30 * 60, parse=int)
14
14
  k8s_acr_url = config.item("mops.k8s.acr.url", "")
15
15
  k8s_job_retry_count = config.item("mops.k8s.job.retry_count", 6, parse=int)
16
16
  k8s_job_cleanup_ttl_seconds_after_completion = config.item(
thds/mops/k8s/jobs.py CHANGED
@@ -25,10 +25,6 @@ _JOB_SOURCE = WatchingObjectSource(
25
25
  )
26
26
 
27
27
 
28
- def job_source() -> WatchingObjectSource[client.models.V1Job]:
29
- return _JOB_SOURCE
30
-
31
-
32
28
  def get_job(job_name: str, namespace: str = "") -> ty.Optional[client.models.V1Job]:
33
29
  return _JOB_SOURCE.get(job_name, namespace=namespace)
34
30
 
@@ -4,22 +4,41 @@ import os
4
4
  import threading
5
5
  import typing as ty
6
6
  import uuid
7
- from functools import partial
8
7
 
9
8
  from kubernetes import client
10
9
 
11
- from thds import core
10
+ from thds.core import scope
11
+ from thds.core.log import logger_context
12
12
  from thds.mops.pure.runner.simple_shims import samethread_shim
13
13
  from thds.termtool.colorize import colorized
14
14
 
15
- from . import config, counts, job_future, logging
15
+ from . import config
16
16
  from ._shared import logger
17
17
  from .auth import load_config, upsert_namespace
18
+ from .logging import JobLogWatcher
18
19
  from .node_selection import NodeNarrowing, ResourceDefinition
19
20
  from .retry import k8s_sdk_retry
20
21
  from .thds_std import embed_thds_auth
22
+ from .wait_job import wait_for_job
21
23
 
22
24
  LAUNCHED = colorized(fg="white", bg="green")
25
+ COMPLETE = colorized(fg="white", bg="blue")
26
+ FAILED = colorized(fg="white", bg="red")
27
+
28
+
29
+ class K8sJobFailedError(Exception):
30
+ """Raised by `launch` when a Job is seen to terminate in a Failed state."""
31
+
32
+
33
+ class Counter:
34
+ def __init__(self) -> None:
35
+ self.value = 0
36
+ self._lock = threading.Lock()
37
+
38
+ def inc(self) -> int:
39
+ with self._lock:
40
+ self.value += 1
41
+ return self.value
23
42
 
24
43
 
25
44
  def sanitize_str(name: str) -> str:
@@ -30,7 +49,7 @@ def sanitize_str(name: str) -> str:
30
49
 
31
50
  def construct_job_name(user_prefix: str, job_num: str) -> str:
32
51
  # we want some consistency here, but also some randomness in case the prefixes don't exist or aren't unique.
33
- mops_name_part = "-".join([sanitize_str(job_num), str(os.getpid()), str(uuid.uuid4())[:8]])
52
+ mops_name_part = "-".join([str(os.getpid()), sanitize_str(job_num), str(uuid.uuid4())[:8]])
34
53
  if len(mops_name_part) > 63:
35
54
  # this should be _impossible_, because having a job num longer than even 20 digits would be an impossibly large
36
55
  # number of jobs. but just in case, we'll truncate it to the last 63 characters.
@@ -46,11 +65,12 @@ def construct_job_name(user_prefix: str, job_num: str) -> str:
46
65
  return name
47
66
 
48
67
 
68
+ _LAUNCH_COUNT = Counter()
69
+ _FINISH_COUNT = Counter()
49
70
  _SIMULTANEOUS_LAUNCHES = threading.BoundedSemaphore(20)
50
- JOB_NAME = core.stack_context.StackContext("job_name", "")
51
71
 
52
72
 
53
- @core.scope.bound
73
+ @scope.bound
54
74
  def launch(
55
75
  container_image: str,
56
76
  args: ty.Sequence[str],
@@ -61,46 +81,38 @@ def launch(
61
81
  # arguments below are for launching; arguments above are for
62
82
  # building. these should get separated in a future change.
63
83
  name_prefix: str = "",
64
- full_name: str = "",
65
84
  dry_run: bool = False,
85
+ fire_and_forget: bool = False,
66
86
  suppress_logs: bool = False,
67
87
  transform_job: ty.Callable[[client.models.V1Job], client.models.V1Job] = embed_thds_auth,
68
88
  # this is a default for now. later if we share this code we'll need to have a wrapper interface
69
89
  service_account_name: str = "",
70
- ) -> core.futures.LazyFuture[bool]:
90
+ ) -> None:
71
91
  """Launch a Kubernetes job.
72
92
 
73
93
  Required parameters are the container_image and the arguments to
74
94
  that image, just as if you were running this directly with Docker.
75
95
 
76
- Returns a Future that will resolve to True when the Job completes successfully, or
77
- raise K8sJobFailedError if the Job fails.
96
+ Unless fire_and_forget=True, will poll until Job completes and
97
+ will raise K8sJobFailedError if the Job fails. None is returned
98
+ if the Job succeeds.
78
99
 
79
100
  `name_prefix` is an optional parameter for debugging/developer
80
101
  convenience. A generated suffix will be added to it.
102
+
81
103
  """
82
104
  if not container_image:
83
105
  raise ValueError("container_image (the fully qualified Docker tag) must not be empty.")
84
-
85
- full_name = full_name or JOB_NAME()
86
- # in certain cases, it may be necessary to set the job name
87
- # via a StackContext, so we check that here, and prefer it over name_prefix.
88
-
89
- if full_name and name_prefix:
90
- raise ValueError("You cannot specify both full_name and name_prefix; use one or the other.")
91
-
92
- if not full_name:
93
- name = construct_job_name(name_prefix, counts.to_name(counts.inc(counts.LAUNCH_COUNT)))
94
- else:
95
- name = full_name
96
-
97
- core.scope.enter(core.log.logger_context(job=name))
106
+ job_num = f"{_LAUNCH_COUNT.inc():0>3}"
107
+ name = construct_job_name(name_prefix, job_num)
108
+ scope.enter(logger_context(job=name))
98
109
  node_narrowing = node_narrowing or dict()
99
110
 
100
111
  # TODO move this entire function out to be separately callable
101
112
  @k8s_sdk_retry()
102
113
  def assemble_base_job() -> client.models.V1Job:
103
114
  logger.debug(f"Assembling job named `{name}` on image `{container_image}`")
115
+ logger.debug("Fire and forget: %s", fire_and_forget)
104
116
  logger.debug("Loading kube configs ...")
105
117
  load_config()
106
118
  logger.debug("Populating job object ...")
@@ -173,7 +185,7 @@ def launch(
173
185
  if dry_run:
174
186
  job_with_all_transforms()
175
187
  logger.info("Dry run assembly successful; not launching...")
176
- return core.futures.LazyFuture(partial(core.futures.ResolvedFuture, True))
188
+ return
177
189
 
178
190
  @k8s_sdk_retry()
179
191
  def launch_job() -> client.models.V1Job:
@@ -186,41 +198,32 @@ def launch(
186
198
  )
187
199
 
188
200
  job = launch_job()
189
- logger.info(LAUNCHED(f"Job {name} launched!") + f" on {container_image}")
190
- return core.futures.make_lazy(_launch_logs_and_create_future)( # see below for implementation
191
- job.metadata.name,
192
- num_pods_expected=len(job.spec.template.spec.containers),
193
- namespace=config.k8s_namespace(),
194
- suppress_logs=suppress_logs,
195
- )
196
-
197
-
198
- # this function has to be a top level def because it will sometimes be transferred across process boundaries,
199
- # and Python/pickle in its infinite wisdom does not allow nested functions to be pickled.
200
- def _launch_logs_and_create_future(
201
- job_name: str, *, num_pods_expected: int, namespace: str, suppress_logs: bool
202
- ) -> core.futures.PFuture[bool]:
201
+ logger.info(LAUNCHED(f"Job {job_num} launched!") + f" on {container_image}")
203
202
  if not suppress_logs:
204
- logging.maybe_start_job_thread(job_name, num_pods_expected)
205
- return job_future.make_job_completion_future(job_name, namespace=namespace)
203
+ threading.Thread( # fire and forget a log watching thread
204
+ target=JobLogWatcher(job.metadata.name, len(job.spec.template.spec.containers)).start,
205
+ daemon=True,
206
+ ).start()
207
+
208
+ if not fire_and_forget:
206
209
 
210
+ def counts() -> str:
211
+ launched = _LAUNCH_COUNT.value
212
+ return f"- ({launched - _FINISH_COUNT.inc()} unfinished of {launched})"
207
213
 
208
- def create_lazy_job_logging_future(
209
- job_name: str, *, namespace: str = "", num_pods_expected: int = 1
210
- ) -> core.futures.LazyFuture[bool]:
211
- return core.futures.make_lazy(_launch_logs_and_create_future)(
212
- job_name,
213
- num_pods_expected=num_pods_expected,
214
- namespace=namespace or config.k8s_namespace(),
215
- suppress_logs=False,
216
- )
214
+ job_name = job.metadata.name
215
+ del job # trying to save memory here while we wait...
216
+ if not wait_for_job(job_name, short_name=job_num):
217
+ logger.error(FAILED(f"Job {job_num} Failed! {counts()}"))
218
+ raise K8sJobFailedError(f"Job {job_name} failed.")
219
+ logger.info(COMPLETE(f"Job {job_num} Complete! {counts()}"))
217
220
 
218
221
 
219
222
  def shim(
220
223
  container_image: ty.Union[str, ty.Callable[[], str]],
221
224
  disable_remote: ty.Callable[[], bool] = lambda: False,
222
225
  **outer_kwargs: ty.Any,
223
- ) -> ty.Callable[[ty.Sequence[str]], core.futures.LazyFuture[bool]]:
226
+ ) -> ty.Callable[[ty.Sequence[str]], None]:
224
227
  """Return a closure that can launch the given configuration and run a mops pure function.
225
228
 
226
229
  Now supports callables that return a container image name; the
@@ -237,18 +240,16 @@ def shim(
237
240
  ), "Passing 'args' as a keyword argument will cause conflicts with the closure."
238
241
 
239
242
  if disable_remote():
240
- return samethread_shim # type: ignore[return-value]
243
+ return samethread_shim
241
244
 
242
245
  if isinstance(container_image, str):
243
246
  get_container_image: ty.Callable[[], str] = lambda: container_image # noqa: E731
244
247
  else:
245
248
  get_container_image = container_image
246
249
 
247
- def launch_container_on_k8s_with_args(
248
- args: ty.Sequence[str], **inner_kwargs: ty.Any
249
- ) -> core.futures.LazyFuture[bool]:
250
+ def launch_container_on_k8s_with_args(args: ty.Sequence[str], **inner_kwargs: ty.Any) -> None:
250
251
  assert "args" not in inner_kwargs
251
- return launch(
252
+ launch(
252
253
  get_container_image(),
253
254
  ["python", "-m", "thds.mops.pure.core.entry.main", *args],
254
255
  **{**outer_kwargs, **inner_kwargs},
thds/mops/k8s/logging.py CHANGED
@@ -32,17 +32,6 @@ BOINK = colorized(fg="white", bg="magenta")
32
32
  # string in this and it'll stand out.
33
33
 
34
34
 
35
- def should_log(job_name: str) -> bool:
36
- if NO_K8S_LOGS():
37
- return False
38
-
39
- if random.random() > K8S_LOG_POD_FRACTION():
40
- logger.info(f"Skipping log watcher for {job_name} due to fraction.")
41
- return False
42
-
43
- return True
44
-
45
-
46
35
  class JobLogWatcher:
47
36
  """Will spawn one or more daemon threads.
48
37
 
@@ -70,7 +59,11 @@ class JobLogWatcher:
70
59
  @core.scope.bound
71
60
  def start(self, failed_pod_name: str = "") -> None:
72
61
  """Call this one time - it will spawn threads as needed."""
73
- if not should_log(self.job_name):
62
+ if NO_K8S_LOGS():
63
+ return
64
+
65
+ if random.random() > K8S_LOG_POD_FRACTION():
66
+ logger.info(f"Skipping log watcher for {self.job_name} due to fraction.")
74
67
  return
75
68
 
76
69
  core.scope.enter(self.job_pods_discovery_lock)
@@ -252,28 +245,3 @@ def _scrape_pod_logs(
252
245
  logger.exception(BOINK("Pod log scraping failed utterly. Pod may have died?"))
253
246
  # at least let the caller know something went horribly wrong
254
247
  failure_callback(pod_name)
255
-
256
-
257
- _JOB_LOG_THREADS: set[str] = set()
258
- _JOB_LOG_THREAD_COUNT: int = 0
259
- _JOB_LOG_THREADS_LOCK = threading.Lock()
260
-
261
-
262
- def maybe_start_job_thread(job_name: str, num_pods_expected: int = 1) -> bool:
263
- """Starts a thread to watch the logs of a job. Makes sure we only start one thread per
264
- job even if there are multiple calls to this function.
265
- """
266
- if job_name not in _JOB_LOG_THREADS:
267
- with _JOB_LOG_THREADS_LOCK:
268
- if job_name not in _JOB_LOG_THREADS:
269
- # double-checked locking to avoid creating multiple threads for the same job
270
- _JOB_LOG_THREADS.add(job_name)
271
- if should_log(job_name):
272
- global _JOB_LOG_THREAD_COUNT
273
- _JOB_LOG_THREAD_COUNT += 1
274
- logger.info(f"Starting log watcher {_JOB_LOG_THREAD_COUNT} for job {job_name}")
275
- threading.Thread(
276
- target=JobLogWatcher(job_name, num_pods_expected).start, daemon=True
277
- ).start()
278
- return True
279
- return False
thds/mops/k8s/watch.py CHANGED
@@ -12,20 +12,17 @@ import urllib3
12
12
  from kubernetes import client
13
13
  from kubernetes import watch as k8s_watch
14
14
 
15
- from thds.core import futures, scope
15
+ from thds.core import scope
16
16
  from thds.core.log import getLogger, logger_context
17
17
  from thds.termtool.colorize import colorized
18
18
 
19
19
  from . import config
20
20
  from .auth import load_config
21
21
  from .too_old_resource_version import parse_too_old_resource_version
22
- from .uncertain_future import FutureInterpreter, UncertainFuturesTracker
23
22
 
24
23
  logger = getLogger(__name__)
25
24
 
26
25
  T = ty.TypeVar("T")
27
- K = ty.TypeVar("K")
28
- R = ty.TypeVar("R")
29
26
 
30
27
 
31
28
  class V1List(ty.Protocol[T]):
@@ -118,6 +115,10 @@ def callback_events(
118
115
  break
119
116
 
120
117
 
118
+ def _make_name(namespace: str, name: str) -> str:
119
+ return f"{namespace}/{name}"
120
+
121
+
121
122
  def _default_get_name(obj: ty.Any) -> str:
122
123
  return obj.metadata.name
123
124
 
@@ -147,15 +148,9 @@ class OneShotLimiter:
147
148
  self._names.add(name)
148
149
 
149
150
 
150
- def _watch_timer() -> float:
151
- # in this context, monotonicity (actual timing) is most useful because we don't need sentinels.
152
- return time.monotonic()
153
-
154
-
155
151
  def is_stale(api_last_update_time: float, obj_last_seen_time: float) -> bool:
156
- now = _watch_timer()
152
+ now = time.monotonic()
157
153
  allowed_stale_seconds = config.k8s_watch_object_stale_seconds()
158
- # about 5 minutes by default as of 2025-07-15.
159
154
  if (time_since_api_update := now - api_last_update_time) > allowed_stale_seconds: # noqa: F841
160
155
  # we haven't heard anything from the API in a while; probably
161
156
  # the API is down. Ignore object staleness to avoid false positives.
@@ -228,146 +223,93 @@ def watch_forever(
228
223
  break
229
224
 
230
225
 
231
- class _SeenObjectContainer(ty.Generic[K, T]):
232
- """Splits some of the logic for 'get' out of WatchingObjectSource
233
- so that we can have it be a simpler container for both this and the UncertainFuturesTracker.
226
+ class WatchingObjectSource(ty.Generic[T]):
227
+ """Efficiently 'get' objects by reliably watching for changes to all such objects in a given namespace.
228
+
229
+ This is network-efficient for observing many different objects,
230
+ but not memory efficient if you really only need to fetch details
231
+ for a few objects.
234
232
  """
235
233
 
236
234
  def __init__(
237
235
  self,
238
- backup_fetch: ty.Optional[ty.Callable[[K], ty.Optional[T]]] = None,
236
+ get_list_method: GetListMethod[T],
237
+ get_name: ty.Callable[[T], str] = ty.cast( # noqa: B008
238
+ ty.Callable[[T], str], _default_get_name
239
+ ),
240
+ backup_fetch: ty.Optional[ty.Callable[[str, str], T]] = None,
241
+ typename: str = "object",
242
+ starting: ty.Callable[[str], str] = STARTING,
239
243
  ) -> None:
240
- self._objs: ty.Dict[K, T] = dict()
244
+ self.get_list_method = get_list_method
245
+ self.get_name = get_name
246
+ self.backup_fetch = backup_fetch
247
+ self.typename = typename
248
+ self._objs_by_name: ty.Dict[str, T] = dict()
241
249
  # ^ is a possibly big/expensive local cache of the most recent
242
250
  # state for all of the event type in the namespace. Don't use
243
251
  # this class if you can't afford the memory overhead of
244
252
  # observing everything in your namespace and keeping the last
245
253
  # known copy of everything forever.
246
- self._last_seen_times: ty.Dict[K, float] = dict()
254
+ self._last_seen_time_by_name: ty.Dict[str, float] = dict()
247
255
  self._last_api_update_time = 0.0
248
- self.backup_fetch = backup_fetch
256
+ self._limiter = OneShotLimiter()
249
257
 
250
- def set_object(self, key: K, obj: T) -> None:
251
- """Set an object in the cache, updating the last seen time."""
252
- now = _watch_timer()
253
- self._last_api_update_time = now
254
- self._last_seen_times[key] = now
255
- self._objs[key] = obj
258
+ def _start_thread(self, namespace: str) -> None:
259
+ create_watch_thread(
260
+ self.get_list_method, self._add_object, namespace, typename=self.typename
261
+ ).start()
262
+
263
+ def _add_object(self, namespace: str, obj: T, _event_type: EventType) -> None:
264
+ """This is where we receive updates from the k8s API."""
265
+ self._last_api_update_time = time.monotonic()
266
+
267
+ if not obj:
268
+ logger.warning(f"Received null/empty {self.typename}")
269
+ return
256
270
 
257
- def _is_stale(self, key: K) -> bool:
258
- return is_stale(self._last_api_update_time, self._last_seen_times.get(key) or 0)
271
+ name = _make_name(namespace, self.get_name(obj))
272
+ logger.debug(f"{self.typename} {name} updated")
273
+ self._last_seen_time_by_name[name] = time.monotonic()
274
+ self._objs_by_name[name] = obj
275
+
276
+ def _is_stale(self, name: str) -> bool:
277
+ return is_stale(self._last_api_update_time, self._last_seen_time_by_name.get(name) or 0)
278
+
279
+ @scope.bound
280
+ def get(self, obj_name: str, namespace: str = "") -> ty.Optional[T]:
281
+ namespace = namespace or config.k8s_namespace()
282
+ name = _make_name(namespace, obj_name)
283
+ scope.enter(logger_context(name=obj_name, namespace=namespace))
259
284
 
260
- def get(self, key: K) -> ty.Optional[T]:
261
285
  # first try is looking in our local cache
262
- if (obj := self._objs.get(key)) and not self._is_stale(key):
286
+ if (obj := self._objs_by_name.get(name)) and not self._is_stale(name):
263
287
  return obj
264
288
 
265
289
  # second try is making sure the namespace watcher is running, sleeping, and then looking in the cache again.
266
290
  # This is much more efficient than a manual fetch.
291
+ self._limiter(namespace, self._start_thread)
267
292
  time.sleep(config.k8s_monitor_delay())
268
- if (obj := self._objs.get(key)) and not self._is_stale(key):
293
+ if (obj := self._objs_by_name.get(name)) and not self._is_stale(name):
269
294
  return obj
270
295
 
271
296
  # if that doesn't work, try a manual fetch.
272
297
  if self.backup_fetch:
273
- logger.warning(f"Manually fetching {key}...")
298
+ logger.warning(f"Manually fetching {self.typename}...")
274
299
  # doing a lot of manual fetches may indicate that the k8s API is having trouble keeping up...
275
300
  try:
276
- if obj := self.backup_fetch(key):
277
- self.set_object(key, obj) # updates last seen, too
301
+ if obj := self.backup_fetch(namespace, obj_name):
302
+ self._add_object(namespace, obj, "FETCH") # updates last seen, too
278
303
  return obj
279
304
 
280
305
  except Exception:
281
- logger.exception(f"Unexpected error during manual fetch of {key}.")
306
+ logger.exception(f"Unexpected error during manual fetch of {self.typename}.")
282
307
 
283
- if self._is_stale(key):
308
+ if self._is_stale(name):
284
309
  logger.warning(
285
- f"Could not refresh {key}, and our record of it is stale - dropping stale object!"
310
+ f"Could not refresh {name}, and our record of it is stale - dropping stale object!"
286
311
  )
287
- self._objs.pop(key, None)
288
- self._last_seen_times.pop(key, None)
312
+ self._objs_by_name.pop(name, None)
313
+ self._last_seen_time_by_name.pop(name, None)
289
314
 
290
315
  return None
291
-
292
-
293
- class WatchingObjectSource(ty.Generic[T]):
294
- """Efficiently 'get' objects by launching a single thread to
295
- watch for changes to all such objects in a given namespace.
296
-
297
- Also provide a way to create a future that will be resolved according to the logic
298
- provided by the caller whenever an object is updated, or if the object has not been
299
- updated in a while.
300
-
301
- Importantly, the Futures are only prevented from deadlocking (never awakening their
302
- condition variable) by the fact that we very occasionally will go through the list
303
- of seen objects and raise Exceptions for objects that have not been updated in a while.
304
- This is vaguely akin to garbage collection, in that it will occasionally
305
- cause a 'pause' in the watcher thread as it tries to collect stale objects.
306
-
307
- This is network-efficient for observing many different objects,
308
- but not memory efficient if you really only need to fetch details
309
- for a few objects, because we retain the last known state for every observed object indefinitely.
310
- """
311
-
312
- def __init__(
313
- self,
314
- get_list_method: GetListMethod[T],
315
- get_name: ty.Callable[[T], str] = ty.cast( # noqa: B008
316
- ty.Callable[[T], str], _default_get_name
317
- ),
318
- backup_fetch: ty.Optional[ty.Callable[[str, str], ty.Optional[T]]] = None,
319
- typename: str = "object",
320
- ) -> None:
321
- self.get_list_method = get_list_method
322
- self.get_name = get_name
323
- self.typename = typename
324
- self._limiter = OneShotLimiter()
325
- self._uncertain_futures = UncertainFuturesTracker[tuple[str, str], T](
326
- config.k8s_watch_object_stale_seconds()
327
- )
328
- self._seen_objects = _SeenObjectContainer[tuple[str, str], T](
329
- lambda namespace_and_name: backup_fetch(*namespace_and_name) if backup_fetch else None
330
- )
331
-
332
- def _add_object(self, namespace: str, obj: T, _event_type: EventType) -> None:
333
- """This is where we receive updates from the k8s API."""
334
- if not obj:
335
- logger.warning(f"Received null/empty {self.typename}")
336
- return
337
-
338
- key = (namespace, self.get_name(obj))
339
- self._seen_objects.set_object(key, obj)
340
- self._uncertain_futures.update(key, obj)
341
- logger.debug("%s %s updated", self.typename, key)
342
-
343
- def _start_namespace_watcher_thread(self, namespace: str) -> None:
344
- create_watch_thread(
345
- self.get_list_method, self._add_object, namespace, typename=self.typename
346
- ).start()
347
-
348
- @scope.bound
349
- def get(self, obj_name: str, namespace: str = "") -> ty.Optional[T]:
350
- """May block for a little while if a manual fetch is required."""
351
- namespace = namespace or config.k8s_namespace()
352
- scope.enter(logger_context(name=obj_name, namespace=namespace))
353
- self._limiter(namespace, self._start_namespace_watcher_thread)
354
- return self._seen_objects.get((namespace, obj_name))
355
-
356
- def create_future(
357
- self,
358
- interpreter: FutureInterpreter[T, R],
359
- obj_name: str,
360
- *,
361
- namespace: str = "",
362
- ) -> futures.PFuture[R]:
363
- """Create a future that will be resolved when the object is available according to
364
- the interpreter.
365
-
366
- The FutureInterpreter must:
367
- - raise an exception if it wishes the future to raise.
368
- - return a Done with the result if it wishes the future to resolve successfully.
369
- -return None if the status is still in progress.
370
- """
371
- namespace = namespace or config.k8s_namespace()
372
- self._limiter(namespace, self._start_namespace_watcher_thread)
373
- return self._uncertain_futures.create((namespace, obj_name), interpreter)
@@ -7,7 +7,6 @@
7
7
  from . import adls # noqa
8
8
  from ._magic.api import magic # noqa
9
9
  from .core.entry import register_entry_handler
10
- from .core.lock.maintain import no_maintain as no_maintain_locks # noqa: F401
11
10
  from .core.memo import results # noqa
12
11
  from .core.memo.function_memospace import ( # noqa
13
12
  add_pipeline_memospace_handlers,
@@ -22,7 +21,7 @@ from .core.use_runner import use_runner # noqa
22
21
  from .pickling.memoize_only import memoize_in # noqa
23
22
  from .pickling.mprunner import MemoizingPicklingRunner # noqa
24
23
  from .runner.simple_shims import samethread_shim, subprocess_shim # noqa
25
- from .runner.types import FutureShim, Shim, ShimBuilder # noqa
24
+ from .runner.types import Shim, ShimBuilder # noqa
26
25
 
27
26
 
28
27
  def _register_things() -> None:
@@ -6,7 +6,7 @@ import typing as ty
6
6
 
7
7
  from typing_extensions import ParamSpec
8
8
 
9
- from thds.core import futures, stack_context
9
+ from thds.core import stack_context
10
10
  from thds.mops._utils import config_tree
11
11
 
12
12
  from ..core import file_blob_store, pipeline_id, pipeline_id_mask, uris
@@ -109,7 +109,7 @@ class Magic(ty.Generic[P, R]):
109
109
  def _is_off(self) -> bool:
110
110
  return self._shim_builder_or_off is None
111
111
 
112
- def _shimbuilder(self, f: ty.Callable[P, R], args: P.args, kwargs: P.kwargs) -> Shim: # type: ignore[valid-type]
112
+ def _shimbuilder(self, f: ty.Callable[P, R], args: P.args, kwargs: P.kwargs) -> Shim:
113
113
  # this can be set using a stack-local context, or set globally as specifically
114
114
  # or generally as the user needs. We prefer stack local over everything else.
115
115
  sb = self._shim_builder_or_off
@@ -123,16 +123,8 @@ class Magic(ty.Generic[P, R]):
123
123
  def _pipeline_id(self) -> str:
124
124
  return self.config.pipeline_id.getv(self._func_config_path)
125
125
 
126
- def submit(self, *args: P.args, **kwargs: P.kwargs) -> futures.PFuture[R]:
127
- """A futures-based interface that doesn't block on the result of the wrapped
128
- function call, but returns a PFuture once either a result has been found or a a
129
- new invocation has been started.
130
- """
131
- with pipeline_id.set_pipeline_id_for_stack(self._pipeline_id):
132
- return self.runner.submit(self.__wrapped__, *args, **kwargs)
133
-
134
126
  def __call__(self, *args: P.args, **kwargs: P.kwargs) -> R:
135
- """This is the wrapped function - call this as though it were the function itself."""
127
+ """This is the wrapped function."""
136
128
  with pipeline_id.set_pipeline_id_for_stack(self._pipeline_id):
137
129
  return self._func(*args, **kwargs)
138
130
 
@@ -4,14 +4,14 @@ from thds import core
4
4
 
5
5
  from ..runner.shim_builder import make_builder
6
6
  from ..runner.simple_shims import samethread_shim, subprocess_shim
7
- from ..runner.types import FutureShim, Shim, ShimBuilder
7
+ from ..runner.types import Shim, ShimBuilder
8
8
 
9
9
  ShimName = ty.Literal[
10
10
  "samethread", # memoization and coordination, but run in the same thread as the caller.
11
11
  "subprocess", # memoization and coordination, but transfer to a subprocess rather than remote.
12
12
  "off", # equivalent to None - disables use of mops.
13
13
  ]
14
- ShimOrBuilder = ty.Union[ShimBuilder, Shim, FutureShim]
14
+ ShimOrBuilder = ty.Union[ShimBuilder, Shim]
15
15
  logger = core.log.getLogger(__name__)
16
16
 
17
17