thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show
  1. thds/mops/__about__.py +8 -0
  2. thds/mops/__init__.py +3 -0
  3. thds/mops/_compat.py +6 -0
  4. thds/mops/_utils/__init__.py +0 -0
  5. thds/mops/_utils/colorize.py +110 -0
  6. thds/mops/_utils/config_tree.py +167 -0
  7. thds/mops/_utils/exception.py +16 -0
  8. thds/mops/_utils/locked_cache.py +78 -0
  9. thds/mops/_utils/names.py +23 -0
  10. thds/mops/_utils/on_slow.py +28 -0
  11. thds/mops/_utils/once.py +30 -0
  12. thds/mops/_utils/temp.py +32 -0
  13. thds/mops/config.py +60 -0
  14. thds/mops/impure/__init__.py +2 -0
  15. thds/mops/impure/keyfunc.py +14 -0
  16. thds/mops/impure/runner.py +73 -0
  17. thds/mops/k8s/__init__.py +27 -0
  18. thds/mops/k8s/_shared.py +3 -0
  19. thds/mops/k8s/apply_yaml.py +22 -0
  20. thds/mops/k8s/auth.py +49 -0
  21. thds/mops/k8s/config.py +37 -0
  22. thds/mops/k8s/container_registry.py +14 -0
  23. thds/mops/k8s/jobs.py +57 -0
  24. thds/mops/k8s/launch.py +234 -0
  25. thds/mops/k8s/logging.py +239 -0
  26. thds/mops/k8s/namespace.py +17 -0
  27. thds/mops/k8s/node_selection.py +58 -0
  28. thds/mops/k8s/retry.py +75 -0
  29. thds/mops/k8s/too_old_resource_version.py +42 -0
  30. thds/mops/k8s/tools/krsync.py +50 -0
  31. thds/mops/k8s/tools/krsync.sh +22 -0
  32. thds/mops/k8s/wait_job.py +72 -0
  33. thds/mops/k8s/warn_image_backoff.py +63 -0
  34. thds/mops/k8s/watch.py +266 -0
  35. thds/mops/meta.json +8 -0
  36. thds/mops/parallel.py +36 -0
  37. thds/mops/pure/__init__.py +43 -0
  38. thds/mops/pure/_magic/__init__.py +0 -0
  39. thds/mops/pure/_magic/api.py +114 -0
  40. thds/mops/pure/_magic/sauce.py +152 -0
  41. thds/mops/pure/_magic/shims.py +34 -0
  42. thds/mops/pure/adls/__init__.py +1 -0
  43. thds/mops/pure/adls/_files.py +22 -0
  44. thds/mops/pure/adls/blob_store.py +185 -0
  45. thds/mops/pure/adls/output_fqn.py +17 -0
  46. thds/mops/pure/core/__init__.py +0 -0
  47. thds/mops/pure/core/content_addressed.py +31 -0
  48. thds/mops/pure/core/deferred_work.py +83 -0
  49. thds/mops/pure/core/entry/__init__.py +2 -0
  50. thds/mops/pure/core/entry/main.py +47 -0
  51. thds/mops/pure/core/entry/route_result.py +66 -0
  52. thds/mops/pure/core/entry/runner_registry.py +31 -0
  53. thds/mops/pure/core/file_blob_store.py +120 -0
  54. thds/mops/pure/core/lock/__init__.py +7 -0
  55. thds/mops/pure/core/lock/_acquire.py +192 -0
  56. thds/mops/pure/core/lock/_funcs.py +37 -0
  57. thds/mops/pure/core/lock/cli.py +73 -0
  58. thds/mops/pure/core/lock/maintain.py +150 -0
  59. thds/mops/pure/core/lock/read.py +39 -0
  60. thds/mops/pure/core/lock/types.py +37 -0
  61. thds/mops/pure/core/lock/write.py +136 -0
  62. thds/mops/pure/core/memo/__init__.py +6 -0
  63. thds/mops/pure/core/memo/function_memospace.py +267 -0
  64. thds/mops/pure/core/memo/keyfunc.py +53 -0
  65. thds/mops/pure/core/memo/overwrite_params.py +61 -0
  66. thds/mops/pure/core/memo/results.py +103 -0
  67. thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
  68. thds/mops/pure/core/metadata.py +230 -0
  69. thds/mops/pure/core/output_naming.py +52 -0
  70. thds/mops/pure/core/partial.py +15 -0
  71. thds/mops/pure/core/pipeline_id.py +62 -0
  72. thds/mops/pure/core/pipeline_id_mask.py +79 -0
  73. thds/mops/pure/core/script_support.py +25 -0
  74. thds/mops/pure/core/serialize_big_objs.py +73 -0
  75. thds/mops/pure/core/serialize_paths.py +149 -0
  76. thds/mops/pure/core/source.py +291 -0
  77. thds/mops/pure/core/types.py +142 -0
  78. thds/mops/pure/core/uris.py +81 -0
  79. thds/mops/pure/core/use_runner.py +47 -0
  80. thds/mops/pure/joblib/__init__.py +1 -0
  81. thds/mops/pure/joblib/backend.py +81 -0
  82. thds/mops/pure/joblib/batching.py +67 -0
  83. thds/mops/pure/pickling/__init__.py +3 -0
  84. thds/mops/pure/pickling/_pickle.py +193 -0
  85. thds/mops/pure/pickling/memoize_only.py +22 -0
  86. thds/mops/pure/pickling/mprunner.py +173 -0
  87. thds/mops/pure/pickling/pickles.py +149 -0
  88. thds/mops/pure/pickling/remote.py +145 -0
  89. thds/mops/pure/pickling/sha256_b64.py +71 -0
  90. thds/mops/pure/runner/__init__.py +0 -0
  91. thds/mops/pure/runner/local.py +239 -0
  92. thds/mops/pure/runner/shim_builder.py +25 -0
  93. thds/mops/pure/runner/simple_shims.py +21 -0
  94. thds/mops/pure/runner/strings.py +1 -0
  95. thds/mops/pure/runner/types.py +28 -0
  96. thds/mops/pure/tools/__init__.py +0 -0
  97. thds/mops/pure/tools/history.py +35 -0
  98. thds/mops/pure/tools/inspect.py +372 -0
  99. thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
  100. thds/mops/pure/tools/stress.py +63 -0
  101. thds/mops/pure/tools/summarize/__init__.py +4 -0
  102. thds/mops/pure/tools/summarize/cli.py +293 -0
  103. thds/mops/pure/tools/summarize/run_summary.py +143 -0
  104. thds/mops/py.typed +0 -0
  105. thds/mops/testing/__init__.py +0 -0
  106. thds/mops/testing/deferred_imports.py +81 -0
  107. thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
  108. thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
  109. thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
  110. thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
  111. thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
@@ -0,0 +1,81 @@
1
+ import importlib.metadata
2
+ import io
3
+ import typing as ty
4
+ from pathlib import Path
5
+ from typing import Callable, Union
6
+
7
+ from thds.adls import AdlsFqn, AdlsRoot
8
+ from thds.core.stack_context import StackContext
9
+
10
+ from ..adls.blob_store import get_adls_blob_store
11
+ from .file_blob_store import get_file_blob_store
12
+ from .types import BlobStore
13
+
14
+ GetBlobStoreForUri = ty.Callable[[str], ty.Optional[BlobStore]]
15
+
16
+
17
+ # we add the ADLS blob store and FileBlobStore here because they are the 'blessed'
18
+ # implementations that our internal users rely on.
19
+ # Others can be registered via entry-points.
20
+ _REGISTERED_BLOB_STORES: ty.List[GetBlobStoreForUri] = [
21
+ get_file_blob_store,
22
+ get_adls_blob_store,
23
+ ]
24
+
25
+
26
+ def register_blob_store(get_store: GetBlobStoreForUri) -> None:
27
+ """Dynamically register a BlobStore implementation."""
28
+ _REGISTERED_BLOB_STORES.append(get_store)
29
+
30
+
31
+ def load_plugin_blobstores() -> None:
32
+ for entry_point in importlib.metadata.entry_points().get("thds.mops.pure.blob_stores", []):
33
+ try:
34
+ register_blob_store(entry_point.load())
35
+ except Exception as e:
36
+ print(f"Error loading entry point {entry_point.name}: {e}")
37
+
38
+
39
+ def lookup_blob_store(uri: str) -> BlobStore:
40
+ for get_store in _REGISTERED_BLOB_STORES[::-1]:
41
+ if store := get_store(uri):
42
+ return store
43
+ raise ValueError(f"Unsupported URI: {uri}")
44
+
45
+
46
+ def get_root(uri: str) -> str:
47
+ blob_store = lookup_blob_store(uri)
48
+ return blob_store.control_root(uri)
49
+
50
+
51
+ UriIsh = Union[AdlsRoot, AdlsFqn, str, Path]
52
+ UriResolvable = Union[UriIsh, Callable[[], UriIsh]]
53
+
54
+
55
+ def to_lazy_uri(resolvable: UriResolvable) -> Callable[[], str]:
56
+ if isinstance(resolvable, Path):
57
+ return lambda: str(resolvable.resolve())
58
+ if isinstance(resolvable, (str, AdlsRoot, AdlsFqn)):
59
+ return lambda: str(resolvable)
60
+ if callable(resolvable):
61
+ return lambda: str(resolvable()) # type: ignore
62
+ raise TypeError(type(resolvable))
63
+
64
+
65
+ def get_bytes(remote_uri: str, type_hint: str) -> bytes:
66
+ blob_store = lookup_blob_store(remote_uri)
67
+ with io.BytesIO() as tb:
68
+ blob_store.readbytesinto(remote_uri, tb, type_hint=type_hint)
69
+ tb.seek(0)
70
+ return tb.read()
71
+
72
+
73
+ # ACTIVE_STORAGE_ROOT is meant as a global, non-semantic URI prefix.
74
+ # In other words, it should have nothing to do with your application
75
+ ACTIVE_STORAGE_ROOT: StackContext[str] = StackContext("ACTIVE_STORAGE_ROOT", "")
76
+ # objects referencing this StackContext must be used in the same thread where they were created.
77
+
78
+
79
+ def active_storage_root() -> str:
80
+ assert ACTIVE_STORAGE_ROOT(), "ACTIVE_STORAGE_ROOT must be set before use."
81
+ return ACTIVE_STORAGE_ROOT()
@@ -0,0 +1,47 @@
1
+ """use_runner wrapper decorator factory lives here.
2
+
3
+ You can transfer control to a Runner without this, but decorators are a Pythonic approach.
4
+ """
5
+
6
+ import typing as ty
7
+ from functools import wraps
8
+
9
+ from thds.core import log, stack_context
10
+
11
+ from .entry.runner_registry import entry_count
12
+ from .types import Runner
13
+
14
+ logger = log.getLogger(__name__)
15
+ F = ty.TypeVar("F", bound=ty.Callable)
16
+ FUNCTION_UNWRAP_COUNT = stack_context.StackContext("function_unwrap_count", 0)
17
+
18
+
19
+ def _is_runner_entry() -> bool:
20
+ """Function is being called in the context of a Runner."""
21
+ return entry_count() > FUNCTION_UNWRAP_COUNT()
22
+
23
+
24
+ def use_runner(runner: Runner, skip: ty.Callable[[], bool] = lambda: False) -> ty.Callable[[F], F]:
25
+ """Wrap a function that is pure with respect to its arguments and result.
26
+
27
+ Run that function on the provided runner.
28
+
29
+ The arguments must be able to be transmitted by the runner to the
30
+ remote context and not refer to anything that will not be
31
+ accessible in that context.
32
+ """
33
+
34
+ def deco(f: F) -> F:
35
+ @wraps(f)
36
+ def __use_runner_wrapper(*args, **kwargs): # type: ignore
37
+ if _is_runner_entry() or skip():
38
+ logger.debug("Calling function %s directly...", f)
39
+ with FUNCTION_UNWRAP_COUNT.set(FUNCTION_UNWRAP_COUNT() + 1):
40
+ return f(*args, **kwargs)
41
+
42
+ logger.debug("Forwarding local function %s call to runner...", f)
43
+ return runner(f, args, kwargs)
44
+
45
+ return ty.cast(F, __use_runner_wrapper)
46
+
47
+ return deco
@@ -0,0 +1 @@
1
+ from .backend import TrillMlParallelJoblibBackend # noqa
@@ -0,0 +1,81 @@
1
+ """A mops.pure powered joblib backend.
2
+
3
+ Additionally, several hacks to work around either bugs or bad behavior in joblib.
4
+
5
+ mops itself does _not_ force a dependency on joblib.
6
+
7
+ If you want to use this supplied helper class, you must ensure that
8
+ Joblib is installed, or you will get an ImportError.
9
+ """
10
+
11
+ import typing as ty
12
+
13
+ from joblib._parallel_backends import LokyBackend, SequentialBackend, ThreadingBackend # type: ignore
14
+
15
+ from thds.core import files, log
16
+
17
+ from ..core.types import Runner
18
+ from .batching import patch_joblib_parallel_batching # noqa
19
+
20
+ logger = log.getLogger(__name__)
21
+
22
+
23
+ class NoMemmapLokyBackend(LokyBackend):
24
+ """Workaround for joblib/Cython bug exposed via SciKit-Learn.
25
+
26
+ https://github.com/scikit-learn/scikit-learn/issues/7981#issuecomment-341879166
27
+
28
+ Since this only runs on the remotes, and only then a few
29
+ CPUs/processes at a time, the extra memcopies being done are
30
+ no big deal.
31
+ """
32
+
33
+ def configure(self, *args: ty.Any, **kwargs: ty.Any) -> None:
34
+ kwargs["max_nbytes"] = None
35
+ return super().configure(*args, **kwargs)
36
+
37
+
38
+ class TrillMlParallelJoblibBackend(ThreadingBackend):
39
+ """A joblib backend that forwards to our MemoizingPicklingRunner.
40
+
41
+ Performs simple batching based on the parallelism and oversubscribe factors at construction time.
42
+
43
+ Note that you'll likely need to customize pre_dispatch, otherwise
44
+ there won't be enough created tasks to actually batch anything.
45
+ """
46
+
47
+ supports_sharedmem = False
48
+ uses_threads = False
49
+
50
+ def __init__(
51
+ self,
52
+ runner: Runner,
53
+ parallelism: int,
54
+ n_cores: int,
55
+ oversubscribe: int = 10,
56
+ ):
57
+ """number of cores should be the number of cores available on the remote system."""
58
+ files.bump_limits()
59
+ self.runner = runner
60
+ self.n_cores = n_cores
61
+ self._n_jobs = parallelism
62
+ self.oversubscribe = oversubscribe
63
+
64
+ def effective_n_jobs(self, _nj: int) -> int:
65
+ return self._n_jobs
66
+
67
+ def compute_batch_size(self) -> int:
68
+ return self.n_cores * self.oversubscribe
69
+
70
+ def apply_async(self, func: ty.Any, callback: ty.Any = None) -> ty.Any:
71
+ def call_in_runner() -> ty.Any:
72
+ return self.runner(func, (), dict())
73
+
74
+ return super().apply_async(call_in_runner, callback=callback)
75
+
76
+ def get_nested_backend(self) -> ty.Any:
77
+ nesting_level = getattr(self, "nesting_level", 0) + 1
78
+ if nesting_level > 1:
79
+ logger.warning("Using sequential backend")
80
+ return SequentialBackend(nesting_level=nesting_level), None
81
+ return NoMemmapLokyBackend(nesting_level=nesting_level), self.n_cores
@@ -0,0 +1,67 @@
1
+ """Fix joblib batching for use with many parallel tasks running remotely."""
2
+
3
+ import itertools
4
+ import queue
5
+ import typing as ty
6
+ import unittest.mock
7
+ from collections import defaultdict
8
+ from contextlib import contextmanager
9
+
10
+ import joblib # type: ignore
11
+ from joblib.parallel import BatchedCalls # type: ignore
12
+
13
+ from thds.core.log import getLogger
14
+
15
+ logger = getLogger(__name__)
16
+
17
+
18
+ def dispatch_one_batch(self: ty.Any, iterator: ty.Iterable[ty.Any]) -> bool:
19
+ """Joblib batching is truly horrible for running on a remote machine.
20
+
21
+ Various things conspire to essentially try to outsmart your
22
+ backend's batching instructions, and you end up with much smaller
23
+ batches than desired if you try to launch lots of runtimes in
24
+ parallel.
25
+
26
+ This is an ugly monkey patch, but it _works_.
27
+ """
28
+ if not hasattr(self, "__patch_stats"):
29
+ self.__patch_stats = defaultdict(int) # type: ignore
30
+
31
+ batch_size = self._backend.compute_batch_size()
32
+ with self._lock:
33
+ try:
34
+ tasks = self._ready_batches.get(block=False)
35
+ except queue.Empty:
36
+ n_jobs = self._cached_effective_n_jobs
37
+ islice = list(itertools.islice(iterator, batch_size * n_jobs))
38
+ if len(islice) == 0:
39
+ return False
40
+ self.__patch_stats["tasks"] += len(islice)
41
+ logger.info(
42
+ f"Creating new tasks with patched batch size {batch_size}; "
43
+ f"stats so far: {self.__patch_stats}"
44
+ )
45
+ for i in range(0, len(islice), batch_size):
46
+ self._ready_batches.put(
47
+ BatchedCalls(
48
+ islice[i : i + batch_size],
49
+ self._backend.get_nested_backend(),
50
+ self._reducer_callback,
51
+ self._pickle_cache,
52
+ )
53
+ )
54
+ # finally, get one task.
55
+ tasks = self._ready_batches.get(block=False)
56
+
57
+ if len(tasks) == 0:
58
+ return False
59
+ self.__patch_stats["batches"] += 1
60
+ self._dispatch(tasks)
61
+ return True
62
+
63
+
64
+ @contextmanager
65
+ def patch_joblib_parallel_batching() -> ty.Iterator[None]:
66
+ with unittest.mock.patch.object(joblib.Parallel, "dispatch_one_batch", dispatch_one_batch):
67
+ yield
@@ -0,0 +1,3 @@
1
+ from . import mprunner, remote # noqa: F401
2
+ from .memoize_only import memoize_in # noqa: F401
3
+ from .mprunner import MemoizingPicklingRunner # noqa: F401
@@ -0,0 +1,193 @@
1
+ """Utilities built around pickle for the purpose of transferring large amounts of on-disk
2
+ data and also functions."""
3
+
4
+ import inspect
5
+ import io
6
+ import pickle
7
+ import typing as ty
8
+ from functools import partial
9
+
10
+ # so we can pickle and re-raise exceptions with remote tracebacks
11
+ from tblib import pickling_support # type: ignore
12
+
13
+ from thds.core import hashing, log, source
14
+
15
+ from ..core import memo, metadata
16
+ from ..core.source import prepare_source_argument, prepare_source_result
17
+ from ..core.types import Args, Deserializer, Kwargs, SerializerHandler
18
+ from ..core.uris import get_bytes
19
+ from .pickles import (
20
+ PicklableFunction,
21
+ UnpickleFunctionWithLogicKey,
22
+ UnpickleSourceHashrefArgument,
23
+ UnpickleSourceResult,
24
+ UnpickleSourceUriArgument,
25
+ )
26
+
27
+ logger = log.getLogger(__name__)
28
+ F = ty.TypeVar("F", bound=ty.Callable)
29
+
30
+
31
+ def wrap_f(f: F) -> ty.Union[F, PicklableFunction]:
32
+ if hasattr(f, "__module__") and hasattr(f, "__name__"):
33
+ return PicklableFunction(f)
34
+ return f
35
+
36
+
37
+ class _CallbackPickler(pickle.Pickler):
38
+ def __init__(self, handlers: ty.Sequence[SerializerHandler], *args: ty.Any, **kwargs: ty.Any):
39
+ super().__init__(*args, **kwargs)
40
+ self.handlers = handlers
41
+
42
+ def persistent_id(self, obj: ty.Any) -> ty.Union[None, ty.Callable]:
43
+ if isinstance(obj, Exception):
44
+ pickling_support.install(obj)
45
+ for handler in self.handlers:
46
+ pid = handler(obj)
47
+ if pid is not None:
48
+ return pid
49
+ return None
50
+
51
+
52
+ class CallableUnpickler(pickle.Unpickler):
53
+ """Present same interface as pickle.load but support unpickling callable PIDs generated by CallbackPickler."""
54
+
55
+ def persistent_load(self, pid: ty.Callable) -> ty.Any:
56
+ try:
57
+ return pid()
58
+ except TypeError as te:
59
+ # logger.exception("TypeError hit while debugging")
60
+ # this line should never get hit as long as nobody asks us to unpickle PIDs we don't know about
61
+ raise pickle.UnpicklingError(f"unsupported persistent object - {te}") # pragma: no cover
62
+
63
+
64
+ class Dumper:
65
+ """Presents the same interface as pickle.dump but supports
66
+ arbitrary callback-based unpickling.
67
+ """
68
+
69
+ def __init__(self, *handlers: SerializerHandler):
70
+ self.handlers = handlers
71
+
72
+ def __call__(self, obj: object, file: ty.IO, *args: ty.Any, **kwargs: ty.Any) -> ty.Any:
73
+ _CallbackPickler(self.handlers, file, *args, **kwargs).dump(obj)
74
+
75
+
76
+ def gimme_bytes(pickle_dump: ty.Callable[[object, ty.IO], None], obj: object) -> bytes:
77
+ with io.BytesIO() as bio:
78
+ pickle_dump(obj, bio)
79
+ bio.seek(0)
80
+ return bio.read()
81
+
82
+
83
+ def read_partial_pickle(full_bytes: bytes) -> ty.Tuple[bytes, ty.Any]:
84
+ # in order to be forward-compatible with v3 of mops, we're introducing a new
85
+ # wrinkle in the read. Instead of assuming that the data at the URI
86
+ # _begins_ with a pickle, we are looking for the first possible pickle
87
+ # and beginning our read there. Mops 3 will be generating some human-readable,
88
+ # non-pickle metadata and embedding it at the beginning of the file.
89
+ first_pickle_pos = full_bytes.find(b"\x80")
90
+ if first_pickle_pos == -1:
91
+ raise ValueError("Unable to find a pickle in the bytes")
92
+ return (
93
+ full_bytes[:first_pickle_pos],
94
+ CallableUnpickler(io.BytesIO(full_bytes[first_pickle_pos:])).load(),
95
+ )
96
+
97
+
98
+ H = ty.TypeVar("H")
99
+
100
+
101
+ def make_read_header_and_object(
102
+ type_hint: str, xf_header: ty.Optional[ty.Callable[[bytes], H]] = None
103
+ ) -> ty.Callable[[str], ty.Tuple[H, ty.Any]]:
104
+ def read_object(uri: str) -> ty.Tuple[H, ty.Any]:
105
+ header, unpickled = read_partial_pickle(get_bytes(uri, type_hint=type_hint))
106
+ return (xf_header or (lambda h: h))(header), unpickled # type: ignore
107
+
108
+ return read_object
109
+
110
+
111
+ def read_metadata_and_object(
112
+ type_hint: str, uri: str
113
+ ) -> ty.Tuple[ty.Optional[metadata.ResultMetadata], ty.Any]:
114
+ def _read_metadata_header(header_bytes: bytes) -> ty.Optional[metadata.ResultMetadata]:
115
+ if not header_bytes:
116
+ return None
117
+ return metadata.parse_result_metadata(header_bytes.decode("utf-8").split("\n"))
118
+
119
+ return make_read_header_and_object(type_hint, xf_header=_read_metadata_header)(uri)
120
+
121
+
122
+ def freeze_args_kwargs(dumper: Dumper, f: ty.Callable, args: Args, kwargs: Kwargs) -> bytes:
123
+ """Returns a pickled (args, kwargs) tuple, with pre-bound
124
+ arguments to normalize different call structures into a
125
+ canonical/determinstic binding.
126
+
127
+ Also binds default arguments, for maximum determinism/explicitness.
128
+ """
129
+ bound_arguments = inspect.signature(f).bind(*args, **kwargs)
130
+ bound_arguments.apply_defaults()
131
+ return gimme_bytes(dumper, (bound_arguments.args, bound_arguments.kwargs))
132
+
133
+
134
+ def unfreeze_args_kwargs(
135
+ args_kwargs_pickle: bytes, unpickler: ty.Type[pickle.Unpickler] = CallableUnpickler
136
+ ) -> ty.Tuple[Args, Kwargs]:
137
+ """Undoes a freeze_args_kwargs call."""
138
+ return unpickler(io.BytesIO(args_kwargs_pickle)).load()
139
+
140
+
141
+ # SerializerHandlers for Source objects:
142
+ _DeserSource = ty.Callable[[], source.Source]
143
+
144
+
145
+ class SourceArgumentPickler:
146
+ """Only for use on the orchestrator side, when serializing the arguments."""
147
+
148
+ def __call__(self, maybe_source: ty.Any) -> ty.Optional[_DeserSource]:
149
+ if isinstance(maybe_source, source.Source):
150
+ uri_or_hash = prepare_source_argument(maybe_source)
151
+ if isinstance(uri_or_hash, hashing.Hash):
152
+ return ty.cast(_DeserSource, UnpickleSourceHashrefArgument(uri_or_hash))
153
+ return ty.cast(_DeserSource, UnpickleSourceUriArgument(uri_or_hash))
154
+ # I do not understand why these casts are necessary to avoid mypy errors.
155
+ # I think it has something to do with NamedTuples being the underlying
156
+ # object type that is expected to support __call__(self) -> Foo,
157
+ # but I haven't recently located a relevant Issue anywhere.
158
+ return None
159
+
160
+
161
+ class SourceResultPickler:
162
+ """Only for use on the remote side, when serializing the result."""
163
+
164
+ def __call__(self, maybe_source: ty.Any) -> ty.Optional[_DeserSource]:
165
+ if isinstance(maybe_source, source.Source):
166
+ return ty.cast(_DeserSource, UnpickleSourceResult(*prepare_source_result(maybe_source)))
167
+
168
+ return None
169
+
170
+
171
+ class NestedFunctionWithLogicKeyPickler:
172
+ def __call__(self, maybe_function_with_logic_key: ty.Any) -> ty.Optional[Deserializer]:
173
+ """Returns a pickle 'persistent id' which is a 'kind' of CallableUnpickler.
174
+
175
+ ...or None, which means pickle normally.
176
+ """
177
+ if not callable(maybe_function_with_logic_key):
178
+ return None
179
+
180
+ if isinstance(maybe_function_with_logic_key, partial):
181
+ # do not extract from the partial - only a raw function
182
+ # which will itself be included in the partial when it gets pickled
183
+ return None
184
+
185
+ function_logic_key = memo.extract_function_logic_key_from_docstr(maybe_function_with_logic_key)
186
+ if not function_logic_key:
187
+ return None
188
+
189
+ return UnpickleFunctionWithLogicKey( # type: ignore
190
+ # we must then wrap the function itself so that this does not cause infinite recursion.
191
+ pickle.dumps(maybe_function_with_logic_key),
192
+ function_logic_key,
193
+ )
@@ -0,0 +1,22 @@
1
+ from typing import Callable
2
+
3
+ from ..core.types import F
4
+ from ..core.uris import UriResolvable
5
+ from ..core.use_runner import use_runner
6
+ from ..runner import simple_shims
7
+ from .mprunner import MemoizingPicklingRunner
8
+
9
+
10
+ # this may soon become deprecated in favor of mops.pure.magic(blob_root=...)
11
+ def memoize_in(uri_resolvable: UriResolvable) -> Callable[[F], F]:
12
+ """A decorator that makes a function globally-memoizable, but running in the current
13
+ thread.
14
+
15
+ This is a good thing to use when the computation derives no
16
+ advantage from not running locally (i.e. exhibits no parallelism)
17
+ but you still want memoization.
18
+
19
+ This enables nested memoized function calls, which is not (yet)
20
+ the default for `use_runner`.
21
+ """
22
+ return use_runner(MemoizingPicklingRunner(simple_shims.samethread_shim, uri_resolvable))
@@ -0,0 +1,173 @@
1
+ """Provides concrete serialization/deserialization (via pickling) for the basic memoizing runner algorithm.
2
+
3
+ Contains default config, core 'state' and some rarely-used customization interfaces.
4
+
5
+ See runner.local.py for the core runner implementation.
6
+ """
7
+
8
+ import typing as ty
9
+ from functools import partial
10
+
11
+ from thds.core import cache, log
12
+ from thds.core.stack_context import StackContext
13
+
14
+ from ..._utils.once import Once
15
+ from ..core import memo, uris
16
+ from ..core.serialize_big_objs import ByIdRegistry, ByIdSerializer
17
+ from ..core.serialize_paths import CoordinatingPathSerializer
18
+ from ..core.types import Args, F, Kwargs, Serializer, T
19
+ from ..runner import local, shim_builder
20
+ from ..runner.types import Shim, ShimBuilder
21
+ from ..tools.summarize import run_summary
22
+ from . import _pickle, pickles, sha256_b64
23
+
24
+ RUNNER_NAME = "mops2-mpf"
25
+ Redirect = ty.Callable[[F, Args, Kwargs], F]
26
+ NO_REDIRECT = lambda f, _args, _kwargs: f # noqa: E731
27
+ _ARGS_CONTEXT = StackContext[ty.Sequence]("args_kwargs", tuple())
28
+ _KWARGS_CONTEXT = StackContext[ty.Mapping]("args_kwargs", dict())
29
+ logger = log.getLogger(__name__)
30
+
31
+
32
+ def mp_shim(base_shim: Shim, shim_args: ty.Sequence[str]) -> ty.Any:
33
+ return base_shim((RUNNER_NAME, *shim_args))
34
+
35
+
36
+ def _runner_prefix_for_pickled_functions(storage_root: str) -> str:
37
+ return uris.lookup_blob_store(storage_root).join(storage_root, RUNNER_NAME)
38
+
39
+
40
+ class MemoizingPicklingRunner:
41
+ """
42
+ Runs callables in a process as defined by the Shim.
43
+ This is often a remote process, however a local shim may be provided.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ shim: ty.Union[ShimBuilder, Shim],
49
+ blob_storage_root: uris.UriResolvable,
50
+ *,
51
+ rerun_exceptions: bool = True,
52
+ serialization_registry: ByIdRegistry[ty.Any, Serializer] = ByIdRegistry(), # noqa: B008
53
+ redirect: Redirect = NO_REDIRECT,
54
+ ):
55
+ """Construct a memoizing shim runner.
56
+
57
+ Transmitted Path resources will be content-hash-addressed
58
+ below the runner_prefix to save storage and increase chances
59
+ of memoization. Named objects will be treated
60
+ similarly. Function invocations will be pickled and stored
61
+ under the current pipeline id since we do not have a way of
62
+ inferring whether their associated code is safely content-addressable
63
+ across runs.
64
+
65
+ The Shim must forward control in the remote environment to a
66
+ wrapper that will pull the function and arguments from the URI(s).
67
+
68
+ A ShimBuilder will receive the original function and its
69
+ original arguments, which you can use to determine which
70
+ concrete Shim implementation to return for the given function
71
+ call.
72
+
73
+ `rerun_exceptions` will cause a pre-existing `exception`
74
+ result to be ignored, as though Exceptions in your function
75
+ are the result of transient errors and not an expected return
76
+ value of a (simulated) pure function. If you do not want this
77
+ behavior, turn it off.
78
+
79
+ `redirect` changes only the function that is actually invoked
80
+ on the remote side of the runner. It does not change the
81
+ computed memoization key, which is still based on the original
82
+ function and the args, kwargs pair passed in. A common use for
83
+ this would be allowing a contextually-aware function to be
84
+ invoked in the manner of initializer/initargs, without those
85
+ additional bits being part of the function invocation and
86
+ therefore the memoization key, especially where they're not
87
+ picklable at all.
88
+ """
89
+ self._shim_builder = shim_builder.make_builder(shim)
90
+ self._get_storage_root = uris.to_lazy_uri(blob_storage_root)
91
+ self._rerun_exceptions = rerun_exceptions
92
+ self._by_id_registry = serialization_registry
93
+ self._redirect = redirect
94
+
95
+ self._run_directory = run_summary.create_mops_run_directory()
96
+
97
+ def shared(self, *objs: ty.Any, **named_objs: ty.Any) -> None:
98
+ """Set up memoizing pickle serialization for these objects.
99
+
100
+ Provided names are used for debugging purposes only.
101
+ """
102
+ for obj in objs:
103
+ self._by_id_registry[obj] = sha256_b64.Sha256B64Pickler()
104
+ for name, obj in named_objs.items():
105
+ self._by_id_registry[obj] = sha256_b64.Sha256B64Pickler(name)
106
+
107
+ @cache.locking
108
+ def _get_stateful_dumper(self, _root: str) -> _pickle.Dumper:
109
+ """We want one of these per blob storage root, because the
110
+ invocation and result must exist on the same blob store as
111
+ any other automatically dumped objects, e.g. Paths or named
112
+ objects, such that the full invocation payload is
113
+ byte-for-byte identical, since its hash is our memoization
114
+ key.
115
+ """
116
+ return _pickle.Dumper(
117
+ ByIdSerializer(self._by_id_registry),
118
+ CoordinatingPathSerializer(sha256_b64.Sha256B64PathStream(), Once()),
119
+ _pickle.SourceArgumentPickler(),
120
+ _pickle.NestedFunctionWithLogicKeyPickler(),
121
+ )
122
+
123
+ def _serialize_args_kwargs(
124
+ self, storage_root: str, func: ty.Callable[..., T], args: Args, kwargs: Kwargs
125
+ ) -> bytes:
126
+ # Why do we need func in order to serialize args and kwargs? Because
127
+ # we use it to bind the arguments to the function first, which makes that part
128
+ # deterministic and also 'reifies' any default arguments, so we don't have any implicit state.
129
+ return _pickle.freeze_args_kwargs(self._get_stateful_dumper(storage_root), func, args, kwargs)
130
+
131
+ def _serialize_invocation(
132
+ self, storage_root: str, func: ty.Callable[..., T], args_kwargs: bytes
133
+ ) -> bytes:
134
+ return _pickle.gimme_bytes(
135
+ self._get_stateful_dumper(storage_root),
136
+ pickles.Invocation(
137
+ _pickle.wrap_f(self._redirect(func, _ARGS_CONTEXT(), _KWARGS_CONTEXT())),
138
+ args_kwargs,
139
+ ),
140
+ )
141
+
142
+ def _wrap_shim_builder(self, func: F, args: Args, kwargs: Kwargs) -> Shim:
143
+ base_shim = self._shim_builder(func, args, kwargs)
144
+ return partial(mp_shim, base_shim)
145
+
146
+ def __call__(self, func: ty.Callable[..., T], args: Args, kwargs: Kwargs) -> T:
147
+ """Return result of running this function remotely via the shim.
148
+
149
+ Passes data to shim process via pickles in a Blob Store.
150
+
151
+ May return cached (previously-computed) results found via the
152
+ derived function memo URI, which contains the determinstic
153
+ hashed bytes of all the function arguments, but also
154
+ additional namespacing including pipeline_id as documented
155
+ in memo.function_memospace.py.
156
+ """
157
+ logger.debug("Preparing to run function via remote shim")
158
+ with _ARGS_CONTEXT.set(args), _KWARGS_CONTEXT.set(kwargs):
159
+ return local.invoke_via_shim_or_return_memoized(
160
+ self._serialize_args_kwargs,
161
+ self._serialize_invocation,
162
+ self._wrap_shim_builder,
163
+ _pickle.read_metadata_and_object,
164
+ self._run_directory,
165
+ )(
166
+ self._rerun_exceptions,
167
+ memo.make_function_memospace(
168
+ _runner_prefix_for_pickled_functions(self._get_storage_root()), func
169
+ ),
170
+ func,
171
+ args,
172
+ kwargs,
173
+ )