thds.mops 3.6.20250219172032__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.mops might be problematic. Click here for more details.
- thds/mops/__about__.py +8 -0
- thds/mops/__init__.py +3 -0
- thds/mops/_compat.py +6 -0
- thds/mops/_utils/__init__.py +0 -0
- thds/mops/_utils/colorize.py +110 -0
- thds/mops/_utils/config_tree.py +167 -0
- thds/mops/_utils/exception.py +16 -0
- thds/mops/_utils/locked_cache.py +78 -0
- thds/mops/_utils/names.py +23 -0
- thds/mops/_utils/on_slow.py +28 -0
- thds/mops/_utils/once.py +30 -0
- thds/mops/_utils/temp.py +32 -0
- thds/mops/config.py +60 -0
- thds/mops/impure/__init__.py +2 -0
- thds/mops/impure/keyfunc.py +14 -0
- thds/mops/impure/runner.py +73 -0
- thds/mops/k8s/__init__.py +27 -0
- thds/mops/k8s/_shared.py +3 -0
- thds/mops/k8s/apply_yaml.py +22 -0
- thds/mops/k8s/auth.py +49 -0
- thds/mops/k8s/config.py +37 -0
- thds/mops/k8s/container_registry.py +14 -0
- thds/mops/k8s/jobs.py +57 -0
- thds/mops/k8s/launch.py +234 -0
- thds/mops/k8s/logging.py +239 -0
- thds/mops/k8s/namespace.py +17 -0
- thds/mops/k8s/node_selection.py +58 -0
- thds/mops/k8s/retry.py +75 -0
- thds/mops/k8s/too_old_resource_version.py +42 -0
- thds/mops/k8s/tools/krsync.py +50 -0
- thds/mops/k8s/tools/krsync.sh +22 -0
- thds/mops/k8s/wait_job.py +72 -0
- thds/mops/k8s/warn_image_backoff.py +63 -0
- thds/mops/k8s/watch.py +266 -0
- thds/mops/meta.json +8 -0
- thds/mops/parallel.py +36 -0
- thds/mops/pure/__init__.py +43 -0
- thds/mops/pure/_magic/__init__.py +0 -0
- thds/mops/pure/_magic/api.py +114 -0
- thds/mops/pure/_magic/sauce.py +152 -0
- thds/mops/pure/_magic/shims.py +34 -0
- thds/mops/pure/adls/__init__.py +1 -0
- thds/mops/pure/adls/_files.py +22 -0
- thds/mops/pure/adls/blob_store.py +185 -0
- thds/mops/pure/adls/output_fqn.py +17 -0
- thds/mops/pure/core/__init__.py +0 -0
- thds/mops/pure/core/content_addressed.py +31 -0
- thds/mops/pure/core/deferred_work.py +83 -0
- thds/mops/pure/core/entry/__init__.py +2 -0
- thds/mops/pure/core/entry/main.py +47 -0
- thds/mops/pure/core/entry/route_result.py +66 -0
- thds/mops/pure/core/entry/runner_registry.py +31 -0
- thds/mops/pure/core/file_blob_store.py +120 -0
- thds/mops/pure/core/lock/__init__.py +7 -0
- thds/mops/pure/core/lock/_acquire.py +192 -0
- thds/mops/pure/core/lock/_funcs.py +37 -0
- thds/mops/pure/core/lock/cli.py +73 -0
- thds/mops/pure/core/lock/maintain.py +150 -0
- thds/mops/pure/core/lock/read.py +39 -0
- thds/mops/pure/core/lock/types.py +37 -0
- thds/mops/pure/core/lock/write.py +136 -0
- thds/mops/pure/core/memo/__init__.py +6 -0
- thds/mops/pure/core/memo/function_memospace.py +267 -0
- thds/mops/pure/core/memo/keyfunc.py +53 -0
- thds/mops/pure/core/memo/overwrite_params.py +61 -0
- thds/mops/pure/core/memo/results.py +103 -0
- thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
- thds/mops/pure/core/metadata.py +230 -0
- thds/mops/pure/core/output_naming.py +52 -0
- thds/mops/pure/core/partial.py +15 -0
- thds/mops/pure/core/pipeline_id.py +62 -0
- thds/mops/pure/core/pipeline_id_mask.py +79 -0
- thds/mops/pure/core/script_support.py +25 -0
- thds/mops/pure/core/serialize_big_objs.py +73 -0
- thds/mops/pure/core/serialize_paths.py +149 -0
- thds/mops/pure/core/source.py +291 -0
- thds/mops/pure/core/types.py +142 -0
- thds/mops/pure/core/uris.py +81 -0
- thds/mops/pure/core/use_runner.py +47 -0
- thds/mops/pure/joblib/__init__.py +1 -0
- thds/mops/pure/joblib/backend.py +81 -0
- thds/mops/pure/joblib/batching.py +67 -0
- thds/mops/pure/pickling/__init__.py +3 -0
- thds/mops/pure/pickling/_pickle.py +193 -0
- thds/mops/pure/pickling/memoize_only.py +22 -0
- thds/mops/pure/pickling/mprunner.py +173 -0
- thds/mops/pure/pickling/pickles.py +149 -0
- thds/mops/pure/pickling/remote.py +145 -0
- thds/mops/pure/pickling/sha256_b64.py +71 -0
- thds/mops/pure/runner/__init__.py +0 -0
- thds/mops/pure/runner/local.py +239 -0
- thds/mops/pure/runner/shim_builder.py +25 -0
- thds/mops/pure/runner/simple_shims.py +21 -0
- thds/mops/pure/runner/strings.py +1 -0
- thds/mops/pure/runner/types.py +28 -0
- thds/mops/pure/tools/__init__.py +0 -0
- thds/mops/pure/tools/history.py +35 -0
- thds/mops/pure/tools/inspect.py +372 -0
- thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
- thds/mops/pure/tools/stress.py +63 -0
- thds/mops/pure/tools/summarize/__init__.py +4 -0
- thds/mops/pure/tools/summarize/cli.py +293 -0
- thds/mops/pure/tools/summarize/run_summary.py +143 -0
- thds/mops/py.typed +0 -0
- thds/mops/testing/__init__.py +0 -0
- thds/mops/testing/deferred_imports.py +81 -0
- thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
- thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
- thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
- thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
- thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Support for transferring execution of a function
|
|
2
|
+
when the function is defined inside the __main__ module.
|
|
3
|
+
|
|
4
|
+
Only works if you 'transfer execution' to the same process.
|
|
5
|
+
"""
|
|
6
|
+
import typing as ty
|
|
7
|
+
|
|
8
|
+
_LOCAL_MAIN_FUNCTIONS: ty.Dict[str, ty.Callable] = dict()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def add_main_module_function(function_name: str, function: ty.Callable) -> None:
|
|
12
|
+
"""This only works if you end up running remotely in the same process."""
|
|
13
|
+
_LOCAL_MAIN_FUNCTIONS[function_name] = function
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_main_module_function(fname: str) -> ty.Callable:
|
|
17
|
+
"""This only works if you end up running 'remotely' in the same process."""
|
|
18
|
+
try:
|
|
19
|
+
return _LOCAL_MAIN_FUNCTIONS[fname]
|
|
20
|
+
except KeyError:
|
|
21
|
+
raise ValueError(
|
|
22
|
+
f"Serialized function {fname} that was in the __main__ module"
|
|
23
|
+
" and attempted to transfer control to a different process."
|
|
24
|
+
" Please move your function to a module that is not __main__."
|
|
25
|
+
) from KeyError
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Bring your own serialization."""
|
|
2
|
+
|
|
3
|
+
import typing as ty
|
|
4
|
+
from weakref import WeakValueDictionary
|
|
5
|
+
|
|
6
|
+
from thds.core.log import getLogger
|
|
7
|
+
|
|
8
|
+
from ..._utils.once import Once
|
|
9
|
+
from .types import Deserializer, Serializer, T
|
|
10
|
+
|
|
11
|
+
V = ty.TypeVar("V")
|
|
12
|
+
|
|
13
|
+
logger = getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NeedsToBeWeakReferenceable(TypeError):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ByIdRegistry(ty.Generic[T, V]):
|
|
21
|
+
"""When you want to use something as the key for a runtime-only
|
|
22
|
+
dictionary, but the thing doesn't support being hashed.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
self._objects: ty.Dict[int, T] = WeakValueDictionary() # type: ignore
|
|
27
|
+
self._values: ty.Dict[int, V] = dict()
|
|
28
|
+
|
|
29
|
+
def __setitem__(self, obj: T, value: V) -> None:
|
|
30
|
+
try:
|
|
31
|
+
self._objects[id(obj)] = obj
|
|
32
|
+
self._values[id(obj)] = value
|
|
33
|
+
except TypeError as te:
|
|
34
|
+
raise NeedsToBeWeakReferenceable(f"{obj} needs to be weak-referenceable") from te
|
|
35
|
+
|
|
36
|
+
def __contains__(self, obj: T) -> bool:
|
|
37
|
+
return id(obj) in self._objects and self._objects[id(obj)] is obj
|
|
38
|
+
|
|
39
|
+
def __getitem__(self, obj: T) -> V:
|
|
40
|
+
if obj not in self:
|
|
41
|
+
raise KeyError(str(obj))
|
|
42
|
+
return self._values[id(obj)]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ByIdSerializer:
|
|
46
|
+
"""Proxies id()-based memoizing serialization for large in-memory objects.
|
|
47
|
+
|
|
48
|
+
For use with something like CallablePickler, which will allow this
|
|
49
|
+
object to recognize registered objects and provide their
|
|
50
|
+
serialization.
|
|
51
|
+
|
|
52
|
+
Thread-safe at the time of (deferred) serialization, but all calls
|
|
53
|
+
to `register` should be done prior to beginning concurrent serialization.
|
|
54
|
+
|
|
55
|
+
The Deserializer returned by the Serializer should ideally not
|
|
56
|
+
occupy much memory, as it will be cached.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, registry: ByIdRegistry[ty.Any, Serializer]) -> None:
|
|
60
|
+
self._registry = registry
|
|
61
|
+
self._desers: ty.Dict[int, Deserializer] = dict()
|
|
62
|
+
self._once = Once()
|
|
63
|
+
|
|
64
|
+
def __call__(self, obj: ty.Any) -> ty.Union[None, Deserializer]:
|
|
65
|
+
if obj in self._registry:
|
|
66
|
+
|
|
67
|
+
def serialize_and_cache() -> None:
|
|
68
|
+
logger.info(f"Serializing object {type(obj)} {id(obj)}")
|
|
69
|
+
self._desers[id(obj)] = self._registry[obj](obj)
|
|
70
|
+
|
|
71
|
+
self._once.run_once(id(obj), serialize_and_cache)
|
|
72
|
+
return self._desers[id(obj)]
|
|
73
|
+
return None
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import typing as ty
|
|
3
|
+
from functools import partial
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from tempfile import NamedTemporaryFile
|
|
6
|
+
|
|
7
|
+
from thds import humenc
|
|
8
|
+
from thds.core.hash_cache import hash_file
|
|
9
|
+
from thds.core.log import getLogger
|
|
10
|
+
|
|
11
|
+
from ..._utils import once
|
|
12
|
+
from . import deferred_work
|
|
13
|
+
|
|
14
|
+
Downloader = ty.Callable[[], Path]
|
|
15
|
+
logger = getLogger(__name__)
|
|
16
|
+
_1_MB = 2**20
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def human_sha256b64_file_at_paths(path: Path) -> str:
|
|
20
|
+
"""Return a human-readable hash of the file at the given path."""
|
|
21
|
+
assert path.exists(), path
|
|
22
|
+
return humenc.encode(hash_file(path, hashlib.sha256()))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _ProcessLockingPathContentAddresser:
|
|
26
|
+
"""Hashes the data at a path, but only once per unique resolved
|
|
27
|
+
Path seen, because hashing a large file is expensive and such
|
|
28
|
+
Paths are often shared across many invocations.
|
|
29
|
+
|
|
30
|
+
In general, you will want only one instance of this per
|
|
31
|
+
application/process, to take advantage of the caching behavior.
|
|
32
|
+
|
|
33
|
+
This does imply that each use of a Path is, as documented in the
|
|
34
|
+
README, a reference to an immutable, write-at-most-once file, at
|
|
35
|
+
least during the lifetime of the process hosting this
|
|
36
|
+
object. Passing the same Path multiple times with different
|
|
37
|
+
contents, and expecting it to get hashed and uploaded each time,
|
|
38
|
+
will not work.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, once: once.Once):
|
|
42
|
+
self.once = once
|
|
43
|
+
self.paths_to_keys: ty.Dict[str, str] = dict()
|
|
44
|
+
|
|
45
|
+
def __call__(self, path: Path) -> str:
|
|
46
|
+
"""Return a remote key (sha256 hash in human-base64) for a path."""
|
|
47
|
+
resolved = str(path.resolve())
|
|
48
|
+
# we now put all paths at the hash of their own contents which
|
|
49
|
+
# allows us to avoid uploading duplicated data even from two
|
|
50
|
+
# different file paths that happen to share the same contents.
|
|
51
|
+
#
|
|
52
|
+
# This _also_ allows us to be more confident that memoization
|
|
53
|
+
# bugs arising from reuse of Paths pointing to different
|
|
54
|
+
# underlying file contents across separate process lifetimes
|
|
55
|
+
# cannot happen - a given Path will be represented inside the
|
|
56
|
+
# pickle by something that represents the (immutable) file
|
|
57
|
+
# contents itself, rather than by a mutable reference (the
|
|
58
|
+
# path).
|
|
59
|
+
|
|
60
|
+
def _hash_and_remember_path() -> None:
|
|
61
|
+
self.paths_to_keys[resolved] = human_sha256b64_file_at_paths(path)
|
|
62
|
+
|
|
63
|
+
self.once.run_once(resolved, _hash_and_remember_path)
|
|
64
|
+
return self.paths_to_keys[resolved]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class PathStream(ty.Protocol):
|
|
68
|
+
def local_to_remote(self, __path: Path, __key: str) -> None:
|
|
69
|
+
... # pragma: no cover
|
|
70
|
+
|
|
71
|
+
def get_downloader(self, __key: str) -> Downloader:
|
|
72
|
+
... # pragma: no cover
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class NotAFileError(ValueError):
|
|
76
|
+
"""We err on the side of caution in Mops 2.0 by never allowing
|
|
77
|
+
Paths that are not actual files to be serialized on either side.
|
|
78
|
+
|
|
79
|
+
This error is not intended to be caught; it is intended to inform
|
|
80
|
+
the developer that they have made a coding mistake by passing an
|
|
81
|
+
incorrect Path to a function that is supposed to be transferring
|
|
82
|
+
execution via a Runner.
|
|
83
|
+
|
|
84
|
+
In the future we might add support for directories if it is desired.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _serialize_file_path_as_upload(
|
|
89
|
+
once: once.Once, path_keyer: _ProcessLockingPathContentAddresser, stream: PathStream, local_src: Path
|
|
90
|
+
) -> ty.Optional[Downloader]:
|
|
91
|
+
if not local_src.exists():
|
|
92
|
+
raise NotAFileError(f"You asked mops to upload the path {local_src}, but it does not exist.")
|
|
93
|
+
if not local_src.is_file():
|
|
94
|
+
raise NotAFileError(f"You asked mops to upload the Path {local_src}, but it is not a file.")
|
|
95
|
+
|
|
96
|
+
remote_root = path_keyer(local_src)
|
|
97
|
+
# I am creating a root 'directory' so that we can put debug info
|
|
98
|
+
# side-by-side with the actual bytes, without interfering in any
|
|
99
|
+
# way with the determinism of the hashed bytes themselves.
|
|
100
|
+
remote_key = remote_root + "/_bytes"
|
|
101
|
+
|
|
102
|
+
def upload() -> None:
|
|
103
|
+
size = local_src.stat().st_size
|
|
104
|
+
formatted_size = f"{size / _1_MB:,.2f} MB"
|
|
105
|
+
log = logger.info if size > 10 * _1_MB else logger.debug
|
|
106
|
+
log(
|
|
107
|
+
f"Uploading Path {local_src} of size {formatted_size} to {remote_key} - "
|
|
108
|
+
"its contents will get 'unpickled' on the other side"
|
|
109
|
+
" as a Path pointing to a local, read-only file."
|
|
110
|
+
)
|
|
111
|
+
stream.local_to_remote(local_src, remote_key)
|
|
112
|
+
with NamedTemporaryFile("w") as tmp:
|
|
113
|
+
tmp.write(str(local_src))
|
|
114
|
+
tmp.flush()
|
|
115
|
+
stream.local_to_remote( # purely debug info
|
|
116
|
+
Path(tmp.name),
|
|
117
|
+
f"{remote_root}/pathname-_{str(local_src).replace('/', '_')}",
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
logger.debug("Adding deferred upload of %s", remote_key)
|
|
121
|
+
deferred_work.add(
|
|
122
|
+
__name__,
|
|
123
|
+
remote_key,
|
|
124
|
+
partial(once.run_once, remote_key, upload),
|
|
125
|
+
)
|
|
126
|
+
return stream.get_downloader(remote_key)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class CoordinatingPathSerializer:
|
|
130
|
+
"""Allow local file Paths to be serialized as streaming objects and then
|
|
131
|
+
deserialized remotely by downloading them from a stream and then
|
|
132
|
+
returning a Path object pointing to the downloaded file.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(self, stream: PathStream, once: once.Once):
|
|
136
|
+
self.stream = stream
|
|
137
|
+
self.once = once
|
|
138
|
+
self.path_addresser = _ProcessLockingPathContentAddresser(once)
|
|
139
|
+
|
|
140
|
+
def __call__(self, maybe_path: ty.Any) -> ty.Optional[ty.Callable[[], Path]]:
|
|
141
|
+
"""Returns a persistent ID compatible with CallableUnpickler for any real file Path.
|
|
142
|
+
|
|
143
|
+
The Persistent ID will actually be a thunk that is self-unpickling.
|
|
144
|
+
"""
|
|
145
|
+
if isinstance(maybe_path, Path):
|
|
146
|
+
return _serialize_file_path_as_upload(
|
|
147
|
+
self.once, self.path_addresser, self.stream, maybe_path
|
|
148
|
+
)
|
|
149
|
+
return None
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""Bidirectional, context-sensitive translation: Source <--> (Hashref | URI).
|
|
2
|
+
|
|
3
|
+
Hashrefs - passing data blobs of many kinds into remote functions by their Hash where
|
|
4
|
+
possible, then using a separate lookup file per hash to tell us where the actual data is
|
|
5
|
+
stored.
|
|
6
|
+
|
|
7
|
+
- local file source containing a Hash - can be optimized with hashref
|
|
8
|
+
- remote file source containing a Hash - can be optimized with hashref
|
|
9
|
+
- remote file source only having URI - cannot be optimized - passed as a raw URI.
|
|
10
|
+
|
|
11
|
+
Decoupling hashref creation from potential upload is important because it lets us avoid
|
|
12
|
+
upload in cases where the Shim turns out to be a local machine shim.
|
|
13
|
+
|
|
14
|
+
We create hashrefs for Sources on the local machine in a shared location. Since this
|
|
15
|
+
data is immutable and content-addressed, there should be no serious concurrency objections
|
|
16
|
+
to this approach.
|
|
17
|
+
|
|
18
|
+
Then, if we cross a boundary into a Shim that will start execution on a different
|
|
19
|
+
machine, we serialize the local Path to content-addressed storage in the current active
|
|
20
|
+
storage root, and we then create a hashref in the active storage root (again, these
|
|
21
|
+
should be effectively immutable on the shared store even if they will mostly likely get
|
|
22
|
+
rewritten multiple times).
|
|
23
|
+
|
|
24
|
+
On the remote side, we will first check the local hashref location. It may very well not
|
|
25
|
+
exist at all. If it does, we should attempt to follow it, but the referent may not
|
|
26
|
+
exist (for whatever reason) and in all cases we are able to fall back to looking for a
|
|
27
|
+
remote hashref and following its reference.
|
|
28
|
+
|
|
29
|
+
We are keeping the core business logic completely separate from pickling. All
|
|
30
|
+
serialization methods will have to choose how to represent the information returned by
|
|
31
|
+
this module, but it should be able to call back into this module with that same state to
|
|
32
|
+
have a Source object returned to it while it performs low-level deserialization.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
import io
|
|
36
|
+
import typing as ty
|
|
37
|
+
from functools import partial
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
|
|
40
|
+
from thds import humenc
|
|
41
|
+
from thds.core import hashing, log, source
|
|
42
|
+
from thds.core.files import is_file_uri, to_uri
|
|
43
|
+
from thds.core.source import Source
|
|
44
|
+
from thds.core.types import StrOrPath
|
|
45
|
+
|
|
46
|
+
from . import deferred_work
|
|
47
|
+
from .content_addressed import wordybin_content_addressed
|
|
48
|
+
from .output_naming import invocation_output_uri
|
|
49
|
+
from .uris import active_storage_root, lookup_blob_store
|
|
50
|
+
|
|
51
|
+
_REMOTE_HASHREF_PREFIX = "mops2-hashrefs"
|
|
52
|
+
_LOCAL_HASHREF_DIR = ".mops2-local-hashrefs"
|
|
53
|
+
logger = log.getLogger(__name__)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _hash_to_str(hash: hashing.Hash) -> str:
|
|
57
|
+
# i see no reason to not remain opinionated and "debug-friendly" with the user-visible
|
|
58
|
+
# encoding of our hashes when they are being stored on a blob store/FS of some kind.
|
|
59
|
+
return f"{hash.algo}-{humenc.encode(hash.bytes)}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _hashref_uri(hash: hashing.Hash, type: ty.Literal["local", "remote"]) -> str:
|
|
63
|
+
# the .txt extensions are just for user-friendliness during debugging
|
|
64
|
+
if type == "remote":
|
|
65
|
+
base_uri = active_storage_root()
|
|
66
|
+
return lookup_blob_store(base_uri).join(
|
|
67
|
+
base_uri, _REMOTE_HASHREF_PREFIX, _hash_to_str(hash) + ".txt"
|
|
68
|
+
)
|
|
69
|
+
local_hashref = Path.home() / _LOCAL_HASHREF_DIR / f"{_hash_to_str(hash)}.txt"
|
|
70
|
+
return to_uri(local_hashref)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _read_hashref(hashref_uri: str) -> str:
|
|
74
|
+
"""Return URI represented by this hashref. Performs IO."""
|
|
75
|
+
uri_bytes = io.BytesIO()
|
|
76
|
+
lookup_blob_store(hashref_uri).readbytesinto(hashref_uri, uri_bytes)
|
|
77
|
+
uri = uri_bytes.getvalue().decode()
|
|
78
|
+
assert uri, f"Hashref from {hashref_uri} is empty"
|
|
79
|
+
return uri
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _write_hashref(hashref_uri: str, uri: str) -> None:
|
|
83
|
+
"""Write URI to this hashref. Performs IO."""
|
|
84
|
+
assert uri, f"Should never encode hashref ({hashref_uri}) pointing to empty URI"
|
|
85
|
+
lookup_blob_store(hashref_uri).putbytes(hashref_uri, uri.encode(), type_hint="text/plain")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def source_from_hashref(hash: hashing.Hash) -> Source:
|
|
89
|
+
"""Re-create a Source from a Hash by looking up one of two Hashrefs and finding a
|
|
90
|
+
valid Source for the data."""
|
|
91
|
+
local_file_hashref_uri = _hashref_uri(hash, "local")
|
|
92
|
+
remote_hashref_uri = _hashref_uri(hash, "remote")
|
|
93
|
+
|
|
94
|
+
def remote_uri(allow_blob_not_found: bool = True) -> str:
|
|
95
|
+
try:
|
|
96
|
+
return _read_hashref(remote_hashref_uri)
|
|
97
|
+
except Exception as e:
|
|
98
|
+
if not allow_blob_not_found or not lookup_blob_store(
|
|
99
|
+
remote_hashref_uri,
|
|
100
|
+
).is_blob_not_found(e):
|
|
101
|
+
# 'remote' blob not found is sometimes fine, but anything else is weird
|
|
102
|
+
# and we should raise.
|
|
103
|
+
raise
|
|
104
|
+
return ""
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
# we might be on the same machine where this was originally invoked.
|
|
108
|
+
# therefore, there may be a local path we can use directly.
|
|
109
|
+
# Then, there's no need to bother grabbing the remote_uri
|
|
110
|
+
# - but for debugging's sake, it's quite nice to actually
|
|
111
|
+
# have the full remote URI as well even if we're ultimately going to use the local copy.
|
|
112
|
+
return source.from_file(_read_hashref(local_file_hashref_uri), hash=hash, uri=remote_uri())
|
|
113
|
+
except FileNotFoundError:
|
|
114
|
+
# we are not on the same machine as the local ref. assume we need the remote URI.
|
|
115
|
+
pass
|
|
116
|
+
except Exception as e:
|
|
117
|
+
if not lookup_blob_store(local_file_hashref_uri).is_blob_not_found(e):
|
|
118
|
+
# 'local' blob not found is fine, but anything else is weird and we should raise.
|
|
119
|
+
raise
|
|
120
|
+
|
|
121
|
+
# no local file, so we assume there must be a remote URI.
|
|
122
|
+
return source.from_uri(remote_uri(False), hash=hash)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _upload_and_create_remote_hashref(local_path: Path, remote_uri: str, hash: hashing.Hash) -> None:
|
|
126
|
+
# exists only to provide a local (non-serializable) closure around local_path and remote_uri.
|
|
127
|
+
lookup_blob_store(remote_uri).putfile(local_path, remote_uri)
|
|
128
|
+
# make sure we never overwrite a hashref until it's actually going to be valid.
|
|
129
|
+
_write_hashref(_hashref_uri(hash, "remote"), remote_uri)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _auto_remote_uri(hash: hashing.Hash) -> str:
|
|
133
|
+
"""Pick a remote URI for a file/source that has the given hash.
|
|
134
|
+
|
|
135
|
+
The underlying implementation is shared with the content-addressing that is used
|
|
136
|
+
throughout mops.
|
|
137
|
+
"""
|
|
138
|
+
return wordybin_content_addressed(hash).bytes_uri
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def prepare_source_argument(source_: Source) -> ty.Union[str, hashing.Hash]:
|
|
142
|
+
"""For use on the orchestrator side, during serialization of the invocation.
|
|
143
|
+
|
|
144
|
+
You either end up with a Hashref created under the current HASHREF_ROOT, or you end up
|
|
145
|
+
with just a URI, which is not amenable to hashref optimization.
|
|
146
|
+
"""
|
|
147
|
+
if not source_.hash:
|
|
148
|
+
# we cannot optimize this one for memoization - just return the URI.
|
|
149
|
+
return source_.uri
|
|
150
|
+
|
|
151
|
+
local_path = source_.cached_path
|
|
152
|
+
if local_path and local_path.exists():
|
|
153
|
+
# register creation of local hashref...
|
|
154
|
+
deferred_work.add(
|
|
155
|
+
__name__ + "-localhashref",
|
|
156
|
+
source_.hash,
|
|
157
|
+
partial(_write_hashref, _hashref_uri(source_.hash, "local"), str(local_path)),
|
|
158
|
+
)
|
|
159
|
+
# then also register pending upload - if the URI is a local file, we need to determine a
|
|
160
|
+
# remote URI for this thing automagically; otherwise, use whatever was already
|
|
161
|
+
# specified by the Source itself.
|
|
162
|
+
remote_uri = source_.uri if not is_file_uri(source_.uri) else _auto_remote_uri(source_.hash)
|
|
163
|
+
deferred_work.add(
|
|
164
|
+
__name__ + "-remotehashref",
|
|
165
|
+
source_.hash,
|
|
166
|
+
partial(_upload_and_create_remote_hashref, local_path, remote_uri, source_.hash),
|
|
167
|
+
)
|
|
168
|
+
else:
|
|
169
|
+
# prepare to (later, if necessary) create a remote hashref, because this Source
|
|
170
|
+
# represents a non-local resource.
|
|
171
|
+
deferred_work.add(
|
|
172
|
+
__name__,
|
|
173
|
+
source_.hash,
|
|
174
|
+
partial(_write_hashref, _hashref_uri(source_.hash, "remote"), source_.uri),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
return source_.hash
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def perform_source_uploads() -> None: # has been replaced by a general work-deferring mechanism.
|
|
181
|
+
deferred_work.perform_all()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# RETURNING FROM REMOTE
|
|
185
|
+
#
|
|
186
|
+
# when returning a Source from a remote, we cannot avoid the upload. this is because the
|
|
187
|
+
# uploaded data is part of the memoized result, and memoization by definition is available
|
|
188
|
+
# to all callers, even those on other machines/environments.
|
|
189
|
+
#
|
|
190
|
+
# A good example of where this is necessary is memoizing Person API test data in CI. the
|
|
191
|
+
# code runs locally, but the goal is to create an output file that can be reused next time
|
|
192
|
+
# it runs (locally or in CI). And for that to be possible, the output _must_ be uploaded.
|
|
193
|
+
#
|
|
194
|
+
# This does not mean that the Source itself must be uploaded immediately upon creation;
|
|
195
|
+
# just that mops must detect Sources in the return value and must force an upload on them.
|
|
196
|
+
# In essence, this creates a bifurcated code path for Sources during serialization; if
|
|
197
|
+
# we're "on the way out", we avoid uploading until it is clear that the data will be used
|
|
198
|
+
# in a remote environment. Whereas "on the way back", we must always upload, and nothing
|
|
199
|
+
# can or should be deferred; upload should happen at the time of serialization.
|
|
200
|
+
#
|
|
201
|
+
# Nevertheless, a local caller should still be able to short-circuit the _download_ by
|
|
202
|
+
# using a locally-created File, if on the same machine where the local file was created.
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class SourceResult(ty.NamedTuple):
|
|
206
|
+
"""Contains the fully-specified local URI and remote URI, plus (probably) a Hash.
|
|
207
|
+
|
|
208
|
+
Everything is defined right here. No need for any kind of dynamic lookup, and
|
|
209
|
+
optimization buys us nothing, since memoization only operates on arguments.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
remote_uri: str
|
|
213
|
+
hash: ty.Optional[hashing.Hash]
|
|
214
|
+
file_uri: str
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def prepare_source_result(source_: Source) -> SourceResult:
|
|
218
|
+
"""Call from within the remote side of an invocation, while serializing the function return value.
|
|
219
|
+
|
|
220
|
+
Forces the Source to be present at a remote URI which will be available once
|
|
221
|
+
returned to the orchestrator.
|
|
222
|
+
|
|
223
|
+
The full output URI is auto-generated if one is not already provided, because we're
|
|
224
|
+
guaranteed to be in a remote context, which provides an invocation output root URI
|
|
225
|
+
where we can safely place any named output.
|
|
226
|
+
"""
|
|
227
|
+
if not is_file_uri(source_.uri):
|
|
228
|
+
if source_.cached_path and Path(source_.cached_path).exists():
|
|
229
|
+
# it exists locally - an upload may be necessary.
|
|
230
|
+
file_uri = to_uri(source_.cached_path)
|
|
231
|
+
lookup_blob_store(source_.uri).putfile(source_.cached_path, source_.uri)
|
|
232
|
+
logger.info("Uploading Source to %s", source_.uri)
|
|
233
|
+
else:
|
|
234
|
+
file_uri = ""
|
|
235
|
+
logger.debug("Creating a SourceResult for a URI that is presumed to already be uploaded.")
|
|
236
|
+
return SourceResult(source_.uri, source_.hash, file_uri)
|
|
237
|
+
|
|
238
|
+
# by definition, if this is a file URI, it now needs to be uploaded, because we could
|
|
239
|
+
# be transferring back to an orchestrator on a different machine, but also because a
|
|
240
|
+
# future caller on a different machine could try to use this memoized result.
|
|
241
|
+
local_path = source.path_from_uri(source_.uri)
|
|
242
|
+
assert local_path.exists(), f"{local_path} does not exist"
|
|
243
|
+
logger.debug("Automatically selecting a remote URI for a Source being returned.")
|
|
244
|
+
remote_uri = invocation_output_uri(name=local_path.name)
|
|
245
|
+
# the line above is a bit of opinionated magic. it uses the 'end' of the filename
|
|
246
|
+
# to automagically assign a meaningful name to the output remote URI.
|
|
247
|
+
#
|
|
248
|
+
# If users do not like this automatically assigned remote URI name, they can construct
|
|
249
|
+
# the Source themselves and provide a remote URI (as well as, optionally, a
|
|
250
|
+
# local_path), and we will use their remote URI.
|
|
251
|
+
lookup_blob_store(remote_uri).putfile(local_path, remote_uri)
|
|
252
|
+
# upload must _always_ happen on remotely-returned Sources, as detailed above.
|
|
253
|
+
# There is no advantage to waiting to upload past this point.
|
|
254
|
+
return SourceResult(remote_uri, source_.hash, source_.uri)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def source_from_source_result(remote_uri: str, hash: ty.Optional[hashing.Hash], file_uri: str) -> Source:
|
|
258
|
+
"""Call when deserializing a remote function return value on the orchestrator side, to
|
|
259
|
+
replace all SourceResults with the intended Source object.
|
|
260
|
+
"""
|
|
261
|
+
if not file_uri:
|
|
262
|
+
return source.from_uri(remote_uri, hash=hash)
|
|
263
|
+
|
|
264
|
+
local_path = source.path_from_uri(file_uri)
|
|
265
|
+
if local_path.exists():
|
|
266
|
+
try:
|
|
267
|
+
# since there's a remote URI, it's possible a specific consumer might want to
|
|
268
|
+
# get access to that directly, even though the default data access would still
|
|
269
|
+
# be to use the local file.
|
|
270
|
+
return source.from_file(local_path, hash=hash, uri=remote_uri)
|
|
271
|
+
except Exception as e:
|
|
272
|
+
logger.warning(
|
|
273
|
+
f"Unable to reuse destination local path {local_path} when constructing Source {remote_uri}: {e}"
|
|
274
|
+
)
|
|
275
|
+
return source.from_uri(remote_uri, hash=hash)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def create_source_at_uri(filename: StrOrPath, destination_uri: str) -> Source:
|
|
279
|
+
"""Public API for creating a Source with a manually-specified remote URI
|
|
280
|
+
within a remote function invocation. Not generally recommended.
|
|
281
|
+
|
|
282
|
+
Use this if you want to provide specific URI destination for a file that exists
|
|
283
|
+
locally, rather than using the automagic naming behavior provided by creating a Source
|
|
284
|
+
with `from_file`, which is standard.
|
|
285
|
+
|
|
286
|
+
_Only_ use this if you are willing to immediately upload your data.
|
|
287
|
+
|
|
288
|
+
"""
|
|
289
|
+
source_ = source.from_file(filename, uri=destination_uri)
|
|
290
|
+
lookup_blob_store(destination_uri).putfile(Path(filename), destination_uri)
|
|
291
|
+
return source_
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Core abstractions for the remote runner system."""
|
|
2
|
+
|
|
3
|
+
import typing as ty
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from typing_extensions import Protocol
|
|
7
|
+
|
|
8
|
+
from thds.core import config
|
|
9
|
+
|
|
10
|
+
T = ty.TypeVar("T")
|
|
11
|
+
F = ty.TypeVar("F", bound=ty.Callable)
|
|
12
|
+
|
|
13
|
+
Deserializer = ty.Callable[[], T]
|
|
14
|
+
Serializer = ty.Callable[[T], Deserializer]
|
|
15
|
+
SerializerHandler = ty.Callable[[T], ty.Union[None, Deserializer]]
|
|
16
|
+
# returns None if the object should be serialized normally.
|
|
17
|
+
# Otherwise returns a Deserializing Callable that will itself return the deserialized object when called.
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Runner(Protocol):
|
|
21
|
+
"""A Runner copies a function, its arguments, and discoverable
|
|
22
|
+
context to a location that can be picked up from a future remote
|
|
23
|
+
process, executes that process remotely, and later pulls the
|
|
24
|
+
result of that remote process back to the local caller process.
|
|
25
|
+
|
|
26
|
+
It is essentially the same abstraction as
|
|
27
|
+
`concurrent.futures.Executor.submit`, or
|
|
28
|
+
`multiprocessing.Pool.apply`.
|
|
29
|
+
|
|
30
|
+
`use_runner` uses this abstraction to provide a way of wrapping a
|
|
31
|
+
function and calling it elsewhere.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __call__(
|
|
35
|
+
self,
|
|
36
|
+
__f: ty.Callable[..., T],
|
|
37
|
+
__args: ty.Sequence,
|
|
38
|
+
__kwargs: ty.Mapping[str, ty.Any],
|
|
39
|
+
) -> T:
|
|
40
|
+
... # pragma: no cover
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class NoResultAfterInvocationError(Exception): # TODO remove in v4.
|
|
44
|
+
"""Runners should raise this if the remotely-invoked function does not provide any result."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class NoResultAfterShimSuccess(NoResultAfterInvocationError):
|
|
48
|
+
"""Raised this if the shim returns with no error, but no result is found in the blob store.
|
|
49
|
+
|
|
50
|
+
A better name for NoResultAfterInvocationError.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class NotARunnerContext(Exception):
|
|
55
|
+
"""Mops may raise this if some code intended to be run under a
|
|
56
|
+
Runner context is invoked outside that context.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
AnyStrSrc = ty.Union[ty.AnyStr, ty.Iterable[ty.AnyStr], ty.IO[ty.AnyStr], Path]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
DISABLE_CONTROL_CACHE = config.item(
|
|
64
|
+
"thds.mops.pure.disable_control_cache", default=False, parse=config.tobool
|
|
65
|
+
)
|
|
66
|
+
# set the above to True in order to specifically opt out of read-path caching of
|
|
67
|
+
# mops-created files. This can apply to a local (stack) context, or can
|
|
68
|
+
# apply globally to the process. The former may be used selectively within mops
|
|
69
|
+
# for issues of known correctness, e.g. locks, whereas the latter will be useful
|
|
70
|
+
# for debugging any cases where files have been remotely deleted.
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BlobStore(Protocol):
|
|
74
|
+
"""A minimal interface that can be supported by almost any type of key-value store
|
|
75
|
+
that has some basic concept of hierarchical pathing (as implemented by join and
|
|
76
|
+
split).
|
|
77
|
+
|
|
78
|
+
getfile and putfile are pathways intended for large files that are passed as arguments
|
|
79
|
+
to, or returned as results from, mops-wrapped functions. Implementations may wish to
|
|
80
|
+
make sure they can perform streaming reads and writes. However, mops itself does not
|
|
81
|
+
generate such files itself, so your use case may not benefit from supporting
|
|
82
|
+
memory-efficient reads and writes if your application does not deal with large files
|
|
83
|
+
via pathlib.Path or thds.core.Source objects.
|
|
84
|
+
|
|
85
|
+
In the methods below, `type_hint` is a parameter that must be _accepted_ as a keyword
|
|
86
|
+
argument by the implementation, but is intended for use mainly as a hint to loggers
|
|
87
|
+
and other debugging setups. It does not need to affect the implementation in any way.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def control_root(self, __remote_uri: str) -> str:
|
|
91
|
+
"""Return the mops-specific root of the blob store for this URI.
|
|
92
|
+
|
|
93
|
+
Essentially, define a place for mops to store its control files under its own internal prefix.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
def readbytesinto(
|
|
97
|
+
self, __remote_uri: str, __stream_or_file: ty.IO[bytes], *, type_hint: str = "bytes"
|
|
98
|
+
) -> None:
|
|
99
|
+
"""Allows reading into any stream, including a stream-to-disk.
|
|
100
|
+
|
|
101
|
+
May optimize reads by returning a cached version of the file if it has been seen before.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def getfile(self, __remote_uri: str) -> Path:
|
|
105
|
+
"""Read a remote uri directly into a Path controlled by the implementation.
|
|
106
|
+
Optimizations involving caches for remotes may be applied.
|
|
107
|
+
The returned file is by definition read-only.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def putbytes(self, __remote_uri: str, __data: AnyStrSrc, *, type_hint: str = "bytes") -> None:
|
|
111
|
+
"""Upload bytes from any stream."""
|
|
112
|
+
|
|
113
|
+
def putfile(self, __path: Path, __remote_uri: str) -> None:
|
|
114
|
+
"""Upload a file that exists on the local
|
|
115
|
+
filesystem. Optimizations including softlinking into caches may be
|
|
116
|
+
applied.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def exists(self, __remote_uri: str) -> bool:
|
|
120
|
+
"""Check if a file exists. May optimize by assuming that files previously seen
|
|
121
|
+
have not been deleted - since this is intended only for mops control files,
|
|
122
|
+
and mops never deletes any control files.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def join(self, *parts: str) -> str:
|
|
126
|
+
"""Join multiple parts of a URI into one. In actual use, the first part will always
|
|
127
|
+
be a storage root, e.g.:
|
|
128
|
+
|
|
129
|
+
join(['adls://foo/bar', 'baz', 'beans']) -> 'adls://foo/bar/baz/beans'
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def split(self, uri: str) -> ty.List[str]:
|
|
133
|
+
"""Must return the storage root as a single string,
|
|
134
|
+
followed by the path component split along the same lines that join would concatenate.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
def is_blob_not_found(self, __exc: Exception) -> bool:
|
|
138
|
+
...
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
Args = ty.Sequence
|
|
142
|
+
Kwargs = ty.Mapping[str, ty.Any]
|