thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show
  1. thds/mops/__about__.py +8 -0
  2. thds/mops/__init__.py +3 -0
  3. thds/mops/_compat.py +6 -0
  4. thds/mops/_utils/__init__.py +0 -0
  5. thds/mops/_utils/colorize.py +110 -0
  6. thds/mops/_utils/config_tree.py +167 -0
  7. thds/mops/_utils/exception.py +16 -0
  8. thds/mops/_utils/locked_cache.py +78 -0
  9. thds/mops/_utils/names.py +23 -0
  10. thds/mops/_utils/on_slow.py +28 -0
  11. thds/mops/_utils/once.py +30 -0
  12. thds/mops/_utils/temp.py +32 -0
  13. thds/mops/config.py +60 -0
  14. thds/mops/impure/__init__.py +2 -0
  15. thds/mops/impure/keyfunc.py +14 -0
  16. thds/mops/impure/runner.py +73 -0
  17. thds/mops/k8s/__init__.py +27 -0
  18. thds/mops/k8s/_shared.py +3 -0
  19. thds/mops/k8s/apply_yaml.py +22 -0
  20. thds/mops/k8s/auth.py +49 -0
  21. thds/mops/k8s/config.py +37 -0
  22. thds/mops/k8s/container_registry.py +14 -0
  23. thds/mops/k8s/jobs.py +57 -0
  24. thds/mops/k8s/launch.py +234 -0
  25. thds/mops/k8s/logging.py +239 -0
  26. thds/mops/k8s/namespace.py +17 -0
  27. thds/mops/k8s/node_selection.py +58 -0
  28. thds/mops/k8s/retry.py +75 -0
  29. thds/mops/k8s/too_old_resource_version.py +42 -0
  30. thds/mops/k8s/tools/krsync.py +50 -0
  31. thds/mops/k8s/tools/krsync.sh +22 -0
  32. thds/mops/k8s/wait_job.py +72 -0
  33. thds/mops/k8s/warn_image_backoff.py +63 -0
  34. thds/mops/k8s/watch.py +266 -0
  35. thds/mops/meta.json +8 -0
  36. thds/mops/parallel.py +36 -0
  37. thds/mops/pure/__init__.py +43 -0
  38. thds/mops/pure/_magic/__init__.py +0 -0
  39. thds/mops/pure/_magic/api.py +114 -0
  40. thds/mops/pure/_magic/sauce.py +152 -0
  41. thds/mops/pure/_magic/shims.py +34 -0
  42. thds/mops/pure/adls/__init__.py +1 -0
  43. thds/mops/pure/adls/_files.py +22 -0
  44. thds/mops/pure/adls/blob_store.py +185 -0
  45. thds/mops/pure/adls/output_fqn.py +17 -0
  46. thds/mops/pure/core/__init__.py +0 -0
  47. thds/mops/pure/core/content_addressed.py +31 -0
  48. thds/mops/pure/core/deferred_work.py +83 -0
  49. thds/mops/pure/core/entry/__init__.py +2 -0
  50. thds/mops/pure/core/entry/main.py +47 -0
  51. thds/mops/pure/core/entry/route_result.py +66 -0
  52. thds/mops/pure/core/entry/runner_registry.py +31 -0
  53. thds/mops/pure/core/file_blob_store.py +120 -0
  54. thds/mops/pure/core/lock/__init__.py +7 -0
  55. thds/mops/pure/core/lock/_acquire.py +192 -0
  56. thds/mops/pure/core/lock/_funcs.py +37 -0
  57. thds/mops/pure/core/lock/cli.py +73 -0
  58. thds/mops/pure/core/lock/maintain.py +150 -0
  59. thds/mops/pure/core/lock/read.py +39 -0
  60. thds/mops/pure/core/lock/types.py +37 -0
  61. thds/mops/pure/core/lock/write.py +136 -0
  62. thds/mops/pure/core/memo/__init__.py +6 -0
  63. thds/mops/pure/core/memo/function_memospace.py +267 -0
  64. thds/mops/pure/core/memo/keyfunc.py +53 -0
  65. thds/mops/pure/core/memo/overwrite_params.py +61 -0
  66. thds/mops/pure/core/memo/results.py +103 -0
  67. thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
  68. thds/mops/pure/core/metadata.py +230 -0
  69. thds/mops/pure/core/output_naming.py +52 -0
  70. thds/mops/pure/core/partial.py +15 -0
  71. thds/mops/pure/core/pipeline_id.py +62 -0
  72. thds/mops/pure/core/pipeline_id_mask.py +79 -0
  73. thds/mops/pure/core/script_support.py +25 -0
  74. thds/mops/pure/core/serialize_big_objs.py +73 -0
  75. thds/mops/pure/core/serialize_paths.py +149 -0
  76. thds/mops/pure/core/source.py +291 -0
  77. thds/mops/pure/core/types.py +142 -0
  78. thds/mops/pure/core/uris.py +81 -0
  79. thds/mops/pure/core/use_runner.py +47 -0
  80. thds/mops/pure/joblib/__init__.py +1 -0
  81. thds/mops/pure/joblib/backend.py +81 -0
  82. thds/mops/pure/joblib/batching.py +67 -0
  83. thds/mops/pure/pickling/__init__.py +3 -0
  84. thds/mops/pure/pickling/_pickle.py +193 -0
  85. thds/mops/pure/pickling/memoize_only.py +22 -0
  86. thds/mops/pure/pickling/mprunner.py +173 -0
  87. thds/mops/pure/pickling/pickles.py +149 -0
  88. thds/mops/pure/pickling/remote.py +145 -0
  89. thds/mops/pure/pickling/sha256_b64.py +71 -0
  90. thds/mops/pure/runner/__init__.py +0 -0
  91. thds/mops/pure/runner/local.py +239 -0
  92. thds/mops/pure/runner/shim_builder.py +25 -0
  93. thds/mops/pure/runner/simple_shims.py +21 -0
  94. thds/mops/pure/runner/strings.py +1 -0
  95. thds/mops/pure/runner/types.py +28 -0
  96. thds/mops/pure/tools/__init__.py +0 -0
  97. thds/mops/pure/tools/history.py +35 -0
  98. thds/mops/pure/tools/inspect.py +372 -0
  99. thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
  100. thds/mops/pure/tools/stress.py +63 -0
  101. thds/mops/pure/tools/summarize/__init__.py +4 -0
  102. thds/mops/pure/tools/summarize/cli.py +293 -0
  103. thds/mops/pure/tools/summarize/run_summary.py +143 -0
  104. thds/mops/py.typed +0 -0
  105. thds/mops/testing/__init__.py +0 -0
  106. thds/mops/testing/deferred_imports.py +81 -0
  107. thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
  108. thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
  109. thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
  110. thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
  111. thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
@@ -0,0 +1,25 @@
1
+ """Support for transferring execution of a function
2
+ when the function is defined inside the __main__ module.
3
+
4
+ Only works if you 'transfer execution' to the same process.
5
+ """
6
+ import typing as ty
7
+
8
+ _LOCAL_MAIN_FUNCTIONS: ty.Dict[str, ty.Callable] = dict()
9
+
10
+
11
+ def add_main_module_function(function_name: str, function: ty.Callable) -> None:
12
+ """This only works if you end up running remotely in the same process."""
13
+ _LOCAL_MAIN_FUNCTIONS[function_name] = function
14
+
15
+
16
+ def get_main_module_function(fname: str) -> ty.Callable:
17
+ """This only works if you end up running 'remotely' in the same process."""
18
+ try:
19
+ return _LOCAL_MAIN_FUNCTIONS[fname]
20
+ except KeyError:
21
+ raise ValueError(
22
+ f"Serialized function {fname} that was in the __main__ module"
23
+ " and attempted to transfer control to a different process."
24
+ " Please move your function to a module that is not __main__."
25
+ ) from KeyError
@@ -0,0 +1,73 @@
1
+ """Bring your own serialization."""
2
+
3
+ import typing as ty
4
+ from weakref import WeakValueDictionary
5
+
6
+ from thds.core.log import getLogger
7
+
8
+ from ..._utils.once import Once
9
+ from .types import Deserializer, Serializer, T
10
+
11
+ V = ty.TypeVar("V")
12
+
13
+ logger = getLogger(__name__)
14
+
15
+
16
+ class NeedsToBeWeakReferenceable(TypeError):
17
+ pass
18
+
19
+
20
+ class ByIdRegistry(ty.Generic[T, V]):
21
+ """When you want to use something as the key for a runtime-only
22
+ dictionary, but the thing doesn't support being hashed.
23
+ """
24
+
25
+ def __init__(self) -> None:
26
+ self._objects: ty.Dict[int, T] = WeakValueDictionary() # type: ignore
27
+ self._values: ty.Dict[int, V] = dict()
28
+
29
+ def __setitem__(self, obj: T, value: V) -> None:
30
+ try:
31
+ self._objects[id(obj)] = obj
32
+ self._values[id(obj)] = value
33
+ except TypeError as te:
34
+ raise NeedsToBeWeakReferenceable(f"{obj} needs to be weak-referenceable") from te
35
+
36
+ def __contains__(self, obj: T) -> bool:
37
+ return id(obj) in self._objects and self._objects[id(obj)] is obj
38
+
39
+ def __getitem__(self, obj: T) -> V:
40
+ if obj not in self:
41
+ raise KeyError(str(obj))
42
+ return self._values[id(obj)]
43
+
44
+
45
+ class ByIdSerializer:
46
+ """Proxies id()-based memoizing serialization for large in-memory objects.
47
+
48
+ For use with something like CallablePickler, which will allow this
49
+ object to recognize registered objects and provide their
50
+ serialization.
51
+
52
+ Thread-safe at the time of (deferred) serialization, but all calls
53
+ to `register` should be done prior to beginning concurrent serialization.
54
+
55
+ The Deserializer returned by the Serializer should ideally not
56
+ occupy much memory, as it will be cached.
57
+ """
58
+
59
+ def __init__(self, registry: ByIdRegistry[ty.Any, Serializer]) -> None:
60
+ self._registry = registry
61
+ self._desers: ty.Dict[int, Deserializer] = dict()
62
+ self._once = Once()
63
+
64
+ def __call__(self, obj: ty.Any) -> ty.Union[None, Deserializer]:
65
+ if obj in self._registry:
66
+
67
+ def serialize_and_cache() -> None:
68
+ logger.info(f"Serializing object {type(obj)} {id(obj)}")
69
+ self._desers[id(obj)] = self._registry[obj](obj)
70
+
71
+ self._once.run_once(id(obj), serialize_and_cache)
72
+ return self._desers[id(obj)]
73
+ return None
@@ -0,0 +1,149 @@
1
+ import hashlib
2
+ import typing as ty
3
+ from functools import partial
4
+ from pathlib import Path
5
+ from tempfile import NamedTemporaryFile
6
+
7
+ from thds import humenc
8
+ from thds.core.hash_cache import hash_file
9
+ from thds.core.log import getLogger
10
+
11
+ from ..._utils import once
12
+ from . import deferred_work
13
+
14
+ Downloader = ty.Callable[[], Path]
15
+ logger = getLogger(__name__)
16
+ _1_MB = 2**20
17
+
18
+
19
+ def human_sha256b64_file_at_paths(path: Path) -> str:
20
+ """Return a human-readable hash of the file at the given path."""
21
+ assert path.exists(), path
22
+ return humenc.encode(hash_file(path, hashlib.sha256()))
23
+
24
+
25
+ class _ProcessLockingPathContentAddresser:
26
+ """Hashes the data at a path, but only once per unique resolved
27
+ Path seen, because hashing a large file is expensive and such
28
+ Paths are often shared across many invocations.
29
+
30
+ In general, you will want only one instance of this per
31
+ application/process, to take advantage of the caching behavior.
32
+
33
+ This does imply that each use of a Path is, as documented in the
34
+ README, a reference to an immutable, write-at-most-once file, at
35
+ least during the lifetime of the process hosting this
36
+ object. Passing the same Path multiple times with different
37
+ contents, and expecting it to get hashed and uploaded each time,
38
+ will not work.
39
+ """
40
+
41
+ def __init__(self, once: once.Once):
42
+ self.once = once
43
+ self.paths_to_keys: ty.Dict[str, str] = dict()
44
+
45
+ def __call__(self, path: Path) -> str:
46
+ """Return a remote key (sha256 hash in human-base64) for a path."""
47
+ resolved = str(path.resolve())
48
+ # we now put all paths at the hash of their own contents which
49
+ # allows us to avoid uploading duplicated data even from two
50
+ # different file paths that happen to share the same contents.
51
+ #
52
+ # This _also_ allows us to be more confident that memoization
53
+ # bugs arising from reuse of Paths pointing to different
54
+ # underlying file contents across separate process lifetimes
55
+ # cannot happen - a given Path will be represented inside the
56
+ # pickle by something that represents the (immutable) file
57
+ # contents itself, rather than by a mutable reference (the
58
+ # path).
59
+
60
+ def _hash_and_remember_path() -> None:
61
+ self.paths_to_keys[resolved] = human_sha256b64_file_at_paths(path)
62
+
63
+ self.once.run_once(resolved, _hash_and_remember_path)
64
+ return self.paths_to_keys[resolved]
65
+
66
+
67
+ class PathStream(ty.Protocol):
68
+ def local_to_remote(self, __path: Path, __key: str) -> None:
69
+ ... # pragma: no cover
70
+
71
+ def get_downloader(self, __key: str) -> Downloader:
72
+ ... # pragma: no cover
73
+
74
+
75
+ class NotAFileError(ValueError):
76
+ """We err on the side of caution in Mops 2.0 by never allowing
77
+ Paths that are not actual files to be serialized on either side.
78
+
79
+ This error is not intended to be caught; it is intended to inform
80
+ the developer that they have made a coding mistake by passing an
81
+ incorrect Path to a function that is supposed to be transferring
82
+ execution via a Runner.
83
+
84
+ In the future we might add support for directories if it is desired.
85
+ """
86
+
87
+
88
+ def _serialize_file_path_as_upload(
89
+ once: once.Once, path_keyer: _ProcessLockingPathContentAddresser, stream: PathStream, local_src: Path
90
+ ) -> ty.Optional[Downloader]:
91
+ if not local_src.exists():
92
+ raise NotAFileError(f"You asked mops to upload the path {local_src}, but it does not exist.")
93
+ if not local_src.is_file():
94
+ raise NotAFileError(f"You asked mops to upload the Path {local_src}, but it is not a file.")
95
+
96
+ remote_root = path_keyer(local_src)
97
+ # I am creating a root 'directory' so that we can put debug info
98
+ # side-by-side with the actual bytes, without interfering in any
99
+ # way with the determinism of the hashed bytes themselves.
100
+ remote_key = remote_root + "/_bytes"
101
+
102
+ def upload() -> None:
103
+ size = local_src.stat().st_size
104
+ formatted_size = f"{size / _1_MB:,.2f} MB"
105
+ log = logger.info if size > 10 * _1_MB else logger.debug
106
+ log(
107
+ f"Uploading Path {local_src} of size {formatted_size} to {remote_key} - "
108
+ "its contents will get 'unpickled' on the other side"
109
+ " as a Path pointing to a local, read-only file."
110
+ )
111
+ stream.local_to_remote(local_src, remote_key)
112
+ with NamedTemporaryFile("w") as tmp:
113
+ tmp.write(str(local_src))
114
+ tmp.flush()
115
+ stream.local_to_remote( # purely debug info
116
+ Path(tmp.name),
117
+ f"{remote_root}/pathname-_{str(local_src).replace('/', '_')}",
118
+ )
119
+
120
+ logger.debug("Adding deferred upload of %s", remote_key)
121
+ deferred_work.add(
122
+ __name__,
123
+ remote_key,
124
+ partial(once.run_once, remote_key, upload),
125
+ )
126
+ return stream.get_downloader(remote_key)
127
+
128
+
129
+ class CoordinatingPathSerializer:
130
+ """Allow local file Paths to be serialized as streaming objects and then
131
+ deserialized remotely by downloading them from a stream and then
132
+ returning a Path object pointing to the downloaded file.
133
+ """
134
+
135
+ def __init__(self, stream: PathStream, once: once.Once):
136
+ self.stream = stream
137
+ self.once = once
138
+ self.path_addresser = _ProcessLockingPathContentAddresser(once)
139
+
140
+ def __call__(self, maybe_path: ty.Any) -> ty.Optional[ty.Callable[[], Path]]:
141
+ """Returns a persistent ID compatible with CallableUnpickler for any real file Path.
142
+
143
+ The Persistent ID will actually be a thunk that is self-unpickling.
144
+ """
145
+ if isinstance(maybe_path, Path):
146
+ return _serialize_file_path_as_upload(
147
+ self.once, self.path_addresser, self.stream, maybe_path
148
+ )
149
+ return None
@@ -0,0 +1,291 @@
1
+ """Bidirectional, context-sensitive translation: Source <--> (Hashref | URI).
2
+
3
+ Hashrefs - passing data blobs of many kinds into remote functions by their Hash where
4
+ possible, then using a separate lookup file per hash to tell us where the actual data is
5
+ stored.
6
+
7
+ - local file source containing a Hash - can be optimized with hashref
8
+ - remote file source containing a Hash - can be optimized with hashref
9
+ - remote file source only having URI - cannot be optimized - passed as a raw URI.
10
+
11
+ Decoupling hashref creation from potential upload is important because it lets us avoid
12
+ upload in cases where the Shim turns out to be a local machine shim.
13
+
14
+ We create hashrefs for Sources on the local machine in a shared location. Since this
15
+ data is immutable and content-addressed, there should be no serious concurrency objections
16
+ to this approach.
17
+
18
+ Then, if we cross a boundary into a Shim that will start execution on a different
19
+ machine, we serialize the local Path to content-addressed storage in the current active
20
+ storage root, and we then create a hashref in the active storage root (again, these
21
+ should be effectively immutable on the shared store even if they will mostly likely get
22
+ rewritten multiple times).
23
+
24
+ On the remote side, we will first check the local hashref location. It may very well not
25
+ exist at all. If it does, we should attempt to follow it, but the referent may not
26
+ exist (for whatever reason) and in all cases we are able to fall back to looking for a
27
+ remote hashref and following its reference.
28
+
29
+ We are keeping the core business logic completely separate from pickling. All
30
+ serialization methods will have to choose how to represent the information returned by
31
+ this module, but it should be able to call back into this module with that same state to
32
+ have a Source object returned to it while it performs low-level deserialization.
33
+ """
34
+
35
+ import io
36
+ import typing as ty
37
+ from functools import partial
38
+ from pathlib import Path
39
+
40
+ from thds import humenc
41
+ from thds.core import hashing, log, source
42
+ from thds.core.files import is_file_uri, to_uri
43
+ from thds.core.source import Source
44
+ from thds.core.types import StrOrPath
45
+
46
+ from . import deferred_work
47
+ from .content_addressed import wordybin_content_addressed
48
+ from .output_naming import invocation_output_uri
49
+ from .uris import active_storage_root, lookup_blob_store
50
+
51
+ _REMOTE_HASHREF_PREFIX = "mops2-hashrefs"
52
+ _LOCAL_HASHREF_DIR = ".mops2-local-hashrefs"
53
+ logger = log.getLogger(__name__)
54
+
55
+
56
+ def _hash_to_str(hash: hashing.Hash) -> str:
57
+ # i see no reason to not remain opinionated and "debug-friendly" with the user-visible
58
+ # encoding of our hashes when they are being stored on a blob store/FS of some kind.
59
+ return f"{hash.algo}-{humenc.encode(hash.bytes)}"
60
+
61
+
62
+ def _hashref_uri(hash: hashing.Hash, type: ty.Literal["local", "remote"]) -> str:
63
+ # the .txt extensions are just for user-friendliness during debugging
64
+ if type == "remote":
65
+ base_uri = active_storage_root()
66
+ return lookup_blob_store(base_uri).join(
67
+ base_uri, _REMOTE_HASHREF_PREFIX, _hash_to_str(hash) + ".txt"
68
+ )
69
+ local_hashref = Path.home() / _LOCAL_HASHREF_DIR / f"{_hash_to_str(hash)}.txt"
70
+ return to_uri(local_hashref)
71
+
72
+
73
+ def _read_hashref(hashref_uri: str) -> str:
74
+ """Return URI represented by this hashref. Performs IO."""
75
+ uri_bytes = io.BytesIO()
76
+ lookup_blob_store(hashref_uri).readbytesinto(hashref_uri, uri_bytes)
77
+ uri = uri_bytes.getvalue().decode()
78
+ assert uri, f"Hashref from {hashref_uri} is empty"
79
+ return uri
80
+
81
+
82
+ def _write_hashref(hashref_uri: str, uri: str) -> None:
83
+ """Write URI to this hashref. Performs IO."""
84
+ assert uri, f"Should never encode hashref ({hashref_uri}) pointing to empty URI"
85
+ lookup_blob_store(hashref_uri).putbytes(hashref_uri, uri.encode(), type_hint="text/plain")
86
+
87
+
88
+ def source_from_hashref(hash: hashing.Hash) -> Source:
89
+ """Re-create a Source from a Hash by looking up one of two Hashrefs and finding a
90
+ valid Source for the data."""
91
+ local_file_hashref_uri = _hashref_uri(hash, "local")
92
+ remote_hashref_uri = _hashref_uri(hash, "remote")
93
+
94
+ def remote_uri(allow_blob_not_found: bool = True) -> str:
95
+ try:
96
+ return _read_hashref(remote_hashref_uri)
97
+ except Exception as e:
98
+ if not allow_blob_not_found or not lookup_blob_store(
99
+ remote_hashref_uri,
100
+ ).is_blob_not_found(e):
101
+ # 'remote' blob not found is sometimes fine, but anything else is weird
102
+ # and we should raise.
103
+ raise
104
+ return ""
105
+
106
+ try:
107
+ # we might be on the same machine where this was originally invoked.
108
+ # therefore, there may be a local path we can use directly.
109
+ # Then, there's no need to bother grabbing the remote_uri
110
+ # - but for debugging's sake, it's quite nice to actually
111
+ # have the full remote URI as well even if we're ultimately going to use the local copy.
112
+ return source.from_file(_read_hashref(local_file_hashref_uri), hash=hash, uri=remote_uri())
113
+ except FileNotFoundError:
114
+ # we are not on the same machine as the local ref. assume we need the remote URI.
115
+ pass
116
+ except Exception as e:
117
+ if not lookup_blob_store(local_file_hashref_uri).is_blob_not_found(e):
118
+ # 'local' blob not found is fine, but anything else is weird and we should raise.
119
+ raise
120
+
121
+ # no local file, so we assume there must be a remote URI.
122
+ return source.from_uri(remote_uri(False), hash=hash)
123
+
124
+
125
+ def _upload_and_create_remote_hashref(local_path: Path, remote_uri: str, hash: hashing.Hash) -> None:
126
+ # exists only to provide a local (non-serializable) closure around local_path and remote_uri.
127
+ lookup_blob_store(remote_uri).putfile(local_path, remote_uri)
128
+ # make sure we never overwrite a hashref until it's actually going to be valid.
129
+ _write_hashref(_hashref_uri(hash, "remote"), remote_uri)
130
+
131
+
132
+ def _auto_remote_uri(hash: hashing.Hash) -> str:
133
+ """Pick a remote URI for a file/source that has the given hash.
134
+
135
+ The underlying implementation is shared with the content-addressing that is used
136
+ throughout mops.
137
+ """
138
+ return wordybin_content_addressed(hash).bytes_uri
139
+
140
+
141
+ def prepare_source_argument(source_: Source) -> ty.Union[str, hashing.Hash]:
142
+ """For use on the orchestrator side, during serialization of the invocation.
143
+
144
+ You either end up with a Hashref created under the current HASHREF_ROOT, or you end up
145
+ with just a URI, which is not amenable to hashref optimization.
146
+ """
147
+ if not source_.hash:
148
+ # we cannot optimize this one for memoization - just return the URI.
149
+ return source_.uri
150
+
151
+ local_path = source_.cached_path
152
+ if local_path and local_path.exists():
153
+ # register creation of local hashref...
154
+ deferred_work.add(
155
+ __name__ + "-localhashref",
156
+ source_.hash,
157
+ partial(_write_hashref, _hashref_uri(source_.hash, "local"), str(local_path)),
158
+ )
159
+ # then also register pending upload - if the URI is a local file, we need to determine a
160
+ # remote URI for this thing automagically; otherwise, use whatever was already
161
+ # specified by the Source itself.
162
+ remote_uri = source_.uri if not is_file_uri(source_.uri) else _auto_remote_uri(source_.hash)
163
+ deferred_work.add(
164
+ __name__ + "-remotehashref",
165
+ source_.hash,
166
+ partial(_upload_and_create_remote_hashref, local_path, remote_uri, source_.hash),
167
+ )
168
+ else:
169
+ # prepare to (later, if necessary) create a remote hashref, because this Source
170
+ # represents a non-local resource.
171
+ deferred_work.add(
172
+ __name__,
173
+ source_.hash,
174
+ partial(_write_hashref, _hashref_uri(source_.hash, "remote"), source_.uri),
175
+ )
176
+
177
+ return source_.hash
178
+
179
+
180
+ def perform_source_uploads() -> None: # has been replaced by a general work-deferring mechanism.
181
+ deferred_work.perform_all()
182
+
183
+
184
+ # RETURNING FROM REMOTE
185
+ #
186
+ # when returning a Source from a remote, we cannot avoid the upload. this is because the
187
+ # uploaded data is part of the memoized result, and memoization by definition is available
188
+ # to all callers, even those on other machines/environments.
189
+ #
190
+ # A good example of where this is necessary is memoizing Person API test data in CI. the
191
+ # code runs locally, but the goal is to create an output file that can be reused next time
192
+ # it runs (locally or in CI). And for that to be possible, the output _must_ be uploaded.
193
+ #
194
+ # This does not mean that the Source itself must be uploaded immediately upon creation;
195
+ # just that mops must detect Sources in the return value and must force an upload on them.
196
+ # In essence, this creates a bifurcated code path for Sources during serialization; if
197
+ # we're "on the way out", we avoid uploading until it is clear that the data will be used
198
+ # in a remote environment. Whereas "on the way back", we must always upload, and nothing
199
+ # can or should be deferred; upload should happen at the time of serialization.
200
+ #
201
+ # Nevertheless, a local caller should still be able to short-circuit the _download_ by
202
+ # using a locally-created File, if on the same machine where the local file was created.
203
+
204
+
205
+ class SourceResult(ty.NamedTuple):
206
+ """Contains the fully-specified local URI and remote URI, plus (probably) a Hash.
207
+
208
+ Everything is defined right here. No need for any kind of dynamic lookup, and
209
+ optimization buys us nothing, since memoization only operates on arguments.
210
+ """
211
+
212
+ remote_uri: str
213
+ hash: ty.Optional[hashing.Hash]
214
+ file_uri: str
215
+
216
+
217
+ def prepare_source_result(source_: Source) -> SourceResult:
218
+ """Call from within the remote side of an invocation, while serializing the function return value.
219
+
220
+ Forces the Source to be present at a remote URI which will be available once
221
+ returned to the orchestrator.
222
+
223
+ The full output URI is auto-generated if one is not already provided, because we're
224
+ guaranteed to be in a remote context, which provides an invocation output root URI
225
+ where we can safely place any named output.
226
+ """
227
+ if not is_file_uri(source_.uri):
228
+ if source_.cached_path and Path(source_.cached_path).exists():
229
+ # it exists locally - an upload may be necessary.
230
+ file_uri = to_uri(source_.cached_path)
231
+ lookup_blob_store(source_.uri).putfile(source_.cached_path, source_.uri)
232
+ logger.info("Uploading Source to %s", source_.uri)
233
+ else:
234
+ file_uri = ""
235
+ logger.debug("Creating a SourceResult for a URI that is presumed to already be uploaded.")
236
+ return SourceResult(source_.uri, source_.hash, file_uri)
237
+
238
+ # by definition, if this is a file URI, it now needs to be uploaded, because we could
239
+ # be transferring back to an orchestrator on a different machine, but also because a
240
+ # future caller on a different machine could try to use this memoized result.
241
+ local_path = source.path_from_uri(source_.uri)
242
+ assert local_path.exists(), f"{local_path} does not exist"
243
+ logger.debug("Automatically selecting a remote URI for a Source being returned.")
244
+ remote_uri = invocation_output_uri(name=local_path.name)
245
+ # the line above is a bit of opinionated magic. it uses the 'end' of the filename
246
+ # to automagically assign a meaningful name to the output remote URI.
247
+ #
248
+ # If users do not like this automatically assigned remote URI name, they can construct
249
+ # the Source themselves and provide a remote URI (as well as, optionally, a
250
+ # local_path), and we will use their remote URI.
251
+ lookup_blob_store(remote_uri).putfile(local_path, remote_uri)
252
+ # upload must _always_ happen on remotely-returned Sources, as detailed above.
253
+ # There is no advantage to waiting to upload past this point.
254
+ return SourceResult(remote_uri, source_.hash, source_.uri)
255
+
256
+
257
+ def source_from_source_result(remote_uri: str, hash: ty.Optional[hashing.Hash], file_uri: str) -> Source:
258
+ """Call when deserializing a remote function return value on the orchestrator side, to
259
+ replace all SourceResults with the intended Source object.
260
+ """
261
+ if not file_uri:
262
+ return source.from_uri(remote_uri, hash=hash)
263
+
264
+ local_path = source.path_from_uri(file_uri)
265
+ if local_path.exists():
266
+ try:
267
+ # since there's a remote URI, it's possible a specific consumer might want to
268
+ # get access to that directly, even though the default data access would still
269
+ # be to use the local file.
270
+ return source.from_file(local_path, hash=hash, uri=remote_uri)
271
+ except Exception as e:
272
+ logger.warning(
273
+ f"Unable to reuse destination local path {local_path} when constructing Source {remote_uri}: {e}"
274
+ )
275
+ return source.from_uri(remote_uri, hash=hash)
276
+
277
+
278
+ def create_source_at_uri(filename: StrOrPath, destination_uri: str) -> Source:
279
+ """Public API for creating a Source with a manually-specified remote URI
280
+ within a remote function invocation. Not generally recommended.
281
+
282
+ Use this if you want to provide specific URI destination for a file that exists
283
+ locally, rather than using the automagic naming behavior provided by creating a Source
284
+ with `from_file`, which is standard.
285
+
286
+ _Only_ use this if you are willing to immediately upload your data.
287
+
288
+ """
289
+ source_ = source.from_file(filename, uri=destination_uri)
290
+ lookup_blob_store(destination_uri).putfile(Path(filename), destination_uri)
291
+ return source_
@@ -0,0 +1,142 @@
1
+ """Core abstractions for the remote runner system."""
2
+
3
+ import typing as ty
4
+ from pathlib import Path
5
+
6
+ from typing_extensions import Protocol
7
+
8
+ from thds.core import config
9
+
10
+ T = ty.TypeVar("T")
11
+ F = ty.TypeVar("F", bound=ty.Callable)
12
+
13
+ Deserializer = ty.Callable[[], T]
14
+ Serializer = ty.Callable[[T], Deserializer]
15
+ SerializerHandler = ty.Callable[[T], ty.Union[None, Deserializer]]
16
+ # returns None if the object should be serialized normally.
17
+ # Otherwise returns a Deserializing Callable that will itself return the deserialized object when called.
18
+
19
+
20
+ class Runner(Protocol):
21
+ """A Runner copies a function, its arguments, and discoverable
22
+ context to a location that can be picked up from a future remote
23
+ process, executes that process remotely, and later pulls the
24
+ result of that remote process back to the local caller process.
25
+
26
+ It is essentially the same abstraction as
27
+ `concurrent.futures.Executor.submit`, or
28
+ `multiprocessing.Pool.apply`.
29
+
30
+ `use_runner` uses this abstraction to provide a way of wrapping a
31
+ function and calling it elsewhere.
32
+ """
33
+
34
+ def __call__(
35
+ self,
36
+ __f: ty.Callable[..., T],
37
+ __args: ty.Sequence,
38
+ __kwargs: ty.Mapping[str, ty.Any],
39
+ ) -> T:
40
+ ... # pragma: no cover
41
+
42
+
43
+ class NoResultAfterInvocationError(Exception): # TODO remove in v4.
44
+ """Runners should raise this if the remotely-invoked function does not provide any result."""
45
+
46
+
47
+ class NoResultAfterShimSuccess(NoResultAfterInvocationError):
48
+ """Raised this if the shim returns with no error, but no result is found in the blob store.
49
+
50
+ A better name for NoResultAfterInvocationError.
51
+ """
52
+
53
+
54
+ class NotARunnerContext(Exception):
55
+ """Mops may raise this if some code intended to be run under a
56
+ Runner context is invoked outside that context.
57
+ """
58
+
59
+
60
+ AnyStrSrc = ty.Union[ty.AnyStr, ty.Iterable[ty.AnyStr], ty.IO[ty.AnyStr], Path]
61
+
62
+
63
+ DISABLE_CONTROL_CACHE = config.item(
64
+ "thds.mops.pure.disable_control_cache", default=False, parse=config.tobool
65
+ )
66
+ # set the above to True in order to specifically opt out of read-path caching of
67
+ # mops-created files. This can apply to a local (stack) context, or can
68
+ # apply globally to the process. The former may be used selectively within mops
69
+ # for issues of known correctness, e.g. locks, whereas the latter will be useful
70
+ # for debugging any cases where files have been remotely deleted.
71
+
72
+
73
+ class BlobStore(Protocol):
74
+ """A minimal interface that can be supported by almost any type of key-value store
75
+ that has some basic concept of hierarchical pathing (as implemented by join and
76
+ split).
77
+
78
+ getfile and putfile are pathways intended for large files that are passed as arguments
79
+ to, or returned as results from, mops-wrapped functions. Implementations may wish to
80
+ make sure they can perform streaming reads and writes. However, mops itself does not
81
+ generate such files itself, so your use case may not benefit from supporting
82
+ memory-efficient reads and writes if your application does not deal with large files
83
+ via pathlib.Path or thds.core.Source objects.
84
+
85
+ In the methods below, `type_hint` is a parameter that must be _accepted_ as a keyword
86
+ argument by the implementation, but is intended for use mainly as a hint to loggers
87
+ and other debugging setups. It does not need to affect the implementation in any way.
88
+ """
89
+
90
+ def control_root(self, __remote_uri: str) -> str:
91
+ """Return the mops-specific root of the blob store for this URI.
92
+
93
+ Essentially, define a place for mops to store its control files under its own internal prefix.
94
+ """
95
+
96
+ def readbytesinto(
97
+ self, __remote_uri: str, __stream_or_file: ty.IO[bytes], *, type_hint: str = "bytes"
98
+ ) -> None:
99
+ """Allows reading into any stream, including a stream-to-disk.
100
+
101
+ May optimize reads by returning a cached version of the file if it has been seen before.
102
+ """
103
+
104
+ def getfile(self, __remote_uri: str) -> Path:
105
+ """Read a remote uri directly into a Path controlled by the implementation.
106
+ Optimizations involving caches for remotes may be applied.
107
+ The returned file is by definition read-only.
108
+ """
109
+
110
+ def putbytes(self, __remote_uri: str, __data: AnyStrSrc, *, type_hint: str = "bytes") -> None:
111
+ """Upload bytes from any stream."""
112
+
113
+ def putfile(self, __path: Path, __remote_uri: str) -> None:
114
+ """Upload a file that exists on the local
115
+ filesystem. Optimizations including softlinking into caches may be
116
+ applied.
117
+ """
118
+
119
+ def exists(self, __remote_uri: str) -> bool:
120
+ """Check if a file exists. May optimize by assuming that files previously seen
121
+ have not been deleted - since this is intended only for mops control files,
122
+ and mops never deletes any control files.
123
+ """
124
+
125
+ def join(self, *parts: str) -> str:
126
+ """Join multiple parts of a URI into one. In actual use, the first part will always
127
+ be a storage root, e.g.:
128
+
129
+ join(['adls://foo/bar', 'baz', 'beans']) -> 'adls://foo/bar/baz/beans'
130
+ """
131
+
132
+ def split(self, uri: str) -> ty.List[str]:
133
+ """Must return the storage root as a single string,
134
+ followed by the path component split along the same lines that join would concatenate.
135
+ """
136
+
137
+ def is_blob_not_found(self, __exc: Exception) -> bool:
138
+ ...
139
+
140
+
141
+ Args = ty.Sequence
142
+ Kwargs = ty.Mapping[str, ty.Any]