thds.mops 3.6.20250219172032__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.mops might be problematic. Click here for more details.
- thds/mops/__about__.py +8 -0
- thds/mops/__init__.py +3 -0
- thds/mops/_compat.py +6 -0
- thds/mops/_utils/__init__.py +0 -0
- thds/mops/_utils/colorize.py +110 -0
- thds/mops/_utils/config_tree.py +167 -0
- thds/mops/_utils/exception.py +16 -0
- thds/mops/_utils/locked_cache.py +78 -0
- thds/mops/_utils/names.py +23 -0
- thds/mops/_utils/on_slow.py +28 -0
- thds/mops/_utils/once.py +30 -0
- thds/mops/_utils/temp.py +32 -0
- thds/mops/config.py +60 -0
- thds/mops/impure/__init__.py +2 -0
- thds/mops/impure/keyfunc.py +14 -0
- thds/mops/impure/runner.py +73 -0
- thds/mops/k8s/__init__.py +27 -0
- thds/mops/k8s/_shared.py +3 -0
- thds/mops/k8s/apply_yaml.py +22 -0
- thds/mops/k8s/auth.py +49 -0
- thds/mops/k8s/config.py +37 -0
- thds/mops/k8s/container_registry.py +14 -0
- thds/mops/k8s/jobs.py +57 -0
- thds/mops/k8s/launch.py +234 -0
- thds/mops/k8s/logging.py +239 -0
- thds/mops/k8s/namespace.py +17 -0
- thds/mops/k8s/node_selection.py +58 -0
- thds/mops/k8s/retry.py +75 -0
- thds/mops/k8s/too_old_resource_version.py +42 -0
- thds/mops/k8s/tools/krsync.py +50 -0
- thds/mops/k8s/tools/krsync.sh +22 -0
- thds/mops/k8s/wait_job.py +72 -0
- thds/mops/k8s/warn_image_backoff.py +63 -0
- thds/mops/k8s/watch.py +266 -0
- thds/mops/meta.json +8 -0
- thds/mops/parallel.py +36 -0
- thds/mops/pure/__init__.py +43 -0
- thds/mops/pure/_magic/__init__.py +0 -0
- thds/mops/pure/_magic/api.py +114 -0
- thds/mops/pure/_magic/sauce.py +152 -0
- thds/mops/pure/_magic/shims.py +34 -0
- thds/mops/pure/adls/__init__.py +1 -0
- thds/mops/pure/adls/_files.py +22 -0
- thds/mops/pure/adls/blob_store.py +185 -0
- thds/mops/pure/adls/output_fqn.py +17 -0
- thds/mops/pure/core/__init__.py +0 -0
- thds/mops/pure/core/content_addressed.py +31 -0
- thds/mops/pure/core/deferred_work.py +83 -0
- thds/mops/pure/core/entry/__init__.py +2 -0
- thds/mops/pure/core/entry/main.py +47 -0
- thds/mops/pure/core/entry/route_result.py +66 -0
- thds/mops/pure/core/entry/runner_registry.py +31 -0
- thds/mops/pure/core/file_blob_store.py +120 -0
- thds/mops/pure/core/lock/__init__.py +7 -0
- thds/mops/pure/core/lock/_acquire.py +192 -0
- thds/mops/pure/core/lock/_funcs.py +37 -0
- thds/mops/pure/core/lock/cli.py +73 -0
- thds/mops/pure/core/lock/maintain.py +150 -0
- thds/mops/pure/core/lock/read.py +39 -0
- thds/mops/pure/core/lock/types.py +37 -0
- thds/mops/pure/core/lock/write.py +136 -0
- thds/mops/pure/core/memo/__init__.py +6 -0
- thds/mops/pure/core/memo/function_memospace.py +267 -0
- thds/mops/pure/core/memo/keyfunc.py +53 -0
- thds/mops/pure/core/memo/overwrite_params.py +61 -0
- thds/mops/pure/core/memo/results.py +103 -0
- thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
- thds/mops/pure/core/metadata.py +230 -0
- thds/mops/pure/core/output_naming.py +52 -0
- thds/mops/pure/core/partial.py +15 -0
- thds/mops/pure/core/pipeline_id.py +62 -0
- thds/mops/pure/core/pipeline_id_mask.py +79 -0
- thds/mops/pure/core/script_support.py +25 -0
- thds/mops/pure/core/serialize_big_objs.py +73 -0
- thds/mops/pure/core/serialize_paths.py +149 -0
- thds/mops/pure/core/source.py +291 -0
- thds/mops/pure/core/types.py +142 -0
- thds/mops/pure/core/uris.py +81 -0
- thds/mops/pure/core/use_runner.py +47 -0
- thds/mops/pure/joblib/__init__.py +1 -0
- thds/mops/pure/joblib/backend.py +81 -0
- thds/mops/pure/joblib/batching.py +67 -0
- thds/mops/pure/pickling/__init__.py +3 -0
- thds/mops/pure/pickling/_pickle.py +193 -0
- thds/mops/pure/pickling/memoize_only.py +22 -0
- thds/mops/pure/pickling/mprunner.py +173 -0
- thds/mops/pure/pickling/pickles.py +149 -0
- thds/mops/pure/pickling/remote.py +145 -0
- thds/mops/pure/pickling/sha256_b64.py +71 -0
- thds/mops/pure/runner/__init__.py +0 -0
- thds/mops/pure/runner/local.py +239 -0
- thds/mops/pure/runner/shim_builder.py +25 -0
- thds/mops/pure/runner/simple_shims.py +21 -0
- thds/mops/pure/runner/strings.py +1 -0
- thds/mops/pure/runner/types.py +28 -0
- thds/mops/pure/tools/__init__.py +0 -0
- thds/mops/pure/tools/history.py +35 -0
- thds/mops/pure/tools/inspect.py +372 -0
- thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
- thds/mops/pure/tools/stress.py +63 -0
- thds/mops/pure/tools/summarize/__init__.py +4 -0
- thds/mops/pure/tools/summarize/cli.py +293 -0
- thds/mops/pure/tools/summarize/run_summary.py +143 -0
- thds/mops/py.typed +0 -0
- thds/mops/testing/__init__.py +0 -0
- thds/mops/testing/deferred_imports.py +81 -0
- thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
- thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
- thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
- thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
- thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
thds/mops/parallel.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import concurrent
|
|
2
|
+
import typing as ty
|
|
3
|
+
|
|
4
|
+
from thds.core import parallel
|
|
5
|
+
from thds.core.parallel import ( # noqa: F401; for backward-compatibility, since these came from here originally.
|
|
6
|
+
IterableWithLen,
|
|
7
|
+
IteratorWithLen,
|
|
8
|
+
)
|
|
9
|
+
from thds.core.thunks import ( # noqa: F401; for backward-compatibility, since these came from here originally.
|
|
10
|
+
Thunk,
|
|
11
|
+
thunking,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from ._utils.colorize import colorized
|
|
15
|
+
|
|
16
|
+
ERROR = colorized(fg="white", bg="red")
|
|
17
|
+
DONE = colorized(fg="white", bg="blue")
|
|
18
|
+
R = ty.TypeVar("R")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def parallel_yield_results(
|
|
22
|
+
thunks: ty.Iterable[ty.Callable[[], R]],
|
|
23
|
+
*,
|
|
24
|
+
executor_cm: ty.Optional[ty.ContextManager[concurrent.futures.Executor]] = None,
|
|
25
|
+
named: str = "",
|
|
26
|
+
) -> ty.Iterator[R]:
|
|
27
|
+
yield from parallel.yield_results(
|
|
28
|
+
thunks,
|
|
29
|
+
executor_cm=executor_cm,
|
|
30
|
+
error_fmt=ERROR,
|
|
31
|
+
success_fmt=DONE,
|
|
32
|
+
named=named,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
yield_results = parallel_yield_results
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# This module is the supported interface and everything not exported here is subject to
|
|
2
|
+
# backward-incompatible change without notice.
|
|
3
|
+
#
|
|
4
|
+
# The single exception is the joblib module, which is not exported by default
|
|
5
|
+
# to avoid requiring the additional dependency.
|
|
6
|
+
|
|
7
|
+
from . import adls # noqa
|
|
8
|
+
from ._magic.api import magic # noqa
|
|
9
|
+
from .core.entry import register_entry_handler
|
|
10
|
+
from .core.memo import results # noqa
|
|
11
|
+
from .core.memo.function_memospace import ( # noqa
|
|
12
|
+
add_pipeline_memospace_handlers,
|
|
13
|
+
matching_mask_pipeline_id,
|
|
14
|
+
)
|
|
15
|
+
from .core.pipeline_id import get_pipeline_id, set_pipeline_id # noqa
|
|
16
|
+
from .core.pipeline_id_mask import pipeline_id_mask # noqa
|
|
17
|
+
from .core.source import create_source_at_uri # noqa
|
|
18
|
+
from .core.types import Args, BlobStore, Kwargs, Runner # noqa
|
|
19
|
+
from .core.uris import UriIsh, UriResolvable, register_blob_store # noqa
|
|
20
|
+
from .core.use_runner import use_runner # noqa
|
|
21
|
+
from .pickling.memoize_only import memoize_in # noqa
|
|
22
|
+
from .pickling.mprunner import MemoizingPicklingRunner # noqa
|
|
23
|
+
from .runner.simple_shims import samethread_shim, subprocess_shim # noqa
|
|
24
|
+
from .runner.types import Shim, ShimBuilder # noqa
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _register_things() -> None:
|
|
28
|
+
from . import pickling
|
|
29
|
+
from .core.uris import load_plugin_blobstores
|
|
30
|
+
|
|
31
|
+
register_entry_handler(
|
|
32
|
+
pickling.mprunner.RUNNER_NAME,
|
|
33
|
+
pickling.remote.run_pickled_invocation, # type: ignore
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
load_plugin_blobstores()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_register_things()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
Shell = Shim # deprecated alias
|
|
43
|
+
ShellBuilder = ShimBuilder # deprecated alias
|
|
File without changes
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Magic is an attempt at a new interface for mops designed to make it even less sticky
|
|
2
|
+
and easier to get things done with.
|
|
3
|
+
|
|
4
|
+
It's designed to combine the most common workflows into a single wrapper function
|
|
5
|
+
requiring an absolute minimum of boilerplate/config.
|
|
6
|
+
|
|
7
|
+
Unlike the more open-ended interface of use_runner plus BYO Runner, this one assumes
|
|
8
|
+
MemoizingPicklingRunner, and the most likely non-default config will be a runtime Shim or
|
|
9
|
+
ShimBuilder. If you don't supply one, it will default to the same-thread shim.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import typing as ty
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from thds import core
|
|
16
|
+
from thds.mops import config
|
|
17
|
+
from thds.mops._utils import config_tree
|
|
18
|
+
|
|
19
|
+
from ..core import uris
|
|
20
|
+
from ..runner.types import ShimBuilder
|
|
21
|
+
from . import sauce
|
|
22
|
+
from .sauce import P, R
|
|
23
|
+
from .shims import ShimName, ShimOrBuilder, to_shim_builder
|
|
24
|
+
|
|
25
|
+
_MAGIC_CONFIG: ty.Final = sauce.new_config()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_config() -> sauce._MagicConfig: # for testing
|
|
29
|
+
return _MAGIC_CONFIG
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class _MagicApi:
|
|
33
|
+
"""The public API for pure.magic.
|
|
34
|
+
|
|
35
|
+
Each of these methods makes a global change to your application, so they're designed
|
|
36
|
+
to be used at import time or in other situations where no functions have been called.
|
|
37
|
+
|
|
38
|
+
If you want to apply a shim, blob_root, or pipeline_id to a single function, prefer
|
|
39
|
+
the @pure.magic(shim, blob_root=your_blob_root, pipeline_id='lazing/sunday') decorator
|
|
40
|
+
approach rather than configuring them after the fact, to keep the definition as close
|
|
41
|
+
as possible to the site of use.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def __call__(
|
|
46
|
+
shim_or_builder: ty.Union[ShimName, ShimOrBuilder, None] = None,
|
|
47
|
+
*,
|
|
48
|
+
blob_root: uris.UriResolvable = "",
|
|
49
|
+
pipeline_id: str = "",
|
|
50
|
+
) -> ty.Callable[[ty.Callable[P, R]], sauce.Magic[P, R]]:
|
|
51
|
+
return sauce.make_magic(_get_config(), shim_or_builder, blob_root, pipeline_id)
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def blob_root(
|
|
55
|
+
blob_root_uri: uris.UriResolvable, pathable: config_tree.Pathable = None, *, mask: bool = False
|
|
56
|
+
) -> core.config.ConfigItem[ty.Callable[[], str]]:
|
|
57
|
+
"""Sets the root URI for the blob store and control files for a specific module or function."""
|
|
58
|
+
return _get_config().blob_root.setv(uris.to_lazy_uri(blob_root_uri), pathable, mask=mask)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def shim(
|
|
62
|
+
shim: ty.Union[ShimName, ShimOrBuilder],
|
|
63
|
+
pathable: config_tree.Pathable = None,
|
|
64
|
+
*,
|
|
65
|
+
mask: bool = False,
|
|
66
|
+
) -> core.config.ConfigItem[ty.Optional[ShimBuilder]]:
|
|
67
|
+
"""Use the provided shim for everything matching the pathable,
|
|
68
|
+
unless there's a more specific path that matches.
|
|
69
|
+
|
|
70
|
+
e.g.:
|
|
71
|
+
- magic.shim('samethread') would turn off mops for everything within
|
|
72
|
+
or below the current module.
|
|
73
|
+
- magic.shim('subprocess', 'foo.bar.baz') would use the subprocess shim for
|
|
74
|
+
everything within or below the foo.bar.baz module.
|
|
75
|
+
- magic.shim(my_shim_builder, my_func) would use my_shim_builder for just my_func.
|
|
76
|
+
|
|
77
|
+
To instead _mask_ everything at this level and below regardless of more specific
|
|
78
|
+
config, pass mask=True.
|
|
79
|
+
"""
|
|
80
|
+
return _get_config().shim_bld.setv(to_shim_builder(shim), pathable, mask=mask)
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def off(pathable: config_tree.Pathable = None, *, mask: bool = False) -> None:
|
|
84
|
+
"""Turn off mops for everything matching the pathable.
|
|
85
|
+
|
|
86
|
+
A shortcut for shim(None).
|
|
87
|
+
"""
|
|
88
|
+
_MagicApi.shim("off", pathable, mask=mask)
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def pipeline_id(
|
|
92
|
+
pipeline_id: str, pathable: config_tree.Pathable = None, *, mask: bool = False
|
|
93
|
+
) -> core.config.ConfigItem[str]:
|
|
94
|
+
"""Sets the pipeline_id for a specific module or function."""
|
|
95
|
+
return _get_config().pipeline_id.setv(pipeline_id, pathable, mask=mask)
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def load_config_file(magic_config: ty.Optional[Path] = None) -> None:
|
|
99
|
+
"""Call this to load pure.magic config from the nearest .mops.toml file upward,
|
|
100
|
+
or the path you provide.
|
|
101
|
+
|
|
102
|
+
Should be called only once, in the `__main__` block of your program,
|
|
103
|
+
and after all imports are resolved.
|
|
104
|
+
"""
|
|
105
|
+
all_config = config.load(magic_config or config.first_found_config_file(), name="pure.magic")
|
|
106
|
+
m_config = _get_config()
|
|
107
|
+
m_config.shim_bld.load_config(all_config)
|
|
108
|
+
m_config.blob_root.load_config(all_config)
|
|
109
|
+
m_config.pipeline_id.load_config(all_config)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
magic: ty.Final = _MagicApi()
|
|
113
|
+
# we only instantiate this so we can have a call to magic() that is not __init__.
|
|
114
|
+
# there is no state whatsoever in this object.
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""The magic sauce for most of what pure.magic does."""
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import functools
|
|
5
|
+
import typing as ty
|
|
6
|
+
|
|
7
|
+
from typing_extensions import ParamSpec
|
|
8
|
+
|
|
9
|
+
from thds.core import stack_context
|
|
10
|
+
from thds.mops._utils import config_tree
|
|
11
|
+
|
|
12
|
+
from ..core import file_blob_store, pipeline_id_mask, uris
|
|
13
|
+
from ..core.memo.unique_name_for_function import full_name_and_callable
|
|
14
|
+
from ..core.use_runner import use_runner
|
|
15
|
+
from ..pickling.mprunner import MemoizingPicklingRunner
|
|
16
|
+
from ..runner.shim_builder import make_builder
|
|
17
|
+
from ..runner.simple_shims import samethread_shim
|
|
18
|
+
from ..runner.types import Shim, ShimBuilder
|
|
19
|
+
from .shims import ShimName, ShimOrBuilder, to_shim_builder
|
|
20
|
+
|
|
21
|
+
_local_root = lambda: f"file://{file_blob_store.MOPS_ROOT()}" # noqa: E731
|
|
22
|
+
P = ParamSpec("P")
|
|
23
|
+
R = ty.TypeVar("R")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class _MagicConfig:
|
|
27
|
+
def __init__(self) -> None:
|
|
28
|
+
# these ConfigTree objects apply configuration to callables wrapped with pure.magic
|
|
29
|
+
# based on the fully-qualified path to the callable, e.g. foo.bar.baz.my_func
|
|
30
|
+
self.shim_bld = config_tree.ConfigTree[ty.Optional[ShimBuilder]](
|
|
31
|
+
"mops.pure.magic.shim", parse=to_shim_builder # type: ignore
|
|
32
|
+
)
|
|
33
|
+
self.blob_root = config_tree.ConfigTree[ty.Callable[[], str]](
|
|
34
|
+
"mops.pure.magic.blob_root", parse=uris.to_lazy_uri
|
|
35
|
+
)
|
|
36
|
+
self.pipeline_id = config_tree.ConfigTree[str]("mops.pure.magic.pipeline_id")
|
|
37
|
+
self.blob_root[""] = _local_root # default Blob Store
|
|
38
|
+
self.shim_bld[""] = make_builder(samethread_shim) # default Shim
|
|
39
|
+
self.pipeline_id[""] = "magic" # default pipeline_id
|
|
40
|
+
|
|
41
|
+
def __repr__(self) -> str:
|
|
42
|
+
return f"MagicConfig(shim_bld={self.shim_bld}, blob_root={self.blob_root}, pipeline_id={self.pipeline_id})"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def new_config() -> _MagicConfig:
|
|
46
|
+
return _MagicConfig()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Magic(ty.Generic[P, R]):
|
|
50
|
+
"""Magic adds mops' powers (memoization, coordination, remote execution) to a callable.
|
|
51
|
+
|
|
52
|
+
If you want to _change_ which runtime shim the function is using, that can be set globally
|
|
53
|
+
to the program with pure.magic.shim(other_shim, my_magic_func), and it can also be set
|
|
54
|
+
as a stack-local variable in a context manager provided by this object:
|
|
55
|
+
|
|
56
|
+
with my_magic_func.shim("subprocess"):
|
|
57
|
+
my_magic_func(1, 2, 3)
|
|
58
|
+
|
|
59
|
+
You can completely disable mops magic for a function in the same ways, either with a contextmanager
|
|
60
|
+
or globally, using `off()`, like so:
|
|
61
|
+
|
|
62
|
+
with my_magic_func.off():
|
|
63
|
+
...
|
|
64
|
+
my_magic_func(1, 2, 3)
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
func: ty.Callable[P, R],
|
|
70
|
+
config: _MagicConfig,
|
|
71
|
+
):
|
|
72
|
+
functools.update_wrapper(self, func)
|
|
73
|
+
self._func_config_path = full_name_and_callable(func)[0].replace("--", ".")
|
|
74
|
+
|
|
75
|
+
self.config = config
|
|
76
|
+
if p_id := pipeline_id_mask.extract_from_docstr(func, require=False):
|
|
77
|
+
# this allows the docstring pipeline id to become 'the most specific' config.
|
|
78
|
+
self.config.pipeline_id.setv(p_id, self._func_config_path)
|
|
79
|
+
self._shim = stack_context.StackContext[ty.Union[None, ShimName, ShimOrBuilder]](
|
|
80
|
+
str(func) + "_SHIM", None # none means nothing has been set stack-local
|
|
81
|
+
)
|
|
82
|
+
self.runner = MemoizingPicklingRunner(self._shimbuilder, self._get_blob_root)
|
|
83
|
+
self._func = use_runner(self.runner, self._is_off)(func)
|
|
84
|
+
self.__doc__ = f"{func.__doc__}\n\nMagic class info:\n{self.__class__.__doc__}"
|
|
85
|
+
self.__wrapped__ = func
|
|
86
|
+
|
|
87
|
+
@contextlib.contextmanager
|
|
88
|
+
def shim(self, shim_or_builder: ty.Union[None, ShimName, ShimOrBuilder]) -> ty.Iterator[None]:
|
|
89
|
+
"""If None is passed, no change will be made."""
|
|
90
|
+
with self._shim.set(shim_or_builder or self._shim()):
|
|
91
|
+
yield
|
|
92
|
+
|
|
93
|
+
@contextlib.contextmanager
|
|
94
|
+
def off(self) -> ty.Iterator[None]:
|
|
95
|
+
"""off is an API for setting the shim to None,
|
|
96
|
+
effectively turning off mops for the wrapped function.
|
|
97
|
+
"""
|
|
98
|
+
with self.shim("off"):
|
|
99
|
+
yield
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def _shim_builder_or_off(self) -> ty.Optional[ShimBuilder]:
|
|
103
|
+
if stack_local_shim := self._shim():
|
|
104
|
+
return to_shim_builder(stack_local_shim)
|
|
105
|
+
return self.config.shim_bld.getv(self._func_config_path)
|
|
106
|
+
|
|
107
|
+
def _is_off(self) -> bool:
|
|
108
|
+
return self._shim_builder_or_off is None
|
|
109
|
+
|
|
110
|
+
def _shimbuilder(self, f: ty.Callable[P, R], args: P.args, kwargs: P.kwargs) -> Shim:
|
|
111
|
+
# this can be set using a stack-local context, or set globally as specifically
|
|
112
|
+
# or generally as the user needs. We prefer stack local over everything else.
|
|
113
|
+
sb = self._shim_builder_or_off
|
|
114
|
+
assert sb is not None, "This should have been handled by use_runner(self._off)"
|
|
115
|
+
return sb(f, args, kwargs)
|
|
116
|
+
|
|
117
|
+
def _get_blob_root(self) -> str:
|
|
118
|
+
return self.config.blob_root.getv(self._func_config_path)()
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def _pipeline_id(self) -> str:
|
|
122
|
+
return self.config.pipeline_id.getv(self._func_config_path)
|
|
123
|
+
|
|
124
|
+
def __call__(self, *args: P.args, **kwargs: P.kwargs) -> R:
|
|
125
|
+
"""This is the wrapped function."""
|
|
126
|
+
with pipeline_id_mask.pipeline_id_mask(self._pipeline_id):
|
|
127
|
+
return self._func(*args, **kwargs)
|
|
128
|
+
|
|
129
|
+
def __repr__(self) -> str:
|
|
130
|
+
return (
|
|
131
|
+
f"Magic('{self._func_config_path}', shim={self._shim_builder_or_off},"
|
|
132
|
+
f" blob_root='{self._get_blob_root()}', pipeline_id='{self._pipeline_id}')"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def make_magic(
|
|
137
|
+
config: _MagicConfig,
|
|
138
|
+
shim_or_builder: ty.Union[ShimName, ShimOrBuilder, None],
|
|
139
|
+
blob_root: uris.UriResolvable,
|
|
140
|
+
pipeline_id: str,
|
|
141
|
+
) -> ty.Callable[[ty.Callable[P, R]], Magic[P, R]]:
|
|
142
|
+
def deco(func: ty.Callable[P, R]) -> Magic[P, R]:
|
|
143
|
+
fully_qualified_name = full_name_and_callable(func)[0].replace("--", ".")
|
|
144
|
+
if shim_or_builder is not None:
|
|
145
|
+
config.shim_bld[fully_qualified_name] = to_shim_builder(shim_or_builder)
|
|
146
|
+
if blob_root: # could be empty string
|
|
147
|
+
config.blob_root[fully_qualified_name] = uris.to_lazy_uri(blob_root)
|
|
148
|
+
if pipeline_id: # could be empty string
|
|
149
|
+
config.pipeline_id[fully_qualified_name] = pipeline_id
|
|
150
|
+
return Magic(func, config)
|
|
151
|
+
|
|
152
|
+
return deco
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
|
|
3
|
+
from thds import core
|
|
4
|
+
|
|
5
|
+
from ..runner.shim_builder import make_builder
|
|
6
|
+
from ..runner.simple_shims import samethread_shim, subprocess_shim
|
|
7
|
+
from ..runner.types import Shim, ShimBuilder
|
|
8
|
+
|
|
9
|
+
ShimName = ty.Literal[
|
|
10
|
+
"samethread", # memoization and coordination, but run in the same thread as the caller.
|
|
11
|
+
"subprocess", # memoization and coordination, but transfer to a subprocess rather than remote.
|
|
12
|
+
"off", # equivalent to None - disables use of mops.
|
|
13
|
+
]
|
|
14
|
+
ShimOrBuilder = ty.Union[ShimBuilder, Shim]
|
|
15
|
+
logger = core.log.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _shim_name_to_builder(shim_name: ShimName) -> ty.Optional[ShimBuilder]:
|
|
19
|
+
if shim_name == "samethread":
|
|
20
|
+
return make_builder(samethread_shim)
|
|
21
|
+
if shim_name == "subprocess":
|
|
22
|
+
return make_builder(subprocess_shim)
|
|
23
|
+
if shim_name == "off":
|
|
24
|
+
return None
|
|
25
|
+
logger.warning("Unrecognized shim name: %s; mops will be turned off.", shim_name)
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def to_shim_builder(shim: ty.Union[None, ShimName, ShimOrBuilder]) -> ty.Optional[ShimBuilder]:
|
|
30
|
+
if shim is None:
|
|
31
|
+
return None
|
|
32
|
+
if isinstance(shim, str):
|
|
33
|
+
return _shim_name_to_builder(shim)
|
|
34
|
+
return make_builder(shim)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .output_fqn import invocation_output_fqn # noqa
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
|
|
3
|
+
import azure.core.exceptions
|
|
4
|
+
from azure.storage.filedatalake import FileSystemClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def yield_files(fsc: FileSystemClient, adls_root: str) -> ty.Iterable[ty.Any]:
|
|
8
|
+
"""Yield files (including directories) from the root."""
|
|
9
|
+
with fsc as client:
|
|
10
|
+
try:
|
|
11
|
+
yield from client.get_paths(adls_root)
|
|
12
|
+
except azure.core.exceptions.ResourceNotFoundError as rnfe:
|
|
13
|
+
if rnfe.response and rnfe.response.status_code == 404:
|
|
14
|
+
return # no paths
|
|
15
|
+
raise
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def yield_filenames(fsc: FileSystemClient, adls_root: str) -> ty.Iterable[str]:
|
|
19
|
+
"""Yield only real file (not directory) names recursively from the root."""
|
|
20
|
+
for azure_file in yield_files(fsc, adls_root):
|
|
21
|
+
if not azure_file.get("is_directory"):
|
|
22
|
+
yield azure_file["name"]
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""This abstraction matches what is required by the BlobStore abstraction in pure.core.uris"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import typing as ty
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from azure.core.exceptions import HttpResponseError
|
|
8
|
+
from azure.storage.filedatalake import DataLakeFileClient
|
|
9
|
+
|
|
10
|
+
from thds.adls import ADLS_SCHEME, AdlsFqn, download, join, resource, ro_cache
|
|
11
|
+
from thds.adls.cached_up_down import download_to_cache, upload_through_cache
|
|
12
|
+
from thds.adls.errors import blob_not_found_translation, is_blob_not_found
|
|
13
|
+
from thds.adls.global_client import get_global_fs_client
|
|
14
|
+
from thds.core import config, fretry, home, link, log, scope
|
|
15
|
+
|
|
16
|
+
from ..._utils.on_slow import LogSlow, on_slow
|
|
17
|
+
from ..core.types import DISABLE_CONTROL_CACHE, AnyStrSrc, BlobStore
|
|
18
|
+
|
|
19
|
+
T = ty.TypeVar("T")
|
|
20
|
+
ToBytes = ty.Callable[[T, ty.BinaryIO], ty.Any]
|
|
21
|
+
FromBytes = ty.Callable[[ty.BinaryIO], T]
|
|
22
|
+
_5_MB = 5 * 2**20
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# suppress very noisy INFO logs in azure library.
|
|
26
|
+
# This mirrors thds.adls.
|
|
27
|
+
log.getLogger("azure.core").setLevel(logging.WARNING)
|
|
28
|
+
logger = log.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _selective_upload_path(path: Path, fqn: AdlsFqn) -> None:
|
|
32
|
+
if path.stat().st_size > _5_MB:
|
|
33
|
+
upload_through_cache(fqn, path)
|
|
34
|
+
else:
|
|
35
|
+
resource.upload(fqn, path)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def is_creds_failure(exc: Exception) -> bool:
|
|
39
|
+
return isinstance(exc, HttpResponseError) and not is_blob_not_found(exc)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
_azure_creds_retry = fretry.retry_sleep(is_creds_failure, fretry.expo(retries=9, delay=1.0))
|
|
43
|
+
# sometimes Azure Cli credentials expire but would succeed if retried
|
|
44
|
+
# and the azure library does not seem to retry these on its own.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class AdlsBlobStore(BlobStore):
|
|
48
|
+
def control_root(self, uri: str) -> str:
|
|
49
|
+
return str(AdlsFqn.parse(uri).root())
|
|
50
|
+
|
|
51
|
+
def _client(self, fqn: AdlsFqn) -> DataLakeFileClient:
|
|
52
|
+
return get_global_fs_client(fqn.sa, fqn.container).get_file_client(fqn.path)
|
|
53
|
+
|
|
54
|
+
@_azure_creds_retry
|
|
55
|
+
@scope.bound
|
|
56
|
+
def readbytesinto(self, remote_uri: str, stream: ty.IO[bytes], type_hint: str = "bytes") -> None:
|
|
57
|
+
fqn = AdlsFqn.parse(remote_uri)
|
|
58
|
+
scope.enter(log.logger_context(download=fqn))
|
|
59
|
+
logger.debug(f"<----- downloading {type_hint}")
|
|
60
|
+
with blob_not_found_translation(fqn):
|
|
61
|
+
on_slow(
|
|
62
|
+
lambda elapsed_s: LogSlow(f"Took {int(elapsed_s)}s to download {type_hint}"),
|
|
63
|
+
)(lambda: self._client(fqn).download_file().readinto(stream))()
|
|
64
|
+
|
|
65
|
+
@_azure_creds_retry
|
|
66
|
+
@scope.bound
|
|
67
|
+
def getfile(self, remote_uri: str) -> Path:
|
|
68
|
+
scope.enter(log.logger_context(download="mops-getfile"))
|
|
69
|
+
return download_to_cache(AdlsFqn.parse(remote_uri))
|
|
70
|
+
|
|
71
|
+
@_azure_creds_retry
|
|
72
|
+
@scope.bound
|
|
73
|
+
def putbytes(
|
|
74
|
+
self, remote_uri: str, data: AnyStrSrc, type_hint: str = "application/octet-stream"
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Upload data to a remote path."""
|
|
77
|
+
resource.upload(AdlsFqn.parse(remote_uri), data, content_type=type_hint)
|
|
78
|
+
|
|
79
|
+
@_azure_creds_retry
|
|
80
|
+
@scope.bound
|
|
81
|
+
def putfile(self, path: Path, remote_uri: str) -> None:
|
|
82
|
+
scope.enter(log.logger_context(upload="mops-putfile"))
|
|
83
|
+
_selective_upload_path(path, AdlsFqn.parse(remote_uri))
|
|
84
|
+
|
|
85
|
+
@_azure_creds_retry
|
|
86
|
+
@scope.bound
|
|
87
|
+
def exists(self, remote_uri: str) -> bool:
|
|
88
|
+
fqn = AdlsFqn.parse(remote_uri)
|
|
89
|
+
scope.enter(log.logger_context(exists=fqn))
|
|
90
|
+
return on_slow(
|
|
91
|
+
lambda secs: LogSlow(f"Took {int(secs)}s to check if file exists."),
|
|
92
|
+
slow_seconds=1.2,
|
|
93
|
+
)(lambda: self._client(fqn).exists())()
|
|
94
|
+
|
|
95
|
+
def join(self, *parts: str) -> str:
|
|
96
|
+
return join(*parts).rstrip("/")
|
|
97
|
+
|
|
98
|
+
def split(self, uri: str) -> ty.List[str]:
|
|
99
|
+
fqn = AdlsFqn.parse(uri)
|
|
100
|
+
return [str(fqn.root()), *fqn.path.split("/")]
|
|
101
|
+
|
|
102
|
+
def is_blob_not_found(self, exc: Exception) -> bool:
|
|
103
|
+
return is_blob_not_found(exc)
|
|
104
|
+
|
|
105
|
+
def list(self, uri: str) -> ty.List[str]:
|
|
106
|
+
fqn = AdlsFqn.parse(uri)
|
|
107
|
+
return [
|
|
108
|
+
str(AdlsFqn(fqn.sa, fqn.container, path.name))
|
|
109
|
+
for path in get_global_fs_client(fqn.sa, fqn.container).get_paths(fqn.path, recursive=False)
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class DangerouslyCachingStore(AdlsBlobStore):
|
|
114
|
+
"""This BlobStore will cache _everything_ locally
|
|
115
|
+
and anything it finds locally it will return without question.
|
|
116
|
+
|
|
117
|
+
This maximally avoids network operations if for some reason you feel the need
|
|
118
|
+
to do that, but it will 100% lead to false positive cache hits, because it is no longer
|
|
119
|
+
checking the hash of the locally-cached file against what ADLS itself advertises.
|
|
120
|
+
|
|
121
|
+
It is now believed that this is not as dangerous as originally thought, because mops
|
|
122
|
+
control files are not intended to be mutable, and if mutated in some kind of
|
|
123
|
+
distributed systems context (e.g. two parallel runs of the same thing), the results
|
|
124
|
+
are intended to be at least 'equivalent' even when they're not byte-identical.
|
|
125
|
+
|
|
126
|
+
Therefore, this is now enabled by default and considered generally safe to use in any
|
|
127
|
+
environment, including automated ones.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
def __init__(self, root: Path):
|
|
131
|
+
self._cache = ro_cache.Cache(root.resolve(), ("ref", "hard"))
|
|
132
|
+
|
|
133
|
+
def exists(self, remote_uri: str) -> bool:
|
|
134
|
+
cache_path = self._cache.path(AdlsFqn.parse(remote_uri))
|
|
135
|
+
if cache_path.exists():
|
|
136
|
+
return True
|
|
137
|
+
return super().exists(remote_uri)
|
|
138
|
+
|
|
139
|
+
def readbytesinto(self, remote_uri: str, stream: ty.IO[bytes], type_hint: str = "bytes") -> None:
|
|
140
|
+
# readbytesinto is used for _almost_ everything in mops - but almost everything is a 'control file'
|
|
141
|
+
# of some sort. We use a completely separate cache for all of these things, because
|
|
142
|
+
# in previous implementations, none of these things would have been cached at all.
|
|
143
|
+
# (see comment on getfile below...)
|
|
144
|
+
fqn = AdlsFqn.parse(remote_uri)
|
|
145
|
+
cache_path = self._cache.path(fqn)
|
|
146
|
+
if not cache_path.exists():
|
|
147
|
+
download.download_or_use_verified(
|
|
148
|
+
get_global_fs_client(fqn.sa, fqn.container), fqn.path, cache_path, cache=self._cache
|
|
149
|
+
)
|
|
150
|
+
with cache_path.open("rb") as f:
|
|
151
|
+
stream.write(f.read())
|
|
152
|
+
|
|
153
|
+
def getfile(self, remote_uri: str) -> Path:
|
|
154
|
+
# (continued from comment on readbytesinto...)
|
|
155
|
+
#
|
|
156
|
+
# whereas, for getfile, it is really only used for optimizations on larger file
|
|
157
|
+
# downloads (e.g. Paths), and those were previously subject to long-term caching.
|
|
158
|
+
# So for getfile, our primary source will be the parent implementation of getfile,
|
|
159
|
+
# including any caching it already did.
|
|
160
|
+
#
|
|
161
|
+
# We still dangerously short-circuit the hash check, and we make a 'cheap copy'
|
|
162
|
+
# (a link, usually) to our separate cache directory so that it's possible to
|
|
163
|
+
# completely empty this particular mops cache (and all its 'dangerous' behavior)
|
|
164
|
+
# simply by deleting that one cache directory.
|
|
165
|
+
cache_path = self._cache.path(AdlsFqn.parse(remote_uri))
|
|
166
|
+
if cache_path.exists():
|
|
167
|
+
return cache_path
|
|
168
|
+
outpath = super().getfile(remote_uri)
|
|
169
|
+
link.link(outpath, cache_path)
|
|
170
|
+
return outpath
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
_DEFAULT_CONTROL_CACHE = config.item(
|
|
174
|
+
"thds.mops.pure.adls.control_cache_root", default=home.HOMEDIR() / ".mops-adls-control-cache"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def get_adls_blob_store(uri: str) -> ty.Optional[AdlsBlobStore]:
|
|
179
|
+
if not uri.startswith(ADLS_SCHEME):
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
if DISABLE_CONTROL_CACHE() or not _DEFAULT_CONTROL_CACHE():
|
|
183
|
+
return AdlsBlobStore()
|
|
184
|
+
|
|
185
|
+
return DangerouslyCachingStore(_DEFAULT_CONTROL_CACHE())
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from thds.adls import AdlsFqn
|
|
2
|
+
|
|
3
|
+
from ..core import uris
|
|
4
|
+
from ..core.output_naming import invocation_output_uri
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def invocation_output_fqn(storage_root: uris.UriIsh = "", name: str = "") -> AdlsFqn:
|
|
8
|
+
"""If your function only outputs a single blob to ADLS, you can safely
|
|
9
|
+
use this without providing a name. However, if you have multiple outputs
|
|
10
|
+
from the same invocation, you must provide a meaningful name for each one.
|
|
11
|
+
|
|
12
|
+
As an example:
|
|
13
|
+
|
|
14
|
+
<pipeline> <function mod/name > <your name > <args,kwargs hash >
|
|
15
|
+
nppes/2023/thds.nppes.intake:run/<name goes here>/CoastOilAsset.IVZ9KplQKlNgxQHav0jIMUS9p4Kbn3N481e0Uvs
|
|
16
|
+
"""
|
|
17
|
+
return AdlsFqn.parse(invocation_output_uri(storage_root, name=name))
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
|
|
3
|
+
from thds import humenc
|
|
4
|
+
from thds.core.hashing import Hash
|
|
5
|
+
|
|
6
|
+
from .uris import active_storage_root
|
|
7
|
+
|
|
8
|
+
B64_ADDRESSED = "{algo}-b64-addressed"
|
|
9
|
+
# we can save on storage and simplify lots of internals if we just
|
|
10
|
+
# hash all blobs and upload them to a key that is their hash.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def storage_content_addressed(hash_str: str, algo: str, storage_root: str = "") -> str:
|
|
14
|
+
hash_namespace = B64_ADDRESSED.format(algo=algo)
|
|
15
|
+
return f"{storage_root or active_storage_root()}/{hash_namespace}/{hash_str}"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ContentAddressed(ty.NamedTuple):
|
|
19
|
+
bytes_uri: str
|
|
20
|
+
debug_uri: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def wordybin_content_addressed(
|
|
24
|
+
hash: Hash, storage_root: str = "", debug_name: str = ""
|
|
25
|
+
) -> ContentAddressed:
|
|
26
|
+
"""This should be used any time you have access to the raw bytes, so that we can stick
|
|
27
|
+
with the Human Base 64 format.
|
|
28
|
+
"""
|
|
29
|
+
base_uri = storage_content_addressed(humenc.encode(hash.bytes), hash.algo, storage_root)
|
|
30
|
+
# corresponds with '_bytes' as used in `serialize_paths.py`
|
|
31
|
+
return ContentAddressed(f"{base_uri}/_bytes", f"{base_uri}/{debug_name}" if debug_name else "")
|