thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show
  1. thds/mops/__about__.py +8 -0
  2. thds/mops/__init__.py +3 -0
  3. thds/mops/_compat.py +6 -0
  4. thds/mops/_utils/__init__.py +0 -0
  5. thds/mops/_utils/colorize.py +110 -0
  6. thds/mops/_utils/config_tree.py +167 -0
  7. thds/mops/_utils/exception.py +16 -0
  8. thds/mops/_utils/locked_cache.py +78 -0
  9. thds/mops/_utils/names.py +23 -0
  10. thds/mops/_utils/on_slow.py +28 -0
  11. thds/mops/_utils/once.py +30 -0
  12. thds/mops/_utils/temp.py +32 -0
  13. thds/mops/config.py +60 -0
  14. thds/mops/impure/__init__.py +2 -0
  15. thds/mops/impure/keyfunc.py +14 -0
  16. thds/mops/impure/runner.py +73 -0
  17. thds/mops/k8s/__init__.py +27 -0
  18. thds/mops/k8s/_shared.py +3 -0
  19. thds/mops/k8s/apply_yaml.py +22 -0
  20. thds/mops/k8s/auth.py +49 -0
  21. thds/mops/k8s/config.py +37 -0
  22. thds/mops/k8s/container_registry.py +14 -0
  23. thds/mops/k8s/jobs.py +57 -0
  24. thds/mops/k8s/launch.py +234 -0
  25. thds/mops/k8s/logging.py +239 -0
  26. thds/mops/k8s/namespace.py +17 -0
  27. thds/mops/k8s/node_selection.py +58 -0
  28. thds/mops/k8s/retry.py +75 -0
  29. thds/mops/k8s/too_old_resource_version.py +42 -0
  30. thds/mops/k8s/tools/krsync.py +50 -0
  31. thds/mops/k8s/tools/krsync.sh +22 -0
  32. thds/mops/k8s/wait_job.py +72 -0
  33. thds/mops/k8s/warn_image_backoff.py +63 -0
  34. thds/mops/k8s/watch.py +266 -0
  35. thds/mops/meta.json +8 -0
  36. thds/mops/parallel.py +36 -0
  37. thds/mops/pure/__init__.py +43 -0
  38. thds/mops/pure/_magic/__init__.py +0 -0
  39. thds/mops/pure/_magic/api.py +114 -0
  40. thds/mops/pure/_magic/sauce.py +152 -0
  41. thds/mops/pure/_magic/shims.py +34 -0
  42. thds/mops/pure/adls/__init__.py +1 -0
  43. thds/mops/pure/adls/_files.py +22 -0
  44. thds/mops/pure/adls/blob_store.py +185 -0
  45. thds/mops/pure/adls/output_fqn.py +17 -0
  46. thds/mops/pure/core/__init__.py +0 -0
  47. thds/mops/pure/core/content_addressed.py +31 -0
  48. thds/mops/pure/core/deferred_work.py +83 -0
  49. thds/mops/pure/core/entry/__init__.py +2 -0
  50. thds/mops/pure/core/entry/main.py +47 -0
  51. thds/mops/pure/core/entry/route_result.py +66 -0
  52. thds/mops/pure/core/entry/runner_registry.py +31 -0
  53. thds/mops/pure/core/file_blob_store.py +120 -0
  54. thds/mops/pure/core/lock/__init__.py +7 -0
  55. thds/mops/pure/core/lock/_acquire.py +192 -0
  56. thds/mops/pure/core/lock/_funcs.py +37 -0
  57. thds/mops/pure/core/lock/cli.py +73 -0
  58. thds/mops/pure/core/lock/maintain.py +150 -0
  59. thds/mops/pure/core/lock/read.py +39 -0
  60. thds/mops/pure/core/lock/types.py +37 -0
  61. thds/mops/pure/core/lock/write.py +136 -0
  62. thds/mops/pure/core/memo/__init__.py +6 -0
  63. thds/mops/pure/core/memo/function_memospace.py +267 -0
  64. thds/mops/pure/core/memo/keyfunc.py +53 -0
  65. thds/mops/pure/core/memo/overwrite_params.py +61 -0
  66. thds/mops/pure/core/memo/results.py +103 -0
  67. thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
  68. thds/mops/pure/core/metadata.py +230 -0
  69. thds/mops/pure/core/output_naming.py +52 -0
  70. thds/mops/pure/core/partial.py +15 -0
  71. thds/mops/pure/core/pipeline_id.py +62 -0
  72. thds/mops/pure/core/pipeline_id_mask.py +79 -0
  73. thds/mops/pure/core/script_support.py +25 -0
  74. thds/mops/pure/core/serialize_big_objs.py +73 -0
  75. thds/mops/pure/core/serialize_paths.py +149 -0
  76. thds/mops/pure/core/source.py +291 -0
  77. thds/mops/pure/core/types.py +142 -0
  78. thds/mops/pure/core/uris.py +81 -0
  79. thds/mops/pure/core/use_runner.py +47 -0
  80. thds/mops/pure/joblib/__init__.py +1 -0
  81. thds/mops/pure/joblib/backend.py +81 -0
  82. thds/mops/pure/joblib/batching.py +67 -0
  83. thds/mops/pure/pickling/__init__.py +3 -0
  84. thds/mops/pure/pickling/_pickle.py +193 -0
  85. thds/mops/pure/pickling/memoize_only.py +22 -0
  86. thds/mops/pure/pickling/mprunner.py +173 -0
  87. thds/mops/pure/pickling/pickles.py +149 -0
  88. thds/mops/pure/pickling/remote.py +145 -0
  89. thds/mops/pure/pickling/sha256_b64.py +71 -0
  90. thds/mops/pure/runner/__init__.py +0 -0
  91. thds/mops/pure/runner/local.py +239 -0
  92. thds/mops/pure/runner/shim_builder.py +25 -0
  93. thds/mops/pure/runner/simple_shims.py +21 -0
  94. thds/mops/pure/runner/strings.py +1 -0
  95. thds/mops/pure/runner/types.py +28 -0
  96. thds/mops/pure/tools/__init__.py +0 -0
  97. thds/mops/pure/tools/history.py +35 -0
  98. thds/mops/pure/tools/inspect.py +372 -0
  99. thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
  100. thds/mops/pure/tools/stress.py +63 -0
  101. thds/mops/pure/tools/summarize/__init__.py +4 -0
  102. thds/mops/pure/tools/summarize/cli.py +293 -0
  103. thds/mops/pure/tools/summarize/run_summary.py +143 -0
  104. thds/mops/py.typed +0 -0
  105. thds/mops/testing/__init__.py +0 -0
  106. thds/mops/testing/deferred_imports.py +81 -0
  107. thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
  108. thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
  109. thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
  110. thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
  111. thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
thds/mops/parallel.py ADDED
@@ -0,0 +1,36 @@
1
+ import concurrent
2
+ import typing as ty
3
+
4
+ from thds.core import parallel
5
+ from thds.core.parallel import ( # noqa: F401; for backward-compatibility, since these came from here originally.
6
+ IterableWithLen,
7
+ IteratorWithLen,
8
+ )
9
+ from thds.core.thunks import ( # noqa: F401; for backward-compatibility, since these came from here originally.
10
+ Thunk,
11
+ thunking,
12
+ )
13
+
14
+ from ._utils.colorize import colorized
15
+
16
+ ERROR = colorized(fg="white", bg="red")
17
+ DONE = colorized(fg="white", bg="blue")
18
+ R = ty.TypeVar("R")
19
+
20
+
21
+ def parallel_yield_results(
22
+ thunks: ty.Iterable[ty.Callable[[], R]],
23
+ *,
24
+ executor_cm: ty.Optional[ty.ContextManager[concurrent.futures.Executor]] = None,
25
+ named: str = "",
26
+ ) -> ty.Iterator[R]:
27
+ yield from parallel.yield_results(
28
+ thunks,
29
+ executor_cm=executor_cm,
30
+ error_fmt=ERROR,
31
+ success_fmt=DONE,
32
+ named=named,
33
+ )
34
+
35
+
36
+ yield_results = parallel_yield_results
@@ -0,0 +1,43 @@
1
+ # This module is the supported interface and everything not exported here is subject to
2
+ # backward-incompatible change without notice.
3
+ #
4
+ # The single exception is the joblib module, which is not exported by default
5
+ # to avoid requiring the additional dependency.
6
+
7
+ from . import adls # noqa
8
+ from ._magic.api import magic # noqa
9
+ from .core.entry import register_entry_handler
10
+ from .core.memo import results # noqa
11
+ from .core.memo.function_memospace import ( # noqa
12
+ add_pipeline_memospace_handlers,
13
+ matching_mask_pipeline_id,
14
+ )
15
+ from .core.pipeline_id import get_pipeline_id, set_pipeline_id # noqa
16
+ from .core.pipeline_id_mask import pipeline_id_mask # noqa
17
+ from .core.source import create_source_at_uri # noqa
18
+ from .core.types import Args, BlobStore, Kwargs, Runner # noqa
19
+ from .core.uris import UriIsh, UriResolvable, register_blob_store # noqa
20
+ from .core.use_runner import use_runner # noqa
21
+ from .pickling.memoize_only import memoize_in # noqa
22
+ from .pickling.mprunner import MemoizingPicklingRunner # noqa
23
+ from .runner.simple_shims import samethread_shim, subprocess_shim # noqa
24
+ from .runner.types import Shim, ShimBuilder # noqa
25
+
26
+
27
+ def _register_things() -> None:
28
+ from . import pickling
29
+ from .core.uris import load_plugin_blobstores
30
+
31
+ register_entry_handler(
32
+ pickling.mprunner.RUNNER_NAME,
33
+ pickling.remote.run_pickled_invocation, # type: ignore
34
+ )
35
+
36
+ load_plugin_blobstores()
37
+
38
+
39
+ _register_things()
40
+
41
+
42
+ Shell = Shim # deprecated alias
43
+ ShellBuilder = ShimBuilder # deprecated alias
File without changes
@@ -0,0 +1,114 @@
1
+ """Magic is an attempt at a new interface for mops designed to make it even less sticky
2
+ and easier to get things done with.
3
+
4
+ It's designed to combine the most common workflows into a single wrapper function
5
+ requiring an absolute minimum of boilerplate/config.
6
+
7
+ Unlike the more open-ended interface of use_runner plus BYO Runner, this one assumes
8
+ MemoizingPicklingRunner, and the most likely non-default config will be a runtime Shim or
9
+ ShimBuilder. If you don't supply one, it will default to the same-thread shim.
10
+ """
11
+
12
+ import typing as ty
13
+ from pathlib import Path
14
+
15
+ from thds import core
16
+ from thds.mops import config
17
+ from thds.mops._utils import config_tree
18
+
19
+ from ..core import uris
20
+ from ..runner.types import ShimBuilder
21
+ from . import sauce
22
+ from .sauce import P, R
23
+ from .shims import ShimName, ShimOrBuilder, to_shim_builder
24
+
25
+ _MAGIC_CONFIG: ty.Final = sauce.new_config()
26
+
27
+
28
+ def _get_config() -> sauce._MagicConfig: # for testing
29
+ return _MAGIC_CONFIG
30
+
31
+
32
+ class _MagicApi:
33
+ """The public API for pure.magic.
34
+
35
+ Each of these methods makes a global change to your application, so they're designed
36
+ to be used at import time or in other situations where no functions have been called.
37
+
38
+ If you want to apply a shim, blob_root, or pipeline_id to a single function, prefer
39
+ the @pure.magic(shim, blob_root=your_blob_root, pipeline_id='lazing/sunday') decorator
40
+ approach rather than configuring them after the fact, to keep the definition as close
41
+ as possible to the site of use.
42
+ """
43
+
44
+ @staticmethod
45
+ def __call__(
46
+ shim_or_builder: ty.Union[ShimName, ShimOrBuilder, None] = None,
47
+ *,
48
+ blob_root: uris.UriResolvable = "",
49
+ pipeline_id: str = "",
50
+ ) -> ty.Callable[[ty.Callable[P, R]], sauce.Magic[P, R]]:
51
+ return sauce.make_magic(_get_config(), shim_or_builder, blob_root, pipeline_id)
52
+
53
+ @staticmethod
54
+ def blob_root(
55
+ blob_root_uri: uris.UriResolvable, pathable: config_tree.Pathable = None, *, mask: bool = False
56
+ ) -> core.config.ConfigItem[ty.Callable[[], str]]:
57
+ """Sets the root URI for the blob store and control files for a specific module or function."""
58
+ return _get_config().blob_root.setv(uris.to_lazy_uri(blob_root_uri), pathable, mask=mask)
59
+
60
+ @staticmethod
61
+ def shim(
62
+ shim: ty.Union[ShimName, ShimOrBuilder],
63
+ pathable: config_tree.Pathable = None,
64
+ *,
65
+ mask: bool = False,
66
+ ) -> core.config.ConfigItem[ty.Optional[ShimBuilder]]:
67
+ """Use the provided shim for everything matching the pathable,
68
+ unless there's a more specific path that matches.
69
+
70
+ e.g.:
71
+ - magic.shim('samethread') would turn off mops for everything within
72
+ or below the current module.
73
+ - magic.shim('subprocess', 'foo.bar.baz') would use the subprocess shim for
74
+ everything within or below the foo.bar.baz module.
75
+ - magic.shim(my_shim_builder, my_func) would use my_shim_builder for just my_func.
76
+
77
+ To instead _mask_ everything at this level and below regardless of more specific
78
+ config, pass mask=True.
79
+ """
80
+ return _get_config().shim_bld.setv(to_shim_builder(shim), pathable, mask=mask)
81
+
82
+ @staticmethod
83
+ def off(pathable: config_tree.Pathable = None, *, mask: bool = False) -> None:
84
+ """Turn off mops for everything matching the pathable.
85
+
86
+ A shortcut for shim(None).
87
+ """
88
+ _MagicApi.shim("off", pathable, mask=mask)
89
+
90
+ @staticmethod
91
+ def pipeline_id(
92
+ pipeline_id: str, pathable: config_tree.Pathable = None, *, mask: bool = False
93
+ ) -> core.config.ConfigItem[str]:
94
+ """Sets the pipeline_id for a specific module or function."""
95
+ return _get_config().pipeline_id.setv(pipeline_id, pathable, mask=mask)
96
+
97
+ @staticmethod
98
+ def load_config_file(magic_config: ty.Optional[Path] = None) -> None:
99
+ """Call this to load pure.magic config from the nearest .mops.toml file upward,
100
+ or the path you provide.
101
+
102
+ Should be called only once, in the `__main__` block of your program,
103
+ and after all imports are resolved.
104
+ """
105
+ all_config = config.load(magic_config or config.first_found_config_file(), name="pure.magic")
106
+ m_config = _get_config()
107
+ m_config.shim_bld.load_config(all_config)
108
+ m_config.blob_root.load_config(all_config)
109
+ m_config.pipeline_id.load_config(all_config)
110
+
111
+
112
+ magic: ty.Final = _MagicApi()
113
+ # we only instantiate this so we can have a call to magic() that is not __init__.
114
+ # there is no state whatsoever in this object.
@@ -0,0 +1,152 @@
1
+ """The magic sauce for most of what pure.magic does."""
2
+
3
+ import contextlib
4
+ import functools
5
+ import typing as ty
6
+
7
+ from typing_extensions import ParamSpec
8
+
9
+ from thds.core import stack_context
10
+ from thds.mops._utils import config_tree
11
+
12
+ from ..core import file_blob_store, pipeline_id_mask, uris
13
+ from ..core.memo.unique_name_for_function import full_name_and_callable
14
+ from ..core.use_runner import use_runner
15
+ from ..pickling.mprunner import MemoizingPicklingRunner
16
+ from ..runner.shim_builder import make_builder
17
+ from ..runner.simple_shims import samethread_shim
18
+ from ..runner.types import Shim, ShimBuilder
19
+ from .shims import ShimName, ShimOrBuilder, to_shim_builder
20
+
21
+ _local_root = lambda: f"file://{file_blob_store.MOPS_ROOT()}" # noqa: E731
22
+ P = ParamSpec("P")
23
+ R = ty.TypeVar("R")
24
+
25
+
26
+ class _MagicConfig:
27
+ def __init__(self) -> None:
28
+ # these ConfigTree objects apply configuration to callables wrapped with pure.magic
29
+ # based on the fully-qualified path to the callable, e.g. foo.bar.baz.my_func
30
+ self.shim_bld = config_tree.ConfigTree[ty.Optional[ShimBuilder]](
31
+ "mops.pure.magic.shim", parse=to_shim_builder # type: ignore
32
+ )
33
+ self.blob_root = config_tree.ConfigTree[ty.Callable[[], str]](
34
+ "mops.pure.magic.blob_root", parse=uris.to_lazy_uri
35
+ )
36
+ self.pipeline_id = config_tree.ConfigTree[str]("mops.pure.magic.pipeline_id")
37
+ self.blob_root[""] = _local_root # default Blob Store
38
+ self.shim_bld[""] = make_builder(samethread_shim) # default Shim
39
+ self.pipeline_id[""] = "magic" # default pipeline_id
40
+
41
+ def __repr__(self) -> str:
42
+ return f"MagicConfig(shim_bld={self.shim_bld}, blob_root={self.blob_root}, pipeline_id={self.pipeline_id})"
43
+
44
+
45
+ def new_config() -> _MagicConfig:
46
+ return _MagicConfig()
47
+
48
+
49
+ class Magic(ty.Generic[P, R]):
50
+ """Magic adds mops' powers (memoization, coordination, remote execution) to a callable.
51
+
52
+ If you want to _change_ which runtime shim the function is using, that can be set globally
53
+ to the program with pure.magic.shim(other_shim, my_magic_func), and it can also be set
54
+ as a stack-local variable in a context manager provided by this object:
55
+
56
+ with my_magic_func.shim("subprocess"):
57
+ my_magic_func(1, 2, 3)
58
+
59
+ You can completely disable mops magic for a function in the same ways, either with a contextmanager
60
+ or globally, using `off()`, like so:
61
+
62
+ with my_magic_func.off():
63
+ ...
64
+ my_magic_func(1, 2, 3)
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ func: ty.Callable[P, R],
70
+ config: _MagicConfig,
71
+ ):
72
+ functools.update_wrapper(self, func)
73
+ self._func_config_path = full_name_and_callable(func)[0].replace("--", ".")
74
+
75
+ self.config = config
76
+ if p_id := pipeline_id_mask.extract_from_docstr(func, require=False):
77
+ # this allows the docstring pipeline id to become 'the most specific' config.
78
+ self.config.pipeline_id.setv(p_id, self._func_config_path)
79
+ self._shim = stack_context.StackContext[ty.Union[None, ShimName, ShimOrBuilder]](
80
+ str(func) + "_SHIM", None # none means nothing has been set stack-local
81
+ )
82
+ self.runner = MemoizingPicklingRunner(self._shimbuilder, self._get_blob_root)
83
+ self._func = use_runner(self.runner, self._is_off)(func)
84
+ self.__doc__ = f"{func.__doc__}\n\nMagic class info:\n{self.__class__.__doc__}"
85
+ self.__wrapped__ = func
86
+
87
+ @contextlib.contextmanager
88
+ def shim(self, shim_or_builder: ty.Union[None, ShimName, ShimOrBuilder]) -> ty.Iterator[None]:
89
+ """If None is passed, no change will be made."""
90
+ with self._shim.set(shim_or_builder or self._shim()):
91
+ yield
92
+
93
+ @contextlib.contextmanager
94
+ def off(self) -> ty.Iterator[None]:
95
+ """off is an API for setting the shim to None,
96
+ effectively turning off mops for the wrapped function.
97
+ """
98
+ with self.shim("off"):
99
+ yield
100
+
101
+ @property
102
+ def _shim_builder_or_off(self) -> ty.Optional[ShimBuilder]:
103
+ if stack_local_shim := self._shim():
104
+ return to_shim_builder(stack_local_shim)
105
+ return self.config.shim_bld.getv(self._func_config_path)
106
+
107
+ def _is_off(self) -> bool:
108
+ return self._shim_builder_or_off is None
109
+
110
+ def _shimbuilder(self, f: ty.Callable[P, R], args: P.args, kwargs: P.kwargs) -> Shim:
111
+ # this can be set using a stack-local context, or set globally as specifically
112
+ # or generally as the user needs. We prefer stack local over everything else.
113
+ sb = self._shim_builder_or_off
114
+ assert sb is not None, "This should have been handled by use_runner(self._off)"
115
+ return sb(f, args, kwargs)
116
+
117
+ def _get_blob_root(self) -> str:
118
+ return self.config.blob_root.getv(self._func_config_path)()
119
+
120
+ @property
121
+ def _pipeline_id(self) -> str:
122
+ return self.config.pipeline_id.getv(self._func_config_path)
123
+
124
+ def __call__(self, *args: P.args, **kwargs: P.kwargs) -> R:
125
+ """This is the wrapped function."""
126
+ with pipeline_id_mask.pipeline_id_mask(self._pipeline_id):
127
+ return self._func(*args, **kwargs)
128
+
129
+ def __repr__(self) -> str:
130
+ return (
131
+ f"Magic('{self._func_config_path}', shim={self._shim_builder_or_off},"
132
+ f" blob_root='{self._get_blob_root()}', pipeline_id='{self._pipeline_id}')"
133
+ )
134
+
135
+
136
+ def make_magic(
137
+ config: _MagicConfig,
138
+ shim_or_builder: ty.Union[ShimName, ShimOrBuilder, None],
139
+ blob_root: uris.UriResolvable,
140
+ pipeline_id: str,
141
+ ) -> ty.Callable[[ty.Callable[P, R]], Magic[P, R]]:
142
+ def deco(func: ty.Callable[P, R]) -> Magic[P, R]:
143
+ fully_qualified_name = full_name_and_callable(func)[0].replace("--", ".")
144
+ if shim_or_builder is not None:
145
+ config.shim_bld[fully_qualified_name] = to_shim_builder(shim_or_builder)
146
+ if blob_root: # could be empty string
147
+ config.blob_root[fully_qualified_name] = uris.to_lazy_uri(blob_root)
148
+ if pipeline_id: # could be empty string
149
+ config.pipeline_id[fully_qualified_name] = pipeline_id
150
+ return Magic(func, config)
151
+
152
+ return deco
@@ -0,0 +1,34 @@
1
+ import typing as ty
2
+
3
+ from thds import core
4
+
5
+ from ..runner.shim_builder import make_builder
6
+ from ..runner.simple_shims import samethread_shim, subprocess_shim
7
+ from ..runner.types import Shim, ShimBuilder
8
+
9
+ ShimName = ty.Literal[
10
+ "samethread", # memoization and coordination, but run in the same thread as the caller.
11
+ "subprocess", # memoization and coordination, but transfer to a subprocess rather than remote.
12
+ "off", # equivalent to None - disables use of mops.
13
+ ]
14
+ ShimOrBuilder = ty.Union[ShimBuilder, Shim]
15
+ logger = core.log.getLogger(__name__)
16
+
17
+
18
+ def _shim_name_to_builder(shim_name: ShimName) -> ty.Optional[ShimBuilder]:
19
+ if shim_name == "samethread":
20
+ return make_builder(samethread_shim)
21
+ if shim_name == "subprocess":
22
+ return make_builder(subprocess_shim)
23
+ if shim_name == "off":
24
+ return None
25
+ logger.warning("Unrecognized shim name: %s; mops will be turned off.", shim_name)
26
+ return None
27
+
28
+
29
+ def to_shim_builder(shim: ty.Union[None, ShimName, ShimOrBuilder]) -> ty.Optional[ShimBuilder]:
30
+ if shim is None:
31
+ return None
32
+ if isinstance(shim, str):
33
+ return _shim_name_to_builder(shim)
34
+ return make_builder(shim)
@@ -0,0 +1 @@
1
+ from .output_fqn import invocation_output_fqn # noqa
@@ -0,0 +1,22 @@
1
+ import typing as ty
2
+
3
+ import azure.core.exceptions
4
+ from azure.storage.filedatalake import FileSystemClient
5
+
6
+
7
+ def yield_files(fsc: FileSystemClient, adls_root: str) -> ty.Iterable[ty.Any]:
8
+ """Yield files (including directories) from the root."""
9
+ with fsc as client:
10
+ try:
11
+ yield from client.get_paths(adls_root)
12
+ except azure.core.exceptions.ResourceNotFoundError as rnfe:
13
+ if rnfe.response and rnfe.response.status_code == 404:
14
+ return # no paths
15
+ raise
16
+
17
+
18
+ def yield_filenames(fsc: FileSystemClient, adls_root: str) -> ty.Iterable[str]:
19
+ """Yield only real file (not directory) names recursively from the root."""
20
+ for azure_file in yield_files(fsc, adls_root):
21
+ if not azure_file.get("is_directory"):
22
+ yield azure_file["name"]
@@ -0,0 +1,185 @@
1
+ """This abstraction matches what is required by the BlobStore abstraction in pure.core.uris"""
2
+
3
+ import logging
4
+ import typing as ty
5
+ from pathlib import Path
6
+
7
+ from azure.core.exceptions import HttpResponseError
8
+ from azure.storage.filedatalake import DataLakeFileClient
9
+
10
+ from thds.adls import ADLS_SCHEME, AdlsFqn, download, join, resource, ro_cache
11
+ from thds.adls.cached_up_down import download_to_cache, upload_through_cache
12
+ from thds.adls.errors import blob_not_found_translation, is_blob_not_found
13
+ from thds.adls.global_client import get_global_fs_client
14
+ from thds.core import config, fretry, home, link, log, scope
15
+
16
+ from ..._utils.on_slow import LogSlow, on_slow
17
+ from ..core.types import DISABLE_CONTROL_CACHE, AnyStrSrc, BlobStore
18
+
19
+ T = ty.TypeVar("T")
20
+ ToBytes = ty.Callable[[T, ty.BinaryIO], ty.Any]
21
+ FromBytes = ty.Callable[[ty.BinaryIO], T]
22
+ _5_MB = 5 * 2**20
23
+
24
+
25
+ # suppress very noisy INFO logs in azure library.
26
+ # This mirrors thds.adls.
27
+ log.getLogger("azure.core").setLevel(logging.WARNING)
28
+ logger = log.getLogger(__name__)
29
+
30
+
31
+ def _selective_upload_path(path: Path, fqn: AdlsFqn) -> None:
32
+ if path.stat().st_size > _5_MB:
33
+ upload_through_cache(fqn, path)
34
+ else:
35
+ resource.upload(fqn, path)
36
+
37
+
38
+ def is_creds_failure(exc: Exception) -> bool:
39
+ return isinstance(exc, HttpResponseError) and not is_blob_not_found(exc)
40
+
41
+
42
+ _azure_creds_retry = fretry.retry_sleep(is_creds_failure, fretry.expo(retries=9, delay=1.0))
43
+ # sometimes Azure Cli credentials expire but would succeed if retried
44
+ # and the azure library does not seem to retry these on its own.
45
+
46
+
47
+ class AdlsBlobStore(BlobStore):
48
+ def control_root(self, uri: str) -> str:
49
+ return str(AdlsFqn.parse(uri).root())
50
+
51
+ def _client(self, fqn: AdlsFqn) -> DataLakeFileClient:
52
+ return get_global_fs_client(fqn.sa, fqn.container).get_file_client(fqn.path)
53
+
54
+ @_azure_creds_retry
55
+ @scope.bound
56
+ def readbytesinto(self, remote_uri: str, stream: ty.IO[bytes], type_hint: str = "bytes") -> None:
57
+ fqn = AdlsFqn.parse(remote_uri)
58
+ scope.enter(log.logger_context(download=fqn))
59
+ logger.debug(f"<----- downloading {type_hint}")
60
+ with blob_not_found_translation(fqn):
61
+ on_slow(
62
+ lambda elapsed_s: LogSlow(f"Took {int(elapsed_s)}s to download {type_hint}"),
63
+ )(lambda: self._client(fqn).download_file().readinto(stream))()
64
+
65
+ @_azure_creds_retry
66
+ @scope.bound
67
+ def getfile(self, remote_uri: str) -> Path:
68
+ scope.enter(log.logger_context(download="mops-getfile"))
69
+ return download_to_cache(AdlsFqn.parse(remote_uri))
70
+
71
+ @_azure_creds_retry
72
+ @scope.bound
73
+ def putbytes(
74
+ self, remote_uri: str, data: AnyStrSrc, type_hint: str = "application/octet-stream"
75
+ ) -> None:
76
+ """Upload data to a remote path."""
77
+ resource.upload(AdlsFqn.parse(remote_uri), data, content_type=type_hint)
78
+
79
+ @_azure_creds_retry
80
+ @scope.bound
81
+ def putfile(self, path: Path, remote_uri: str) -> None:
82
+ scope.enter(log.logger_context(upload="mops-putfile"))
83
+ _selective_upload_path(path, AdlsFqn.parse(remote_uri))
84
+
85
+ @_azure_creds_retry
86
+ @scope.bound
87
+ def exists(self, remote_uri: str) -> bool:
88
+ fqn = AdlsFqn.parse(remote_uri)
89
+ scope.enter(log.logger_context(exists=fqn))
90
+ return on_slow(
91
+ lambda secs: LogSlow(f"Took {int(secs)}s to check if file exists."),
92
+ slow_seconds=1.2,
93
+ )(lambda: self._client(fqn).exists())()
94
+
95
+ def join(self, *parts: str) -> str:
96
+ return join(*parts).rstrip("/")
97
+
98
+ def split(self, uri: str) -> ty.List[str]:
99
+ fqn = AdlsFqn.parse(uri)
100
+ return [str(fqn.root()), *fqn.path.split("/")]
101
+
102
+ def is_blob_not_found(self, exc: Exception) -> bool:
103
+ return is_blob_not_found(exc)
104
+
105
+ def list(self, uri: str) -> ty.List[str]:
106
+ fqn = AdlsFqn.parse(uri)
107
+ return [
108
+ str(AdlsFqn(fqn.sa, fqn.container, path.name))
109
+ for path in get_global_fs_client(fqn.sa, fqn.container).get_paths(fqn.path, recursive=False)
110
+ ]
111
+
112
+
113
+ class DangerouslyCachingStore(AdlsBlobStore):
114
+ """This BlobStore will cache _everything_ locally
115
+ and anything it finds locally it will return without question.
116
+
117
+ This maximally avoids network operations if for some reason you feel the need
118
+ to do that, but it will 100% lead to false positive cache hits, because it is no longer
119
+ checking the hash of the locally-cached file against what ADLS itself advertises.
120
+
121
+ It is now believed that this is not as dangerous as originally thought, because mops
122
+ control files are not intended to be mutable, and if mutated in some kind of
123
+ distributed systems context (e.g. two parallel runs of the same thing), the results
124
+ are intended to be at least 'equivalent' even when they're not byte-identical.
125
+
126
+ Therefore, this is now enabled by default and considered generally safe to use in any
127
+ environment, including automated ones.
128
+ """
129
+
130
+ def __init__(self, root: Path):
131
+ self._cache = ro_cache.Cache(root.resolve(), ("ref", "hard"))
132
+
133
+ def exists(self, remote_uri: str) -> bool:
134
+ cache_path = self._cache.path(AdlsFqn.parse(remote_uri))
135
+ if cache_path.exists():
136
+ return True
137
+ return super().exists(remote_uri)
138
+
139
+ def readbytesinto(self, remote_uri: str, stream: ty.IO[bytes], type_hint: str = "bytes") -> None:
140
+ # readbytesinto is used for _almost_ everything in mops - but almost everything is a 'control file'
141
+ # of some sort. We use a completely separate cache for all of these things, because
142
+ # in previous implementations, none of these things would have been cached at all.
143
+ # (see comment on getfile below...)
144
+ fqn = AdlsFqn.parse(remote_uri)
145
+ cache_path = self._cache.path(fqn)
146
+ if not cache_path.exists():
147
+ download.download_or_use_verified(
148
+ get_global_fs_client(fqn.sa, fqn.container), fqn.path, cache_path, cache=self._cache
149
+ )
150
+ with cache_path.open("rb") as f:
151
+ stream.write(f.read())
152
+
153
+ def getfile(self, remote_uri: str) -> Path:
154
+ # (continued from comment on readbytesinto...)
155
+ #
156
+ # whereas, for getfile, it is really only used for optimizations on larger file
157
+ # downloads (e.g. Paths), and those were previously subject to long-term caching.
158
+ # So for getfile, our primary source will be the parent implementation of getfile,
159
+ # including any caching it already did.
160
+ #
161
+ # We still dangerously short-circuit the hash check, and we make a 'cheap copy'
162
+ # (a link, usually) to our separate cache directory so that it's possible to
163
+ # completely empty this particular mops cache (and all its 'dangerous' behavior)
164
+ # simply by deleting that one cache directory.
165
+ cache_path = self._cache.path(AdlsFqn.parse(remote_uri))
166
+ if cache_path.exists():
167
+ return cache_path
168
+ outpath = super().getfile(remote_uri)
169
+ link.link(outpath, cache_path)
170
+ return outpath
171
+
172
+
173
+ _DEFAULT_CONTROL_CACHE = config.item(
174
+ "thds.mops.pure.adls.control_cache_root", default=home.HOMEDIR() / ".mops-adls-control-cache"
175
+ )
176
+
177
+
178
+ def get_adls_blob_store(uri: str) -> ty.Optional[AdlsBlobStore]:
179
+ if not uri.startswith(ADLS_SCHEME):
180
+ return None
181
+
182
+ if DISABLE_CONTROL_CACHE() or not _DEFAULT_CONTROL_CACHE():
183
+ return AdlsBlobStore()
184
+
185
+ return DangerouslyCachingStore(_DEFAULT_CONTROL_CACHE())
@@ -0,0 +1,17 @@
1
+ from thds.adls import AdlsFqn
2
+
3
+ from ..core import uris
4
+ from ..core.output_naming import invocation_output_uri
5
+
6
+
7
+ def invocation_output_fqn(storage_root: uris.UriIsh = "", name: str = "") -> AdlsFqn:
8
+ """If your function only outputs a single blob to ADLS, you can safely
9
+ use this without providing a name. However, if you have multiple outputs
10
+ from the same invocation, you must provide a meaningful name for each one.
11
+
12
+ As an example:
13
+
14
+ <pipeline> <function mod/name > <your name > <args,kwargs hash >
15
+ nppes/2023/thds.nppes.intake:run/<name goes here>/CoastOilAsset.IVZ9KplQKlNgxQHav0jIMUS9p4Kbn3N481e0Uvs
16
+ """
17
+ return AdlsFqn.parse(invocation_output_uri(storage_root, name=name))
File without changes
@@ -0,0 +1,31 @@
1
+ import typing as ty
2
+
3
+ from thds import humenc
4
+ from thds.core.hashing import Hash
5
+
6
+ from .uris import active_storage_root
7
+
8
+ B64_ADDRESSED = "{algo}-b64-addressed"
9
+ # we can save on storage and simplify lots of internals if we just
10
+ # hash all blobs and upload them to a key that is their hash.
11
+
12
+
13
+ def storage_content_addressed(hash_str: str, algo: str, storage_root: str = "") -> str:
14
+ hash_namespace = B64_ADDRESSED.format(algo=algo)
15
+ return f"{storage_root or active_storage_root()}/{hash_namespace}/{hash_str}"
16
+
17
+
18
+ class ContentAddressed(ty.NamedTuple):
19
+ bytes_uri: str
20
+ debug_uri: str
21
+
22
+
23
+ def wordybin_content_addressed(
24
+ hash: Hash, storage_root: str = "", debug_name: str = ""
25
+ ) -> ContentAddressed:
26
+ """This should be used any time you have access to the raw bytes, so that we can stick
27
+ with the Human Base 64 format.
28
+ """
29
+ base_uri = storage_content_addressed(humenc.encode(hash.bytes), hash.algo, storage_root)
30
+ # corresponds with '_bytes' as used in `serialize_paths.py`
31
+ return ContentAddressed(f"{base_uri}/_bytes", f"{base_uri}/{debug_name}" if debug_name else "")