thds.mops 3.6.20250219172032__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.mops might be problematic. Click here for more details.
- thds/mops/__about__.py +8 -0
- thds/mops/__init__.py +3 -0
- thds/mops/_compat.py +6 -0
- thds/mops/_utils/__init__.py +0 -0
- thds/mops/_utils/colorize.py +110 -0
- thds/mops/_utils/config_tree.py +167 -0
- thds/mops/_utils/exception.py +16 -0
- thds/mops/_utils/locked_cache.py +78 -0
- thds/mops/_utils/names.py +23 -0
- thds/mops/_utils/on_slow.py +28 -0
- thds/mops/_utils/once.py +30 -0
- thds/mops/_utils/temp.py +32 -0
- thds/mops/config.py +60 -0
- thds/mops/impure/__init__.py +2 -0
- thds/mops/impure/keyfunc.py +14 -0
- thds/mops/impure/runner.py +73 -0
- thds/mops/k8s/__init__.py +27 -0
- thds/mops/k8s/_shared.py +3 -0
- thds/mops/k8s/apply_yaml.py +22 -0
- thds/mops/k8s/auth.py +49 -0
- thds/mops/k8s/config.py +37 -0
- thds/mops/k8s/container_registry.py +14 -0
- thds/mops/k8s/jobs.py +57 -0
- thds/mops/k8s/launch.py +234 -0
- thds/mops/k8s/logging.py +239 -0
- thds/mops/k8s/namespace.py +17 -0
- thds/mops/k8s/node_selection.py +58 -0
- thds/mops/k8s/retry.py +75 -0
- thds/mops/k8s/too_old_resource_version.py +42 -0
- thds/mops/k8s/tools/krsync.py +50 -0
- thds/mops/k8s/tools/krsync.sh +22 -0
- thds/mops/k8s/wait_job.py +72 -0
- thds/mops/k8s/warn_image_backoff.py +63 -0
- thds/mops/k8s/watch.py +266 -0
- thds/mops/meta.json +8 -0
- thds/mops/parallel.py +36 -0
- thds/mops/pure/__init__.py +43 -0
- thds/mops/pure/_magic/__init__.py +0 -0
- thds/mops/pure/_magic/api.py +114 -0
- thds/mops/pure/_magic/sauce.py +152 -0
- thds/mops/pure/_magic/shims.py +34 -0
- thds/mops/pure/adls/__init__.py +1 -0
- thds/mops/pure/adls/_files.py +22 -0
- thds/mops/pure/adls/blob_store.py +185 -0
- thds/mops/pure/adls/output_fqn.py +17 -0
- thds/mops/pure/core/__init__.py +0 -0
- thds/mops/pure/core/content_addressed.py +31 -0
- thds/mops/pure/core/deferred_work.py +83 -0
- thds/mops/pure/core/entry/__init__.py +2 -0
- thds/mops/pure/core/entry/main.py +47 -0
- thds/mops/pure/core/entry/route_result.py +66 -0
- thds/mops/pure/core/entry/runner_registry.py +31 -0
- thds/mops/pure/core/file_blob_store.py +120 -0
- thds/mops/pure/core/lock/__init__.py +7 -0
- thds/mops/pure/core/lock/_acquire.py +192 -0
- thds/mops/pure/core/lock/_funcs.py +37 -0
- thds/mops/pure/core/lock/cli.py +73 -0
- thds/mops/pure/core/lock/maintain.py +150 -0
- thds/mops/pure/core/lock/read.py +39 -0
- thds/mops/pure/core/lock/types.py +37 -0
- thds/mops/pure/core/lock/write.py +136 -0
- thds/mops/pure/core/memo/__init__.py +6 -0
- thds/mops/pure/core/memo/function_memospace.py +267 -0
- thds/mops/pure/core/memo/keyfunc.py +53 -0
- thds/mops/pure/core/memo/overwrite_params.py +61 -0
- thds/mops/pure/core/memo/results.py +103 -0
- thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
- thds/mops/pure/core/metadata.py +230 -0
- thds/mops/pure/core/output_naming.py +52 -0
- thds/mops/pure/core/partial.py +15 -0
- thds/mops/pure/core/pipeline_id.py +62 -0
- thds/mops/pure/core/pipeline_id_mask.py +79 -0
- thds/mops/pure/core/script_support.py +25 -0
- thds/mops/pure/core/serialize_big_objs.py +73 -0
- thds/mops/pure/core/serialize_paths.py +149 -0
- thds/mops/pure/core/source.py +291 -0
- thds/mops/pure/core/types.py +142 -0
- thds/mops/pure/core/uris.py +81 -0
- thds/mops/pure/core/use_runner.py +47 -0
- thds/mops/pure/joblib/__init__.py +1 -0
- thds/mops/pure/joblib/backend.py +81 -0
- thds/mops/pure/joblib/batching.py +67 -0
- thds/mops/pure/pickling/__init__.py +3 -0
- thds/mops/pure/pickling/_pickle.py +193 -0
- thds/mops/pure/pickling/memoize_only.py +22 -0
- thds/mops/pure/pickling/mprunner.py +173 -0
- thds/mops/pure/pickling/pickles.py +149 -0
- thds/mops/pure/pickling/remote.py +145 -0
- thds/mops/pure/pickling/sha256_b64.py +71 -0
- thds/mops/pure/runner/__init__.py +0 -0
- thds/mops/pure/runner/local.py +239 -0
- thds/mops/pure/runner/shim_builder.py +25 -0
- thds/mops/pure/runner/simple_shims.py +21 -0
- thds/mops/pure/runner/strings.py +1 -0
- thds/mops/pure/runner/types.py +28 -0
- thds/mops/pure/tools/__init__.py +0 -0
- thds/mops/pure/tools/history.py +35 -0
- thds/mops/pure/tools/inspect.py +372 -0
- thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
- thds/mops/pure/tools/stress.py +63 -0
- thds/mops/pure/tools/summarize/__init__.py +4 -0
- thds/mops/pure/tools/summarize/cli.py +293 -0
- thds/mops/pure/tools/summarize/run_summary.py +143 -0
- thds/mops/py.typed +0 -0
- thds/mops/testing/__init__.py +0 -0
- thds/mops/testing/deferred_imports.py +81 -0
- thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
- thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
- thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
- thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
- thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Register expensive "only if we actually need to invoke the function" work to be performed later
|
|
2
|
+
#
|
|
3
|
+
# this could be _any_ kind of work, but is only uploads as of initial abstraction.
|
|
4
|
+
# this basic idea was stolen from `pure.core.source` as a form of optimization for
|
|
5
|
+
# uploading Sources and their hashrefs.
|
|
6
|
+
import typing as ty
|
|
7
|
+
from contextlib import contextmanager
|
|
8
|
+
|
|
9
|
+
from thds import core
|
|
10
|
+
from thds.core.stack_context import StackContext
|
|
11
|
+
|
|
12
|
+
_DEFERRED_INVOCATION_WORK: StackContext[
|
|
13
|
+
ty.Optional[ty.Dict[ty.Hashable, ty.Callable[[], ty.Any]]]
|
|
14
|
+
] = StackContext("DEFERRED_INVOCATION_WORK", None)
|
|
15
|
+
logger = core.log.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@contextmanager
|
|
19
|
+
def open_context() -> ty.Iterator[None]:
|
|
20
|
+
"""Enter this context before you begin serializing your invocation. When perform_all()
|
|
21
|
+
is later called, any deferred work will be evaluated. The context should not be
|
|
22
|
+
closed until after return from the Shim.
|
|
23
|
+
|
|
24
|
+
The idea is that you'd call perform_all() inside your Shim which transfers
|
|
25
|
+
execution to a remote environment, but _not_ call it if you're transferring execution
|
|
26
|
+
to a local environment, as the upload will not be needed.
|
|
27
|
+
|
|
28
|
+
This is not re-entrant. If this is called while the dictionary is non-empty, an
|
|
29
|
+
exception will be raised. This is only because I can think of no reason why anyone
|
|
30
|
+
would want it to be re-entrant, so it seems better to raise an error. If for some
|
|
31
|
+
reason re-entrancy were desired, we could just silently pass if the dictionary already
|
|
32
|
+
has deferred work.
|
|
33
|
+
"""
|
|
34
|
+
existing_work = _DEFERRED_INVOCATION_WORK()
|
|
35
|
+
assert existing_work is None, f"deferred work context is not re-entrant! {existing_work}"
|
|
36
|
+
with _DEFERRED_INVOCATION_WORK.set(dict()):
|
|
37
|
+
logger.debug("Opening deferred work context")
|
|
38
|
+
yield
|
|
39
|
+
logger.debug("Closing deferred work context")
|
|
40
|
+
work_unperformed = _DEFERRED_INVOCATION_WORK()
|
|
41
|
+
if work_unperformed:
|
|
42
|
+
logger.debug(
|
|
43
|
+
"some deferred work was not performed before context close: %s", work_unperformed
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@contextmanager
|
|
48
|
+
def push_non_context() -> ty.Iterator[None]:
|
|
49
|
+
with _DEFERRED_INVOCATION_WORK.set(None):
|
|
50
|
+
yield
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def add(work_owner: str, work_id: ty.Hashable, work: ty.Callable[[], ty.Any]) -> None:
|
|
54
|
+
"""Add some work to an open context. The work will be performed when perform_all() is
|
|
55
|
+
called. If there is no open context, perform the work immediately.
|
|
56
|
+
|
|
57
|
+
The work_owner should usually be the module __name__, but if multiple things
|
|
58
|
+
in a module need to add different types of tasks, then it can be anything
|
|
59
|
+
that would further disambiguate.
|
|
60
|
+
|
|
61
|
+
The work_id should be a unique id within the work_owner 'namespace'.
|
|
62
|
+
"""
|
|
63
|
+
deferred_work = _DEFERRED_INVOCATION_WORK()
|
|
64
|
+
if deferred_work is None:
|
|
65
|
+
logger.debug("No open context - performing work %s immediately", work)
|
|
66
|
+
work()
|
|
67
|
+
else:
|
|
68
|
+
logger.debug("Adding work %s to deferred work %s", (work_owner, work_id), id(deferred_work))
|
|
69
|
+
deferred_work[(work_owner, work_id)] = work
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def perform_all() -> None:
|
|
73
|
+
"""execute all the deferred work that has been added to the current context."""
|
|
74
|
+
work_items = _DEFERRED_INVOCATION_WORK()
|
|
75
|
+
if work_items:
|
|
76
|
+
logger.info("Performing %s items of deferred work", len(work_items))
|
|
77
|
+
for key, _result in core.parallel.yield_all(dict(work_items).items()):
|
|
78
|
+
# consume iterator but don't keep results in memory.
|
|
79
|
+
logger.debug("Popping deferred work %s from %s", key, id(work_items))
|
|
80
|
+
work_items.pop(key)
|
|
81
|
+
|
|
82
|
+
logger.debug("Done performing deferred work on %s", id(work_items))
|
|
83
|
+
assert not work_items, f"Some deferred work was not performed! {work_items}"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""It's critical to keep this main function separate from everything
|
|
2
|
+
in `.core`, because `.core` needs to be fully imported _before_ the
|
|
3
|
+
remote function starts to run, otherwise you get infinite remote
|
|
4
|
+
recursion without ever returning a result, since the core module ends
|
|
5
|
+
up halfway imported and the effect of the IS_INSIDE_RUNNER_ENTRY.set(True) line
|
|
6
|
+
gets erased when the final function module inevitably re-imports
|
|
7
|
+
`.core` when being dynamically looked up, and core is not yet
|
|
8
|
+
registered in sys.modules because it's still running `main`.
|
|
9
|
+
|
|
10
|
+
Ask me how long it took to figure out what was going on there...
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import os
|
|
15
|
+
import time
|
|
16
|
+
from timeit import default_timer
|
|
17
|
+
|
|
18
|
+
from thds.core.log import getLogger
|
|
19
|
+
|
|
20
|
+
from ....__about__ import __version__
|
|
21
|
+
from .. import metadata
|
|
22
|
+
from .runner_registry import run_named_entry_handler
|
|
23
|
+
|
|
24
|
+
logger = getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def main() -> None:
|
|
28
|
+
"""Routes the top level remote function call in a new process."""
|
|
29
|
+
start = default_timer()
|
|
30
|
+
start_timestamp = time.time()
|
|
31
|
+
logger.info(f"Entering remote process {os.getpid()} with installed mops version {__version__}")
|
|
32
|
+
parser = argparse.ArgumentParser(description="Unknown arguments will be passed to the named runner.")
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"runner_name",
|
|
35
|
+
help="Name of a known remote runner that can handle the rest of the arguments",
|
|
36
|
+
)
|
|
37
|
+
# TODO potentially allow things like logger context to be passed in as -- arguments
|
|
38
|
+
args, unknown = parser.parse_known_args()
|
|
39
|
+
run_named_entry_handler(args.runner_name, *unknown)
|
|
40
|
+
logger.info(
|
|
41
|
+
f"Exiting remote process {os.getpid()} after {(default_timer() - start)/60:.2f} minutes"
|
|
42
|
+
+ metadata.format_end_of_run_times(start_timestamp, unknown)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
main() # pragma: no cover
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Utilities that may be useful for a remote entry implementation for your Runner.
|
|
2
|
+
|
|
3
|
+
None of them are required and may not be suitable for a given Runner implementation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import typing as ty
|
|
7
|
+
|
|
8
|
+
from thds.core import log, scope
|
|
9
|
+
|
|
10
|
+
from .. import deferred_work
|
|
11
|
+
from ..output_naming import FunctionArgumentsHashUniqueKey, PipelineFunctionUniqueKey
|
|
12
|
+
|
|
13
|
+
T_contra = ty.TypeVar("T_contra", contravariant=True)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ResultChannel(ty.Protocol[T_contra]):
|
|
17
|
+
"""After remote invocation, respond with result.
|
|
18
|
+
|
|
19
|
+
A remote invocation can succeed with a result or fail with an exception.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def return_value(self, __return_value: T_contra) -> None:
|
|
23
|
+
... # pragma: no cover
|
|
24
|
+
|
|
25
|
+
def exception(self, __ex: Exception) -> None:
|
|
26
|
+
... # pragma: no cover
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
logger = log.getLogger(__name__)
|
|
30
|
+
_routing_scope = scope.Scope()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@_routing_scope.bound
|
|
34
|
+
def route_return_value_or_exception(
|
|
35
|
+
channel: ResultChannel[T_contra],
|
|
36
|
+
do_work_return_value: ty.Callable[[], T_contra],
|
|
37
|
+
pipeline_id: str = "",
|
|
38
|
+
pipeline_function_and_arguments_unique_key: ty.Optional[ty.Tuple[str, str]] = None,
|
|
39
|
+
) -> None:
|
|
40
|
+
"""The remote side of your runner implementation doesn't have to use this, but it's a reasonable approach."""
|
|
41
|
+
_routing_scope.enter(deferred_work.push_non_context())
|
|
42
|
+
# deferred work can be requested during result serialization, but because we don't want
|
|
43
|
+
# to leave a 'broken' result payload (one that refers to unperformed deferred work,
|
|
44
|
+
# maybe because of network or other failure), we simply don't open a deferred work
|
|
45
|
+
# context on the remote side, which forces all the work to be performed as it is
|
|
46
|
+
# added for deferral instead of actually being deferred.
|
|
47
|
+
#
|
|
48
|
+
# pushing this non-context is only necessary in the case of a thread-local
|
|
49
|
+
# 'remote' invocation - in all true remote invocations, there will be no context open.
|
|
50
|
+
|
|
51
|
+
_routing_scope.enter(log.logger_context(remote=pipeline_id))
|
|
52
|
+
if pipeline_function_and_arguments_unique_key:
|
|
53
|
+
pf_key, args_key = pipeline_function_and_arguments_unique_key
|
|
54
|
+
_routing_scope.enter(PipelineFunctionUniqueKey.set(pf_key))
|
|
55
|
+
_routing_scope.enter(FunctionArgumentsHashUniqueKey.set(args_key))
|
|
56
|
+
try:
|
|
57
|
+
# i want to _only_ run the user's function inside this try-catch.
|
|
58
|
+
# If mops itself has a bug, we should not be recording that as
|
|
59
|
+
# though it were an exception in the user's code.
|
|
60
|
+
return_value = do_work_return_value()
|
|
61
|
+
except Exception as ex:
|
|
62
|
+
logger.exception("Failure to run remote function. Transmitting exception...")
|
|
63
|
+
channel.exception(ex)
|
|
64
|
+
else:
|
|
65
|
+
logger.debug("Success running function remotely. Transmitting return value...")
|
|
66
|
+
channel.return_value(return_value)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""In theory, our core concept supports multiple different Runner 'types' being registered and used at the time of remote entry.
|
|
2
|
+
|
|
3
|
+
In practice we only have a single Runner type registered, the MemoizingPicklingRunner.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import typing as ty
|
|
7
|
+
|
|
8
|
+
from thds.core import stack_context
|
|
9
|
+
|
|
10
|
+
RUNNER_ENTRY_COUNT = stack_context.StackContext("runner_entry_count", 0)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def entry_count() -> int:
|
|
14
|
+
return RUNNER_ENTRY_COUNT()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EntryHandler(ty.Protocol):
|
|
18
|
+
def __call__(self, *__args: str) -> ty.Any:
|
|
19
|
+
... # pragma: nocover
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
ENTRY_HANDLERS: ty.Dict[str, EntryHandler] = dict()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def register_entry_handler(name: str, mh: EntryHandler) -> None:
|
|
26
|
+
ENTRY_HANDLERS[name] = mh
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def run_named_entry_handler(name: str, *args: str) -> None:
|
|
30
|
+
with RUNNER_ENTRY_COUNT.set(RUNNER_ENTRY_COUNT() + 1):
|
|
31
|
+
ENTRY_HANDLERS[name](*args)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import typing as ty
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from thds.core import config, log
|
|
8
|
+
from thds.core.files import FILE_SCHEME, atomic_write_path, path_from_uri, remove_file_scheme, to_uri
|
|
9
|
+
from thds.core.link import link
|
|
10
|
+
|
|
11
|
+
from ..core.types import AnyStrSrc, BlobStore
|
|
12
|
+
|
|
13
|
+
MOPS_ROOT = config.item("control_root", default=Path.home() / ".mops")
|
|
14
|
+
logger = log.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@contextmanager
|
|
18
|
+
def atomic_writable(desturi: str, mode: str = "wb") -> ty.Iterator[ty.IO[bytes]]:
|
|
19
|
+
with atomic_write_path(desturi) as temppath:
|
|
20
|
+
with open(temppath, mode) as f:
|
|
21
|
+
yield f
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _link(path: Path, remote_uri: str) -> None:
|
|
25
|
+
dest = path_from_uri(remote_uri)
|
|
26
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
assert link(path, dest), f"Link {path} to {remote_uri} failed!"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _put_bytes_to_file_uri(remote_uri: str, data: AnyStrSrc) -> None:
|
|
31
|
+
"""Write data to a local path. It is very hard to support all the same inputs that ADLS does. :("""
|
|
32
|
+
path = None
|
|
33
|
+
if isinstance(data, str):
|
|
34
|
+
path = Path(data)
|
|
35
|
+
if not path.exists(): # wasn't _actually_ a Path
|
|
36
|
+
path = None
|
|
37
|
+
elif isinstance(data, Path):
|
|
38
|
+
path = data
|
|
39
|
+
if path:
|
|
40
|
+
_link(path, remote_uri)
|
|
41
|
+
elif isinstance(data, bytes):
|
|
42
|
+
with atomic_writable(remote_uri, "wb") as f:
|
|
43
|
+
f.write(data)
|
|
44
|
+
elif isinstance(data, str):
|
|
45
|
+
with atomic_writable(remote_uri, "w") as f:
|
|
46
|
+
f.write(data) # type: ignore
|
|
47
|
+
else:
|
|
48
|
+
# if this fallback case fails, we may need to admit defeat for now,
|
|
49
|
+
# and follow up by analyzing the failure and adding support for the input data type.
|
|
50
|
+
with atomic_writable(remote_uri, "wb") as f:
|
|
51
|
+
for block in data: # type: ignore
|
|
52
|
+
f.write(block)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class FileBlobStore(BlobStore):
|
|
56
|
+
def control_root(self, uri: str) -> str:
|
|
57
|
+
local_root = MOPS_ROOT()
|
|
58
|
+
local_root.mkdir(exist_ok=True)
|
|
59
|
+
return to_uri(local_root)
|
|
60
|
+
|
|
61
|
+
def readbytesinto(self, remote_uri: str, stream: ty.IO[bytes], type_hint: str = "bytes") -> None:
|
|
62
|
+
with path_from_uri(remote_uri).open("rb") as f:
|
|
63
|
+
shutil.copyfileobj(f, stream) # type: ignore
|
|
64
|
+
|
|
65
|
+
def getfile(self, remote_uri: str) -> Path:
|
|
66
|
+
p = path_from_uri(remote_uri)
|
|
67
|
+
if not p.exists():
|
|
68
|
+
logger.error(f"{remote_uri} does not exist. Parent = {p.parent}")
|
|
69
|
+
try:
|
|
70
|
+
logger.error(list(p.parent.glob("*")))
|
|
71
|
+
except FileNotFoundError:
|
|
72
|
+
logger.error(f"{p.parent} does not exist either!")
|
|
73
|
+
raise FileNotFoundError(f"{remote_uri} does not exist")
|
|
74
|
+
return p
|
|
75
|
+
|
|
76
|
+
def putbytes(self, remote_uri: str, data: AnyStrSrc, type_hint: str = "bytes") -> None:
|
|
77
|
+
"""Upload data to a remote path."""
|
|
78
|
+
logger.debug(f"Writing {type_hint} to {remote_uri}")
|
|
79
|
+
_put_bytes_to_file_uri(remote_uri, data)
|
|
80
|
+
|
|
81
|
+
def putfile(self, path: Path, remote_uri: str) -> None:
|
|
82
|
+
_link(path, remote_uri)
|
|
83
|
+
|
|
84
|
+
def exists(self, remote_uri: str) -> bool:
|
|
85
|
+
return path_from_uri(remote_uri).exists()
|
|
86
|
+
|
|
87
|
+
def join(self, *parts: str) -> str:
|
|
88
|
+
return os.path.join(*parts)
|
|
89
|
+
|
|
90
|
+
def split(self, uri: str) -> ty.List[str]:
|
|
91
|
+
"""Splits a given URI into its constituent parts"""
|
|
92
|
+
path = remove_file_scheme(uri)
|
|
93
|
+
# normalize the path to handle redundant slashes
|
|
94
|
+
normalized_path = os.path.normpath(path)
|
|
95
|
+
|
|
96
|
+
parts = normalized_path.split(os.sep)
|
|
97
|
+
|
|
98
|
+
# remove any empty parts that might be created due to leading slashes
|
|
99
|
+
parts = [part for part in parts if part]
|
|
100
|
+
|
|
101
|
+
parts = [f"{FILE_SCHEME}/"] + parts
|
|
102
|
+
|
|
103
|
+
return parts
|
|
104
|
+
|
|
105
|
+
def is_blob_not_found(self, exc: Exception) -> bool:
|
|
106
|
+
return isinstance(exc, FileNotFoundError)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
_STATELESS_BLOB_STORE = FileBlobStore()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def get_file_blob_store(uri: str) -> ty.Optional[FileBlobStore]:
|
|
113
|
+
if uri.startswith(FILE_SCHEME):
|
|
114
|
+
return _STATELESS_BLOB_STORE
|
|
115
|
+
|
|
116
|
+
# special case for things where somebody forgot the file:// scheme.
|
|
117
|
+
# we're the 'first' registered blob store, so we're the last ones to be asked
|
|
118
|
+
# and this shouldn't cause a significant performance penalty since everything else
|
|
119
|
+
# with a scheme will get picked up first.
|
|
120
|
+
return _STATELESS_BLOB_STORE
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""The intent of this module is to provide a best-effort "lock"
|
|
2
|
+
that can be built on top of just `getbytes` and `putbytes` operations.
|
|
3
|
+
|
|
4
|
+
It is important to note that this lock, while it should work under nearly all
|
|
5
|
+
circumstances, is not actually a true lock - it is _possible_ for multiple holders of the
|
|
6
|
+
lock to believe that they hold it exclusively, under degenerate conditions involving very
|
|
7
|
+
slow networks. Therefore, it should only be used as a performance optimization, and not in
|
|
8
|
+
cases where absolute application correctness depend upon an exclusive lock with full
|
|
9
|
+
guarantees.
|
|
10
|
+
|
|
11
|
+
While it is possible that a lock may be acquired multiple times, the _more likely_ failure
|
|
12
|
+
scenario is contention for the lock. There is a built in safety margin to reduce cases of
|
|
13
|
+
multiple lock acquirers, and with a very large number of lockers, it is possible that no
|
|
14
|
+
acquirer will ever get to see its own write 'persist' long enough to determine that it has
|
|
15
|
+
the lock.
|
|
16
|
+
|
|
17
|
+
Again, this algorithm is _not_ designed to be a perfect lock - only to make it relatively
|
|
18
|
+
efficient for a single caller to acquire a lock and maintain it for a period of time while
|
|
19
|
+
other potential acquirers instead determine that they ought to wait for the lock to be
|
|
20
|
+
released.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import time
|
|
24
|
+
import timeit
|
|
25
|
+
import typing as ty
|
|
26
|
+
from datetime import datetime, timedelta
|
|
27
|
+
from uuid import uuid4
|
|
28
|
+
|
|
29
|
+
from thds import humenc
|
|
30
|
+
from thds.core import log
|
|
31
|
+
|
|
32
|
+
from . import _funcs
|
|
33
|
+
from .read import get_writer_id, make_read_lockfile
|
|
34
|
+
from .types import LockAcquired, LockContents
|
|
35
|
+
from .write import LockfileWriter, make_lock_contents
|
|
36
|
+
|
|
37
|
+
logger = log.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def acquire( # noqa: C901
|
|
41
|
+
lock_dir_uri: str,
|
|
42
|
+
*,
|
|
43
|
+
expire: timedelta = timedelta(seconds=30),
|
|
44
|
+
acquire_margin: timedelta = timedelta(seconds=0.0),
|
|
45
|
+
debug: bool = True,
|
|
46
|
+
block: ty.Optional[timedelta] = timedelta(seconds=0),
|
|
47
|
+
) -> ty.Optional[LockAcquired]:
|
|
48
|
+
"""Attempt to acquire an expiring lock.
|
|
49
|
+
|
|
50
|
+
Return a callable suitable for 'maintaining the lock as active' if the lock was
|
|
51
|
+
acquired, plus a Callable suitable for releasing the lock - otherwise, return None to
|
|
52
|
+
indicate that the lock is not owned.
|
|
53
|
+
|
|
54
|
+
The lock_dir_uri must be identical across multiple processes.
|
|
55
|
+
|
|
56
|
+
It is strongly recommended that expire and acquire_margin also be identical
|
|
57
|
+
across all processes attempting to acquire the same lock.
|
|
58
|
+
|
|
59
|
+
It is up to the caller to call `lock.maintain` at regular intervals less than
|
|
60
|
+
`expire`. It is polite and more efficient for other acquirers for you to call
|
|
61
|
+
`lock.release` when the lock is no longer needed.
|
|
62
|
+
|
|
63
|
+
A lock that has not been updated in 'expire' seconds is considered 'released' and may
|
|
64
|
+
be acquired by any other process attempting to acquire it. If you do not `.maintain()`
|
|
65
|
+
the lock, you will lose it.
|
|
66
|
+
|
|
67
|
+
`acquire_margin` is the minimum amount of time that will be waited after attempting to
|
|
68
|
+
acquire the lock, to confirm that no other writer has also attempted to acquire
|
|
69
|
+
it. This should be scaled to be longer than the longest delay you expect _any_
|
|
70
|
+
candidate process to experience between checking the lock uri, finding it acquirable,
|
|
71
|
+
and successfully writing back to the lock uri itself. If the default value (0) is
|
|
72
|
+
provided, then the acquire_margin will be determined automatically to be twice the
|
|
73
|
+
amount of time elapsed between the beginning of the check and the end of the write. If
|
|
74
|
+
you have acquirers accessing this from very different environments, it may be safer to
|
|
75
|
+
specify a higher acquire_margin that will be closer to the largest latency you expect
|
|
76
|
+
any of your clients to experience.
|
|
77
|
+
|
|
78
|
+
`block` is the _minimum_ amount of time to wait before returning None if the lock
|
|
79
|
+
cannot be required. A zero length block will cause acquire to return None after the
|
|
80
|
+
first unsuccessful attempt. Passing block=None will block until first acquisition.
|
|
81
|
+
|
|
82
|
+
If you fail to acquire the lock and want to try again, it is recommended that you call
|
|
83
|
+
this at spaced intervals, not in a tight loop, in order to avoid performance issues.
|
|
84
|
+
|
|
85
|
+
"""
|
|
86
|
+
if acquire_margin * 2 > expire:
|
|
87
|
+
# You should not be waiting nearly as much time as it would take for the lock to
|
|
88
|
+
# become expire to decide that you have acquired the lock.
|
|
89
|
+
#
|
|
90
|
+
# If network or other delays are encountered, other candidate acquirers will end
|
|
91
|
+
# up convinced that the lock has gone expire, right about the time you decide you
|
|
92
|
+
# have acquired it.
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"Acquire margin ({acquire_margin.total_seconds()})"
|
|
95
|
+
f" must be less than half the expire time ({expire.total_seconds()})."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
acquire_margin_s = acquire_margin.total_seconds()
|
|
99
|
+
if acquire_margin_s < 0:
|
|
100
|
+
raise ValueError(f"Acquire margin may not be negative: {acquire_margin_s}")
|
|
101
|
+
|
|
102
|
+
start = _funcs.utc_now()
|
|
103
|
+
|
|
104
|
+
my_writer_id = humenc.encode(uuid4().bytes)
|
|
105
|
+
|
|
106
|
+
lockfile_writer = LockfileWriter(
|
|
107
|
+
my_writer_id,
|
|
108
|
+
lock_dir_uri,
|
|
109
|
+
make_lock_contents(my_writer_id, expire),
|
|
110
|
+
expire.total_seconds(),
|
|
111
|
+
debug=debug,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
read_lockfile = make_read_lockfile(_funcs.make_lock_uri(lock_dir_uri))
|
|
115
|
+
|
|
116
|
+
def is_released(lock_contents: LockContents) -> bool:
|
|
117
|
+
return bool(lock_contents.get("released_at"))
|
|
118
|
+
|
|
119
|
+
def is_fresh(lock_contents: LockContents) -> bool:
|
|
120
|
+
written_at_str = lock_contents.get("written_at")
|
|
121
|
+
if not written_at_str:
|
|
122
|
+
# this likely won't happen in practice b/c we check released first.
|
|
123
|
+
return False # pragma: no cover
|
|
124
|
+
lock_expire_s = lock_contents["expire_s"]
|
|
125
|
+
if round(lock_expire_s, 4) != round(expire.total_seconds(), 4):
|
|
126
|
+
logger.warning(
|
|
127
|
+
f"Remote lock {lock_dir_uri} has expire duration {lock_expire_s},"
|
|
128
|
+
f" which is different than the local configuration {expire}."
|
|
129
|
+
" This may lead to multiple simultaneous acquirers on the lock."
|
|
130
|
+
)
|
|
131
|
+
return datetime.fromisoformat(written_at_str) + expire >= _funcs.utc_now()
|
|
132
|
+
|
|
133
|
+
acquire_delay = 0.0
|
|
134
|
+
|
|
135
|
+
def determine_acquire_delay(before_read: float) -> float:
|
|
136
|
+
# decide how long we're going to wait.
|
|
137
|
+
read_write_delay = timeit.default_timer() - before_read
|
|
138
|
+
if acquire_margin_s and read_write_delay > acquire_margin_s:
|
|
139
|
+
logger.warning(
|
|
140
|
+
f"It took longer ({read_write_delay}) than the acquire margin"
|
|
141
|
+
" between the lock check and completing the lock write."
|
|
142
|
+
" There is danger that another process may think it has acquired the lock."
|
|
143
|
+
" You should make the acquire_margin longer to reduce the chances of this happening."
|
|
144
|
+
)
|
|
145
|
+
auto_acquire_delay = read_write_delay * 2
|
|
146
|
+
# pick the larger of the two, because if we're encountering bad latency, we should
|
|
147
|
+
# be waiting longer to make sure that we don't 'think we won' because of latency.
|
|
148
|
+
return max(acquire_margin_s, auto_acquire_delay)
|
|
149
|
+
|
|
150
|
+
while True:
|
|
151
|
+
before_read = timeit.default_timer()
|
|
152
|
+
maybe_lock_contents = read_lockfile()
|
|
153
|
+
if maybe_lock_contents:
|
|
154
|
+
lock = maybe_lock_contents
|
|
155
|
+
if is_released(lock):
|
|
156
|
+
logger.debug("Lock %s was released - attempting to lock", lock_dir_uri)
|
|
157
|
+
elif not is_fresh(lock):
|
|
158
|
+
logger.debug("Lock %s has expired - will attempt to steal it!", lock_dir_uri)
|
|
159
|
+
elif get_writer_id(lock) == my_writer_id:
|
|
160
|
+
# LOCK ACQUIRED!
|
|
161
|
+
lockfile_writer.mark_acquired()
|
|
162
|
+
# You still need to maintain it by calling .maintain() periodically!
|
|
163
|
+
return lockfile_writer
|
|
164
|
+
|
|
165
|
+
else:
|
|
166
|
+
# lock is fresh and held by another acquirer - failed to acquire!
|
|
167
|
+
if acquire_delay:
|
|
168
|
+
logger.info(f"Lost race for lock {lock_dir_uri}")
|
|
169
|
+
# this is info (not debug) because we expect it to be rare.
|
|
170
|
+
acquire_delay = 0.0
|
|
171
|
+
if block is not None and _funcs.utc_now() > start + block:
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
time.sleep(0.2)
|
|
175
|
+
# just a short sleep before we try again - this probably doesn't need to
|
|
176
|
+
# be configurable, since anyone wanting different behavior can just pass
|
|
177
|
+
# block=0.0 and then do the polling themselves.
|
|
178
|
+
continue
|
|
179
|
+
else:
|
|
180
|
+
logger.debug("Lock %s does not exist - will attempt to lock it.", lock_dir_uri)
|
|
181
|
+
|
|
182
|
+
# lock has expired or does not exist - attempt to acquire it by writing!
|
|
183
|
+
lockfile_writer.write()
|
|
184
|
+
|
|
185
|
+
# wait for a long enough time that we feel confident we were the last writer and
|
|
186
|
+
# not just the fastest write-then-reader.
|
|
187
|
+
acquire_delay = determine_acquire_delay(before_read)
|
|
188
|
+
logger.debug(
|
|
189
|
+
"Waiting %s seconds before checking lock to see if we acquired it...", acquire_delay
|
|
190
|
+
)
|
|
191
|
+
time.sleep(acquire_delay)
|
|
192
|
+
# go back to the beginning of the loop, and see if we managed to acquire the lock!
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import typing as ty
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
|
|
5
|
+
from thds.core import log
|
|
6
|
+
|
|
7
|
+
from ..types import BlobStore
|
|
8
|
+
from ..uris import lookup_blob_store
|
|
9
|
+
|
|
10
|
+
logger = log.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def utc_now() -> datetime:
|
|
14
|
+
return datetime.now(tz=timezone.utc)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def write(blob_store: BlobStore, lock_uri: str, lock_bytes: bytes) -> None:
|
|
18
|
+
try:
|
|
19
|
+
blob_store.putbytes(lock_uri, lock_bytes, type_hint="application/mops-lock")
|
|
20
|
+
except Exception:
|
|
21
|
+
logger.error(f"Failed to write lock at {lock_uri}")
|
|
22
|
+
raise
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def json_dumpb(contents: ty.Mapping) -> bytes:
|
|
26
|
+
return json.dumps(contents, indent=2).encode()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def store_and_lock_uri(lock_dir_uri: str) -> ty.Tuple[BlobStore, str]:
|
|
30
|
+
blob_store = lookup_blob_store(lock_dir_uri)
|
|
31
|
+
lock_uri = blob_store.join(lock_dir_uri, "lock.json")
|
|
32
|
+
return blob_store, lock_uri
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def make_lock_uri(lock_dir_uri: str) -> str:
|
|
36
|
+
_, lock_uri = store_and_lock_uri(lock_dir_uri)
|
|
37
|
+
return lock_uri
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import time
|
|
3
|
+
import typing as ty
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from . import _acquire
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _writer(out_times_path: Path) -> ty.Callable[[float, float], None]:
|
|
10
|
+
_acquire.logger.info(f"Will write Lock Times to {out_times_path}")
|
|
11
|
+
out_times_path.parent.mkdir(parents=True, exist_ok=True)
|
|
12
|
+
|
|
13
|
+
def write_times(after_acquired: float, before_released: float) -> None:
|
|
14
|
+
_acquire.logger.info(f"..........appending {after_acquired},{before_released}")
|
|
15
|
+
with out_times_path.open("a") as f:
|
|
16
|
+
f.write(f"{after_acquired},{before_released}\n")
|
|
17
|
+
|
|
18
|
+
return write_times
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def acquire_and_hold_once(
|
|
22
|
+
lock_uri: str, hold_once_acquired_s: float, out_times_path: ty.Optional[Path]
|
|
23
|
+
) -> None:
|
|
24
|
+
if out_times_path:
|
|
25
|
+
write_times = _writer(out_times_path)
|
|
26
|
+
else:
|
|
27
|
+
write_times = lambda x, y: None # noqa: E731
|
|
28
|
+
|
|
29
|
+
# we want more verbose logging when using the CLI.
|
|
30
|
+
_acquire.logger.debug = _acquire.logger.info # type: ignore
|
|
31
|
+
|
|
32
|
+
_acquire.logger.info(f"Beginning lock acquisition on {lock_uri}")
|
|
33
|
+
lock_owned = _acquire.acquire(lock_uri, block=None)
|
|
34
|
+
assert lock_owned
|
|
35
|
+
when_lock_acquired = time.time()
|
|
36
|
+
# we're using time, not timeit.default_timer, because we care about time
|
|
37
|
+
# differences between multiple processes on the same system, so we can compare
|
|
38
|
+
# them afterward.
|
|
39
|
+
time_until_release = when_lock_acquired + hold_once_acquired_s - time.time()
|
|
40
|
+
while time_until_release > 0:
|
|
41
|
+
lock_owned.maintain()
|
|
42
|
+
time.sleep(min(time_until_release, 4))
|
|
43
|
+
# don't wake up to maintain a lot - every 4 seconds is enough for the default 30s expiry.
|
|
44
|
+
time_until_release = when_lock_acquired + hold_once_acquired_s - time.time()
|
|
45
|
+
|
|
46
|
+
write_times(when_lock_acquired, time.time())
|
|
47
|
+
lock_owned.release()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def main() -> None:
|
|
51
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
52
|
+
parser.add_argument("lock_uri", help="URI to the lockfile")
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--hold-once-acquired-s",
|
|
55
|
+
"-t",
|
|
56
|
+
type=float,
|
|
57
|
+
default=20.0,
|
|
58
|
+
help="Time in seconds to hold the lock once acquired.",
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--out-times",
|
|
62
|
+
type=Path,
|
|
63
|
+
default=None,
|
|
64
|
+
help="Write out the periods of time the lock was fully held (after acquire, before release) to this file.",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
args = parser.parse_args()
|
|
68
|
+
|
|
69
|
+
acquire_and_hold_once(args.lock_uri, args.hold_once_acquired_s, args.out_times)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
if __name__ == "__main__":
|
|
73
|
+
main()
|