thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show
  1. thds/mops/__about__.py +8 -0
  2. thds/mops/__init__.py +3 -0
  3. thds/mops/_compat.py +6 -0
  4. thds/mops/_utils/__init__.py +0 -0
  5. thds/mops/_utils/colorize.py +110 -0
  6. thds/mops/_utils/config_tree.py +167 -0
  7. thds/mops/_utils/exception.py +16 -0
  8. thds/mops/_utils/locked_cache.py +78 -0
  9. thds/mops/_utils/names.py +23 -0
  10. thds/mops/_utils/on_slow.py +28 -0
  11. thds/mops/_utils/once.py +30 -0
  12. thds/mops/_utils/temp.py +32 -0
  13. thds/mops/config.py +60 -0
  14. thds/mops/impure/__init__.py +2 -0
  15. thds/mops/impure/keyfunc.py +14 -0
  16. thds/mops/impure/runner.py +73 -0
  17. thds/mops/k8s/__init__.py +27 -0
  18. thds/mops/k8s/_shared.py +3 -0
  19. thds/mops/k8s/apply_yaml.py +22 -0
  20. thds/mops/k8s/auth.py +49 -0
  21. thds/mops/k8s/config.py +37 -0
  22. thds/mops/k8s/container_registry.py +14 -0
  23. thds/mops/k8s/jobs.py +57 -0
  24. thds/mops/k8s/launch.py +234 -0
  25. thds/mops/k8s/logging.py +239 -0
  26. thds/mops/k8s/namespace.py +17 -0
  27. thds/mops/k8s/node_selection.py +58 -0
  28. thds/mops/k8s/retry.py +75 -0
  29. thds/mops/k8s/too_old_resource_version.py +42 -0
  30. thds/mops/k8s/tools/krsync.py +50 -0
  31. thds/mops/k8s/tools/krsync.sh +22 -0
  32. thds/mops/k8s/wait_job.py +72 -0
  33. thds/mops/k8s/warn_image_backoff.py +63 -0
  34. thds/mops/k8s/watch.py +266 -0
  35. thds/mops/meta.json +8 -0
  36. thds/mops/parallel.py +36 -0
  37. thds/mops/pure/__init__.py +43 -0
  38. thds/mops/pure/_magic/__init__.py +0 -0
  39. thds/mops/pure/_magic/api.py +114 -0
  40. thds/mops/pure/_magic/sauce.py +152 -0
  41. thds/mops/pure/_magic/shims.py +34 -0
  42. thds/mops/pure/adls/__init__.py +1 -0
  43. thds/mops/pure/adls/_files.py +22 -0
  44. thds/mops/pure/adls/blob_store.py +185 -0
  45. thds/mops/pure/adls/output_fqn.py +17 -0
  46. thds/mops/pure/core/__init__.py +0 -0
  47. thds/mops/pure/core/content_addressed.py +31 -0
  48. thds/mops/pure/core/deferred_work.py +83 -0
  49. thds/mops/pure/core/entry/__init__.py +2 -0
  50. thds/mops/pure/core/entry/main.py +47 -0
  51. thds/mops/pure/core/entry/route_result.py +66 -0
  52. thds/mops/pure/core/entry/runner_registry.py +31 -0
  53. thds/mops/pure/core/file_blob_store.py +120 -0
  54. thds/mops/pure/core/lock/__init__.py +7 -0
  55. thds/mops/pure/core/lock/_acquire.py +192 -0
  56. thds/mops/pure/core/lock/_funcs.py +37 -0
  57. thds/mops/pure/core/lock/cli.py +73 -0
  58. thds/mops/pure/core/lock/maintain.py +150 -0
  59. thds/mops/pure/core/lock/read.py +39 -0
  60. thds/mops/pure/core/lock/types.py +37 -0
  61. thds/mops/pure/core/lock/write.py +136 -0
  62. thds/mops/pure/core/memo/__init__.py +6 -0
  63. thds/mops/pure/core/memo/function_memospace.py +267 -0
  64. thds/mops/pure/core/memo/keyfunc.py +53 -0
  65. thds/mops/pure/core/memo/overwrite_params.py +61 -0
  66. thds/mops/pure/core/memo/results.py +103 -0
  67. thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
  68. thds/mops/pure/core/metadata.py +230 -0
  69. thds/mops/pure/core/output_naming.py +52 -0
  70. thds/mops/pure/core/partial.py +15 -0
  71. thds/mops/pure/core/pipeline_id.py +62 -0
  72. thds/mops/pure/core/pipeline_id_mask.py +79 -0
  73. thds/mops/pure/core/script_support.py +25 -0
  74. thds/mops/pure/core/serialize_big_objs.py +73 -0
  75. thds/mops/pure/core/serialize_paths.py +149 -0
  76. thds/mops/pure/core/source.py +291 -0
  77. thds/mops/pure/core/types.py +142 -0
  78. thds/mops/pure/core/uris.py +81 -0
  79. thds/mops/pure/core/use_runner.py +47 -0
  80. thds/mops/pure/joblib/__init__.py +1 -0
  81. thds/mops/pure/joblib/backend.py +81 -0
  82. thds/mops/pure/joblib/batching.py +67 -0
  83. thds/mops/pure/pickling/__init__.py +3 -0
  84. thds/mops/pure/pickling/_pickle.py +193 -0
  85. thds/mops/pure/pickling/memoize_only.py +22 -0
  86. thds/mops/pure/pickling/mprunner.py +173 -0
  87. thds/mops/pure/pickling/pickles.py +149 -0
  88. thds/mops/pure/pickling/remote.py +145 -0
  89. thds/mops/pure/pickling/sha256_b64.py +71 -0
  90. thds/mops/pure/runner/__init__.py +0 -0
  91. thds/mops/pure/runner/local.py +239 -0
  92. thds/mops/pure/runner/shim_builder.py +25 -0
  93. thds/mops/pure/runner/simple_shims.py +21 -0
  94. thds/mops/pure/runner/strings.py +1 -0
  95. thds/mops/pure/runner/types.py +28 -0
  96. thds/mops/pure/tools/__init__.py +0 -0
  97. thds/mops/pure/tools/history.py +35 -0
  98. thds/mops/pure/tools/inspect.py +372 -0
  99. thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
  100. thds/mops/pure/tools/stress.py +63 -0
  101. thds/mops/pure/tools/summarize/__init__.py +4 -0
  102. thds/mops/pure/tools/summarize/cli.py +293 -0
  103. thds/mops/pure/tools/summarize/run_summary.py +143 -0
  104. thds/mops/py.typed +0 -0
  105. thds/mops/testing/__init__.py +0 -0
  106. thds/mops/testing/deferred_imports.py +81 -0
  107. thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
  108. thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
  109. thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
  110. thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
  111. thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
@@ -0,0 +1,83 @@
1
+ # Register expensive "only if we actually need to invoke the function" work to be performed later
2
+ #
3
+ # this could be _any_ kind of work, but is only uploads as of initial abstraction.
4
+ # this basic idea was stolen from `pure.core.source` as a form of optimization for
5
+ # uploading Sources and their hashrefs.
6
+ import typing as ty
7
+ from contextlib import contextmanager
8
+
9
+ from thds import core
10
+ from thds.core.stack_context import StackContext
11
+
12
+ _DEFERRED_INVOCATION_WORK: StackContext[
13
+ ty.Optional[ty.Dict[ty.Hashable, ty.Callable[[], ty.Any]]]
14
+ ] = StackContext("DEFERRED_INVOCATION_WORK", None)
15
+ logger = core.log.getLogger(__name__)
16
+
17
+
18
+ @contextmanager
19
+ def open_context() -> ty.Iterator[None]:
20
+ """Enter this context before you begin serializing your invocation. When perform_all()
21
+ is later called, any deferred work will be evaluated. The context should not be
22
+ closed until after return from the Shim.
23
+
24
+ The idea is that you'd call perform_all() inside your Shim which transfers
25
+ execution to a remote environment, but _not_ call it if you're transferring execution
26
+ to a local environment, as the upload will not be needed.
27
+
28
+ This is not re-entrant. If this is called while the dictionary is non-empty, an
29
+ exception will be raised. This is only because I can think of no reason why anyone
30
+ would want it to be re-entrant, so it seems better to raise an error. If for some
31
+ reason re-entrancy were desired, we could just silently pass if the dictionary already
32
+ has deferred work.
33
+ """
34
+ existing_work = _DEFERRED_INVOCATION_WORK()
35
+ assert existing_work is None, f"deferred work context is not re-entrant! {existing_work}"
36
+ with _DEFERRED_INVOCATION_WORK.set(dict()):
37
+ logger.debug("Opening deferred work context")
38
+ yield
39
+ logger.debug("Closing deferred work context")
40
+ work_unperformed = _DEFERRED_INVOCATION_WORK()
41
+ if work_unperformed:
42
+ logger.debug(
43
+ "some deferred work was not performed before context close: %s", work_unperformed
44
+ )
45
+
46
+
47
+ @contextmanager
48
+ def push_non_context() -> ty.Iterator[None]:
49
+ with _DEFERRED_INVOCATION_WORK.set(None):
50
+ yield
51
+
52
+
53
+ def add(work_owner: str, work_id: ty.Hashable, work: ty.Callable[[], ty.Any]) -> None:
54
+ """Add some work to an open context. The work will be performed when perform_all() is
55
+ called. If there is no open context, perform the work immediately.
56
+
57
+ The work_owner should usually be the module __name__, but if multiple things
58
+ in a module need to add different types of tasks, then it can be anything
59
+ that would further disambiguate.
60
+
61
+ The work_id should be a unique id within the work_owner 'namespace'.
62
+ """
63
+ deferred_work = _DEFERRED_INVOCATION_WORK()
64
+ if deferred_work is None:
65
+ logger.debug("No open context - performing work %s immediately", work)
66
+ work()
67
+ else:
68
+ logger.debug("Adding work %s to deferred work %s", (work_owner, work_id), id(deferred_work))
69
+ deferred_work[(work_owner, work_id)] = work
70
+
71
+
72
+ def perform_all() -> None:
73
+ """execute all the deferred work that has been added to the current context."""
74
+ work_items = _DEFERRED_INVOCATION_WORK()
75
+ if work_items:
76
+ logger.info("Performing %s items of deferred work", len(work_items))
77
+ for key, _result in core.parallel.yield_all(dict(work_items).items()):
78
+ # consume iterator but don't keep results in memory.
79
+ logger.debug("Popping deferred work %s from %s", key, id(work_items))
80
+ work_items.pop(key)
81
+
82
+ logger.debug("Done performing deferred work on %s", id(work_items))
83
+ assert not work_items, f"Some deferred work was not performed! {work_items}"
@@ -0,0 +1,2 @@
1
+ from .route_result import route_return_value_or_exception # noqa
2
+ from .runner_registry import register_entry_handler, run_named_entry_handler # noqa
@@ -0,0 +1,47 @@
1
+ """It's critical to keep this main function separate from everything
2
+ in `.core`, because `.core` needs to be fully imported _before_ the
3
+ remote function starts to run, otherwise you get infinite remote
4
+ recursion without ever returning a result, since the core module ends
5
+ up halfway imported and the effect of the IS_INSIDE_RUNNER_ENTRY.set(True) line
6
+ gets erased when the final function module inevitably re-imports
7
+ `.core` when being dynamically looked up, and core is not yet
8
+ registered in sys.modules because it's still running `main`.
9
+
10
+ Ask me how long it took to figure out what was going on there...
11
+ """
12
+
13
+ import argparse
14
+ import os
15
+ import time
16
+ from timeit import default_timer
17
+
18
+ from thds.core.log import getLogger
19
+
20
+ from ....__about__ import __version__
21
+ from .. import metadata
22
+ from .runner_registry import run_named_entry_handler
23
+
24
+ logger = getLogger(__name__)
25
+
26
+
27
+ def main() -> None:
28
+ """Routes the top level remote function call in a new process."""
29
+ start = default_timer()
30
+ start_timestamp = time.time()
31
+ logger.info(f"Entering remote process {os.getpid()} with installed mops version {__version__}")
32
+ parser = argparse.ArgumentParser(description="Unknown arguments will be passed to the named runner.")
33
+ parser.add_argument(
34
+ "runner_name",
35
+ help="Name of a known remote runner that can handle the rest of the arguments",
36
+ )
37
+ # TODO potentially allow things like logger context to be passed in as -- arguments
38
+ args, unknown = parser.parse_known_args()
39
+ run_named_entry_handler(args.runner_name, *unknown)
40
+ logger.info(
41
+ f"Exiting remote process {os.getpid()} after {(default_timer() - start)/60:.2f} minutes"
42
+ + metadata.format_end_of_run_times(start_timestamp, unknown)
43
+ )
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main() # pragma: no cover
@@ -0,0 +1,66 @@
1
+ """Utilities that may be useful for a remote entry implementation for your Runner.
2
+
3
+ None of them are required and may not be suitable for a given Runner implementation.
4
+ """
5
+
6
+ import typing as ty
7
+
8
+ from thds.core import log, scope
9
+
10
+ from .. import deferred_work
11
+ from ..output_naming import FunctionArgumentsHashUniqueKey, PipelineFunctionUniqueKey
12
+
13
+ T_contra = ty.TypeVar("T_contra", contravariant=True)
14
+
15
+
16
+ class ResultChannel(ty.Protocol[T_contra]):
17
+ """After remote invocation, respond with result.
18
+
19
+ A remote invocation can succeed with a result or fail with an exception.
20
+ """
21
+
22
+ def return_value(self, __return_value: T_contra) -> None:
23
+ ... # pragma: no cover
24
+
25
+ def exception(self, __ex: Exception) -> None:
26
+ ... # pragma: no cover
27
+
28
+
29
+ logger = log.getLogger(__name__)
30
+ _routing_scope = scope.Scope()
31
+
32
+
33
+ @_routing_scope.bound
34
+ def route_return_value_or_exception(
35
+ channel: ResultChannel[T_contra],
36
+ do_work_return_value: ty.Callable[[], T_contra],
37
+ pipeline_id: str = "",
38
+ pipeline_function_and_arguments_unique_key: ty.Optional[ty.Tuple[str, str]] = None,
39
+ ) -> None:
40
+ """The remote side of your runner implementation doesn't have to use this, but it's a reasonable approach."""
41
+ _routing_scope.enter(deferred_work.push_non_context())
42
+ # deferred work can be requested during result serialization, but because we don't want
43
+ # to leave a 'broken' result payload (one that refers to unperformed deferred work,
44
+ # maybe because of network or other failure), we simply don't open a deferred work
45
+ # context on the remote side, which forces all the work to be performed as it is
46
+ # added for deferral instead of actually being deferred.
47
+ #
48
+ # pushing this non-context is only necessary in the case of a thread-local
49
+ # 'remote' invocation - in all true remote invocations, there will be no context open.
50
+
51
+ _routing_scope.enter(log.logger_context(remote=pipeline_id))
52
+ if pipeline_function_and_arguments_unique_key:
53
+ pf_key, args_key = pipeline_function_and_arguments_unique_key
54
+ _routing_scope.enter(PipelineFunctionUniqueKey.set(pf_key))
55
+ _routing_scope.enter(FunctionArgumentsHashUniqueKey.set(args_key))
56
+ try:
57
+ # i want to _only_ run the user's function inside this try-catch.
58
+ # If mops itself has a bug, we should not be recording that as
59
+ # though it were an exception in the user's code.
60
+ return_value = do_work_return_value()
61
+ except Exception as ex:
62
+ logger.exception("Failure to run remote function. Transmitting exception...")
63
+ channel.exception(ex)
64
+ else:
65
+ logger.debug("Success running function remotely. Transmitting return value...")
66
+ channel.return_value(return_value)
@@ -0,0 +1,31 @@
1
+ """In theory, our core concept supports multiple different Runner 'types' being registered and used at the time of remote entry.
2
+
3
+ In practice we only have a single Runner type registered, the MemoizingPicklingRunner.
4
+ """
5
+
6
+ import typing as ty
7
+
8
+ from thds.core import stack_context
9
+
10
+ RUNNER_ENTRY_COUNT = stack_context.StackContext("runner_entry_count", 0)
11
+
12
+
13
+ def entry_count() -> int:
14
+ return RUNNER_ENTRY_COUNT()
15
+
16
+
17
+ class EntryHandler(ty.Protocol):
18
+ def __call__(self, *__args: str) -> ty.Any:
19
+ ... # pragma: nocover
20
+
21
+
22
+ ENTRY_HANDLERS: ty.Dict[str, EntryHandler] = dict()
23
+
24
+
25
+ def register_entry_handler(name: str, mh: EntryHandler) -> None:
26
+ ENTRY_HANDLERS[name] = mh
27
+
28
+
29
+ def run_named_entry_handler(name: str, *args: str) -> None:
30
+ with RUNNER_ENTRY_COUNT.set(RUNNER_ENTRY_COUNT() + 1):
31
+ ENTRY_HANDLERS[name](*args)
@@ -0,0 +1,120 @@
1
+ import os
2
+ import shutil
3
+ import typing as ty
4
+ from contextlib import contextmanager
5
+ from pathlib import Path
6
+
7
+ from thds.core import config, log
8
+ from thds.core.files import FILE_SCHEME, atomic_write_path, path_from_uri, remove_file_scheme, to_uri
9
+ from thds.core.link import link
10
+
11
+ from ..core.types import AnyStrSrc, BlobStore
12
+
13
+ MOPS_ROOT = config.item("control_root", default=Path.home() / ".mops")
14
+ logger = log.getLogger(__name__)
15
+
16
+
17
+ @contextmanager
18
+ def atomic_writable(desturi: str, mode: str = "wb") -> ty.Iterator[ty.IO[bytes]]:
19
+ with atomic_write_path(desturi) as temppath:
20
+ with open(temppath, mode) as f:
21
+ yield f
22
+
23
+
24
+ def _link(path: Path, remote_uri: str) -> None:
25
+ dest = path_from_uri(remote_uri)
26
+ dest.parent.mkdir(parents=True, exist_ok=True)
27
+ assert link(path, dest), f"Link {path} to {remote_uri} failed!"
28
+
29
+
30
+ def _put_bytes_to_file_uri(remote_uri: str, data: AnyStrSrc) -> None:
31
+ """Write data to a local path. It is very hard to support all the same inputs that ADLS does. :("""
32
+ path = None
33
+ if isinstance(data, str):
34
+ path = Path(data)
35
+ if not path.exists(): # wasn't _actually_ a Path
36
+ path = None
37
+ elif isinstance(data, Path):
38
+ path = data
39
+ if path:
40
+ _link(path, remote_uri)
41
+ elif isinstance(data, bytes):
42
+ with atomic_writable(remote_uri, "wb") as f:
43
+ f.write(data)
44
+ elif isinstance(data, str):
45
+ with atomic_writable(remote_uri, "w") as f:
46
+ f.write(data) # type: ignore
47
+ else:
48
+ # if this fallback case fails, we may need to admit defeat for now,
49
+ # and follow up by analyzing the failure and adding support for the input data type.
50
+ with atomic_writable(remote_uri, "wb") as f:
51
+ for block in data: # type: ignore
52
+ f.write(block)
53
+
54
+
55
+ class FileBlobStore(BlobStore):
56
+ def control_root(self, uri: str) -> str:
57
+ local_root = MOPS_ROOT()
58
+ local_root.mkdir(exist_ok=True)
59
+ return to_uri(local_root)
60
+
61
+ def readbytesinto(self, remote_uri: str, stream: ty.IO[bytes], type_hint: str = "bytes") -> None:
62
+ with path_from_uri(remote_uri).open("rb") as f:
63
+ shutil.copyfileobj(f, stream) # type: ignore
64
+
65
+ def getfile(self, remote_uri: str) -> Path:
66
+ p = path_from_uri(remote_uri)
67
+ if not p.exists():
68
+ logger.error(f"{remote_uri} does not exist. Parent = {p.parent}")
69
+ try:
70
+ logger.error(list(p.parent.glob("*")))
71
+ except FileNotFoundError:
72
+ logger.error(f"{p.parent} does not exist either!")
73
+ raise FileNotFoundError(f"{remote_uri} does not exist")
74
+ return p
75
+
76
+ def putbytes(self, remote_uri: str, data: AnyStrSrc, type_hint: str = "bytes") -> None:
77
+ """Upload data to a remote path."""
78
+ logger.debug(f"Writing {type_hint} to {remote_uri}")
79
+ _put_bytes_to_file_uri(remote_uri, data)
80
+
81
+ def putfile(self, path: Path, remote_uri: str) -> None:
82
+ _link(path, remote_uri)
83
+
84
+ def exists(self, remote_uri: str) -> bool:
85
+ return path_from_uri(remote_uri).exists()
86
+
87
+ def join(self, *parts: str) -> str:
88
+ return os.path.join(*parts)
89
+
90
+ def split(self, uri: str) -> ty.List[str]:
91
+ """Splits a given URI into its constituent parts"""
92
+ path = remove_file_scheme(uri)
93
+ # normalize the path to handle redundant slashes
94
+ normalized_path = os.path.normpath(path)
95
+
96
+ parts = normalized_path.split(os.sep)
97
+
98
+ # remove any empty parts that might be created due to leading slashes
99
+ parts = [part for part in parts if part]
100
+
101
+ parts = [f"{FILE_SCHEME}/"] + parts
102
+
103
+ return parts
104
+
105
+ def is_blob_not_found(self, exc: Exception) -> bool:
106
+ return isinstance(exc, FileNotFoundError)
107
+
108
+
109
+ _STATELESS_BLOB_STORE = FileBlobStore()
110
+
111
+
112
+ def get_file_blob_store(uri: str) -> ty.Optional[FileBlobStore]:
113
+ if uri.startswith(FILE_SCHEME):
114
+ return _STATELESS_BLOB_STORE
115
+
116
+ # special case for things where somebody forgot the file:// scheme.
117
+ # we're the 'first' registered blob store, so we're the last ones to be asked
118
+ # and this shouldn't cause a significant performance penalty since everything else
119
+ # with a scheme will get picked up first.
120
+ return _STATELESS_BLOB_STORE
@@ -0,0 +1,7 @@
1
+ from ._acquire import acquire # noqa: F401
2
+ from .maintain import ( # noqa: F401
3
+ CannotMaintainLock,
4
+ launch_daemon_lock_maintainer,
5
+ remote_lock_maintain,
6
+ )
7
+ from .types import LockAcquired # noqa: F401
@@ -0,0 +1,192 @@
1
+ """The intent of this module is to provide a best-effort "lock"
2
+ that can be built on top of just `getbytes` and `putbytes` operations.
3
+
4
+ It is important to note that this lock, while it should work under nearly all
5
+ circumstances, is not actually a true lock - it is _possible_ for multiple holders of the
6
+ lock to believe that they hold it exclusively, under degenerate conditions involving very
7
+ slow networks. Therefore, it should only be used as a performance optimization, and not in
8
+ cases where absolute application correctness depend upon an exclusive lock with full
9
+ guarantees.
10
+
11
+ While it is possible that a lock may be acquired multiple times, the _more likely_ failure
12
+ scenario is contention for the lock. There is a built in safety margin to reduce cases of
13
+ multiple lock acquirers, and with a very large number of lockers, it is possible that no
14
+ acquirer will ever get to see its own write 'persist' long enough to determine that it has
15
+ the lock.
16
+
17
+ Again, this algorithm is _not_ designed to be a perfect lock - only to make it relatively
18
+ efficient for a single caller to acquire a lock and maintain it for a period of time while
19
+ other potential acquirers instead determine that they ought to wait for the lock to be
20
+ released.
21
+ """
22
+
23
+ import time
24
+ import timeit
25
+ import typing as ty
26
+ from datetime import datetime, timedelta
27
+ from uuid import uuid4
28
+
29
+ from thds import humenc
30
+ from thds.core import log
31
+
32
+ from . import _funcs
33
+ from .read import get_writer_id, make_read_lockfile
34
+ from .types import LockAcquired, LockContents
35
+ from .write import LockfileWriter, make_lock_contents
36
+
37
+ logger = log.getLogger(__name__)
38
+
39
+
40
+ def acquire( # noqa: C901
41
+ lock_dir_uri: str,
42
+ *,
43
+ expire: timedelta = timedelta(seconds=30),
44
+ acquire_margin: timedelta = timedelta(seconds=0.0),
45
+ debug: bool = True,
46
+ block: ty.Optional[timedelta] = timedelta(seconds=0),
47
+ ) -> ty.Optional[LockAcquired]:
48
+ """Attempt to acquire an expiring lock.
49
+
50
+ Return a callable suitable for 'maintaining the lock as active' if the lock was
51
+ acquired, plus a Callable suitable for releasing the lock - otherwise, return None to
52
+ indicate that the lock is not owned.
53
+
54
+ The lock_dir_uri must be identical across multiple processes.
55
+
56
+ It is strongly recommended that expire and acquire_margin also be identical
57
+ across all processes attempting to acquire the same lock.
58
+
59
+ It is up to the caller to call `lock.maintain` at regular intervals less than
60
+ `expire`. It is polite and more efficient for other acquirers for you to call
61
+ `lock.release` when the lock is no longer needed.
62
+
63
+ A lock that has not been updated in 'expire' seconds is considered 'released' and may
64
+ be acquired by any other process attempting to acquire it. If you do not `.maintain()`
65
+ the lock, you will lose it.
66
+
67
+ `acquire_margin` is the minimum amount of time that will be waited after attempting to
68
+ acquire the lock, to confirm that no other writer has also attempted to acquire
69
+ it. This should be scaled to be longer than the longest delay you expect _any_
70
+ candidate process to experience between checking the lock uri, finding it acquirable,
71
+ and successfully writing back to the lock uri itself. If the default value (0) is
72
+ provided, then the acquire_margin will be determined automatically to be twice the
73
+ amount of time elapsed between the beginning of the check and the end of the write. If
74
+ you have acquirers accessing this from very different environments, it may be safer to
75
+ specify a higher acquire_margin that will be closer to the largest latency you expect
76
+ any of your clients to experience.
77
+
78
+ `block` is the _minimum_ amount of time to wait before returning None if the lock
79
+ cannot be required. A zero length block will cause acquire to return None after the
80
+ first unsuccessful attempt. Passing block=None will block until first acquisition.
81
+
82
+ If you fail to acquire the lock and want to try again, it is recommended that you call
83
+ this at spaced intervals, not in a tight loop, in order to avoid performance issues.
84
+
85
+ """
86
+ if acquire_margin * 2 > expire:
87
+ # You should not be waiting nearly as much time as it would take for the lock to
88
+ # become expire to decide that you have acquired the lock.
89
+ #
90
+ # If network or other delays are encountered, other candidate acquirers will end
91
+ # up convinced that the lock has gone expire, right about the time you decide you
92
+ # have acquired it.
93
+ raise ValueError(
94
+ f"Acquire margin ({acquire_margin.total_seconds()})"
95
+ f" must be less than half the expire time ({expire.total_seconds()})."
96
+ )
97
+
98
+ acquire_margin_s = acquire_margin.total_seconds()
99
+ if acquire_margin_s < 0:
100
+ raise ValueError(f"Acquire margin may not be negative: {acquire_margin_s}")
101
+
102
+ start = _funcs.utc_now()
103
+
104
+ my_writer_id = humenc.encode(uuid4().bytes)
105
+
106
+ lockfile_writer = LockfileWriter(
107
+ my_writer_id,
108
+ lock_dir_uri,
109
+ make_lock_contents(my_writer_id, expire),
110
+ expire.total_seconds(),
111
+ debug=debug,
112
+ )
113
+
114
+ read_lockfile = make_read_lockfile(_funcs.make_lock_uri(lock_dir_uri))
115
+
116
+ def is_released(lock_contents: LockContents) -> bool:
117
+ return bool(lock_contents.get("released_at"))
118
+
119
+ def is_fresh(lock_contents: LockContents) -> bool:
120
+ written_at_str = lock_contents.get("written_at")
121
+ if not written_at_str:
122
+ # this likely won't happen in practice b/c we check released first.
123
+ return False # pragma: no cover
124
+ lock_expire_s = lock_contents["expire_s"]
125
+ if round(lock_expire_s, 4) != round(expire.total_seconds(), 4):
126
+ logger.warning(
127
+ f"Remote lock {lock_dir_uri} has expire duration {lock_expire_s},"
128
+ f" which is different than the local configuration {expire}."
129
+ " This may lead to multiple simultaneous acquirers on the lock."
130
+ )
131
+ return datetime.fromisoformat(written_at_str) + expire >= _funcs.utc_now()
132
+
133
+ acquire_delay = 0.0
134
+
135
+ def determine_acquire_delay(before_read: float) -> float:
136
+ # decide how long we're going to wait.
137
+ read_write_delay = timeit.default_timer() - before_read
138
+ if acquire_margin_s and read_write_delay > acquire_margin_s:
139
+ logger.warning(
140
+ f"It took longer ({read_write_delay}) than the acquire margin"
141
+ " between the lock check and completing the lock write."
142
+ " There is danger that another process may think it has acquired the lock."
143
+ " You should make the acquire_margin longer to reduce the chances of this happening."
144
+ )
145
+ auto_acquire_delay = read_write_delay * 2
146
+ # pick the larger of the two, because if we're encountering bad latency, we should
147
+ # be waiting longer to make sure that we don't 'think we won' because of latency.
148
+ return max(acquire_margin_s, auto_acquire_delay)
149
+
150
+ while True:
151
+ before_read = timeit.default_timer()
152
+ maybe_lock_contents = read_lockfile()
153
+ if maybe_lock_contents:
154
+ lock = maybe_lock_contents
155
+ if is_released(lock):
156
+ logger.debug("Lock %s was released - attempting to lock", lock_dir_uri)
157
+ elif not is_fresh(lock):
158
+ logger.debug("Lock %s has expired - will attempt to steal it!", lock_dir_uri)
159
+ elif get_writer_id(lock) == my_writer_id:
160
+ # LOCK ACQUIRED!
161
+ lockfile_writer.mark_acquired()
162
+ # You still need to maintain it by calling .maintain() periodically!
163
+ return lockfile_writer
164
+
165
+ else:
166
+ # lock is fresh and held by another acquirer - failed to acquire!
167
+ if acquire_delay:
168
+ logger.info(f"Lost race for lock {lock_dir_uri}")
169
+ # this is info (not debug) because we expect it to be rare.
170
+ acquire_delay = 0.0
171
+ if block is not None and _funcs.utc_now() > start + block:
172
+ return None
173
+
174
+ time.sleep(0.2)
175
+ # just a short sleep before we try again - this probably doesn't need to
176
+ # be configurable, since anyone wanting different behavior can just pass
177
+ # block=0.0 and then do the polling themselves.
178
+ continue
179
+ else:
180
+ logger.debug("Lock %s does not exist - will attempt to lock it.", lock_dir_uri)
181
+
182
+ # lock has expired or does not exist - attempt to acquire it by writing!
183
+ lockfile_writer.write()
184
+
185
+ # wait for a long enough time that we feel confident we were the last writer and
186
+ # not just the fastest write-then-reader.
187
+ acquire_delay = determine_acquire_delay(before_read)
188
+ logger.debug(
189
+ "Waiting %s seconds before checking lock to see if we acquired it...", acquire_delay
190
+ )
191
+ time.sleep(acquire_delay)
192
+ # go back to the beginning of the loop, and see if we managed to acquire the lock!
@@ -0,0 +1,37 @@
1
+ import json
2
+ import typing as ty
3
+ from datetime import datetime, timezone
4
+
5
+ from thds.core import log
6
+
7
+ from ..types import BlobStore
8
+ from ..uris import lookup_blob_store
9
+
10
+ logger = log.getLogger(__name__)
11
+
12
+
13
+ def utc_now() -> datetime:
14
+ return datetime.now(tz=timezone.utc)
15
+
16
+
17
+ def write(blob_store: BlobStore, lock_uri: str, lock_bytes: bytes) -> None:
18
+ try:
19
+ blob_store.putbytes(lock_uri, lock_bytes, type_hint="application/mops-lock")
20
+ except Exception:
21
+ logger.error(f"Failed to write lock at {lock_uri}")
22
+ raise
23
+
24
+
25
+ def json_dumpb(contents: ty.Mapping) -> bytes:
26
+ return json.dumps(contents, indent=2).encode()
27
+
28
+
29
+ def store_and_lock_uri(lock_dir_uri: str) -> ty.Tuple[BlobStore, str]:
30
+ blob_store = lookup_blob_store(lock_dir_uri)
31
+ lock_uri = blob_store.join(lock_dir_uri, "lock.json")
32
+ return blob_store, lock_uri
33
+
34
+
35
+ def make_lock_uri(lock_dir_uri: str) -> str:
36
+ _, lock_uri = store_and_lock_uri(lock_dir_uri)
37
+ return lock_uri
@@ -0,0 +1,73 @@
1
+ import argparse
2
+ import time
3
+ import typing as ty
4
+ from pathlib import Path
5
+
6
+ from . import _acquire
7
+
8
+
9
+ def _writer(out_times_path: Path) -> ty.Callable[[float, float], None]:
10
+ _acquire.logger.info(f"Will write Lock Times to {out_times_path}")
11
+ out_times_path.parent.mkdir(parents=True, exist_ok=True)
12
+
13
+ def write_times(after_acquired: float, before_released: float) -> None:
14
+ _acquire.logger.info(f"..........appending {after_acquired},{before_released}")
15
+ with out_times_path.open("a") as f:
16
+ f.write(f"{after_acquired},{before_released}\n")
17
+
18
+ return write_times
19
+
20
+
21
+ def acquire_and_hold_once(
22
+ lock_uri: str, hold_once_acquired_s: float, out_times_path: ty.Optional[Path]
23
+ ) -> None:
24
+ if out_times_path:
25
+ write_times = _writer(out_times_path)
26
+ else:
27
+ write_times = lambda x, y: None # noqa: E731
28
+
29
+ # we want more verbose logging when using the CLI.
30
+ _acquire.logger.debug = _acquire.logger.info # type: ignore
31
+
32
+ _acquire.logger.info(f"Beginning lock acquisition on {lock_uri}")
33
+ lock_owned = _acquire.acquire(lock_uri, block=None)
34
+ assert lock_owned
35
+ when_lock_acquired = time.time()
36
+ # we're using time, not timeit.default_timer, because we care about time
37
+ # differences between multiple processes on the same system, so we can compare
38
+ # them afterward.
39
+ time_until_release = when_lock_acquired + hold_once_acquired_s - time.time()
40
+ while time_until_release > 0:
41
+ lock_owned.maintain()
42
+ time.sleep(min(time_until_release, 4))
43
+ # don't wake up to maintain a lot - every 4 seconds is enough for the default 30s expiry.
44
+ time_until_release = when_lock_acquired + hold_once_acquired_s - time.time()
45
+
46
+ write_times(when_lock_acquired, time.time())
47
+ lock_owned.release()
48
+
49
+
50
+ def main() -> None:
51
+ parser = argparse.ArgumentParser(description=__doc__)
52
+ parser.add_argument("lock_uri", help="URI to the lockfile")
53
+ parser.add_argument(
54
+ "--hold-once-acquired-s",
55
+ "-t",
56
+ type=float,
57
+ default=20.0,
58
+ help="Time in seconds to hold the lock once acquired.",
59
+ )
60
+ parser.add_argument(
61
+ "--out-times",
62
+ type=Path,
63
+ default=None,
64
+ help="Write out the periods of time the lock was fully held (after acquire, before release) to this file.",
65
+ )
66
+
67
+ args = parser.parse_args()
68
+
69
+ acquire_and_hold_once(args.lock_uri, args.hold_once_acquired_s, args.out_times)
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()