thds.mops 3.6.20250219172032__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.mops might be problematic. Click here for more details.
- thds/mops/__about__.py +8 -0
- thds/mops/__init__.py +3 -0
- thds/mops/_compat.py +6 -0
- thds/mops/_utils/__init__.py +0 -0
- thds/mops/_utils/colorize.py +110 -0
- thds/mops/_utils/config_tree.py +167 -0
- thds/mops/_utils/exception.py +16 -0
- thds/mops/_utils/locked_cache.py +78 -0
- thds/mops/_utils/names.py +23 -0
- thds/mops/_utils/on_slow.py +28 -0
- thds/mops/_utils/once.py +30 -0
- thds/mops/_utils/temp.py +32 -0
- thds/mops/config.py +60 -0
- thds/mops/impure/__init__.py +2 -0
- thds/mops/impure/keyfunc.py +14 -0
- thds/mops/impure/runner.py +73 -0
- thds/mops/k8s/__init__.py +27 -0
- thds/mops/k8s/_shared.py +3 -0
- thds/mops/k8s/apply_yaml.py +22 -0
- thds/mops/k8s/auth.py +49 -0
- thds/mops/k8s/config.py +37 -0
- thds/mops/k8s/container_registry.py +14 -0
- thds/mops/k8s/jobs.py +57 -0
- thds/mops/k8s/launch.py +234 -0
- thds/mops/k8s/logging.py +239 -0
- thds/mops/k8s/namespace.py +17 -0
- thds/mops/k8s/node_selection.py +58 -0
- thds/mops/k8s/retry.py +75 -0
- thds/mops/k8s/too_old_resource_version.py +42 -0
- thds/mops/k8s/tools/krsync.py +50 -0
- thds/mops/k8s/tools/krsync.sh +22 -0
- thds/mops/k8s/wait_job.py +72 -0
- thds/mops/k8s/warn_image_backoff.py +63 -0
- thds/mops/k8s/watch.py +266 -0
- thds/mops/meta.json +8 -0
- thds/mops/parallel.py +36 -0
- thds/mops/pure/__init__.py +43 -0
- thds/mops/pure/_magic/__init__.py +0 -0
- thds/mops/pure/_magic/api.py +114 -0
- thds/mops/pure/_magic/sauce.py +152 -0
- thds/mops/pure/_magic/shims.py +34 -0
- thds/mops/pure/adls/__init__.py +1 -0
- thds/mops/pure/adls/_files.py +22 -0
- thds/mops/pure/adls/blob_store.py +185 -0
- thds/mops/pure/adls/output_fqn.py +17 -0
- thds/mops/pure/core/__init__.py +0 -0
- thds/mops/pure/core/content_addressed.py +31 -0
- thds/mops/pure/core/deferred_work.py +83 -0
- thds/mops/pure/core/entry/__init__.py +2 -0
- thds/mops/pure/core/entry/main.py +47 -0
- thds/mops/pure/core/entry/route_result.py +66 -0
- thds/mops/pure/core/entry/runner_registry.py +31 -0
- thds/mops/pure/core/file_blob_store.py +120 -0
- thds/mops/pure/core/lock/__init__.py +7 -0
- thds/mops/pure/core/lock/_acquire.py +192 -0
- thds/mops/pure/core/lock/_funcs.py +37 -0
- thds/mops/pure/core/lock/cli.py +73 -0
- thds/mops/pure/core/lock/maintain.py +150 -0
- thds/mops/pure/core/lock/read.py +39 -0
- thds/mops/pure/core/lock/types.py +37 -0
- thds/mops/pure/core/lock/write.py +136 -0
- thds/mops/pure/core/memo/__init__.py +6 -0
- thds/mops/pure/core/memo/function_memospace.py +267 -0
- thds/mops/pure/core/memo/keyfunc.py +53 -0
- thds/mops/pure/core/memo/overwrite_params.py +61 -0
- thds/mops/pure/core/memo/results.py +103 -0
- thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
- thds/mops/pure/core/metadata.py +230 -0
- thds/mops/pure/core/output_naming.py +52 -0
- thds/mops/pure/core/partial.py +15 -0
- thds/mops/pure/core/pipeline_id.py +62 -0
- thds/mops/pure/core/pipeline_id_mask.py +79 -0
- thds/mops/pure/core/script_support.py +25 -0
- thds/mops/pure/core/serialize_big_objs.py +73 -0
- thds/mops/pure/core/serialize_paths.py +149 -0
- thds/mops/pure/core/source.py +291 -0
- thds/mops/pure/core/types.py +142 -0
- thds/mops/pure/core/uris.py +81 -0
- thds/mops/pure/core/use_runner.py +47 -0
- thds/mops/pure/joblib/__init__.py +1 -0
- thds/mops/pure/joblib/backend.py +81 -0
- thds/mops/pure/joblib/batching.py +67 -0
- thds/mops/pure/pickling/__init__.py +3 -0
- thds/mops/pure/pickling/_pickle.py +193 -0
- thds/mops/pure/pickling/memoize_only.py +22 -0
- thds/mops/pure/pickling/mprunner.py +173 -0
- thds/mops/pure/pickling/pickles.py +149 -0
- thds/mops/pure/pickling/remote.py +145 -0
- thds/mops/pure/pickling/sha256_b64.py +71 -0
- thds/mops/pure/runner/__init__.py +0 -0
- thds/mops/pure/runner/local.py +239 -0
- thds/mops/pure/runner/shim_builder.py +25 -0
- thds/mops/pure/runner/simple_shims.py +21 -0
- thds/mops/pure/runner/strings.py +1 -0
- thds/mops/pure/runner/types.py +28 -0
- thds/mops/pure/tools/__init__.py +0 -0
- thds/mops/pure/tools/history.py +35 -0
- thds/mops/pure/tools/inspect.py +372 -0
- thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
- thds/mops/pure/tools/stress.py +63 -0
- thds/mops/pure/tools/summarize/__init__.py +4 -0
- thds/mops/pure/tools/summarize/cli.py +293 -0
- thds/mops/pure/tools/summarize/run_summary.py +143 -0
- thds/mops/py.typed +0 -0
- thds/mops/testing/__init__.py +0 -0
- thds/mops/testing/deferred_imports.py +81 -0
- thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
- thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
- thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
- thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
- thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Part of the design of our lock is that a remote process can take over 'maintenance' of
|
|
2
|
+
the lock if (and especially) if the orchestrator process dies.
|
|
3
|
+
|
|
4
|
+
This allows a killed orchestrator process to be restarted as long as all of its remote
|
|
5
|
+
processes have gotten started working.
|
|
6
|
+
|
|
7
|
+
The remote process lock maintainers never _acquire_ the lock; they simply read what's in
|
|
8
|
+
it when they get started, and from then on keep the `written_at` timestamp up to date.
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import time
|
|
13
|
+
import typing as ty
|
|
14
|
+
from datetime import datetime, timedelta
|
|
15
|
+
from functools import partial
|
|
16
|
+
from threading import Thread
|
|
17
|
+
|
|
18
|
+
from thds.core import log
|
|
19
|
+
|
|
20
|
+
from ._funcs import make_lock_uri
|
|
21
|
+
from .read import get_writer_id, make_read_lockfile
|
|
22
|
+
from .types import LockAcquired
|
|
23
|
+
from .write import LockfileWriter, make_lock_contents
|
|
24
|
+
|
|
25
|
+
logger = log.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _MaintainOnly(ty.NamedTuple):
|
|
29
|
+
"""Matches the LockAcquired interface except that release() will do nothing."""
|
|
30
|
+
|
|
31
|
+
maintain: ty.Callable[[], None]
|
|
32
|
+
expire_s: float
|
|
33
|
+
release: ty.Callable[[], None]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class _MaintainForever(ty.Protocol):
|
|
37
|
+
def __call__(self) -> None:
|
|
38
|
+
... # pragma: no cover
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _maintain_forever(
|
|
42
|
+
maintain: ty.Callable[[], ty.Any], expire_s: float, should_exit: ty.Callable[[], bool]
|
|
43
|
+
) -> None:
|
|
44
|
+
while True:
|
|
45
|
+
# maintain the lock twice as often as necessary, to be safe
|
|
46
|
+
time.sleep(expire_s / 2)
|
|
47
|
+
if should_exit():
|
|
48
|
+
return
|
|
49
|
+
maintain()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class CannotMaintainLock(ValueError):
|
|
53
|
+
pass # pragma: no cover
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class LockWasStolenError(ValueError):
|
|
57
|
+
pass # pragma: no cover
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def remote_lock_maintain(lock_dir_uri: str, expected_writer_id: str = "") -> LockAcquired:
|
|
61
|
+
"""Only for use by remote side - does not _acquire_ the lock,
|
|
62
|
+
but merely maintains it as unexpired. Does not allow for releasing,
|
|
63
|
+
as it is not the responsibility of the remote side to release the lock.
|
|
64
|
+
|
|
65
|
+
Will raise a CannotMaintainLock exception if the lock does not exist or has no
|
|
66
|
+
expiration time.
|
|
67
|
+
|
|
68
|
+
Will raise a LockWasStolenError if a provided expected_writer_id (which is the
|
|
69
|
+
writer_id of the lock as provided to the remote side by the original writer) does not
|
|
70
|
+
match the lock's actual current writer_id - in other words, if some other writer has
|
|
71
|
+
acquired the lock before the remote side has been able to start running.
|
|
72
|
+
|
|
73
|
+
The return value is intended to be launched as the target of a Thread or Process.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
lock_uri = make_lock_uri(lock_dir_uri)
|
|
78
|
+
read_lockfile = make_read_lockfile(lock_uri)
|
|
79
|
+
lock_contents = read_lockfile()
|
|
80
|
+
except Exception:
|
|
81
|
+
logger.exception(f"Could not read lockfile: {lock_uri}")
|
|
82
|
+
|
|
83
|
+
if not lock_contents:
|
|
84
|
+
raise CannotMaintainLock(f"Lock does not exist: {lock_uri}")
|
|
85
|
+
|
|
86
|
+
expire_s = lock_contents["expire_s"]
|
|
87
|
+
if not expire_s or expire_s < 0:
|
|
88
|
+
raise CannotMaintainLock(f"Lock is missing an expiry time: {lock_contents}")
|
|
89
|
+
|
|
90
|
+
first_acquired_at_s = lock_contents["first_acquired_at"]
|
|
91
|
+
if not first_acquired_at_s:
|
|
92
|
+
raise CannotMaintainLock(f"Lock was never acquired: {lock_contents}")
|
|
93
|
+
|
|
94
|
+
current_writer_id = lock_contents["writer_id"]
|
|
95
|
+
if expected_writer_id and expected_writer_id != current_writer_id:
|
|
96
|
+
raise LockWasStolenError(
|
|
97
|
+
"Refusing to maintain lock that was created by a different writer:"
|
|
98
|
+
f" expected `{expected_writer_id}`, got `{current_writer_id}`."
|
|
99
|
+
"This probably means you just need to kill and restart your orchestrator "
|
|
100
|
+
" and it will begin awaiting the results of the new owner of the lock."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
lockfile_writer = LockfileWriter(
|
|
104
|
+
current_writer_id,
|
|
105
|
+
lock_dir_uri,
|
|
106
|
+
make_lock_contents(get_writer_id(lock_contents), timedelta(seconds=expire_s)),
|
|
107
|
+
expire_s,
|
|
108
|
+
writer_name="remote",
|
|
109
|
+
)
|
|
110
|
+
lockfile_writer.first_acquired_at = datetime.fromisoformat(first_acquired_at_s)
|
|
111
|
+
# disable releasing from remote
|
|
112
|
+
lockfile_writer.release = lambda: None # type: ignore # noqa: E731
|
|
113
|
+
return lockfile_writer
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def launch_daemon_lock_maintainer(lock_acq: LockAcquired) -> ty.Callable[[], None]:
|
|
117
|
+
"""Run lock maintenance until the process exits, or until the returned callable gets
|
|
118
|
+
returned.
|
|
119
|
+
|
|
120
|
+
Return a 'release wrapper' that stops maintenance of the lock and releases it.
|
|
121
|
+
|
|
122
|
+
A whole thread for this seems expensive, but the simplest alternative is having too
|
|
123
|
+
many lock maintainers trying to share time slices within some global lock maintainer,
|
|
124
|
+
and that runs a definite risk of overrunning the expiry time(s) for those locks.
|
|
125
|
+
|
|
126
|
+
If we were async all the way down, we could more plausibly make a bunch of async
|
|
127
|
+
network/filesystem calls here without taking into consideration how long they actually
|
|
128
|
+
take to execute.
|
|
129
|
+
"""
|
|
130
|
+
should_exit = False
|
|
131
|
+
|
|
132
|
+
def should_stop_maintaining() -> bool:
|
|
133
|
+
return should_exit
|
|
134
|
+
|
|
135
|
+
Thread(
|
|
136
|
+
target=partial(
|
|
137
|
+
_maintain_forever,
|
|
138
|
+
lock_acq.maintain,
|
|
139
|
+
lock_acq.expire_s,
|
|
140
|
+
should_stop_maintaining,
|
|
141
|
+
),
|
|
142
|
+
daemon=True,
|
|
143
|
+
).start()
|
|
144
|
+
|
|
145
|
+
def stop_maintaining() -> None:
|
|
146
|
+
nonlocal should_exit
|
|
147
|
+
should_exit = True
|
|
148
|
+
lock_acq.release()
|
|
149
|
+
|
|
150
|
+
return stop_maintaining
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import json
|
|
3
|
+
import typing as ty
|
|
4
|
+
|
|
5
|
+
from thds.core import log
|
|
6
|
+
|
|
7
|
+
from ..types import DISABLE_CONTROL_CACHE
|
|
8
|
+
from ..uris import lookup_blob_store
|
|
9
|
+
from .types import LockContents
|
|
10
|
+
|
|
11
|
+
logger = log.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_writer_id(lock_contents: LockContents) -> str:
|
|
15
|
+
return lock_contents["writer_id"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def make_read_lockfile(lock_uri: str) -> ty.Callable[[], ty.Optional[LockContents]]:
|
|
19
|
+
def read_lockfile() -> ty.Optional[LockContents]:
|
|
20
|
+
with DISABLE_CONTROL_CACHE.set_local(True):
|
|
21
|
+
blob_store = lookup_blob_store(lock_uri)
|
|
22
|
+
|
|
23
|
+
while True:
|
|
24
|
+
lockfile_bio = io.BytesIO()
|
|
25
|
+
try:
|
|
26
|
+
# NO OPTIMIZE: this read must never be optimized in any way.
|
|
27
|
+
blob_store.readbytesinto(lock_uri, lockfile_bio, type_hint="lock")
|
|
28
|
+
except Exception as e:
|
|
29
|
+
if blob_store.is_blob_not_found(e):
|
|
30
|
+
return None
|
|
31
|
+
logger.error(f"Failed on {lock_uri}: {e}")
|
|
32
|
+
raise
|
|
33
|
+
|
|
34
|
+
if lockfile_bio.tell() == 0: # nothing was written
|
|
35
|
+
logger.debug("Lockfile %s was empty - retrying read.", lock_uri)
|
|
36
|
+
continue
|
|
37
|
+
return json.loads(lockfile_bio.getvalue().decode())
|
|
38
|
+
|
|
39
|
+
return read_lockfile
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import typing as ty
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LockContents(ty.TypedDict):
|
|
5
|
+
"""Only writer_id, written_at, and expire are technically required for the algorithm
|
|
6
|
+
- everything else is debugging info.
|
|
7
|
+
|
|
8
|
+
In fact, expire_s would be 'optional' as well (this can be acquirer-only state), but
|
|
9
|
+
it is advantegous to embed this explicitly, partly so that we can have remote
|
|
10
|
+
'maintainers' that do not need to have any information other than the lock uri passed
|
|
11
|
+
to them in order to maintain the lock.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
writer_id: str
|
|
15
|
+
written_at: str # ISO8601 string with timezone in UTC
|
|
16
|
+
expire_s: float # seconds after written_at to expire
|
|
17
|
+
|
|
18
|
+
# just for debugging
|
|
19
|
+
hostname: str
|
|
20
|
+
pid: str
|
|
21
|
+
write_count: int
|
|
22
|
+
first_written_at: str
|
|
23
|
+
first_acquired_at: str
|
|
24
|
+
released_at: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LockAcquired(ty.Protocol):
|
|
28
|
+
|
|
29
|
+
writer_id: str
|
|
30
|
+
|
|
31
|
+
def maintain(self) -> None:
|
|
32
|
+
... # pragma: no cover
|
|
33
|
+
|
|
34
|
+
def release(self) -> None:
|
|
35
|
+
... # pragma: no cover
|
|
36
|
+
|
|
37
|
+
expire_s: float
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import typing as ty
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
|
|
5
|
+
from thds.core import hostname, log
|
|
6
|
+
|
|
7
|
+
from . import _funcs
|
|
8
|
+
from .types import LockContents
|
|
9
|
+
|
|
10
|
+
logger = log.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def make_lock_contents(
|
|
14
|
+
writer_id: str, expire: timedelta
|
|
15
|
+
) -> ty.Callable[[ty.Optional[datetime]], LockContents]:
|
|
16
|
+
"""Impure - Resets written_at to 'right now' to keep the lock 'live'."""
|
|
17
|
+
write_count = 0
|
|
18
|
+
first_written_at = ""
|
|
19
|
+
|
|
20
|
+
assert (
|
|
21
|
+
"/" not in writer_id
|
|
22
|
+
), f"{writer_id} should not contain a slash - maybe you passed a URI instead?"
|
|
23
|
+
|
|
24
|
+
def lock_contents(first_acquired_at: ty.Optional[datetime]) -> LockContents:
|
|
25
|
+
nonlocal write_count, first_written_at
|
|
26
|
+
write_count += 1
|
|
27
|
+
now = _funcs.utc_now().isoformat()
|
|
28
|
+
first_written_at = first_written_at or now
|
|
29
|
+
|
|
30
|
+
return {
|
|
31
|
+
"writer_id": writer_id,
|
|
32
|
+
"written_at": now,
|
|
33
|
+
"expire_s": expire.total_seconds(),
|
|
34
|
+
# debug stuff:
|
|
35
|
+
"write_count": write_count,
|
|
36
|
+
"hostname": hostname.friendly(),
|
|
37
|
+
"pid": str(os.getpid()),
|
|
38
|
+
"first_written_at": first_written_at,
|
|
39
|
+
"first_acquired_at": first_acquired_at.isoformat() if first_acquired_at else "",
|
|
40
|
+
"released_at": "",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return lock_contents
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class LockfileWriter:
|
|
47
|
+
"""The core purpose of this class is to allow setting of first_acquired_at immediately
|
|
48
|
+
after the first time that it is confirmed that we have acquired the lock.
|
|
49
|
+
|
|
50
|
+
Everything else could have been done as a (simpler) closure.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
lock_writer_id: str,
|
|
56
|
+
lock_dir_uri: str,
|
|
57
|
+
generate_lock: ty.Callable[[ty.Optional[datetime]], LockContents],
|
|
58
|
+
expire_s: float,
|
|
59
|
+
*,
|
|
60
|
+
debug: bool = True,
|
|
61
|
+
writer_name: str = "",
|
|
62
|
+
) -> None:
|
|
63
|
+
self.writer_id = lock_writer_id
|
|
64
|
+
self.lock_dir_uri = lock_dir_uri
|
|
65
|
+
self.blob_store, self.lock_uri = _funcs.store_and_lock_uri(lock_dir_uri)
|
|
66
|
+
self.generate_lock = generate_lock
|
|
67
|
+
self.expire_s = expire_s
|
|
68
|
+
self.debug = debug
|
|
69
|
+
self.writer_name = writer_name
|
|
70
|
+
self.first_acquired_at: ty.Optional[datetime] = None
|
|
71
|
+
|
|
72
|
+
def mark_acquired(self) -> None:
|
|
73
|
+
assert not self.first_acquired_at
|
|
74
|
+
self.first_acquired_at = _funcs.utc_now()
|
|
75
|
+
logger.debug("Acquired lock %s", self.lock_uri)
|
|
76
|
+
self.write() # record the first_acquired_at value for posterity
|
|
77
|
+
|
|
78
|
+
def write(self) -> None:
|
|
79
|
+
lock_contents = self.generate_lock(self.first_acquired_at)
|
|
80
|
+
if self.writer_name:
|
|
81
|
+
lock_contents["writer_name"] = self.writer_name # type: ignore
|
|
82
|
+
assert "/" not in lock_contents["writer_id"], lock_contents
|
|
83
|
+
assert self.writer_id == lock_contents["writer_id"], (self.writer_id, lock_contents)
|
|
84
|
+
lock_bytes = _funcs.json_dumpb(lock_contents)
|
|
85
|
+
assert lock_bytes
|
|
86
|
+
# technically, writing these bytes may cause an overwrite of someone else's lock.
|
|
87
|
+
# the only way we get to 'decide' who acquired the lock is by waiting an
|
|
88
|
+
# appropriate period of time (agreed upon by all acquirers, and sufficient to be
|
|
89
|
+
# certain that everyone who tried is going to actually wait long enough to see the
|
|
90
|
+
# results - and then we see who wrote it last. Whoever wrote it last 'won',
|
|
91
|
+
# and should continue as though they acquired the lock. Everyone else should 'fail'
|
|
92
|
+
# to acquire the lock.
|
|
93
|
+
_funcs.write(self.blob_store, self.lock_uri, lock_bytes)
|
|
94
|
+
self._maybe_write_debug(lock_contents)
|
|
95
|
+
|
|
96
|
+
def maintain(self) -> None:
|
|
97
|
+
"""It is valid to call this method multiple times as necessary once the lock has been acquired."""
|
|
98
|
+
self.write()
|
|
99
|
+
|
|
100
|
+
def release(self) -> None:
|
|
101
|
+
assert self.first_acquired_at
|
|
102
|
+
lock_contents = self.generate_lock(self.first_acquired_at)
|
|
103
|
+
lock_contents["released_at"] = lock_contents["written_at"]
|
|
104
|
+
lock_contents["written_at"] = ""
|
|
105
|
+
logger.debug(
|
|
106
|
+
"Releasing lock %s after %s", self.lock_uri, _funcs.utc_now() - self.first_acquired_at
|
|
107
|
+
)
|
|
108
|
+
_funcs.write(self.blob_store, self.lock_uri, _funcs.json_dumpb(lock_contents))
|
|
109
|
+
self._maybe_write_debug(lock_contents)
|
|
110
|
+
|
|
111
|
+
def _maybe_write_debug(self, lock_contents: LockContents) -> None:
|
|
112
|
+
"""Only do this if the lock was actually acquired."""
|
|
113
|
+
# this debug bit serves to help us understand when clients actually believed
|
|
114
|
+
# that they had acquired the lock. Because we only do this after our first
|
|
115
|
+
# 'successful' write, it will not impose extra latency during the
|
|
116
|
+
# latency-critical section.
|
|
117
|
+
if self.debug and self.first_acquired_at:
|
|
118
|
+
name = (self.writer_name + ";_") if self.writer_name else ""
|
|
119
|
+
first_written_at = lock_contents["first_written_at"]
|
|
120
|
+
hostname = lock_contents["hostname"]
|
|
121
|
+
pid = lock_contents["pid"]
|
|
122
|
+
acq_uuid = lock_contents["writer_id"]
|
|
123
|
+
assert "/" not in acq_uuid, lock_contents
|
|
124
|
+
debug_uri = self.blob_store.join(
|
|
125
|
+
self.lock_dir_uri,
|
|
126
|
+
"writers-debug",
|
|
127
|
+
f"firstwrite={first_written_at};_uuid={acq_uuid};_host={hostname};_pid={pid}{name}.json",
|
|
128
|
+
)
|
|
129
|
+
try:
|
|
130
|
+
self.blob_store.putbytes(
|
|
131
|
+
debug_uri,
|
|
132
|
+
_funcs.json_dumpb(lock_contents),
|
|
133
|
+
type_hint="application/mops-lock-breadcrumb",
|
|
134
|
+
)
|
|
135
|
+
except Exception:
|
|
136
|
+
logger.warning(f"Problem writing debug lock {debug_uri}")
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""A big part of what mops offers is automatic memoization.
|
|
2
|
+
|
|
3
|
+
It's built on the principle that if we need to be able to transfer
|
|
4
|
+
execution from one system/environment to another, then by definition
|
|
5
|
+
your computation must be a pure function, otherwise the result is not
|
|
6
|
+
reliable. And, because it _is_ a pure function, by definition we can
|
|
7
|
+
memoize your calls to it. More than that, we already _have_ memoized
|
|
8
|
+
them, because in order to transfer the invocation to the worker
|
|
9
|
+
environment, and then the worker's results back to your orchestrator,
|
|
10
|
+
we needed to serialize them somewhere, and those serialized invocation
|
|
11
|
+
and result will (in theory) be there the next time we look for them.
|
|
12
|
+
|
|
13
|
+
In a perfect world with pure functions, this memoization would be
|
|
14
|
+
omnipresent and completely transparent to the user. However, we don't
|
|
15
|
+
live in a perfect world. There are at least two common ways in which
|
|
16
|
+
always-on memoization could lead to incorrect behavior:
|
|
17
|
+
|
|
18
|
+
1. Your code changes between calls to the same function.
|
|
19
|
+
|
|
20
|
+
We can't reliably detect this, because we're not actually able to
|
|
21
|
+
serialize or otherwise derive a key from the full code,
|
|
22
|
+
recursively, of your function and everything it
|
|
23
|
+
references/calls.
|
|
24
|
+
|
|
25
|
+
Therefore, we allow you to notify us of these changes in one of
|
|
26
|
+
several ways, but the most common is by using mops without
|
|
27
|
+
explicitly setting a `pipeline_id` for your application's run.
|
|
28
|
+
|
|
29
|
+
If you don't set a `pipeline_id`, then one will be
|
|
30
|
+
non-deterministically generated for you at every application start;
|
|
31
|
+
essentially, you'll get no memoization of any kind, because you
|
|
32
|
+
haven't confirmed (via pipeline_id) that your code has not
|
|
33
|
+
changed. But if you do set the same pipeline_id consistently when
|
|
34
|
+
running your function, you'll be able to take advantage of the
|
|
35
|
+
memoization that is already occurring under the hood.
|
|
36
|
+
|
|
37
|
+
2. Your function writes its true results as side effects to some other
|
|
38
|
+
storage location, and the returned result from the function merely
|
|
39
|
+
_references_ the true result, which is stored in that external
|
|
40
|
+
system.
|
|
41
|
+
|
|
42
|
+
In other words, your function is not truly pure.
|
|
43
|
+
|
|
44
|
+
In this case, the actual source of erroneous behavior would be if
|
|
45
|
+
the external storage system is mutable. If it is not mutable, or
|
|
46
|
+
if, by convention, the storage can reliably be treated as
|
|
47
|
+
representing immutable, persistent data, then aside from network
|
|
48
|
+
errors or other sources of retryable non-determinism, your
|
|
49
|
+
application can be expected to reliably reuse memoized results from
|
|
50
|
+
this technically impure but pure-in-practice function.
|
|
51
|
+
|
|
52
|
+
In general, this source of non-determinism is probably the easier
|
|
53
|
+
to deal with, as it requires only the one convention - namely, that
|
|
54
|
+
certain ADLS storage accounts/containers should never have new and
|
|
55
|
+
different data written over top of existing data.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
The code that follows helps address point #1 above. Code changes are
|
|
59
|
+
endemic to software development and data science, and it cannot be
|
|
60
|
+
expected that memoization will only be used after code is "set in
|
|
61
|
+
stone".
|
|
62
|
+
|
|
63
|
+
The approach taken here is that it should be possible to run a given
|
|
64
|
+
process, with a known or even an auto-generated pipeline id, and then
|
|
65
|
+
simply record that pipeline id for later, such that a future caller of
|
|
66
|
+
the function can opt into the memoized results of that 'known run'
|
|
67
|
+
simply by calling the function.
|
|
68
|
+
|
|
69
|
+
The implementation detail is that this will be done out of band -
|
|
70
|
+
instead of modifying the code (either the called code or the call
|
|
71
|
+
site), we will allow this to be 'injected' via configuration, on a
|
|
72
|
+
per-function (rather than per-application, or per-function-call)
|
|
73
|
+
basis.
|
|
74
|
+
|
|
75
|
+
- per-application is rejected because it's what pipeline_id already
|
|
76
|
+
does - if you simply want to opt in to an entire 'universe' of
|
|
77
|
+
memoized results, you can reuse the pipeline_id corresponding to
|
|
78
|
+
that universe. We're trying to solve for a case where multiple
|
|
79
|
+
'universes' need to be stitched together in a later re-use of
|
|
80
|
+
memoized results. - per-function-call is rejected because there are
|
|
81
|
+
no currently-anticipated use cases for it - as an implementation
|
|
82
|
+
detail this would not be particularly hard to achieve, but it also
|
|
83
|
+
seems likely to be more 'developer overhead' than anybody would
|
|
84
|
+
really want to use in practice.
|
|
85
|
+
|
|
86
|
+
The memoization/cache key for `use_runner` (mops) function calls is made up of three parts or levels:
|
|
87
|
+
|
|
88
|
+
- The top level is the global storage config, including SA, container,
|
|
89
|
+
and a version-specific base path provided by the `mops` runner.
|
|
90
|
+
This level is not semantically derived from the function call
|
|
91
|
+
itself; it's present purely as a technical reality.
|
|
92
|
+
|
|
93
|
+
In the configuration and in the code, the configurable part of this
|
|
94
|
+
is referred to as the storage_root. Once a mops runner adds its own
|
|
95
|
+
base path, it becomes the runner prefix.
|
|
96
|
+
|
|
97
|
+
- The middle level is the 'code' memoization, which provides users granular ways of
|
|
98
|
+
invalidating caches across runs sharing a runner prefix by changing one of:
|
|
99
|
+
---- pipeline_id
|
|
100
|
+
---- name of function being memoized
|
|
101
|
+
---- cache key in docstring for function being memoized
|
|
102
|
+
to indicate that something about the _code being run_ has changed.
|
|
103
|
+
|
|
104
|
+
- The bottom level is the 'arguments' memoization,
|
|
105
|
+
whereby we serialize and then hash the full set of arguments to the function,
|
|
106
|
+
such that different calls to the same function will memoize differently as expected.
|
|
107
|
+
|
|
108
|
+
Of the three levels, our per-function memoization config should only need to 'deal' with the top two levels.
|
|
109
|
+
|
|
110
|
+
- A previous call to the function in question might have used a
|
|
111
|
+
different storage root than is configured by the application for the
|
|
112
|
+
default case, so it must be necessary to specify where we want to
|
|
113
|
+
look for memoized results.
|
|
114
|
+
|
|
115
|
+
- The pipeline_id used for a known result may be different for various
|
|
116
|
+
different functions that we intend to call.
|
|
117
|
+
|
|
118
|
+
- If a codebase has undergone refactoring, such that a function lives
|
|
119
|
+
in a different module than it previously did, but you wish to reuse
|
|
120
|
+
memoized results, it should be possible to provide a translation
|
|
121
|
+
layer for the name itself.
|
|
122
|
+
|
|
123
|
+
- In rare cases, the (optional) value of a function's
|
|
124
|
+
function-logic-key (embedded in the docstring) may have changed
|
|
125
|
+
compared to the version we're able to import, but we may still wish
|
|
126
|
+
to pick up the result of a different configuration.
|
|
127
|
+
|
|
128
|
+
Notably, we do _not_ propose to allow configuration of the hashed
|
|
129
|
+
args/kwargs itself, which would amount to a full redirect of the
|
|
130
|
+
function call to a known result. It's not that there might not be some
|
|
131
|
+
use case for this functionality; we simply don't foresee what that
|
|
132
|
+
would be and decline to prematurely implement such functionality.
|
|
133
|
+
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
import hashlib
|
|
137
|
+
import re
|
|
138
|
+
import typing as ty
|
|
139
|
+
|
|
140
|
+
from thds import humenc
|
|
141
|
+
from thds.core import config
|
|
142
|
+
|
|
143
|
+
from ..pipeline_id_mask import (
|
|
144
|
+
extract_from_docstr,
|
|
145
|
+
get_pipeline_id,
|
|
146
|
+
get_pipeline_id_mask,
|
|
147
|
+
pipeline_id_mask,
|
|
148
|
+
)
|
|
149
|
+
from ..uris import lookup_blob_store
|
|
150
|
+
from .unique_name_for_function import make_unique_name_including_docstring_key, parse_unique_name
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class _PipelineMemospaceHandler(ty.Protocol):
|
|
154
|
+
def __call__(self, __callable_name: str, __runner_prefix: str) -> ty.Optional[str]:
|
|
155
|
+
...
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
_PIPELINE_MEMOSPACE_HANDLERS: ty.List[_PipelineMemospaceHandler] = list()
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def add_pipeline_memospace_handlers(*handlers: _PipelineMemospaceHandler) -> None:
|
|
162
|
+
"""Add one or more handlers that will be tested in order to determine whether an
|
|
163
|
+
application wishes to override all or part of the "pipeline memospace" (the runner
|
|
164
|
+
prefix plus the pipeline id) for a given fully-qualified function name.
|
|
165
|
+
|
|
166
|
+
Does _not_ provide access to the invocation-specific `function_id` information; this
|
|
167
|
+
capability is not offered by mops.
|
|
168
|
+
"""
|
|
169
|
+
_PIPELINE_MEMOSPACE_HANDLERS.extend(handlers)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def matching_mask_pipeline_id(pipeline_id: str, callable_regex: str) -> _PipelineMemospaceHandler:
|
|
173
|
+
"""Set the function memospace to be:
|
|
174
|
+
|
|
175
|
+
the current runner prefix
|
|
176
|
+
+ the supplied pipeline_id, OR the set pipeline_id (not the
|
|
177
|
+
pipeline_id_mask!) if the supplied pipeline_id is empty
|
|
178
|
+
(thus allowing for this to fall back to an auto-generated pipeline_id if you want to force a run)
|
|
179
|
+
+ the callable name (including docstring key).
|
|
180
|
+
|
|
181
|
+
Note this uses re.match, which means your regex must match the _beginning_ of the
|
|
182
|
+
callable name. If you want fullmatch, write your own. :)
|
|
183
|
+
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
def _handler(callable_name: str, runner_prefix: str) -> ty.Optional[str]:
|
|
187
|
+
if re.match(callable_regex, callable_name):
|
|
188
|
+
return lookup_blob_store(runner_prefix).join(
|
|
189
|
+
runner_prefix, pipeline_id or get_pipeline_id(), callable_name
|
|
190
|
+
)
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
return _handler
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _lookup_pipeline_memospace(runner_prefix: str, callable_name: str) -> ty.Optional[str]:
|
|
197
|
+
"""The pipeline memospace is everything up until but not including the hash of the (args, kwargs) tuple."""
|
|
198
|
+
try:
|
|
199
|
+
config_memospace = config.config_by_name(f"mops.memo.{callable_name}.memospace")()
|
|
200
|
+
except KeyError:
|
|
201
|
+
config_memospace = ""
|
|
202
|
+
if config_memospace:
|
|
203
|
+
return config_memospace
|
|
204
|
+
for handler in _PIPELINE_MEMOSPACE_HANDLERS:
|
|
205
|
+
pipeline_memospace = handler(callable_name, runner_prefix)
|
|
206
|
+
if pipeline_memospace:
|
|
207
|
+
return pipeline_memospace
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def make_function_memospace(runner_prefix: str, f: ty.Callable) -> str:
|
|
212
|
+
callable_name = make_unique_name_including_docstring_key(f)
|
|
213
|
+
# always default to the function docstring if no other mask is currently provided.
|
|
214
|
+
with pipeline_id_mask(extract_from_docstr(f, require=False)):
|
|
215
|
+
return _lookup_pipeline_memospace(runner_prefix, callable_name) or lookup_blob_store(
|
|
216
|
+
runner_prefix
|
|
217
|
+
).join(
|
|
218
|
+
runner_prefix,
|
|
219
|
+
get_pipeline_id_mask(),
|
|
220
|
+
callable_name,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class MemoUriComponents(ty.NamedTuple):
|
|
225
|
+
runner_prefix: str
|
|
226
|
+
pipeline_id: str
|
|
227
|
+
function_module: str
|
|
228
|
+
function_name: str
|
|
229
|
+
function_logic_key: str
|
|
230
|
+
args_hash: str
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def parse_memo_uri(
|
|
234
|
+
memo_uri: str,
|
|
235
|
+
runner_prefix: str = "", # the part up to but not including the pipeline_id
|
|
236
|
+
separator: str = "/",
|
|
237
|
+
backward_compat_split: str = "mops2-mpf",
|
|
238
|
+
) -> MemoUriComponents:
|
|
239
|
+
if not runner_prefix:
|
|
240
|
+
# this is in order to help with backward compatibilty for mops summaries that
|
|
241
|
+
# didn't store any of this. providing memospace is a more precise way to handle this.
|
|
242
|
+
if backward_compat_split not in memo_uri:
|
|
243
|
+
raise ValueError("Cannot determine the components of a memo URI with no memospace")
|
|
244
|
+
parts = memo_uri.split(backward_compat_split, 1)
|
|
245
|
+
assert len(parts) > 1, parts
|
|
246
|
+
runner_prefix = separator.join((parts[0].rstrip(separator), backward_compat_split))
|
|
247
|
+
|
|
248
|
+
runner_prefix = runner_prefix.rstrip(separator)
|
|
249
|
+
rest, args_hash = memo_uri.rsplit(separator, 1) # args hash is last component
|
|
250
|
+
rest, full_function_name = rest.rsplit(separator, 1)
|
|
251
|
+
pipeline_id = rest[len(runner_prefix) :]
|
|
252
|
+
pipeline_id = pipeline_id.strip(separator)
|
|
253
|
+
|
|
254
|
+
function_parts = parse_unique_name(full_function_name)
|
|
255
|
+
|
|
256
|
+
return MemoUriComponents(
|
|
257
|
+
runner_prefix,
|
|
258
|
+
pipeline_id,
|
|
259
|
+
function_parts.module,
|
|
260
|
+
function_parts.name,
|
|
261
|
+
function_parts.function_logic_key,
|
|
262
|
+
args_hash,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def args_kwargs_content_address(args_kwargs_bytes: bytes) -> str:
|
|
267
|
+
return humenc.encode(hashlib.sha256(args_kwargs_bytes).digest())
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Definitions of basic keyfuncs."""
|
|
2
|
+
import inspect
|
|
3
|
+
import typing as ty
|
|
4
|
+
|
|
5
|
+
from ..types import Args, Kwargs
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Keyfunc(ty.Protocol):
|
|
9
|
+
"""A function which, when called with (c, args, kwargs),
|
|
10
|
+
returns either the same or a different callable, and the same or
|
|
11
|
+
different args and kwargs, such that the returned three-tuple is
|
|
12
|
+
what will get used to construct the full memoization key.
|
|
13
|
+
|
|
14
|
+
The args, kwargs returned _must_ be bindable to the parameters of
|
|
15
|
+
the callable returned. However, the callable will not be actually
|
|
16
|
+
invoked, so it is not important that they bind in a semantically
|
|
17
|
+
meaningful way - if you're just trying to drop certain arguments
|
|
18
|
+
that can't be pickled, your best bet will be to return a `None`
|
|
19
|
+
placeholder for those.
|
|
20
|
+
|
|
21
|
+
The identity function (lambda c, a, k: c, a k) is equivalent to
|
|
22
|
+
the unchanged default behavior from MemoizingPicklingRunner.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __call__(
|
|
26
|
+
self, c: ty.Callable, __args: Args, __kwargs: Kwargs
|
|
27
|
+
) -> ty.Tuple[ty.Callable, Args, Kwargs]:
|
|
28
|
+
... # pragma: nocover
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
ArgsOnlyKeyfunc = ty.Callable[..., ty.Tuple[Args, Kwargs]]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def args_only(keyfunc: ty.Union[ArgsOnlyKeyfunc, Keyfunc]) -> Keyfunc:
|
|
35
|
+
def funcpassthrough_keyfunc(
|
|
36
|
+
c: ty.Callable, args: Args, kwargs: Kwargs
|
|
37
|
+
) -> ty.Tuple[ty.Callable, Args, Kwargs]:
|
|
38
|
+
return c, *keyfunc(*args, **kwargs) # type: ignore
|
|
39
|
+
|
|
40
|
+
return funcpassthrough_keyfunc
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def autowrap_args_only_keyfunc(keyfunc: ty.Union[ArgsOnlyKeyfunc, Keyfunc]) -> Keyfunc:
|
|
44
|
+
"""This exists only to 'sweeten' the API, so that in most cases a
|
|
45
|
+
'normal-looking' function can be passed in that does not have
|
|
46
|
+
access to the `func` parameter and gets Pythonic access to the
|
|
47
|
+
splatted args and kwargs, rather than a tuple and a dictionary.
|
|
48
|
+
"""
|
|
49
|
+
keyfunc_params = inspect.signature(keyfunc).parameters
|
|
50
|
+
is_full_keyfunc = len(keyfunc_params) == 3 and next(iter(keyfunc_params.values())).name == "c"
|
|
51
|
+
if is_full_keyfunc:
|
|
52
|
+
return ty.cast(Keyfunc, keyfunc)
|
|
53
|
+
return args_only(keyfunc)
|