thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show
  1. thds/mops/__about__.py +8 -0
  2. thds/mops/__init__.py +3 -0
  3. thds/mops/_compat.py +6 -0
  4. thds/mops/_utils/__init__.py +0 -0
  5. thds/mops/_utils/colorize.py +110 -0
  6. thds/mops/_utils/config_tree.py +167 -0
  7. thds/mops/_utils/exception.py +16 -0
  8. thds/mops/_utils/locked_cache.py +78 -0
  9. thds/mops/_utils/names.py +23 -0
  10. thds/mops/_utils/on_slow.py +28 -0
  11. thds/mops/_utils/once.py +30 -0
  12. thds/mops/_utils/temp.py +32 -0
  13. thds/mops/config.py +60 -0
  14. thds/mops/impure/__init__.py +2 -0
  15. thds/mops/impure/keyfunc.py +14 -0
  16. thds/mops/impure/runner.py +73 -0
  17. thds/mops/k8s/__init__.py +27 -0
  18. thds/mops/k8s/_shared.py +3 -0
  19. thds/mops/k8s/apply_yaml.py +22 -0
  20. thds/mops/k8s/auth.py +49 -0
  21. thds/mops/k8s/config.py +37 -0
  22. thds/mops/k8s/container_registry.py +14 -0
  23. thds/mops/k8s/jobs.py +57 -0
  24. thds/mops/k8s/launch.py +234 -0
  25. thds/mops/k8s/logging.py +239 -0
  26. thds/mops/k8s/namespace.py +17 -0
  27. thds/mops/k8s/node_selection.py +58 -0
  28. thds/mops/k8s/retry.py +75 -0
  29. thds/mops/k8s/too_old_resource_version.py +42 -0
  30. thds/mops/k8s/tools/krsync.py +50 -0
  31. thds/mops/k8s/tools/krsync.sh +22 -0
  32. thds/mops/k8s/wait_job.py +72 -0
  33. thds/mops/k8s/warn_image_backoff.py +63 -0
  34. thds/mops/k8s/watch.py +266 -0
  35. thds/mops/meta.json +8 -0
  36. thds/mops/parallel.py +36 -0
  37. thds/mops/pure/__init__.py +43 -0
  38. thds/mops/pure/_magic/__init__.py +0 -0
  39. thds/mops/pure/_magic/api.py +114 -0
  40. thds/mops/pure/_magic/sauce.py +152 -0
  41. thds/mops/pure/_magic/shims.py +34 -0
  42. thds/mops/pure/adls/__init__.py +1 -0
  43. thds/mops/pure/adls/_files.py +22 -0
  44. thds/mops/pure/adls/blob_store.py +185 -0
  45. thds/mops/pure/adls/output_fqn.py +17 -0
  46. thds/mops/pure/core/__init__.py +0 -0
  47. thds/mops/pure/core/content_addressed.py +31 -0
  48. thds/mops/pure/core/deferred_work.py +83 -0
  49. thds/mops/pure/core/entry/__init__.py +2 -0
  50. thds/mops/pure/core/entry/main.py +47 -0
  51. thds/mops/pure/core/entry/route_result.py +66 -0
  52. thds/mops/pure/core/entry/runner_registry.py +31 -0
  53. thds/mops/pure/core/file_blob_store.py +120 -0
  54. thds/mops/pure/core/lock/__init__.py +7 -0
  55. thds/mops/pure/core/lock/_acquire.py +192 -0
  56. thds/mops/pure/core/lock/_funcs.py +37 -0
  57. thds/mops/pure/core/lock/cli.py +73 -0
  58. thds/mops/pure/core/lock/maintain.py +150 -0
  59. thds/mops/pure/core/lock/read.py +39 -0
  60. thds/mops/pure/core/lock/types.py +37 -0
  61. thds/mops/pure/core/lock/write.py +136 -0
  62. thds/mops/pure/core/memo/__init__.py +6 -0
  63. thds/mops/pure/core/memo/function_memospace.py +267 -0
  64. thds/mops/pure/core/memo/keyfunc.py +53 -0
  65. thds/mops/pure/core/memo/overwrite_params.py +61 -0
  66. thds/mops/pure/core/memo/results.py +103 -0
  67. thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
  68. thds/mops/pure/core/metadata.py +230 -0
  69. thds/mops/pure/core/output_naming.py +52 -0
  70. thds/mops/pure/core/partial.py +15 -0
  71. thds/mops/pure/core/pipeline_id.py +62 -0
  72. thds/mops/pure/core/pipeline_id_mask.py +79 -0
  73. thds/mops/pure/core/script_support.py +25 -0
  74. thds/mops/pure/core/serialize_big_objs.py +73 -0
  75. thds/mops/pure/core/serialize_paths.py +149 -0
  76. thds/mops/pure/core/source.py +291 -0
  77. thds/mops/pure/core/types.py +142 -0
  78. thds/mops/pure/core/uris.py +81 -0
  79. thds/mops/pure/core/use_runner.py +47 -0
  80. thds/mops/pure/joblib/__init__.py +1 -0
  81. thds/mops/pure/joblib/backend.py +81 -0
  82. thds/mops/pure/joblib/batching.py +67 -0
  83. thds/mops/pure/pickling/__init__.py +3 -0
  84. thds/mops/pure/pickling/_pickle.py +193 -0
  85. thds/mops/pure/pickling/memoize_only.py +22 -0
  86. thds/mops/pure/pickling/mprunner.py +173 -0
  87. thds/mops/pure/pickling/pickles.py +149 -0
  88. thds/mops/pure/pickling/remote.py +145 -0
  89. thds/mops/pure/pickling/sha256_b64.py +71 -0
  90. thds/mops/pure/runner/__init__.py +0 -0
  91. thds/mops/pure/runner/local.py +239 -0
  92. thds/mops/pure/runner/shim_builder.py +25 -0
  93. thds/mops/pure/runner/simple_shims.py +21 -0
  94. thds/mops/pure/runner/strings.py +1 -0
  95. thds/mops/pure/runner/types.py +28 -0
  96. thds/mops/pure/tools/__init__.py +0 -0
  97. thds/mops/pure/tools/history.py +35 -0
  98. thds/mops/pure/tools/inspect.py +372 -0
  99. thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
  100. thds/mops/pure/tools/stress.py +63 -0
  101. thds/mops/pure/tools/summarize/__init__.py +4 -0
  102. thds/mops/pure/tools/summarize/cli.py +293 -0
  103. thds/mops/pure/tools/summarize/run_summary.py +143 -0
  104. thds/mops/py.typed +0 -0
  105. thds/mops/testing/__init__.py +0 -0
  106. thds/mops/testing/deferred_imports.py +81 -0
  107. thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
  108. thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
  109. thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
  110. thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
  111. thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
@@ -0,0 +1,150 @@
1
+ """Part of the design of our lock is that a remote process can take over 'maintenance' of
2
+ the lock if (and especially) if the orchestrator process dies.
3
+
4
+ This allows a killed orchestrator process to be restarted as long as all of its remote
5
+ processes have gotten started working.
6
+
7
+ The remote process lock maintainers never _acquire_ the lock; they simply read what's in
8
+ it when they get started, and from then on keep the `written_at` timestamp up to date.
9
+
10
+ """
11
+
12
+ import time
13
+ import typing as ty
14
+ from datetime import datetime, timedelta
15
+ from functools import partial
16
+ from threading import Thread
17
+
18
+ from thds.core import log
19
+
20
+ from ._funcs import make_lock_uri
21
+ from .read import get_writer_id, make_read_lockfile
22
+ from .types import LockAcquired
23
+ from .write import LockfileWriter, make_lock_contents
24
+
25
+ logger = log.getLogger(__name__)
26
+
27
+
28
+ class _MaintainOnly(ty.NamedTuple):
29
+ """Matches the LockAcquired interface except that release() will do nothing."""
30
+
31
+ maintain: ty.Callable[[], None]
32
+ expire_s: float
33
+ release: ty.Callable[[], None]
34
+
35
+
36
+ class _MaintainForever(ty.Protocol):
37
+ def __call__(self) -> None:
38
+ ... # pragma: no cover
39
+
40
+
41
+ def _maintain_forever(
42
+ maintain: ty.Callable[[], ty.Any], expire_s: float, should_exit: ty.Callable[[], bool]
43
+ ) -> None:
44
+ while True:
45
+ # maintain the lock twice as often as necessary, to be safe
46
+ time.sleep(expire_s / 2)
47
+ if should_exit():
48
+ return
49
+ maintain()
50
+
51
+
52
+ class CannotMaintainLock(ValueError):
53
+ pass # pragma: no cover
54
+
55
+
56
+ class LockWasStolenError(ValueError):
57
+ pass # pragma: no cover
58
+
59
+
60
+ def remote_lock_maintain(lock_dir_uri: str, expected_writer_id: str = "") -> LockAcquired:
61
+ """Only for use by remote side - does not _acquire_ the lock,
62
+ but merely maintains it as unexpired. Does not allow for releasing,
63
+ as it is not the responsibility of the remote side to release the lock.
64
+
65
+ Will raise a CannotMaintainLock exception if the lock does not exist or has no
66
+ expiration time.
67
+
68
+ Will raise a LockWasStolenError if a provided expected_writer_id (which is the
69
+ writer_id of the lock as provided to the remote side by the original writer) does not
70
+ match the lock's actual current writer_id - in other words, if some other writer has
71
+ acquired the lock before the remote side has been able to start running.
72
+
73
+ The return value is intended to be launched as the target of a Thread or Process.
74
+ """
75
+
76
+ try:
77
+ lock_uri = make_lock_uri(lock_dir_uri)
78
+ read_lockfile = make_read_lockfile(lock_uri)
79
+ lock_contents = read_lockfile()
80
+ except Exception:
81
+ logger.exception(f"Could not read lockfile: {lock_uri}")
82
+
83
+ if not lock_contents:
84
+ raise CannotMaintainLock(f"Lock does not exist: {lock_uri}")
85
+
86
+ expire_s = lock_contents["expire_s"]
87
+ if not expire_s or expire_s < 0:
88
+ raise CannotMaintainLock(f"Lock is missing an expiry time: {lock_contents}")
89
+
90
+ first_acquired_at_s = lock_contents["first_acquired_at"]
91
+ if not first_acquired_at_s:
92
+ raise CannotMaintainLock(f"Lock was never acquired: {lock_contents}")
93
+
94
+ current_writer_id = lock_contents["writer_id"]
95
+ if expected_writer_id and expected_writer_id != current_writer_id:
96
+ raise LockWasStolenError(
97
+ "Refusing to maintain lock that was created by a different writer:"
98
+ f" expected `{expected_writer_id}`, got `{current_writer_id}`."
99
+ "This probably means you just need to kill and restart your orchestrator "
100
+ " and it will begin awaiting the results of the new owner of the lock."
101
+ )
102
+
103
+ lockfile_writer = LockfileWriter(
104
+ current_writer_id,
105
+ lock_dir_uri,
106
+ make_lock_contents(get_writer_id(lock_contents), timedelta(seconds=expire_s)),
107
+ expire_s,
108
+ writer_name="remote",
109
+ )
110
+ lockfile_writer.first_acquired_at = datetime.fromisoformat(first_acquired_at_s)
111
+ # disable releasing from remote
112
+ lockfile_writer.release = lambda: None # type: ignore # noqa: E731
113
+ return lockfile_writer
114
+
115
+
116
+ def launch_daemon_lock_maintainer(lock_acq: LockAcquired) -> ty.Callable[[], None]:
117
+ """Run lock maintenance until the process exits, or until the returned callable gets
118
+ returned.
119
+
120
+ Return a 'release wrapper' that stops maintenance of the lock and releases it.
121
+
122
+ A whole thread for this seems expensive, but the simplest alternative is having too
123
+ many lock maintainers trying to share time slices within some global lock maintainer,
124
+ and that runs a definite risk of overrunning the expiry time(s) for those locks.
125
+
126
+ If we were async all the way down, we could more plausibly make a bunch of async
127
+ network/filesystem calls here without taking into consideration how long they actually
128
+ take to execute.
129
+ """
130
+ should_exit = False
131
+
132
+ def should_stop_maintaining() -> bool:
133
+ return should_exit
134
+
135
+ Thread(
136
+ target=partial(
137
+ _maintain_forever,
138
+ lock_acq.maintain,
139
+ lock_acq.expire_s,
140
+ should_stop_maintaining,
141
+ ),
142
+ daemon=True,
143
+ ).start()
144
+
145
+ def stop_maintaining() -> None:
146
+ nonlocal should_exit
147
+ should_exit = True
148
+ lock_acq.release()
149
+
150
+ return stop_maintaining
@@ -0,0 +1,39 @@
1
+ import io
2
+ import json
3
+ import typing as ty
4
+
5
+ from thds.core import log
6
+
7
+ from ..types import DISABLE_CONTROL_CACHE
8
+ from ..uris import lookup_blob_store
9
+ from .types import LockContents
10
+
11
+ logger = log.getLogger(__name__)
12
+
13
+
14
+ def get_writer_id(lock_contents: LockContents) -> str:
15
+ return lock_contents["writer_id"]
16
+
17
+
18
+ def make_read_lockfile(lock_uri: str) -> ty.Callable[[], ty.Optional[LockContents]]:
19
+ def read_lockfile() -> ty.Optional[LockContents]:
20
+ with DISABLE_CONTROL_CACHE.set_local(True):
21
+ blob_store = lookup_blob_store(lock_uri)
22
+
23
+ while True:
24
+ lockfile_bio = io.BytesIO()
25
+ try:
26
+ # NO OPTIMIZE: this read must never be optimized in any way.
27
+ blob_store.readbytesinto(lock_uri, lockfile_bio, type_hint="lock")
28
+ except Exception as e:
29
+ if blob_store.is_blob_not_found(e):
30
+ return None
31
+ logger.error(f"Failed on {lock_uri}: {e}")
32
+ raise
33
+
34
+ if lockfile_bio.tell() == 0: # nothing was written
35
+ logger.debug("Lockfile %s was empty - retrying read.", lock_uri)
36
+ continue
37
+ return json.loads(lockfile_bio.getvalue().decode())
38
+
39
+ return read_lockfile
@@ -0,0 +1,37 @@
1
+ import typing as ty
2
+
3
+
4
+ class LockContents(ty.TypedDict):
5
+ """Only writer_id, written_at, and expire are technically required for the algorithm
6
+ - everything else is debugging info.
7
+
8
+ In fact, expire_s would be 'optional' as well (this can be acquirer-only state), but
9
+ it is advantegous to embed this explicitly, partly so that we can have remote
10
+ 'maintainers' that do not need to have any information other than the lock uri passed
11
+ to them in order to maintain the lock.
12
+ """
13
+
14
+ writer_id: str
15
+ written_at: str # ISO8601 string with timezone in UTC
16
+ expire_s: float # seconds after written_at to expire
17
+
18
+ # just for debugging
19
+ hostname: str
20
+ pid: str
21
+ write_count: int
22
+ first_written_at: str
23
+ first_acquired_at: str
24
+ released_at: str
25
+
26
+
27
+ class LockAcquired(ty.Protocol):
28
+
29
+ writer_id: str
30
+
31
+ def maintain(self) -> None:
32
+ ... # pragma: no cover
33
+
34
+ def release(self) -> None:
35
+ ... # pragma: no cover
36
+
37
+ expire_s: float
@@ -0,0 +1,136 @@
1
+ import os
2
+ import typing as ty
3
+ from datetime import datetime, timedelta
4
+
5
+ from thds.core import hostname, log
6
+
7
+ from . import _funcs
8
+ from .types import LockContents
9
+
10
+ logger = log.getLogger(__name__)
11
+
12
+
13
+ def make_lock_contents(
14
+ writer_id: str, expire: timedelta
15
+ ) -> ty.Callable[[ty.Optional[datetime]], LockContents]:
16
+ """Impure - Resets written_at to 'right now' to keep the lock 'live'."""
17
+ write_count = 0
18
+ first_written_at = ""
19
+
20
+ assert (
21
+ "/" not in writer_id
22
+ ), f"{writer_id} should not contain a slash - maybe you passed a URI instead?"
23
+
24
+ def lock_contents(first_acquired_at: ty.Optional[datetime]) -> LockContents:
25
+ nonlocal write_count, first_written_at
26
+ write_count += 1
27
+ now = _funcs.utc_now().isoformat()
28
+ first_written_at = first_written_at or now
29
+
30
+ return {
31
+ "writer_id": writer_id,
32
+ "written_at": now,
33
+ "expire_s": expire.total_seconds(),
34
+ # debug stuff:
35
+ "write_count": write_count,
36
+ "hostname": hostname.friendly(),
37
+ "pid": str(os.getpid()),
38
+ "first_written_at": first_written_at,
39
+ "first_acquired_at": first_acquired_at.isoformat() if first_acquired_at else "",
40
+ "released_at": "",
41
+ }
42
+
43
+ return lock_contents
44
+
45
+
46
+ class LockfileWriter:
47
+ """The core purpose of this class is to allow setting of first_acquired_at immediately
48
+ after the first time that it is confirmed that we have acquired the lock.
49
+
50
+ Everything else could have been done as a (simpler) closure.
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ lock_writer_id: str,
56
+ lock_dir_uri: str,
57
+ generate_lock: ty.Callable[[ty.Optional[datetime]], LockContents],
58
+ expire_s: float,
59
+ *,
60
+ debug: bool = True,
61
+ writer_name: str = "",
62
+ ) -> None:
63
+ self.writer_id = lock_writer_id
64
+ self.lock_dir_uri = lock_dir_uri
65
+ self.blob_store, self.lock_uri = _funcs.store_and_lock_uri(lock_dir_uri)
66
+ self.generate_lock = generate_lock
67
+ self.expire_s = expire_s
68
+ self.debug = debug
69
+ self.writer_name = writer_name
70
+ self.first_acquired_at: ty.Optional[datetime] = None
71
+
72
+ def mark_acquired(self) -> None:
73
+ assert not self.first_acquired_at
74
+ self.first_acquired_at = _funcs.utc_now()
75
+ logger.debug("Acquired lock %s", self.lock_uri)
76
+ self.write() # record the first_acquired_at value for posterity
77
+
78
+ def write(self) -> None:
79
+ lock_contents = self.generate_lock(self.first_acquired_at)
80
+ if self.writer_name:
81
+ lock_contents["writer_name"] = self.writer_name # type: ignore
82
+ assert "/" not in lock_contents["writer_id"], lock_contents
83
+ assert self.writer_id == lock_contents["writer_id"], (self.writer_id, lock_contents)
84
+ lock_bytes = _funcs.json_dumpb(lock_contents)
85
+ assert lock_bytes
86
+ # technically, writing these bytes may cause an overwrite of someone else's lock.
87
+ # the only way we get to 'decide' who acquired the lock is by waiting an
88
+ # appropriate period of time (agreed upon by all acquirers, and sufficient to be
89
+ # certain that everyone who tried is going to actually wait long enough to see the
90
+ # results - and then we see who wrote it last. Whoever wrote it last 'won',
91
+ # and should continue as though they acquired the lock. Everyone else should 'fail'
92
+ # to acquire the lock.
93
+ _funcs.write(self.blob_store, self.lock_uri, lock_bytes)
94
+ self._maybe_write_debug(lock_contents)
95
+
96
+ def maintain(self) -> None:
97
+ """It is valid to call this method multiple times as necessary once the lock has been acquired."""
98
+ self.write()
99
+
100
+ def release(self) -> None:
101
+ assert self.first_acquired_at
102
+ lock_contents = self.generate_lock(self.first_acquired_at)
103
+ lock_contents["released_at"] = lock_contents["written_at"]
104
+ lock_contents["written_at"] = ""
105
+ logger.debug(
106
+ "Releasing lock %s after %s", self.lock_uri, _funcs.utc_now() - self.first_acquired_at
107
+ )
108
+ _funcs.write(self.blob_store, self.lock_uri, _funcs.json_dumpb(lock_contents))
109
+ self._maybe_write_debug(lock_contents)
110
+
111
+ def _maybe_write_debug(self, lock_contents: LockContents) -> None:
112
+ """Only do this if the lock was actually acquired."""
113
+ # this debug bit serves to help us understand when clients actually believed
114
+ # that they had acquired the lock. Because we only do this after our first
115
+ # 'successful' write, it will not impose extra latency during the
116
+ # latency-critical section.
117
+ if self.debug and self.first_acquired_at:
118
+ name = (self.writer_name + ";_") if self.writer_name else ""
119
+ first_written_at = lock_contents["first_written_at"]
120
+ hostname = lock_contents["hostname"]
121
+ pid = lock_contents["pid"]
122
+ acq_uuid = lock_contents["writer_id"]
123
+ assert "/" not in acq_uuid, lock_contents
124
+ debug_uri = self.blob_store.join(
125
+ self.lock_dir_uri,
126
+ "writers-debug",
127
+ f"firstwrite={first_written_at};_uuid={acq_uuid};_host={hostname};_pid={pid}{name}.json",
128
+ )
129
+ try:
130
+ self.blob_store.putbytes(
131
+ debug_uri,
132
+ _funcs.json_dumpb(lock_contents),
133
+ type_hint="application/mops-lock-breadcrumb",
134
+ )
135
+ except Exception:
136
+ logger.warning(f"Problem writing debug lock {debug_uri}")
@@ -0,0 +1,6 @@
1
+ from .function_memospace import ( # noqa
2
+ args_kwargs_content_address,
3
+ make_function_memospace,
4
+ parse_memo_uri,
5
+ )
6
+ from .unique_name_for_function import extract_function_logic_key_from_docstr # noqa: F401
@@ -0,0 +1,267 @@
1
+ """A big part of what mops offers is automatic memoization.
2
+
3
+ It's built on the principle that if we need to be able to transfer
4
+ execution from one system/environment to another, then by definition
5
+ your computation must be a pure function, otherwise the result is not
6
+ reliable. And, because it _is_ a pure function, by definition we can
7
+ memoize your calls to it. More than that, we already _have_ memoized
8
+ them, because in order to transfer the invocation to the worker
9
+ environment, and then the worker's results back to your orchestrator,
10
+ we needed to serialize them somewhere, and those serialized invocation
11
+ and result will (in theory) be there the next time we look for them.
12
+
13
+ In a perfect world with pure functions, this memoization would be
14
+ omnipresent and completely transparent to the user. However, we don't
15
+ live in a perfect world. There are at least two common ways in which
16
+ always-on memoization could lead to incorrect behavior:
17
+
18
+ 1. Your code changes between calls to the same function.
19
+
20
+ We can't reliably detect this, because we're not actually able to
21
+ serialize or otherwise derive a key from the full code,
22
+ recursively, of your function and everything it
23
+ references/calls.
24
+
25
+ Therefore, we allow you to notify us of these changes in one of
26
+ several ways, but the most common is by using mops without
27
+ explicitly setting a `pipeline_id` for your application's run.
28
+
29
+ If you don't set a `pipeline_id`, then one will be
30
+ non-deterministically generated for you at every application start;
31
+ essentially, you'll get no memoization of any kind, because you
32
+ haven't confirmed (via pipeline_id) that your code has not
33
+ changed. But if you do set the same pipeline_id consistently when
34
+ running your function, you'll be able to take advantage of the
35
+ memoization that is already occurring under the hood.
36
+
37
+ 2. Your function writes its true results as side effects to some other
38
+ storage location, and the returned result from the function merely
39
+ _references_ the true result, which is stored in that external
40
+ system.
41
+
42
+ In other words, your function is not truly pure.
43
+
44
+ In this case, the actual source of erroneous behavior would be if
45
+ the external storage system is mutable. If it is not mutable, or
46
+ if, by convention, the storage can reliably be treated as
47
+ representing immutable, persistent data, then aside from network
48
+ errors or other sources of retryable non-determinism, your
49
+ application can be expected to reliably reuse memoized results from
50
+ this technically impure but pure-in-practice function.
51
+
52
+ In general, this source of non-determinism is probably the easier
53
+ to deal with, as it requires only the one convention - namely, that
54
+ certain ADLS storage accounts/containers should never have new and
55
+ different data written over top of existing data.
56
+
57
+
58
+ The code that follows helps address point #1 above. Code changes are
59
+ endemic to software development and data science, and it cannot be
60
+ expected that memoization will only be used after code is "set in
61
+ stone".
62
+
63
+ The approach taken here is that it should be possible to run a given
64
+ process, with a known or even an auto-generated pipeline id, and then
65
+ simply record that pipeline id for later, such that a future caller of
66
+ the function can opt into the memoized results of that 'known run'
67
+ simply by calling the function.
68
+
69
+ The implementation detail is that this will be done out of band -
70
+ instead of modifying the code (either the called code or the call
71
+ site), we will allow this to be 'injected' via configuration, on a
72
+ per-function (rather than per-application, or per-function-call)
73
+ basis.
74
+
75
+ - per-application is rejected because it's what pipeline_id already
76
+ does - if you simply want to opt in to an entire 'universe' of
77
+ memoized results, you can reuse the pipeline_id corresponding to
78
+ that universe. We're trying to solve for a case where multiple
79
+ 'universes' need to be stitched together in a later re-use of
80
+ memoized results. - per-function-call is rejected because there are
81
+ no currently-anticipated use cases for it - as an implementation
82
+ detail this would not be particularly hard to achieve, but it also
83
+ seems likely to be more 'developer overhead' than anybody would
84
+ really want to use in practice.
85
+
86
+ The memoization/cache key for `use_runner` (mops) function calls is made up of three parts or levels:
87
+
88
+ - The top level is the global storage config, including SA, container,
89
+ and a version-specific base path provided by the `mops` runner.
90
+ This level is not semantically derived from the function call
91
+ itself; it's present purely as a technical reality.
92
+
93
+ In the configuration and in the code, the configurable part of this
94
+ is referred to as the storage_root. Once a mops runner adds its own
95
+ base path, it becomes the runner prefix.
96
+
97
+ - The middle level is the 'code' memoization, which provides users granular ways of
98
+ invalidating caches across runs sharing a runner prefix by changing one of:
99
+ ---- pipeline_id
100
+ ---- name of function being memoized
101
+ ---- cache key in docstring for function being memoized
102
+ to indicate that something about the _code being run_ has changed.
103
+
104
+ - The bottom level is the 'arguments' memoization,
105
+ whereby we serialize and then hash the full set of arguments to the function,
106
+ such that different calls to the same function will memoize differently as expected.
107
+
108
+ Of the three levels, our per-function memoization config should only need to 'deal' with the top two levels.
109
+
110
+ - A previous call to the function in question might have used a
111
+ different storage root than is configured by the application for the
112
+ default case, so it must be necessary to specify where we want to
113
+ look for memoized results.
114
+
115
+ - The pipeline_id used for a known result may be different for various
116
+ different functions that we intend to call.
117
+
118
+ - If a codebase has undergone refactoring, such that a function lives
119
+ in a different module than it previously did, but you wish to reuse
120
+ memoized results, it should be possible to provide a translation
121
+ layer for the name itself.
122
+
123
+ - In rare cases, the (optional) value of a function's
124
+ function-logic-key (embedded in the docstring) may have changed
125
+ compared to the version we're able to import, but we may still wish
126
+ to pick up the result of a different configuration.
127
+
128
+ Notably, we do _not_ propose to allow configuration of the hashed
129
+ args/kwargs itself, which would amount to a full redirect of the
130
+ function call to a known result. It's not that there might not be some
131
+ use case for this functionality; we simply don't foresee what that
132
+ would be and decline to prematurely implement such functionality.
133
+
134
+ """
135
+
136
+ import hashlib
137
+ import re
138
+ import typing as ty
139
+
140
+ from thds import humenc
141
+ from thds.core import config
142
+
143
+ from ..pipeline_id_mask import (
144
+ extract_from_docstr,
145
+ get_pipeline_id,
146
+ get_pipeline_id_mask,
147
+ pipeline_id_mask,
148
+ )
149
+ from ..uris import lookup_blob_store
150
+ from .unique_name_for_function import make_unique_name_including_docstring_key, parse_unique_name
151
+
152
+
153
+ class _PipelineMemospaceHandler(ty.Protocol):
154
+ def __call__(self, __callable_name: str, __runner_prefix: str) -> ty.Optional[str]:
155
+ ...
156
+
157
+
158
+ _PIPELINE_MEMOSPACE_HANDLERS: ty.List[_PipelineMemospaceHandler] = list()
159
+
160
+
161
+ def add_pipeline_memospace_handlers(*handlers: _PipelineMemospaceHandler) -> None:
162
+ """Add one or more handlers that will be tested in order to determine whether an
163
+ application wishes to override all or part of the "pipeline memospace" (the runner
164
+ prefix plus the pipeline id) for a given fully-qualified function name.
165
+
166
+ Does _not_ provide access to the invocation-specific `function_id` information; this
167
+ capability is not offered by mops.
168
+ """
169
+ _PIPELINE_MEMOSPACE_HANDLERS.extend(handlers)
170
+
171
+
172
+ def matching_mask_pipeline_id(pipeline_id: str, callable_regex: str) -> _PipelineMemospaceHandler:
173
+ """Set the function memospace to be:
174
+
175
+ the current runner prefix
176
+ + the supplied pipeline_id, OR the set pipeline_id (not the
177
+ pipeline_id_mask!) if the supplied pipeline_id is empty
178
+ (thus allowing for this to fall back to an auto-generated pipeline_id if you want to force a run)
179
+ + the callable name (including docstring key).
180
+
181
+ Note this uses re.match, which means your regex must match the _beginning_ of the
182
+ callable name. If you want fullmatch, write your own. :)
183
+
184
+ """
185
+
186
+ def _handler(callable_name: str, runner_prefix: str) -> ty.Optional[str]:
187
+ if re.match(callable_regex, callable_name):
188
+ return lookup_blob_store(runner_prefix).join(
189
+ runner_prefix, pipeline_id or get_pipeline_id(), callable_name
190
+ )
191
+ return None
192
+
193
+ return _handler
194
+
195
+
196
+ def _lookup_pipeline_memospace(runner_prefix: str, callable_name: str) -> ty.Optional[str]:
197
+ """The pipeline memospace is everything up until but not including the hash of the (args, kwargs) tuple."""
198
+ try:
199
+ config_memospace = config.config_by_name(f"mops.memo.{callable_name}.memospace")()
200
+ except KeyError:
201
+ config_memospace = ""
202
+ if config_memospace:
203
+ return config_memospace
204
+ for handler in _PIPELINE_MEMOSPACE_HANDLERS:
205
+ pipeline_memospace = handler(callable_name, runner_prefix)
206
+ if pipeline_memospace:
207
+ return pipeline_memospace
208
+ return None
209
+
210
+
211
+ def make_function_memospace(runner_prefix: str, f: ty.Callable) -> str:
212
+ callable_name = make_unique_name_including_docstring_key(f)
213
+ # always default to the function docstring if no other mask is currently provided.
214
+ with pipeline_id_mask(extract_from_docstr(f, require=False)):
215
+ return _lookup_pipeline_memospace(runner_prefix, callable_name) or lookup_blob_store(
216
+ runner_prefix
217
+ ).join(
218
+ runner_prefix,
219
+ get_pipeline_id_mask(),
220
+ callable_name,
221
+ )
222
+
223
+
224
+ class MemoUriComponents(ty.NamedTuple):
225
+ runner_prefix: str
226
+ pipeline_id: str
227
+ function_module: str
228
+ function_name: str
229
+ function_logic_key: str
230
+ args_hash: str
231
+
232
+
233
+ def parse_memo_uri(
234
+ memo_uri: str,
235
+ runner_prefix: str = "", # the part up to but not including the pipeline_id
236
+ separator: str = "/",
237
+ backward_compat_split: str = "mops2-mpf",
238
+ ) -> MemoUriComponents:
239
+ if not runner_prefix:
240
+ # this is in order to help with backward compatibilty for mops summaries that
241
+ # didn't store any of this. providing memospace is a more precise way to handle this.
242
+ if backward_compat_split not in memo_uri:
243
+ raise ValueError("Cannot determine the components of a memo URI with no memospace")
244
+ parts = memo_uri.split(backward_compat_split, 1)
245
+ assert len(parts) > 1, parts
246
+ runner_prefix = separator.join((parts[0].rstrip(separator), backward_compat_split))
247
+
248
+ runner_prefix = runner_prefix.rstrip(separator)
249
+ rest, args_hash = memo_uri.rsplit(separator, 1) # args hash is last component
250
+ rest, full_function_name = rest.rsplit(separator, 1)
251
+ pipeline_id = rest[len(runner_prefix) :]
252
+ pipeline_id = pipeline_id.strip(separator)
253
+
254
+ function_parts = parse_unique_name(full_function_name)
255
+
256
+ return MemoUriComponents(
257
+ runner_prefix,
258
+ pipeline_id,
259
+ function_parts.module,
260
+ function_parts.name,
261
+ function_parts.function_logic_key,
262
+ args_hash,
263
+ )
264
+
265
+
266
+ def args_kwargs_content_address(args_kwargs_bytes: bytes) -> str:
267
+ return humenc.encode(hashlib.sha256(args_kwargs_bytes).digest())
@@ -0,0 +1,53 @@
1
+ """Definitions of basic keyfuncs."""
2
+ import inspect
3
+ import typing as ty
4
+
5
+ from ..types import Args, Kwargs
6
+
7
+
8
+ class Keyfunc(ty.Protocol):
9
+ """A function which, when called with (c, args, kwargs),
10
+ returns either the same or a different callable, and the same or
11
+ different args and kwargs, such that the returned three-tuple is
12
+ what will get used to construct the full memoization key.
13
+
14
+ The args, kwargs returned _must_ be bindable to the parameters of
15
+ the callable returned. However, the callable will not be actually
16
+ invoked, so it is not important that they bind in a semantically
17
+ meaningful way - if you're just trying to drop certain arguments
18
+ that can't be pickled, your best bet will be to return a `None`
19
+ placeholder for those.
20
+
21
+ The identity function (lambda c, a, k: c, a k) is equivalent to
22
+ the unchanged default behavior from MemoizingPicklingRunner.
23
+ """
24
+
25
+ def __call__(
26
+ self, c: ty.Callable, __args: Args, __kwargs: Kwargs
27
+ ) -> ty.Tuple[ty.Callable, Args, Kwargs]:
28
+ ... # pragma: nocover
29
+
30
+
31
+ ArgsOnlyKeyfunc = ty.Callable[..., ty.Tuple[Args, Kwargs]]
32
+
33
+
34
+ def args_only(keyfunc: ty.Union[ArgsOnlyKeyfunc, Keyfunc]) -> Keyfunc:
35
+ def funcpassthrough_keyfunc(
36
+ c: ty.Callable, args: Args, kwargs: Kwargs
37
+ ) -> ty.Tuple[ty.Callable, Args, Kwargs]:
38
+ return c, *keyfunc(*args, **kwargs) # type: ignore
39
+
40
+ return funcpassthrough_keyfunc
41
+
42
+
43
+ def autowrap_args_only_keyfunc(keyfunc: ty.Union[ArgsOnlyKeyfunc, Keyfunc]) -> Keyfunc:
44
+ """This exists only to 'sweeten' the API, so that in most cases a
45
+ 'normal-looking' function can be passed in that does not have
46
+ access to the `func` parameter and gets Pythonic access to the
47
+ splatted args and kwargs, rather than a tuple and a dictionary.
48
+ """
49
+ keyfunc_params = inspect.signature(keyfunc).parameters
50
+ is_full_keyfunc = len(keyfunc_params) == 3 and next(iter(keyfunc_params.values())).name == "c"
51
+ if is_full_keyfunc:
52
+ return ty.cast(Keyfunc, keyfunc)
53
+ return args_only(keyfunc)