thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show
  1. thds/mops/__about__.py +8 -0
  2. thds/mops/__init__.py +3 -0
  3. thds/mops/_compat.py +6 -0
  4. thds/mops/_utils/__init__.py +0 -0
  5. thds/mops/_utils/colorize.py +110 -0
  6. thds/mops/_utils/config_tree.py +167 -0
  7. thds/mops/_utils/exception.py +16 -0
  8. thds/mops/_utils/locked_cache.py +78 -0
  9. thds/mops/_utils/names.py +23 -0
  10. thds/mops/_utils/on_slow.py +28 -0
  11. thds/mops/_utils/once.py +30 -0
  12. thds/mops/_utils/temp.py +32 -0
  13. thds/mops/config.py +60 -0
  14. thds/mops/impure/__init__.py +2 -0
  15. thds/mops/impure/keyfunc.py +14 -0
  16. thds/mops/impure/runner.py +73 -0
  17. thds/mops/k8s/__init__.py +27 -0
  18. thds/mops/k8s/_shared.py +3 -0
  19. thds/mops/k8s/apply_yaml.py +22 -0
  20. thds/mops/k8s/auth.py +49 -0
  21. thds/mops/k8s/config.py +37 -0
  22. thds/mops/k8s/container_registry.py +14 -0
  23. thds/mops/k8s/jobs.py +57 -0
  24. thds/mops/k8s/launch.py +234 -0
  25. thds/mops/k8s/logging.py +239 -0
  26. thds/mops/k8s/namespace.py +17 -0
  27. thds/mops/k8s/node_selection.py +58 -0
  28. thds/mops/k8s/retry.py +75 -0
  29. thds/mops/k8s/too_old_resource_version.py +42 -0
  30. thds/mops/k8s/tools/krsync.py +50 -0
  31. thds/mops/k8s/tools/krsync.sh +22 -0
  32. thds/mops/k8s/wait_job.py +72 -0
  33. thds/mops/k8s/warn_image_backoff.py +63 -0
  34. thds/mops/k8s/watch.py +266 -0
  35. thds/mops/meta.json +8 -0
  36. thds/mops/parallel.py +36 -0
  37. thds/mops/pure/__init__.py +43 -0
  38. thds/mops/pure/_magic/__init__.py +0 -0
  39. thds/mops/pure/_magic/api.py +114 -0
  40. thds/mops/pure/_magic/sauce.py +152 -0
  41. thds/mops/pure/_magic/shims.py +34 -0
  42. thds/mops/pure/adls/__init__.py +1 -0
  43. thds/mops/pure/adls/_files.py +22 -0
  44. thds/mops/pure/adls/blob_store.py +185 -0
  45. thds/mops/pure/adls/output_fqn.py +17 -0
  46. thds/mops/pure/core/__init__.py +0 -0
  47. thds/mops/pure/core/content_addressed.py +31 -0
  48. thds/mops/pure/core/deferred_work.py +83 -0
  49. thds/mops/pure/core/entry/__init__.py +2 -0
  50. thds/mops/pure/core/entry/main.py +47 -0
  51. thds/mops/pure/core/entry/route_result.py +66 -0
  52. thds/mops/pure/core/entry/runner_registry.py +31 -0
  53. thds/mops/pure/core/file_blob_store.py +120 -0
  54. thds/mops/pure/core/lock/__init__.py +7 -0
  55. thds/mops/pure/core/lock/_acquire.py +192 -0
  56. thds/mops/pure/core/lock/_funcs.py +37 -0
  57. thds/mops/pure/core/lock/cli.py +73 -0
  58. thds/mops/pure/core/lock/maintain.py +150 -0
  59. thds/mops/pure/core/lock/read.py +39 -0
  60. thds/mops/pure/core/lock/types.py +37 -0
  61. thds/mops/pure/core/lock/write.py +136 -0
  62. thds/mops/pure/core/memo/__init__.py +6 -0
  63. thds/mops/pure/core/memo/function_memospace.py +267 -0
  64. thds/mops/pure/core/memo/keyfunc.py +53 -0
  65. thds/mops/pure/core/memo/overwrite_params.py +61 -0
  66. thds/mops/pure/core/memo/results.py +103 -0
  67. thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
  68. thds/mops/pure/core/metadata.py +230 -0
  69. thds/mops/pure/core/output_naming.py +52 -0
  70. thds/mops/pure/core/partial.py +15 -0
  71. thds/mops/pure/core/pipeline_id.py +62 -0
  72. thds/mops/pure/core/pipeline_id_mask.py +79 -0
  73. thds/mops/pure/core/script_support.py +25 -0
  74. thds/mops/pure/core/serialize_big_objs.py +73 -0
  75. thds/mops/pure/core/serialize_paths.py +149 -0
  76. thds/mops/pure/core/source.py +291 -0
  77. thds/mops/pure/core/types.py +142 -0
  78. thds/mops/pure/core/uris.py +81 -0
  79. thds/mops/pure/core/use_runner.py +47 -0
  80. thds/mops/pure/joblib/__init__.py +1 -0
  81. thds/mops/pure/joblib/backend.py +81 -0
  82. thds/mops/pure/joblib/batching.py +67 -0
  83. thds/mops/pure/pickling/__init__.py +3 -0
  84. thds/mops/pure/pickling/_pickle.py +193 -0
  85. thds/mops/pure/pickling/memoize_only.py +22 -0
  86. thds/mops/pure/pickling/mprunner.py +173 -0
  87. thds/mops/pure/pickling/pickles.py +149 -0
  88. thds/mops/pure/pickling/remote.py +145 -0
  89. thds/mops/pure/pickling/sha256_b64.py +71 -0
  90. thds/mops/pure/runner/__init__.py +0 -0
  91. thds/mops/pure/runner/local.py +239 -0
  92. thds/mops/pure/runner/shim_builder.py +25 -0
  93. thds/mops/pure/runner/simple_shims.py +21 -0
  94. thds/mops/pure/runner/strings.py +1 -0
  95. thds/mops/pure/runner/types.py +28 -0
  96. thds/mops/pure/tools/__init__.py +0 -0
  97. thds/mops/pure/tools/history.py +35 -0
  98. thds/mops/pure/tools/inspect.py +372 -0
  99. thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
  100. thds/mops/pure/tools/stress.py +63 -0
  101. thds/mops/pure/tools/summarize/__init__.py +4 -0
  102. thds/mops/pure/tools/summarize/cli.py +293 -0
  103. thds/mops/pure/tools/summarize/run_summary.py +143 -0
  104. thds/mops/py.typed +0 -0
  105. thds/mops/testing/__init__.py +0 -0
  106. thds/mops/testing/deferred_imports.py +81 -0
  107. thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
  108. thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
  109. thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
  110. thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
  111. thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0
@@ -0,0 +1,293 @@
1
+ import argparse
2
+ import json
3
+ import statistics
4
+ import sys
5
+ import typing as ty
6
+ from functools import reduce
7
+ from pathlib import Path
8
+ from typing import Dict, List, Literal, Optional, Set, TypedDict
9
+
10
+ from thds.mops.pure.core.memo.function_memospace import parse_memo_uri
11
+ from thds.mops.pure.tools.summarize import run_summary
12
+
13
+ SortOrder = Literal["name", "time"]
14
+
15
+
16
+ class FunctionSummary(TypedDict):
17
+ total_calls: int
18
+ cache_hits: int
19
+ executed: int
20
+ error_count: int
21
+ timestamps: List[str]
22
+ runner_prefixes: Set[str]
23
+ pipeline_ids: Set[str]
24
+ function_logic_keys: Set[str]
25
+ invoked_by: List[str]
26
+ invoker_code_version: List[str]
27
+ remote_code_version: List[str]
28
+ total_runtime_minutes: List[float] # minutes
29
+ remote_runtime_minutes: List[float] # minutes
30
+ uris_in_rvalue: List[str]
31
+
32
+
33
+ def _empty_summary() -> FunctionSummary:
34
+ return {
35
+ "total_calls": 0,
36
+ "cache_hits": 0,
37
+ "executed": 0,
38
+ "timestamps": [],
39
+ "runner_prefixes": set(),
40
+ "pipeline_ids": set(),
41
+ "function_logic_keys": set(),
42
+ "error_count": 0,
43
+ "invoked_by": list(),
44
+ "invoker_code_version": list(),
45
+ "remote_code_version": list(),
46
+ "total_runtime_minutes": list(),
47
+ "remote_runtime_minutes": list(),
48
+ "uris_in_rvalue": list(),
49
+ }
50
+
51
+
52
+ def _process_log_file(log_file: Path) -> Dict[str, FunctionSummary]:
53
+ """
54
+ Process a single JSON log file and return a partial summary.
55
+ :param log_file: Path to the log file
56
+ :return: A dictionary with the function names as keys and their execution summaries as values
57
+ """
58
+ partial_summary: Dict[str, FunctionSummary] = {}
59
+ with log_file.open("r") as f:
60
+ try:
61
+ log_entry: run_summary.LogEntry = json.load(f)
62
+ except json.JSONDecodeError:
63
+ print(f"Error reading log file '{log_file}'")
64
+ return dict()
65
+
66
+ function_name = log_entry["function_name"]
67
+ if function_name not in partial_summary:
68
+ partial_summary[function_name] = _empty_summary()
69
+
70
+ summary = partial_summary[function_name]
71
+
72
+ summary["total_calls"] += 1
73
+ if log_entry["status"] in ("memoized", "awaited"):
74
+ summary["cache_hits"] += 1
75
+ else:
76
+ summary["executed"] += 1
77
+ summary["error_count"] += int(log_entry.get("was_error") or 0)
78
+ summary["timestamps"].append(log_entry["timestamp"])
79
+ summary["uris_in_rvalue"].extend(log_entry.get("uris_in_rvalue") or tuple())
80
+
81
+ mu_parts = parse_memo_uri(
82
+ log_entry["memo_uri"], runner_prefix=log_entry.get("runner_prefix", "")
83
+ )
84
+
85
+ summary["runner_prefixes"].add(mu_parts.runner_prefix)
86
+ summary["pipeline_ids"].add(mu_parts.pipeline_id)
87
+ summary["function_logic_keys"].add(mu_parts.function_logic_key)
88
+
89
+ # new metadata stuff below:
90
+ def append_if_exists(key: str) -> None:
91
+ if key in log_entry:
92
+ summary[key].append(log_entry[key]) # type: ignore
93
+
94
+ for key in (
95
+ "invoked_by",
96
+ "invoker_code_version",
97
+ "remote_code_version",
98
+ "total_runtime_minutes",
99
+ "remote_runtime_minutes",
100
+ ):
101
+ append_if_exists(key)
102
+
103
+ return partial_summary
104
+
105
+
106
+ def _combine_summaries(
107
+ acc: Dict[str, FunctionSummary], partial: Dict[str, FunctionSummary]
108
+ ) -> Dict[str, FunctionSummary]:
109
+ """
110
+ Combine two summaries into one
111
+ :param acc: the accumulator summary
112
+ :param partial: A partial summary to be combined with the accumulator
113
+ :return: the combined summary
114
+ """
115
+ for function_name, data in partial.items():
116
+ if function_name not in acc:
117
+ acc[function_name] = _empty_summary()
118
+ acc[function_name]["total_calls"] += data["total_calls"]
119
+ acc[function_name]["cache_hits"] += data["cache_hits"]
120
+ acc[function_name]["executed"] += data["executed"]
121
+ acc[function_name]["error_count"] += data["error_count"]
122
+ acc[function_name]["timestamps"].extend(data["timestamps"])
123
+ acc[function_name]["runner_prefixes"].update(data["runner_prefixes"])
124
+ acc[function_name]["pipeline_ids"].update(data["pipeline_ids"])
125
+ acc[function_name]["function_logic_keys"].update(data["function_logic_keys"])
126
+ acc[function_name]["uris_in_rvalue"].extend(data["uris_in_rvalue"])
127
+
128
+ for key in (
129
+ "invoked_by",
130
+ "invoker_code_version",
131
+ "remote_code_version",
132
+ "total_runtime_minutes",
133
+ "remote_runtime_minutes",
134
+ ):
135
+ acc[function_name][key].extend(data[key]) # type: ignore
136
+
137
+ return acc
138
+
139
+
140
+ def _format_summary(summary: Dict[str, FunctionSummary], sort_by: SortOrder, uri_limit: int = 10) -> str:
141
+ """
142
+ Format a summary into a readable report
143
+ """
144
+ template = (
145
+ "Function '{function_name}':\n"
146
+ " Total calls: {total_calls}\n"
147
+ " Cache hits: {cache_hits}\n"
148
+ " Executed: {executed}\n"
149
+ " Error count: {error_count}\n"
150
+ " Timestamps: {timestamps}\n"
151
+ " Runner Prefixes: {runner_prefixes}\n"
152
+ " Pipeline IDs: {pipeline_ids}\n"
153
+ " Function Logic Keys: {function_logic_keys}\n"
154
+ " Function Runtime minutes: {function_runtimes}\n"
155
+ " Wall clock minutes: {wall_clock_runtimes}\n"
156
+ " Invoked by: {invokers}\n"
157
+ " Invoker code versions: {invoker_code_version}\n"
158
+ " Remote code versions: {remote_code_version}\n"
159
+ )
160
+ report_lines = []
161
+
162
+ sorted_items = (
163
+ sorted(summary.items(), key=lambda item: item[0])
164
+ if sort_by == "name"
165
+ else sorted(summary.items(), key=lambda item: min(item[1]["timestamps"]))
166
+ )
167
+
168
+ for function_name, data in sorted_items:
169
+
170
+ def first_and_last_n(
171
+ obj_set: ty.Collection[str], n: int
172
+ ) -> ty.Tuple[ty.List[str], ty.List[str], int]:
173
+ """take the first n and the last n, unless they would overlap, in which case take the whole list"""
174
+ if len(obj_set) <= n * 2:
175
+ return list(obj_set), list(), 0
176
+ obj_list = list(obj_set)
177
+ return obj_list[:n], obj_list[-n:], len(obj_set) - n * 2
178
+
179
+ def and_more(obj_set: ty.Collection[str], max_count: int = 4) -> str:
180
+ if max_count < 1:
181
+ return ""
182
+ if max_count == 1:
183
+ max_count = 2 # stupid, but keeps the code simpler.
184
+ the_first, the_last, remaining_count = first_and_last_n(obj_set, max_count // 2)
185
+ return ", ".join(
186
+ [
187
+ *the_first,
188
+ *([f"...skipping {remaining_count} more..."] if remaining_count else list()),
189
+ *the_last,
190
+ ]
191
+ )
192
+
193
+ def describe(fs: FunctionSummary, key: str) -> str:
194
+ numlist: ty.List[float] = fs[key] # type: ignore
195
+ if not numlist:
196
+ return ""
197
+
198
+ avg = sum(numlist) / len(numlist)
199
+ maxi = max(numlist)
200
+ mini = min(numlist)
201
+ pstddev = statistics.pstdev(numlist)
202
+ return f"avg: {avg:.2f}, min: {mini:.2f}, max: {maxi:.2f}, pstdev: {pstddev:.2f}"
203
+
204
+ report_lines.append(
205
+ template.format(
206
+ function_name=function_name,
207
+ total_calls=data["total_calls"],
208
+ cache_hits=data["cache_hits"],
209
+ executed=data["executed"],
210
+ error_count=data["error_count"],
211
+ timestamps=and_more(sorted(data["timestamps"])),
212
+ runner_prefixes=and_more(data["runner_prefixes"]),
213
+ pipeline_ids=", ".join(data["pipeline_ids"]),
214
+ function_logic_keys=", ".join(data["function_logic_keys"]),
215
+ function_runtimes=describe(data, "remote_runtime_minutes"),
216
+ wall_clock_runtimes=describe(data, "total_runtime_minutes"),
217
+ invokers=", ".join(sorted(set(data["invoked_by"]))),
218
+ invoker_code_version=", ".join(sorted(set(data["invoker_code_version"]))),
219
+ remote_code_version=", ".join(sorted(set(data["remote_code_version"]))),
220
+ )
221
+ )
222
+ n_uris = and_more(
223
+ sorted(data["uris_in_rvalue"]),
224
+ max_count=uri_limit if uri_limit >= 0 else sys.maxsize,
225
+ ).replace(", ", "\n ")
226
+ if n_uris:
227
+ report_lines.append(f" URIs in return value(s):\n {n_uris}\n")
228
+ return "\n".join(report_lines)
229
+
230
+
231
+ def _auto_find_run_directory() -> ty.Optional[Path]:
232
+ mops_root = run_summary.MOPS_SUMMARY_DIR()
233
+ if not mops_root.exists():
234
+ raise ValueError(f"No mops summary root directory found at {mops_root}.")
235
+ if not mops_root.is_dir():
236
+ raise RuntimeError(
237
+ "Mops summary root is not a directory! "
238
+ f"Delete {mops_root} to allow mops to recreate it on the next run."
239
+ )
240
+ for directory in sorted(mops_root.iterdir(), key=lambda x: x.name, reverse=True):
241
+ if directory.is_dir() and list(directory.glob("*.json")):
242
+ # needs to have some files for it to count for anything
243
+ return directory
244
+
245
+ print("No pipeline run directories found.")
246
+ return None
247
+
248
+
249
+ def summarize(
250
+ run_directory: Optional[str] = None, sort_by: SortOrder = "name", uri_limit: int = 10
251
+ ) -> None:
252
+ run_directory_path = Path(run_directory) if run_directory else _auto_find_run_directory()
253
+ if not run_directory_path:
254
+ return
255
+
256
+ print(f"Summarizing pipeline run '{run_directory_path}'\n")
257
+ log_files = list(run_directory_path.glob("*.json"))
258
+
259
+ partial_summaries = map(_process_log_file, log_files)
260
+
261
+ summary: Dict[str, FunctionSummary] = reduce(_combine_summaries, partial_summaries, {})
262
+
263
+ report = _format_summary(summary, sort_by, uri_limit)
264
+ print(report)
265
+
266
+
267
+ def main() -> None:
268
+ parser = argparse.ArgumentParser(description="Summarize mops pipeline run logs.")
269
+ parser.add_argument(
270
+ "run_directory",
271
+ nargs="?",
272
+ type=str,
273
+ default=None,
274
+ help="Path to the pipeline run directory. If not provided, the latest run directory will be used.",
275
+ )
276
+ parser.add_argument(
277
+ "--sort-by",
278
+ choices=["name", "time"],
279
+ default="time",
280
+ help="Sort the summary by function name or by the first call time",
281
+ )
282
+ parser.add_argument(
283
+ "--uri-limit",
284
+ type=int,
285
+ default=10,
286
+ help=(
287
+ "Limit the number of Source URIs printed in the summary for each function."
288
+ " Grep for lines beginning with 5 spaces to get only the URIs."
289
+ " Negative numbers (e.g. -1) mean no limit."
290
+ ),
291
+ )
292
+ args = parser.parse_args()
293
+ summarize(args.run_directory, args.sort_by, args.uri_limit)
@@ -0,0 +1,143 @@
1
+ import datetime as dt
2
+ import json
3
+ import os
4
+ import pickle
5
+ import typing as ty
6
+ import uuid
7
+ from pathlib import Path
8
+
9
+ from thds.core import config, log, pickle_visit, source
10
+ from thds.mops.pure.core.memo import function_memospace
11
+ from thds.mops.pure.core.metadata import get_invoked_by
12
+ from thds.mops.pure.core.types import T
13
+
14
+ from ...core import metadata
15
+
16
+ MOPS_SUMMARY_DIR = config.item("thds.mops.summary.dir", default=Path(".mops/summary"), parse=Path)
17
+ RUN_NAME = config.item(
18
+ "thds.mops.summary.run_name",
19
+ default=f"{dt.datetime.utcnow().isoformat()}-pid{os.getpid()}-{get_invoked_by()}",
20
+ )
21
+
22
+ InvocationType = ty.Literal["memoized", "invoked", "awaited"]
23
+
24
+ logger = log.getLogger(__name__)
25
+
26
+
27
+ class LogEntryV1(ty.TypedDict):
28
+ function_name: str
29
+ memo_uri: str
30
+ timestamp: str # more or less "when did this complete?"
31
+ status: InvocationType # old name that we're retaining for compatibility
32
+
33
+
34
+ class LogEntry(LogEntryV1, total=False):
35
+ runner_prefix: str # includes env and any prefixes like mops2-mpf
36
+ pipeline_id: str
37
+ function_logic_key: str
38
+ was_error: bool
39
+
40
+ total_runtime_minutes: float
41
+ remote_runtime_minutes: float
42
+ invoked_by: str
43
+ invoker_code_version: str
44
+ remote_code_version: str
45
+
46
+ uris_in_rvalue: ty.List[str]
47
+
48
+
49
+ def create_mops_run_directory() -> Path:
50
+ # Define the root directory for mops logs
51
+ mops_root = MOPS_SUMMARY_DIR()
52
+ # Use run name if set, otherwise fallback to orchestrator datetime
53
+ run_name = RUN_NAME()
54
+ # Create a subdirectory named with the orchestrator datetime and run identifier
55
+ run_directory = mops_root / run_name
56
+ try:
57
+ run_directory.mkdir(parents=True, exist_ok=True)
58
+ except Exception:
59
+ if mops_root.exists() and not mops_root.is_dir():
60
+ # this is going to cause errors later on!
61
+ logger.error(
62
+ f"mops summary directory must be a directory: '{mops_root}'"
63
+ " Please delete this file and allow mops to recreate it!"
64
+ )
65
+ else:
66
+ raise
67
+
68
+ return run_directory
69
+
70
+
71
+ def _generate_log_filename(run_directory: Path) -> Path:
72
+ """Generate a log filename using the current timestamp and a short UUID, ensuring uniqueness"""
73
+ timestamp = dt.datetime.utcnow().strftime("%Y%m%d%H%M%S")
74
+ short_uuid = str(uuid.uuid4())[:8]
75
+ filename = f"{timestamp}-{short_uuid}.json"
76
+ return run_directory / filename
77
+
78
+
79
+ def _extract_source_uris(result: ty.Any) -> ty.Set[str]:
80
+ sources: ty.List[source.Source] = list()
81
+
82
+ def extract_source(unknown: ty.Any) -> None:
83
+ if isinstance(unknown, source.Source):
84
+ sources.append(unknown)
85
+
86
+ try:
87
+ pickle_visit.recursive_visit(extract_source, result)
88
+ except pickle.PicklingError:
89
+ pass
90
+ except Exception as exc:
91
+ logger.warning(f'Unexpected error trying to extract source URIs from "%s"; {exc}', result)
92
+
93
+ return {source.uri for source in sources}
94
+
95
+
96
+ def log_function_execution(
97
+ run_directory: ty.Optional[Path],
98
+ func: ty.Callable[..., T],
99
+ memo_uri: str,
100
+ itype: InvocationType,
101
+ metadata: ty.Optional[metadata.ResultMetadata] = None,
102
+ runner_prefix: str = "",
103
+ was_error: bool = False,
104
+ return_value: ty.Any = None,
105
+ ) -> None:
106
+ if not run_directory:
107
+ logger.debug("Not writing function summary for %s", memo_uri)
108
+ return
109
+
110
+ log_file = _generate_log_filename(run_directory)
111
+ func_module = func.__module__
112
+ func_name = func.__name__
113
+ full_function_name = f"{func_module}:{func_name}"
114
+
115
+ parts = function_memospace.parse_memo_uri(memo_uri, runner_prefix)
116
+
117
+ log_entry: LogEntry = {
118
+ "function_name": full_function_name,
119
+ "memo_uri": memo_uri,
120
+ "runner_prefix": parts.runner_prefix,
121
+ "pipeline_id": parts.pipeline_id,
122
+ "function_logic_key": parts.function_logic_key,
123
+ "timestamp": dt.datetime.utcnow().isoformat(),
124
+ "status": itype,
125
+ "was_error": was_error,
126
+ }
127
+ if metadata:
128
+ log_entry["total_runtime_minutes"] = metadata.result_wall_minutes
129
+ log_entry["remote_runtime_minutes"] = metadata.remote_wall_minutes
130
+ log_entry["invoked_by"] = metadata.invoked_by
131
+ log_entry["invoker_code_version"] = metadata.invoker_code_version
132
+ log_entry["remote_code_version"] = metadata.remote_code_version
133
+ # we don't bother with invoked_at or remote_started_at because they can be
134
+ # inferred from the timestamp and the wall times
135
+ if source_uris := _extract_source_uris(return_value):
136
+ log_entry["uris_in_rvalue"] = sorted(source_uris)
137
+
138
+ try:
139
+ assert not log_file.exists(), f"Log file '{log_file}' should not already exist"
140
+ with log_file.open("w") as f:
141
+ json.dump(log_entry, f, indent=2)
142
+ except Exception:
143
+ logger.exception(f"Unable to write mops function invocation log file at '{log_file}'")
thds/mops/py.typed ADDED
File without changes
File without changes
@@ -0,0 +1,81 @@
1
+ import ast
2
+ import itertools
3
+ import re
4
+ import sys
5
+ import typing as ty
6
+ from contextlib import contextmanager
7
+
8
+ from thds.core.log import getLogger
9
+
10
+
11
+ def module_name_re(modules: ty.Collection[str]) -> ty.Pattern[str]:
12
+ name = "|".join(modules)
13
+ return re.compile(rf"^({name})(?:\.|$)")
14
+
15
+
16
+ def module_names_from_import_statement(import_stmt: str) -> ty.Set[str]:
17
+ statements = ast.parse(import_stmt).body
18
+
19
+ def _extract_imports(imp: ty.Any) -> ty.Iterable[str]:
20
+ names: ty.Iterable[ty.Optional[str]]
21
+ if isinstance(imp, ast.Import):
22
+ names = (n.name for n in imp.names)
23
+ elif isinstance(imp, ast.ImportFrom):
24
+ names = (imp.module,)
25
+ else:
26
+ names = ()
27
+ return filter(None, names)
28
+
29
+ def _extract_ancestors(module: str) -> ty.Iterable[str]:
30
+ parts = module.split(".")
31
+ return (".".join(parts[:i]) for i in range(1, len(parts) + 1))
32
+
33
+ imported_modules = itertools.chain.from_iterable(map(_extract_imports, statements))
34
+ all_imported_modules = itertools.chain.from_iterable(map(_extract_ancestors, imported_modules))
35
+ return set(all_imported_modules)
36
+
37
+
38
+ @contextmanager
39
+ def clear_and_restore_import_cache(module_name_filter: ty.Callable[[str], ty.Any]) -> ty.Iterator[None]:
40
+ already_imported = [name for name in sys.modules if module_name_filter(name)]
41
+ if already_imported:
42
+ getLogger(__name__).debug(
43
+ "Clearing the following from sys.modules matching %s:\n %s",
44
+ module_name_filter,
45
+ "\n ".join(already_imported),
46
+ )
47
+ to_restore = {name: sys.modules.pop(name) for name in already_imported}
48
+ try:
49
+ yield
50
+ finally:
51
+ sys.modules.update(to_restore)
52
+
53
+
54
+ def assert_dev_deps_not_imported(import_statement: str, forbidden_modules: ty.Collection[str]) -> None:
55
+ """One of the primary features of `mops` is to provide global memoization of pure function calls
56
+ using remote storage mechanisms. Sometimes, as a library author, you'd like to pre-compute the
57
+ result of such a function call, memoizing it and making it available to downstream users without
58
+ requiring them to perform the computation themselves. As such, it is useful to export a public
59
+ interface where such functions can be imported and called to achieve a cache hit and download the
60
+ result locally, _without_ requiring that all the dependencies needed to _compute_ the result be
61
+ present; only `mops` itself need be present to fetch the memoized result. This function can be used
62
+ in your test suite to assert that this condition is met for any import statements that a downstream
63
+ user might use to access your memoized functions.
64
+
65
+ :param import_statement: The import statement to test, as a string
66
+ :param forbidden_modules: Module names that should _not_ be imported in the course of executing
67
+ `import_statement`.
68
+ :raises AssertionError: When any of the `forbidden_modules` or their submodules were imported in the
69
+ course of executing `import_statement`
70
+ """
71
+ is_forbidden = module_name_re(forbidden_modules).match
72
+ # ensure that we clear the cache of the actually imported modules, lest we get a spurious pass
73
+ # due to the interpreter not evaluating them again!
74
+ imported_modules = module_names_from_import_statement(import_statement)
75
+ will_be_imported = module_name_re(imported_modules).match
76
+ with clear_and_restore_import_cache(lambda name: is_forbidden(name) or will_be_imported(name)):
77
+ exec(import_statement, {}, {})
78
+ mistakenly_imported = [name for name in sys.modules if is_forbidden(name)]
79
+ assert (
80
+ not mistakenly_imported
81
+ ), f"Modules {', '.join(mistakenly_imported)} were imported on execution of {import_statement!r}"
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.2
2
+ Name: thds.mops
3
+ Version: 3.6.20250219172032
4
+ Summary: ML Ops tools for Trilliant Health
5
+ Author: Trilliant Health
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: ansicolors
8
+ Requires-Dist: azure-core
9
+ Requires-Dist: azure-identity
10
+ Requires-Dist: azure-storage-file-datalake
11
+ Requires-Dist: cachetools
12
+ Requires-Dist: tblib<3.0.0,>=2.0.0
13
+ Requires-Dist: thds.adls>=3.1
14
+ Requires-Dist: thds.core>=1.32
15
+ Requires-Dist: thds.humenc>=1.0
16
+ Requires-Dist: tomli; python_version < "3.11"
17
+ Provides-Extra: k8s
18
+ Requires-Dist: kubernetes>=18.20; extra == "k8s" and extra == "k8s"
19
+
20
+ # `mops`
21
+
22
+ `mops` is a Python library for ML Operations.
23
+
24
+ `mops` solves for three core issues:
25
+
26
+ - Transfer of
27
+ [pure](https://github.com/TrilliantHealth/trilliant-data-science/libs/mops/docs/pure_functions.adoc)
28
+ function execution to
29
+ [remote](https://github.com/TrilliantHealth/trilliant-data-science/libs/mops/docs/remote.adoc)
30
+ execution environments with more &| different compute resources
31
+ - [Efficient](https://github.com/TrilliantHealth/trilliant-data-science/libs/mops/docs/optimizations.adoc)
32
+ transfer of large blob data to/from other environments.
33
+ - [Memoization](https://github.com/TrilliantHealth/trilliant-data-science/libs/mops/docs/memoization.adoc)
34
+ — i.e. _reproducibility and fault tolerance_ — for individual functions.
35
+
36
+ It is used by
37
+ [decorating or wrapping your pure function and then calling it](https://github.com/TrilliantHealth/trilliant-data-science/libs/mops/docs/basic_usage.adoc)
38
+ like a normal function.
39
+
40
+ ### read the docs
41
+
42
+ [Browse our full documentation here.](https://github.com/TrilliantHealth/trilliant-data-science/libs/mops/README.adoc)