PyPI - thds.mops - Versions diffs - 3.6.20250219172032__py3-none-any.whl - Mend

thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show

thds/mops/__about__.py +8 -0
thds/mops/__init__.py +3 -0
thds/mops/_compat.py +6 -0
thds/mops/_utils/__init__.py +0 -0
thds/mops/_utils/colorize.py +110 -0
thds/mops/_utils/config_tree.py +167 -0
thds/mops/_utils/exception.py +16 -0
thds/mops/_utils/locked_cache.py +78 -0
thds/mops/_utils/names.py +23 -0
thds/mops/_utils/on_slow.py +28 -0
thds/mops/_utils/once.py +30 -0
thds/mops/_utils/temp.py +32 -0
thds/mops/config.py +60 -0
thds/mops/impure/__init__.py +2 -0
thds/mops/impure/keyfunc.py +14 -0
thds/mops/impure/runner.py +73 -0
thds/mops/k8s/__init__.py +27 -0
thds/mops/k8s/_shared.py +3 -0
thds/mops/k8s/apply_yaml.py +22 -0
thds/mops/k8s/auth.py +49 -0
thds/mops/k8s/config.py +37 -0
thds/mops/k8s/container_registry.py +14 -0
thds/mops/k8s/jobs.py +57 -0
thds/mops/k8s/launch.py +234 -0
thds/mops/k8s/logging.py +239 -0
thds/mops/k8s/namespace.py +17 -0
thds/mops/k8s/node_selection.py +58 -0
thds/mops/k8s/retry.py +75 -0
thds/mops/k8s/too_old_resource_version.py +42 -0
thds/mops/k8s/tools/krsync.py +50 -0
thds/mops/k8s/tools/krsync.sh +22 -0
thds/mops/k8s/wait_job.py +72 -0
thds/mops/k8s/warn_image_backoff.py +63 -0
thds/mops/k8s/watch.py +266 -0
thds/mops/meta.json +8 -0
thds/mops/parallel.py +36 -0
thds/mops/pure/__init__.py +43 -0
thds/mops/pure/_magic/__init__.py +0 -0
thds/mops/pure/_magic/api.py +114 -0
thds/mops/pure/_magic/sauce.py +152 -0
thds/mops/pure/_magic/shims.py +34 -0
thds/mops/pure/adls/__init__.py +1 -0
thds/mops/pure/adls/_files.py +22 -0
thds/mops/pure/adls/blob_store.py +185 -0
thds/mops/pure/adls/output_fqn.py +17 -0
thds/mops/pure/core/__init__.py +0 -0
thds/mops/pure/core/content_addressed.py +31 -0
thds/mops/pure/core/deferred_work.py +83 -0
thds/mops/pure/core/entry/__init__.py +2 -0
thds/mops/pure/core/entry/main.py +47 -0
thds/mops/pure/core/entry/route_result.py +66 -0
thds/mops/pure/core/entry/runner_registry.py +31 -0
thds/mops/pure/core/file_blob_store.py +120 -0
thds/mops/pure/core/lock/__init__.py +7 -0
thds/mops/pure/core/lock/_acquire.py +192 -0
thds/mops/pure/core/lock/_funcs.py +37 -0
thds/mops/pure/core/lock/cli.py +73 -0
thds/mops/pure/core/lock/maintain.py +150 -0
thds/mops/pure/core/lock/read.py +39 -0
thds/mops/pure/core/lock/types.py +37 -0
thds/mops/pure/core/lock/write.py +136 -0
thds/mops/pure/core/memo/__init__.py +6 -0
thds/mops/pure/core/memo/function_memospace.py +267 -0
thds/mops/pure/core/memo/keyfunc.py +53 -0
thds/mops/pure/core/memo/overwrite_params.py +61 -0
thds/mops/pure/core/memo/results.py +103 -0
thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
thds/mops/pure/core/metadata.py +230 -0
thds/mops/pure/core/output_naming.py +52 -0
thds/mops/pure/core/partial.py +15 -0
thds/mops/pure/core/pipeline_id.py +62 -0
thds/mops/pure/core/pipeline_id_mask.py +79 -0
thds/mops/pure/core/script_support.py +25 -0
thds/mops/pure/core/serialize_big_objs.py +73 -0
thds/mops/pure/core/serialize_paths.py +149 -0
thds/mops/pure/core/source.py +291 -0
thds/mops/pure/core/types.py +142 -0
thds/mops/pure/core/uris.py +81 -0
thds/mops/pure/core/use_runner.py +47 -0
thds/mops/pure/joblib/__init__.py +1 -0
thds/mops/pure/joblib/backend.py +81 -0
thds/mops/pure/joblib/batching.py +67 -0
thds/mops/pure/pickling/__init__.py +3 -0
thds/mops/pure/pickling/_pickle.py +193 -0
thds/mops/pure/pickling/memoize_only.py +22 -0
thds/mops/pure/pickling/mprunner.py +173 -0
thds/mops/pure/pickling/pickles.py +149 -0
thds/mops/pure/pickling/remote.py +145 -0
thds/mops/pure/pickling/sha256_b64.py +71 -0
thds/mops/pure/runner/__init__.py +0 -0
thds/mops/pure/runner/local.py +239 -0
thds/mops/pure/runner/shim_builder.py +25 -0
thds/mops/pure/runner/simple_shims.py +21 -0
thds/mops/pure/runner/strings.py +1 -0
thds/mops/pure/runner/types.py +28 -0
thds/mops/pure/tools/__init__.py +0 -0
thds/mops/pure/tools/history.py +35 -0
thds/mops/pure/tools/inspect.py +372 -0
thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
thds/mops/pure/tools/stress.py +63 -0
thds/mops/pure/tools/summarize/__init__.py +4 -0
thds/mops/pure/tools/summarize/cli.py +293 -0
thds/mops/pure/tools/summarize/run_summary.py +143 -0
thds/mops/py.typed +0 -0
thds/mops/testing/__init__.py +0 -0
thds/mops/testing/deferred_imports.py +81 -0
thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0

thds/mops/pure/tools/__init__.py ADDED Viewed

File without changes

thds/mops/pure/tools/history.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Find out how long a run took by looking at outputs to ADLS."""
+import typing as ty
+from datetime import timezone
+from thds.adls.global_client import get_global_fs_client
+from ..adls._files import yield_files
+def summarize(sa: str, container: str, pipeline_root_dir: str) -> ty.Dict[str, ty.Any]:
+    times = list()
+    durations = list()
+    total_functions = 0
+    for azure_file in yield_files(get_global_fs_client(sa, container), pipeline_root_dir):
+        if azure_file.name.endswith("invocation"):
+            total_functions += 1
+        times.append(azure_file.creation_time)
+        last_modified = azure_file.last_modified.replace(tzinfo=timezone.utc)
+        durations.append(last_modified - azure_file.creation_time)
+    durations = sorted(durations)
+    times = sorted(times)
+    start = times[0]
+    end = times[-1]
+    max_duration = durations[-1]
+    return dict(
+        start=start,
+        end=end,
+        duration=end - start,
+        slowest_file_upload=max_duration,
+        total_functions=total_functions,
+    )

thds/mops/pure/tools/inspect.py ADDED Viewed

@@ -0,0 +1,372 @@
+"""Inspect mops control files and unpickle them for debugging.
+Note that this really only works with ADLS-like Blob Stores, and
+only with the MemoizingPicklingRunner, which is the only implementation
+we have as of 2024-09-24, and will probably be the only implementation ever...
+but if you're reading this in the distant future - those are its limitations.
+"""
+import argparse
+import functools
+import os
+import re
+import subprocess
+import typing as ty
+from dataclasses import dataclass
+from pathlib import Path
+from pprint import pprint
+from thds import adls
+from thds.core import log, scope, tmp
+from thds.mops.parallel import Thunk
+from thds.mops.pure.core import uris
+from thds.mops.pure.core.memo import results
+from thds.mops.pure.pickling._pickle import (
+    CallableUnpickler,
+    read_metadata_and_object,
+    unfreeze_args_kwargs,
+)
+from thds.mops.pure.pickling.pickles import Invocation
+from thds.mops.pure.runner import strings
+logger = log.getLogger(__name__)
+class _MopsInspectPrettyPartial(functools.partial):
+    def __repr__(self) -> str:
+        return f"partial({self.func.__name__}, {self.args}, {self.keywords})"
+    def __rich_repr__(self) -> ty.Iterable[ty.Tuple[str, ty.Any]]:
+        """I don't much like how partial does its repr. Especially with nested partials,
+        it becomes almost impossible to follow.
+        """
+        yield "function", self.func.__name__
+        yield "args", self.args
+        yield "keywords", self.keywords
+class PartialViewingUnpickler(CallableUnpickler):
+    def find_class(self, module: str, name: str) -> ty.Any:
+        if module == "functools" and name == "partial":
+            return _MopsInspectPrettyPartial
+        return super().find_class(module, name)
+def _unpickle_object_for_debugging(uri: str) -> ty.Any:
+    try:
+        if uri.endswith("/" + strings.INVOCATION):
+            _no_header, invoc_raw = read_metadata_and_object(strings.INVOCATION, uri)
+            invoc = ty.cast(Invocation, invoc_raw)
+            args, kwargs = unfreeze_args_kwargs(invoc.args_kwargs_pickle, PartialViewingUnpickler)
+            return Thunk(getattr(invoc, "f", None) or invoc.func, *args, **kwargs)
+        header, obj = read_metadata_and_object("output", uri)
+        return obj, header
+    except ImportError as ie:
+        logger.error(f"Could not import the module ({ie}) needed to unpickle the object.")
+        logger.error("Try re-running this tool in the environment where the above module is available.")
+        raise
+def _resolved_uri(uri: str) -> str:
+    if not uri:
+        return ""
+    if fqn := adls.uri.resolve_uri(uri):
+        return str(fqn)
+    return uri
+_KNOWN_CONTROL_FILES = [strings.INVOCATION, results.RESULT, results.EXCEPTION]
+# prefix with forward-slash because these live in a blob store 'directory'
+@dataclass
+class IRE:
+    invocation: ty.Any
+    result: ty.Any  # a.k.a. return_value
+    exception: ty.Any
+_NOTHING = object()
+def _control_uri(uri: str) -> str:
+    for control_file in _KNOWN_CONTROL_FILES:
+        if uri.endswith("/" + control_file):
+            return control_file
+    return ""
+@scope.bound
+def get_control_file(uri: str) -> ty.Any:
+    """Returns _NOTHING if 'normal' errors occur."""
+    try:
+        uri = _resolved_uri(uri)
+    except Exception as e:
+        logger.error(f"Error while resolving {uri}: {e}")
+        return _NOTHING
+    if not _control_uri(uri):
+        fs = uris.lookup_blob_store(uri)
+        logger.debug(f"Attempting to fetch all control files for {uri}")
+        return IRE(**{cf: get_control_file(fs.join(uri, cf)) for cf in _KNOWN_CONTROL_FILES})
+    has_storage_root = bool(uris.ACTIVE_STORAGE_ROOT())
+    try:
+        scope.enter(uris.ACTIVE_STORAGE_ROOT.set(uris.get_root(uri)))
+        return _unpickle_object_for_debugging(uri)
+    except Exception as e:
+        if uris.lookup_blob_store(uri).is_blob_not_found(e):
+            if has_storage_root or uri not in str(e):
+                logger.warning(str(e))
+            return None
+        logger.exception("Unexpected error while unpickling the object.")
+        raise
+def _embed(o: object) -> None:
+    print('\nObject will be available as "o". Perform embedded URI fetches with "get_control_file"\n')
+    try:
+        __import__("IPython").embed()
+    except ImportError:
+        print("IPython not found, falling back to standard Python shell.")
+        import code
+        code.interact(local=locals())
+def _pprint(obj: object, file: ty.Any = None, uri: str = "") -> None:
+    if uri:
+        print(uri, file=file)
+    try:
+        from rich import console, pretty  # type: ignore[import]
+        if file:
+            console.Console(file=file, color_system=None).print(
+                pretty.Pretty(
+                    obj,  # highlighter=lambda x: x if file else None
+                )
+            )
+        else:
+            pretty.pprint(obj)
+    except ModuleNotFoundError:
+        pprint(obj, indent=4, width=60, sort_dicts=False, stream=file)
+def inspect(uri: str, embed: bool = False) -> ty.Any:
+    obj = get_control_file(uri)
+    if obj is _NOTHING:
+        return
+    if embed:
+        _embed(obj)
+    else:
+        print()
+        _pprint(obj)
+    return obj
+def inspect_and_log(memo_uri: str) -> None:
+    inspect(memo_uri)
+    logger.error(
+        "A required result was not found."
+        " You can compare the above output with other invocations"
+        f" by running `mops-inspect {memo_uri}`"
+        " in your local Python environment."
+    )
+@dataclass
+class Ignores:
+    permanent_ignores_file: Path
+    known_ignores: ty.Set[str]
+    def __post_init__(self) -> None:
+        self.permanent_ignores_file.parent.mkdir(parents=True, exist_ok=True)
+        if not self.permanent_ignores_file.exists():
+            self.permanent_ignores_file.touch()
+        self.known_ignores = set(filter(None, open(self.permanent_ignores_file).read().splitlines()))
+    def ignore_uri(self, ignore_uri: str) -> None:
+        self.known_ignores.add(ignore_uri)
+        # possible race condition here if multiple runs of mops-inspect are happening
+        # in parallel?
+        with open(self.permanent_ignores_file, "a") as wf:
+            wf.write(ignore_uri + "\n")
+    def __contains__(self, uri: str) -> bool:
+        return uri in self.known_ignores
+@dataclass
+class Matches:
+    must_match: ty.List[str]
+    must_not_match: ty.List[str]
+    def add_regex(self, regex: str) -> ty.Literal["ignore", "match"]:
+        """These are not permanent"""
+        if regex.startswith("!"):
+            self.must_not_match.append(regex[1:])
+            return "ignore"
+        self.must_match.append(regex)
+        return "match"
+    def matches(self, ire_str: str) -> bool:
+        for regex in self.must_not_match:
+            if re.search(regex, ire_str):
+                logger.debug('Ignoring because of regex: "%s"', regex)
+                return False
+        if not self.must_match:
+            logger.debug("No regexes must match")
+            return True
+        all_match = all(re.search(regex, ire_str) for regex in self.must_match)
+        if all_match:
+            logger.debug("Matches all required regexes")
+            return True
+        logger.debug("Does not match all of the %d required regexes.", len(self.must_match))
+        return False
+_IGNORES = Ignores(Path("~/.mops-inspect-ignores").expanduser(), set())
+_MATCHES = Matches(list(), list())
+DIFF_TOOL = os.environ.get("DIFF_TOOL") or "difft"  # nicer diffs by default
+def _check_diff_tool() -> None:
+    global DIFF_TOOL
+    try:
+        subprocess.run([DIFF_TOOL, "--version"], check=True, capture_output=True)
+    except subprocess.CalledProcessError:
+        logger.warning("You may want to `brew install difft` for nicer diffs.")
+        DIFF_TOOL = "diff"
+def _run_diff_tool(path_old: Path, path_new: Path) -> None:
+    subprocess.run([DIFF_TOOL, str(path_old), str(path_new)], check=True)
+def _write_ire_to_path(ire: IRE, path: Path, uri: str) -> None:
+    with open(path, "w") as wf:
+        _pprint(ire, file=wf, uri=uri)
+def _diff_memospace(uri: str, new_control: IRE) -> None:
+    """Diff all siblings in the memospace against the new invocation.
+    Ignore any that have been ignored previously.
+    """
+    # this code operates on the assumption that you've provided
+    # it with the 'new' invocation, and you're trying to figure out
+    # what is 'new' as compared to other 'existing' (old) invocations.
+    # Therefore, the 'green' highlighted text will be the 'new' invocation,
+    # and the red will be all the old ones that we loop over below.
+    fs = uris.lookup_blob_store(uri)
+    control_type = _control_uri(uri)
+    memospace_uri = fs.join(*fs.split(uri)[: -2 if control_type else -1])
+    # go up two levels to find the memospace if necessary.
+    path_new = scope.enter(tmp.temppath_same_fs())
+    _write_ire_to_path(new_control, path_new, uri)
+    logger.info(f"Diffing against all siblings in the memospace {memospace_uri}")
+    def sibling_menu(sibling_uri: str) -> None:
+        choice = input(
+            "Enter to continue, Ctrl-C to quit, `i` to permanently ignore this URI,"
+            " or type a regex to filter future results (prefix with ! to find non-matches, otherwise will find matches: "
+        )
+        if "i" == choice.lower():
+            _IGNORES.ignore_uri(sibling_uri)
+        elif choice:
+            regex = choice
+            type = _MATCHES.add_regex(regex)
+            logger.info(f"Added <{type}> regex /{regex}/")
+    sibling_uris = fs.list(memospace_uri)  # type: ignore
+    found_siblings = False
+    for sibling_uri in sibling_uris:
+        if uri.startswith(sibling_uri):
+            continue
+        found_siblings = True
+        sibling_uri = sibling_uri.rstrip("/")
+        if sibling_uri in _IGNORES:
+            continue
+        full_uri = fs.join(sibling_uri, control_type)
+        control_sibling = get_control_file(full_uri)
+        with tmp.temppath_same_fs() as path_sibling:
+            _write_ire_to_path(control_sibling, path_sibling, full_uri)
+            if not _MATCHES.matches(path_sibling.read_text()):
+                continue
+            _run_diff_tool(path_sibling, path_new)
+        sibling_menu(sibling_uri)
+    if not found_siblings:
+        logger.warning(
+            f"No memospace siblings found for '{memospace_uri}'"
+            " - check your pipeline ID, function-logic-key (if any),"
+            " and whether you're running in prod or dev."
+        )
+@scope.bound
+def _inspect_uri(uri: str, diff_memospace: bool, embed: bool) -> None:
+    uri = _resolved_uri(uri)
+    ire_curr = inspect(uri, embed)  # print the main uri
+    if diff_memospace:
+        _diff_memospace(uri, ire_curr)
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "uri",
+        type=str,
+        help="The URI of the first object to inspect. Can be adls:// or https:// or even abfss://",
+    )
+    parser.add_argument(
+        "--diff-memospace",
+        "-d",
+        action="store_true",
+        help=(
+            "Find the diff between the invocation at the provided URI,"
+            " and all other invocations that match the same function memospace."
+            " This will only work if your Blob Store is capable of listing files."
+            " It is highly recommended that you `brew install difftastic` to get more precise diffs."
+        ),
+    )
+    parser.add_argument(
+        "--loop",
+        action="store_true",
+        help="Keep prompting for URIs to inspect - basically just an embedded while loop.",
+    )
+    parser.add_argument("--embed", action="store_true", help="Embed an IPython shell after inspection.")
+    args = parser.parse_args()
+    args.uri = args.uri.rstrip("/")
+    if args.diff_memospace:
+        _check_diff_tool()
+    _inspect_uri(args.uri, args.diff_memospace, args.embed)
+    if args.loop:
+        prompt = "\nEnter another URI to inspect, or empty string to exit: "
+        uri = input(prompt)
+        while uri:
+            _inspect_uri(uri, args.diff_memospace, args.embed)
+            uri = input(prompt)
+if __name__ == "__main__":
+    main()

thds/mops/pure/tools/sha256_b64_addressed.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Upload a file to the location under a given storage root where a
+pathlib.Path would be put by the MemoizingPicklingFunctionRunner.
+"""
+import argparse
+from pathlib import Path
+from thds.adls.defaults import mops_root
+from ..._utils.once import Once
+from ..core import uris
+from ..core.serialize_paths import CoordinatingPathSerializer, human_sha256b64_file_at_paths
+from ..pickling import sha256_b64
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("file", help="Must be an actual file")
+    parser.add_argument(
+        "--upload-root-uri",
+        "-u",
+        help=f"Actually upload, using this URI as storage root. Example: {mops_root()}",
+    )
+    args = parser.parse_args()
+    the_path = Path(args.file)
+    human_hash = human_sha256b64_file_at_paths(the_path)
+    print(human_hash)
+    if args.upload_root_uri:
+        storage_root = args.upload_root_uri.rstrip("/") + "/"
+        with uris.ACTIVE_STORAGE_ROOT.set(storage_root):
+            CoordinatingPathSerializer(sha256_b64.Sha256B64PathStream(), Once())(the_path)
+if __name__ == "__main__":
+    main()

thds/mops/pure/tools/stress.py ADDED Viewed

@@ -0,0 +1,63 @@
+import subprocess
+import time
+import typing as ty
+from timeit import default_timer
+from thds.adls import defaults
+from thds.core.log import getLogger
+from thds.mops._utils.colorize import colorized
+from thds.mops.config import max_concurrent_network_ops
+from thds.mops.parallel import Thunk, parallel_yield_results
+from thds.mops.pure import MemoizingPicklingRunner, use_runner
+BROWN = colorized(fg="brown", bg="black")
+logger = getLogger(__name__)
+def _subprocess_remote(args_list: ty.Sequence[str]) -> None:
+    logger.info(f"Invoking 'remote' runner with args {args_list}")
+    subprocess.run(args_list)
+    logger.info("Completed 'remote' runner")
+runner = MemoizingPicklingRunner(_subprocess_remote, defaults.mops_root)
+adls_shim = use_runner(runner)
+@adls_shim
+def run_and_sleep(i: int, data: ty.List[float], sleep: int) -> float:
+    """Runs 'remotely' - arguments are pickled and passed via ADLS; result is returned via ADLS."""
+    the_sum = sum(data)
+    print(BROWN(f"remote {i} - sum: {the_sum} - sleeping!"))
+    time.sleep(sleep)
+    return the_sum
+def stress(max_clients: int, n: int, sleep: int) -> None:
+    """MemoizingPicklingRunner will perform 4 local ADLS operations (1 file
+    exists, 1 push, 1 file exists and 1 file pull) per task. The
+    remote runner will perform 2 more ADLS operations, which in this
+    case will also be occurring on the local machine, using a
+    different client per runner. This gives a total of 6 ADLS
+    operations for this test, whereas a properly remote worker would
+    allow those 2 remote operations to be offloaded.
+    The computation by definition takes N seconds, but can in theory
+    be perfectly parallelized, so this gives some idea of how the
+    overhead of launching and retrieving task results increases as the
+    length of the task decreases relative to the number of total tasks.
+    """
+    start = default_timer()
+    with max_concurrent_network_ops.set_local(max_clients):
+        tasks = [Thunk(run_and_sleep, i, list(range(i * n, (i + 1) * n)), sleep) for i in range(n)]
+        assert len(list(parallel_yield_results(tasks))) == n
+    total = default_timer() - start
+    print(
+        f"With max_clients {max_clients}; n {n}; sleep {sleep}, took {total:.1f} seconds,"
+        f" which is {total/n:.2f} seconds per task."
+        " Prior experiments have found this to stabilize with increasing N in the vicinity of 0.2 seconds"
+        " of overhead per task as long as the # of tasks dominates (>=20x) the length (in seconds) of the tasks."
+    )

thds/mops/pure/tools/summarize/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .cli import main
+if __name__ == "__main__":
+    main()