PyPI - thds.mops - Versions diffs - 3.6.20250219172032__py3-none-any.whl - Mend

thds.mops 3.6.20250219172032__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of thds.mops might be problematic. Click here for more details.

Files changed (111) hide show

thds/mops/__about__.py +8 -0
thds/mops/__init__.py +3 -0
thds/mops/_compat.py +6 -0
thds/mops/_utils/__init__.py +0 -0
thds/mops/_utils/colorize.py +110 -0
thds/mops/_utils/config_tree.py +167 -0
thds/mops/_utils/exception.py +16 -0
thds/mops/_utils/locked_cache.py +78 -0
thds/mops/_utils/names.py +23 -0
thds/mops/_utils/on_slow.py +28 -0
thds/mops/_utils/once.py +30 -0
thds/mops/_utils/temp.py +32 -0
thds/mops/config.py +60 -0
thds/mops/impure/__init__.py +2 -0
thds/mops/impure/keyfunc.py +14 -0
thds/mops/impure/runner.py +73 -0
thds/mops/k8s/__init__.py +27 -0
thds/mops/k8s/_shared.py +3 -0
thds/mops/k8s/apply_yaml.py +22 -0
thds/mops/k8s/auth.py +49 -0
thds/mops/k8s/config.py +37 -0
thds/mops/k8s/container_registry.py +14 -0
thds/mops/k8s/jobs.py +57 -0
thds/mops/k8s/launch.py +234 -0
thds/mops/k8s/logging.py +239 -0
thds/mops/k8s/namespace.py +17 -0
thds/mops/k8s/node_selection.py +58 -0
thds/mops/k8s/retry.py +75 -0
thds/mops/k8s/too_old_resource_version.py +42 -0
thds/mops/k8s/tools/krsync.py +50 -0
thds/mops/k8s/tools/krsync.sh +22 -0
thds/mops/k8s/wait_job.py +72 -0
thds/mops/k8s/warn_image_backoff.py +63 -0
thds/mops/k8s/watch.py +266 -0
thds/mops/meta.json +8 -0
thds/mops/parallel.py +36 -0
thds/mops/pure/__init__.py +43 -0
thds/mops/pure/_magic/__init__.py +0 -0
thds/mops/pure/_magic/api.py +114 -0
thds/mops/pure/_magic/sauce.py +152 -0
thds/mops/pure/_magic/shims.py +34 -0
thds/mops/pure/adls/__init__.py +1 -0
thds/mops/pure/adls/_files.py +22 -0
thds/mops/pure/adls/blob_store.py +185 -0
thds/mops/pure/adls/output_fqn.py +17 -0
thds/mops/pure/core/__init__.py +0 -0
thds/mops/pure/core/content_addressed.py +31 -0
thds/mops/pure/core/deferred_work.py +83 -0
thds/mops/pure/core/entry/__init__.py +2 -0
thds/mops/pure/core/entry/main.py +47 -0
thds/mops/pure/core/entry/route_result.py +66 -0
thds/mops/pure/core/entry/runner_registry.py +31 -0
thds/mops/pure/core/file_blob_store.py +120 -0
thds/mops/pure/core/lock/__init__.py +7 -0
thds/mops/pure/core/lock/_acquire.py +192 -0
thds/mops/pure/core/lock/_funcs.py +37 -0
thds/mops/pure/core/lock/cli.py +73 -0
thds/mops/pure/core/lock/maintain.py +150 -0
thds/mops/pure/core/lock/read.py +39 -0
thds/mops/pure/core/lock/types.py +37 -0
thds/mops/pure/core/lock/write.py +136 -0
thds/mops/pure/core/memo/__init__.py +6 -0
thds/mops/pure/core/memo/function_memospace.py +267 -0
thds/mops/pure/core/memo/keyfunc.py +53 -0
thds/mops/pure/core/memo/overwrite_params.py +61 -0
thds/mops/pure/core/memo/results.py +103 -0
thds/mops/pure/core/memo/unique_name_for_function.py +70 -0
thds/mops/pure/core/metadata.py +230 -0
thds/mops/pure/core/output_naming.py +52 -0
thds/mops/pure/core/partial.py +15 -0
thds/mops/pure/core/pipeline_id.py +62 -0
thds/mops/pure/core/pipeline_id_mask.py +79 -0
thds/mops/pure/core/script_support.py +25 -0
thds/mops/pure/core/serialize_big_objs.py +73 -0
thds/mops/pure/core/serialize_paths.py +149 -0
thds/mops/pure/core/source.py +291 -0
thds/mops/pure/core/types.py +142 -0
thds/mops/pure/core/uris.py +81 -0
thds/mops/pure/core/use_runner.py +47 -0
thds/mops/pure/joblib/__init__.py +1 -0
thds/mops/pure/joblib/backend.py +81 -0
thds/mops/pure/joblib/batching.py +67 -0
thds/mops/pure/pickling/__init__.py +3 -0
thds/mops/pure/pickling/_pickle.py +193 -0
thds/mops/pure/pickling/memoize_only.py +22 -0
thds/mops/pure/pickling/mprunner.py +173 -0
thds/mops/pure/pickling/pickles.py +149 -0
thds/mops/pure/pickling/remote.py +145 -0
thds/mops/pure/pickling/sha256_b64.py +71 -0
thds/mops/pure/runner/__init__.py +0 -0
thds/mops/pure/runner/local.py +239 -0
thds/mops/pure/runner/shim_builder.py +25 -0
thds/mops/pure/runner/simple_shims.py +21 -0
thds/mops/pure/runner/strings.py +1 -0
thds/mops/pure/runner/types.py +28 -0
thds/mops/pure/tools/__init__.py +0 -0
thds/mops/pure/tools/history.py +35 -0
thds/mops/pure/tools/inspect.py +372 -0
thds/mops/pure/tools/sha256_b64_addressed.py +40 -0
thds/mops/pure/tools/stress.py +63 -0
thds/mops/pure/tools/summarize/__init__.py +4 -0
thds/mops/pure/tools/summarize/cli.py +293 -0
thds/mops/pure/tools/summarize/run_summary.py +143 -0
thds/mops/py.typed +0 -0
thds/mops/testing/__init__.py +0 -0
thds/mops/testing/deferred_imports.py +81 -0
thds.mops-3.6.20250219172032.dist-info/METADATA +42 -0
thds.mops-3.6.20250219172032.dist-info/RECORD +111 -0
thds.mops-3.6.20250219172032.dist-info/WHEEL +5 -0
thds.mops-3.6.20250219172032.dist-info/entry_points.txt +7 -0
thds.mops-3.6.20250219172032.dist-info/top_level.txt +1 -0

thds/mops/pure/core/memo/overwrite_params.py ADDED Viewed

@@ -0,0 +1,61 @@
+import inspect
+import typing as ty
+from ..types import Args, Kwargs
+F = ty.TypeVar("F", bound=ty.Callable)
+def argument_transformer(
+    func: F,
+    names_to_transforms: ty.Mapping[str, ty.Callable[[ty.Any], ty.Any]],
+) -> ty.Callable[[Args, Kwargs], ty.Tuple[Args, Kwargs]]:
+    """Overwrite bound arguments at call time with the value returned
+    by each named transform, whose names must each correspond to a
+    named parameter on the function.
+    The named transforms will each receive the 'raw' argument value,
+    and may choose whether to return it or something else. A mapping
+    of identity functions (foo=lambda x: x) would be, therefore,
+    functionally a no-op.
+    This not a decorator, but a decorator can easily be built on top of it.
+    """
+    signature = inspect.signature(func)
+    parameter_names = list(signature.parameters.keys())
+    unknown_parameters = set(names_to_transforms) - set(parameter_names)
+    if unknown_parameters:
+        # don't let this bad situation go any further...
+        raise ValueError(f"The function {func} does not have parameters {unknown_parameters}")
+    def xf_args_kwargs(args: Args, kwargs: Kwargs) -> ty.Tuple[Args, Kwargs]:
+        # Iterate over positional arguments and replace named ones
+        # with the value received from the callable.
+        pos_args = list(args)  # mutable copy
+        for i, arg in enumerate(args):
+            if i < len(parameter_names):
+                param_name = parameter_names[i]
+                if param_name in names_to_transforms:
+                    pos_args[i] = names_to_transforms[param_name](arg)
+        # Iterate over keyword arguments and replace named ones
+        # with the value received from the callable.
+        kwargs = dict(kwargs)  # mutable copy
+        for param_name, arg in kwargs.items():
+            if param_name in names_to_transforms:
+                kwargs[param_name] = names_to_transforms[param_name](arg)
+        return pos_args, kwargs
+    return xf_args_kwargs
+def parameter_overwriter(
+    func: F, names_to_values: ty.Mapping[str, ty.Any]
+) -> ty.Callable[[Args, Kwargs], ty.Tuple[Args, Kwargs]]:
+    """Overwrite parameters without regard to the actual argument values."""
+    def give_val(val: ty.Any) -> ty.Callable[[ty.Any], ty.Any]:
+        return lambda _: val
+    return argument_transformer(func, {name: give_val(val) for name, val in names_to_values.items()})

thds/mops/pure/core/memo/results.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""Sometimes you want to require that a memoized result exists.
+A Runner should hook into this system to enforce that upon itself.
+"""
+import os
+import typing as ty
+from contextlib import contextmanager
+from thds.core import config, log, stack_context
+from ...._utils import colorize
+from ..uris import lookup_blob_store
+_REQUIRE_ALL_RESULTS = config.item("require_all_results", default="")
+_UNLESS_ENV = stack_context.StackContext("results_unless_env", "")
+# please do not set the above globally unless you really, truly know what you're doing.
+logger = log.getLogger(__name__)
+ORANGE = colorize.colorized("#FF8200")
+_NO_MSG = "xxxx-REQUIRED-xxxx"
+@contextmanager
+def require_all(message: str = _NO_MSG, *, unless_env: str = "") -> ty.Iterator[None]:
+    """Requires all results from this point down the stack, _unless_ the non-empty env
+    variable specified is present in os.environ.
+    An empty string message will still force results to be required.
+    """
+    with _REQUIRE_ALL_RESULTS.set_local(message or _NO_MSG), _UNLESS_ENV.set(unless_env):
+        try:
+            yield
+        except RequiredResultNotFound as rrnf:
+            # re-raising gives us a cleaner stack trace from the point where the result was required
+            raise RequiredResultNotFound(rrnf.args[0], rrnf.uri) from rrnf
+# _REQUIRED_FUNC_NAMES = set()
+# def required(func: ty.Callable) -> None:
+#     pass
+def _should_require_result(memo_uri: str = "") -> str:
+    requirement_msg = _REQUIRE_ALL_RESULTS()
+    if not requirement_msg:
+        return ""
+    envvar_to_override = _UNLESS_ENV()
+    is_envvar_set = envvar_to_override and envvar_to_override in os.environ
+    if is_envvar_set:
+        return ""
+    if envvar_to_override:
+        return f"{requirement_msg}; Note that you can set the environment variable {envvar_to_override} to skip this check."
+    return requirement_msg
+class Success(ty.NamedTuple):
+    value_uri: str
+class Error(ty.NamedTuple):
+    exception_uri: str
+RESULT = "result"
+EXCEPTION = "exception"
+class RequiredResultNotFound(Exception):
+    def __init__(self, message: str, uri: str):
+        super().__init__(message)
+        self.uri = uri
+def check_if_result_exists(
+    memo_uri: str,
+    rerun_excs: bool = False,
+    before_raise: ty.Callable[[], ty.Any] = lambda: None,
+) -> ty.Union[None, Success, Error]:
+    fs = lookup_blob_store(memo_uri)
+    value_uri = fs.join(memo_uri, RESULT)
+    if fs.exists(value_uri):
+        return Success(value_uri)
+    required_msg = _should_require_result(memo_uri)
+    if required_msg:  # might be custom or the default. either way it indicates a required result.
+        before_raise()
+        error_msg = f"Required a result for {ORANGE(memo_uri)} but that result was not found"
+        # i'm tired of visually scanning for these memo_uris in logs.
+        if required_msg != _NO_MSG:
+            error_msg += f": {required_msg}"
+        raise RequiredResultNotFound(error_msg, memo_uri)
+    if rerun_excs:
+        return None
+    error_uri = fs.join(memo_uri, EXCEPTION)
+    if fs.exists(error_uri):
+        return Error(error_uri)
+    return None

thds/mops/pure/core/memo/unique_name_for_function.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Provides for precise cache invalidation via a known key in the docstring.
+`function-logic-key`:
+By modifying this key in the docstring of a use_runner-decorated
+function or callable class, you can indicate to mops that although the
+function name has not changed (perhaps because of refactoring
+concerns), and although the parameters may be the same and take the
+same values as a previous run, nevertheless the (value of the)
+internal logic has changed and therefore no previously memoized
+results should be returned when running this function.
+This step is entirely optional and is expected only to be used for
+advanced use cases where optimal caching/memoization is required.
+The keys may be any string without spaces, but ideally should be
+semantically meaningful to another developer or yourself, i.e. a name
+or description of some sort. Examples might be:
+function-logic-key: v1
+function-logic-key: 2023-03-31
+function-logic-key: try-uint8-math
+"""
+import inspect
+import re
+import typing as ty
+from functools import lru_cache
+from thds.mops._utils.names import full_name_and_callable
+_DOCSTRING_VERSION_RE = re.compile(r".*function-logic-key:\s+(?P<version>[^\s]+)\b", re.DOTALL)
+def _parse_logic_key(doc: str) -> str:
+    m = _DOCSTRING_VERSION_RE.match(doc)
+    return m.group("version") if m else ""
+def extract_function_logic_key_from_docstr(obj: ty.Any) -> str:
+    doc = getattr(obj, "__doc__", "") or ""
+    return _parse_logic_key(doc)
+extract_logic_key_from_docstr = extract_function_logic_key_from_docstr
+@lru_cache(maxsize=None)
+def make_unique_name_including_docstring_key(f: ty.Any) -> str:
+    module_and_name, callable = full_name_and_callable(f)
+    version = ""
+    for attr, value in inspect.getmembers(callable):
+        if attr == "__doc__" and value:
+            version = _parse_logic_key(value)
+    return f"{module_and_name}@{version}".rstrip("@")
+class FunctionComponents(ty.NamedTuple):
+    module: str
+    name: str
+    function_logic_key: str
+def parse_unique_name(full_function_name: str) -> FunctionComponents:
+    assert "--" in full_function_name, f"Expected '--' in {full_function_name}"
+    module, name = full_function_name.split("--")
+    if "@" not in name:
+        return FunctionComponents(module, name, "")
+    return FunctionComponents(module, *name.split("@"))

thds/mops/pure/core/metadata.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""This is where we put implementation details having to do with the new metadata system in
+v3 of mops.
+Metadata is anything that is not critical to the core operation of mops but is useful for
+debugging, monitoring, or other purposes.
+"""
+import argparse
+import getpass
+import os
+import typing as ty
+from dataclasses import dataclass
+from datetime import datetime
+from thds.core import calgitver, config, hostname
+try:
+    _CALGITVER = calgitver.calgitver()
+except calgitver.git.NO_GIT:
+    _CALGITVER = ""
+INVOKER_CODE_VERSION = config.item("mops.metadata.local.invoker_code_version", _CALGITVER)
+INVOKED_BY = config.item("mops.metadata.local.invoked_by", "")
+REMOTE_CODE_VERSION = config.item("mops.metadata.remote.code_version", "")
+# set the remote code version inside your docker image or other environment.
+def get_invoker_code_version() -> str:
+    return INVOKER_CODE_VERSION() or "unknown"
+def get_invoked_by() -> str:
+    return INVOKED_BY() or f"{getpass.getuser()}@{hostname.friendly()}"
+@dataclass
+class InvocationMetadata:
+    """Metadata values may not contain spaces."""
+    invoked_at: datetime
+    invoked_by: str  # a more semantic identifier of 'who' called the function. This should be
+    # passed recursively to other invocations.
+    invoker_code_version: str
+    # ^ Collectively: the 'ABC's of the invocation metadata.
+    invoker_uuid: str  # the writer_uuid from the lock
+    pipeline_id: str
+    # technically not _just_ metadata, because it is used directly in
+    # memoization. but this is a more convenient way to pass alongside
+    # everything else that is used for debugging and monitoring.
+    @staticmethod
+    def new(pipeline_id: str, invoked_at: datetime, invoker_uuid: str) -> "InvocationMetadata":
+        return InvocationMetadata(
+            pipeline_id=pipeline_id,
+            invoker_code_version=get_invoker_code_version(),
+            invoker_uuid=invoker_uuid,
+            invoked_at=invoked_at,
+            invoked_by=get_invoked_by(),
+        )
+def get_remote_code_version(invoker_code_version: str) -> str:
+    return (
+        REMOTE_CODE_VERSION()
+        or os.getenv("CALGITVER")
+        or os.getenv("THDS_APP_VERSION")
+        # these env var fallbacks are specifically for THDS internal use.
+        # Control is exposed via the official config item.
+        or invoker_code_version  # in a local-run context, use whatever was set explicitly, if anything.
+    )
+@dataclass
+class ResultMetadata(InvocationMetadata):
+    remote_code_version: str
+    remote_started_at: datetime
+    remote_ended_at: datetime
+    # the below are redundant but useful to have precomputed:
+    remote_wall_minutes: float  # between remote_started_at and remote_ended_at
+    result_wall_minutes: float  # between remote_ended_at and invoked_at
+    # we're using minutes because it's a more human-friendly unit of time,
+    # and if you want the raw seconds you can always compute it from the original datetimes.
+    @staticmethod
+    def from_invocation(
+        invocation_metadata: InvocationMetadata, started_at: datetime, ended_at: datetime
+    ) -> "ResultMetadata":
+        return ResultMetadata(
+            **vars(invocation_metadata),
+            remote_code_version=get_remote_code_version(invocation_metadata.invoker_code_version),
+            remote_started_at=started_at,
+            remote_ended_at=ended_at,
+            remote_wall_minutes=(ended_at - started_at).total_seconds() / 60,
+            result_wall_minutes=(ended_at - invocation_metadata.invoked_at).total_seconds() / 60,
+        )
+def invocation_metadata_parser(
+    parser: ty.Optional[argparse.ArgumentParser] = None,
+) -> argparse.ArgumentParser:
+    parser = parser or argparse.ArgumentParser()
+    assert parser
+    parser.add_argument(
+        "--invoked-by",
+        help="Who invoked this function. Will be used recursively (for nested functions).",
+        required=True,
+    )
+    parser.add_argument(
+        "--invoker-code-version",
+        help="The version of the code that is running. Usually a CalGitVer, but can be any non-empty string.",
+        required=True,
+    )
+    parser.add_argument(
+        "--invoked-at",
+        help="The time at which this function was invoked. Should be an ISO8601 timestamp.",
+        type=datetime.fromisoformat,
+        required=True,
+    )
+    parser.add_argument(
+        "--invoker-uuid",
+        help="The UUID of the invoker. This is generally the writer UUID from the lock.",
+    )
+    parser.add_argument("--pipeline-id", required=True)
+    return parser
+def result_metadata_parser() -> argparse.ArgumentParser:
+    parser = invocation_metadata_parser()
+    parser.add_argument(
+        "--remote-code-version",
+        help="The version of the code that ran remotely. Usually a CalGitVer, but can be any non-empty string.",
+    )
+    parser.add_argument(
+        "--remote-started-at",
+        help="The time at which this function started. Should be an ISO8601 timestamp.",
+        type=datetime.fromisoformat,
+        required=True,
+    )
+    parser.add_argument(
+        "--remote-ended-at",
+        help="The time at which this function ended. Should be an ISO8601 timestamp.",
+        type=datetime.fromisoformat,
+        required=True,
+    )
+    parser.add_argument(
+        "--remote-wall-minutes",
+        help="The computed wall time in minutes between the remote start and end times.",
+        type=float,
+    )
+    parser.add_argument(
+        "--result-wall-minutes",
+        help="The computed wall time in minutes between the remote end and the invocation time.",
+        type=float,
+    )
+    return parser
+def parse_invocation_metadata_args(args: ty.Sequence[str]) -> InvocationMetadata:
+    """Parse metadata args from the command line.
+    Metadata args are of the form --key-name=value.
+    """
+    metadata, _ = invocation_metadata_parser().parse_known_args(args)
+    return InvocationMetadata(**vars(metadata))
+def parse_result_metadata(metadata_keyvals: ty.Sequence[str]) -> ResultMetadata:
+    """Parse metadata values from a result list.
+    Metadata args are of the form key=value, and are separated by commas.
+    Usually you'll be splitting an initial text string on newline, but we don't do that for you here.
+    """
+    def to_arg(kv: str) -> str:
+        key, value = kv.split("=", 1)
+        return f"--{key.replace('_', '-')}={value}"
+    metadata = result_metadata_parser().parse_args([to_arg(kv) for kv in metadata_keyvals if kv])
+    return ResultMetadata(**vars(metadata))
+def _format_metadata(
+    metadata: ty.Union[InvocationMetadata, ResultMetadata], prefix: str
+) -> ty.List[str]:
+    """Format metadata args for the command line OR for the header in a result payload.
+    Metadata args are of the form key=value, and are separated by commas.
+    """
+    def to_str(value: ty.Any) -> str:
+        if isinstance(value, datetime):
+            return value.isoformat()
+        return str(value)
+    def nospaces_to_str(value: ty.Any) -> str:
+        s = to_str(value)
+        if " " in s:
+            raise ValueError(f"Metadata value {s} contains a space. This is illegal")
+        return s
+    return [
+        f"{prefix}{k.replace('_', '-')}={nospaces_to_str(v)}"
+        for k, v in vars(metadata).items()
+        if v is not None and v != ""
+    ]
+def format_invocation_cli_args(metadata: InvocationMetadata) -> ty.List[str]:
+    return _format_metadata(metadata, prefix="--")
+def format_result_header(metadata: ResultMetadata) -> str:
+    """Includes separating newlines and a trailing newline."""
+    return "\n".join(_format_metadata(metadata, prefix="")) + "\n"
+def format_end_of_run_times(start_timestamp: float, maybe_metadata_args: ty.Sequence[str]) -> str:
+    import time
+    try:
+        meta = parse_invocation_metadata_args(maybe_metadata_args)
+        wait_time = start_timestamp - meta.invoked_at.timestamp()
+        total_time = time.time() - meta.invoked_at.timestamp()
+        return f" (waited {wait_time/60:.2f} minutes, total time {total_time/60:.2f} minutes) - version: {meta.invoker_code_version}"
+    except Exception:
+        return ""

thds/mops/pure/core/output_naming.py ADDED Viewed

@@ -0,0 +1,52 @@
+## utilities for providing remote context for naming things uniquely:
+import typing as ty
+from thds.core.stack_context import StackContext
+from . import types, uris
+PipelineFunctionUniqueKey = StackContext("Mops2PipelineFunctionUniqueKey", default="")
+FunctionArgumentsHashUniqueKey = StackContext("Mops2FunctionArgumentsHashUniqueKey", default="")
+def pipeline_function_invocation_unique_key() -> ty.Optional[ty.Tuple[str, str]]:
+    """A runner may provide a value for the underlying components, and
+    if it does, the first string is required to be unique across all
+    _separate_ functions running within a given pipeline id, and the
+    second string is required to be unique for every unique invocation
+    of that same function.
+    If your code is _not_ running inside a mops runner, or
+    the mops runner does not provide a value for this, you will
+    instead get None.
+    """
+    pfi_key = PipelineFunctionUniqueKey(), FunctionArgumentsHashUniqueKey()
+    if "" in pfi_key:  # if either of the elements was not supplied, we don't have anything!
+        return None
+    return pfi_key
+def invocation_output_uri(storage_root: uris.UriIsh = "", name: str = "") -> str:
+    """If your function only outputs a single blob, you can safely use this without
+    providing a name.  However, if you have multiple outputs from the same invocation, you
+    must provide a meaningful name for each one.
+    As an example:
+    <pipeline> <function mod/name  > <your name     > <args,kwargs hash                                   >
+    nppes/2023/thds.nppes.intake:run/<name goes here>/CoastOilAsset.IVZ9KplQKlNgxQHav0jIMUS9p4Kbn3N481e0Uvs
+    """
+    storage_root = str(storage_root or uris.ACTIVE_STORAGE_ROOT())
+    pf_fa = pipeline_function_invocation_unique_key()
+    if not pf_fa:
+        raise types.NotARunnerContext(
+            "`invocation_output_uri` must be used in a `thds.mops.pure` runner context."
+        )
+    pipeline_function_key, function_arguments_key = pf_fa
+    return uris.lookup_blob_store(storage_root).join(
+        storage_root,
+        pipeline_function_key,
+        "--".join(filter(None, [name, function_arguments_key])),
+        name,
+        # we use the name twice now, so that the final part of the path also has a file extension
+    )

thds/mops/pure/core/partial.py ADDED Viewed

@@ -0,0 +1,15 @@
+# Sometimes you need to 'unwrap' a partial. This will let you do that.
+import typing as ty
+from functools import partial
+from .types import Args, Kwargs, T
+def unwrap_partial(
+    func: ty.Callable[..., T], args: Args, kwargs: Kwargs
+) -> ty.Tuple[ty.Callable[..., T], Args, Kwargs]:
+    while isinstance(func, partial):
+        args = func.args + tuple(args)
+        kwargs = {**func.keywords, **kwargs}
+        func = func.func
+    return func, args, kwargs

thds/mops/pure/core/pipeline_id.py ADDED Viewed

@@ -0,0 +1,62 @@
+# This file must not import anything else from `remote.core` - it is a 'leaf' of our tree
+# because it is depended upon by so many other things.
+import os
+from datetime import datetime
+from thds.core import hostname, log, meta
+from ..._utils.colorize import colorized
+# this is a global instead of a StackContext because we _do_ want it
+# to spill over automatically into new threads.
+_PIPELINE_ID = ""
+logger = log.getLogger(__name__)
+def __set_or_generate_pipeline_id_if_empty() -> None:
+    some_unique_name = meta.get_repo_name() or os.getenv("THDS_DOCKER_IMAGE_NAME") or ""
+    clean_commit = meta.get_commit()[:7] if meta.is_clean() else ""
+    named_clean_commit = (
+        f"{some_unique_name}/{clean_commit}" if some_unique_name and clean_commit else ""
+    )
+    def gen_pipeline_id() -> str:
+        pipeline_id = (
+            hostname.friendly()  # host name can be a group/directory now
+            + "/"
+            + "-".join(
+                [
+                    datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
+                    f"p{os.getpid()}",
+                ]
+            )
+        )
+        logger.warning(
+            colorized(fg="black", bg="yellow")(f"Generated pipeline id '{pipeline_id}' for this run")
+        )
+        return pipeline_id
+    set_pipeline_id(named_clean_commit or gen_pipeline_id())
+def get_pipeline_id() -> str:
+    """This will return the stack-local pipeline id, if set, or, if
+    that is not set, will generate a global pipeline id and return
+    that.
+    Once a global pipeline id is generated, it will not be
+    regenerated, although it can be overridden as a global with
+    set_pipeline_id, and overridden for the stack with
+    """
+    if not _PIPELINE_ID:
+        __set_or_generate_pipeline_id_if_empty()
+    assert _PIPELINE_ID
+    return _PIPELINE_ID
+def set_pipeline_id(new_pipeline_id: str) -> None:
+    """Override the current global pipeline id."""
+    if not new_pipeline_id:
+        return  # quietly disallow empty strings, since we always want a value here.
+    global _PIPELINE_ID
+    _PIPELINE_ID = new_pipeline_id

thds/mops/pure/core/pipeline_id_mask.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Public API for masking the mops pipeline id.
+"""
+import re
+import typing as ty
+from contextlib import contextmanager
+from functools import lru_cache
+from thds.core.stack_context import StackContext
+from .pipeline_id import get_pipeline_id
+_PIPELINE_ID_MASK = StackContext("PIPELINE_ID_MASK", "")
+def get_pipeline_id_mask() -> str:
+    """Returns the 'current' pipeline id, preferring a mask over the global."""
+    return _PIPELINE_ID_MASK() or get_pipeline_id()
+@contextmanager
+def pipeline_id_mask(pipeline_id: str) -> ty.Iterator[bool]:
+    """Sets the pipeline id, but if it's already set, then the outer
+    mask will take precedence over this one, i.e. this will be a
+    no-op.
+    Is a decorator as well as a ContextManager, thanks to the magic of
+    @contextmanager. 🤯
+    When used as a Context Manager, be aware that it will not be
+    applied to threads launched by the current thread. To cross thread
+    boundaries, prefer decorating an actual function that will then be
+    launched in the thread.
+    When used as a context manager, return True if this is the
+    outermost layer and will actually be applied to the function;
+    return False if not.
+    The outermost configuration on this particular thread/green thread
+    stack will be used. This pattern is very useful for libraries that
+    want to define a default pipeline_id for their
+    use_runner-decorated function.
+    """
+    if _PIPELINE_ID_MASK():
+        yield False
+    else:
+        with _PIPELINE_ID_MASK.set(pipeline_id):
+            yield True
+F = ty.TypeVar("F", bound=ty.Callable)
+_DOCSTRING_MASK_RE = re.compile(r".*pipeline-id(?:-mask)?:\s*(?P<pipeline_id>[^\s]+)\b", re.DOTALL)
+# for backward-compatibility, we support pipeline-id-mask, even though the clearer name is
+# ultimately pipeline-id.
+@lru_cache(maxsize=32)
+def extract_from_docstr(func: F, require: bool = True) -> str:
+    if not func.__doc__:
+        if not require:
+            return ""
+        raise ValueError(f"Function {func} must have a non-empty docstring to extract pipeline-id")
+    m = _DOCSTRING_MASK_RE.match(func.__doc__)
+    if not m:
+        if "pipeline-id:" in func.__doc__ or "pipeline-id-mask:" in func.__doc__:
+            raise ValueError("pipeline-id is present but empty - this is probably an accident")
+        if not require:
+            return ""
+        raise ValueError(f"Cannot extract pipeline-id from docstring for {func}")
+    mask = m.group("pipeline_id")
+    assert mask, "pipeline-id should not have matched if it is empty"
+    return mask
+@contextmanager
+def including_function_docstr(f: F) -> ty.Iterator[str]:
+    with pipeline_id_mask(extract_from_docstr(f, require=False)):
+        yield get_pipeline_id_mask()