PyPI - furu - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

furu 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

furu/__init__.py +11 -1
furu/adapters/submitit.py +23 -2
furu/config.py +21 -3
furu/core/__init__.py +2 -2
furu/core/furu.py +708 -188
furu/core/list.py +1 -1
furu/dashboard/__init__.py +10 -1
furu/dashboard/frontend/dist/assets/{index-CbdDfSOZ.css → index-BXAIKNNr.css} +1 -1
furu/dashboard/frontend/dist/assets/{index-DDv_TYB_.js → index-DS3FsqcY.js} +3 -3
furu/dashboard/frontend/dist/index.html +2 -2
furu/dashboard/main.py +10 -3
furu/errors.py +60 -5
furu/execution/__init__.py +22 -0
furu/execution/context.py +30 -0
furu/execution/local.py +184 -0
furu/execution/paths.py +20 -0
furu/execution/plan.py +238 -0
furu/execution/plan_utils.py +13 -0
furu/execution/slurm_dag.py +271 -0
furu/execution/slurm_pool.py +878 -0
furu/execution/slurm_spec.py +38 -0
furu/execution/submitit_factory.py +47 -0
furu/migration.py +8 -4
furu/runtime/logging.py +10 -10
furu/serialization/serializer.py +40 -2
furu/storage/metadata.py +17 -5
furu/storage/state.py +78 -12
{furu-0.0.2.dist-info → furu-0.0.4.dist-info}/METADATA +83 -33
furu-0.0.4.dist-info/RECORD +46 -0
furu-0.0.2.dist-info/RECORD +0 -36
{furu-0.0.2.dist-info → furu-0.0.4.dist-info}/WHEEL +0 -0
{furu-0.0.2.dist-info → furu-0.0.4.dist-info}/entry_points.txt +0 -0

furu/execution/slurm_spec.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Mapping, Protocol
+SlurmSpecValue = str | int | float | bool
+SlurmSpecExtraValue = SlurmSpecValue | Mapping[str, "SlurmSpecExtraValue"]
+@dataclass(frozen=True)
+class SlurmSpec:
+    partition: str | None = None
+    gpus: int = 0
+    cpus: int = 4
+    mem_gb: int = 16
+    time_min: int = 60
+    extra: Mapping[str, SlurmSpecExtraValue] | None = None
+class _SpecNode(Protocol):
+    _furu_hash: str
+    def _executor_spec_key(self) -> str: ...
+def resolve_slurm_spec(specs: Mapping[str, SlurmSpec], node: _SpecNode) -> SlurmSpec:
+    if "default" not in specs:
+        raise KeyError("Missing slurm spec for key 'default'.")
+    spec_key = node._executor_spec_key()
+    if spec_key not in specs:
+        raise KeyError(
+            "Missing slurm spec for key "
+            f"'{spec_key}' for node {node.__class__.__name__} ({node._furu_hash})."
+        )
+    return specs[spec_key]

furu/execution/submitit_factory.py ADDED Viewed

@@ -0,0 +1,47 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+from .paths import submitit_logs_dir
+from .slurm_spec import SlurmSpec, SlurmSpecExtraValue
+if TYPE_CHECKING:
+    import submitit
+def make_executor_for_spec(
+    spec_key: str,
+    spec: SlurmSpec,
+    *,
+    kind: str,
+    submitit_root: Path | None,
+    run_id: str | None = None,
+) -> submitit.AutoExecutor:
+    import submitit
+    folder = submitit_logs_dir(
+        kind,
+        spec_key,
+        override=submitit_root,
+        run_id=run_id,
+    )
+    folder.mkdir(parents=True, exist_ok=True)
+    executor = submitit.AutoExecutor(folder=str(folder))
+    params: dict[str, SlurmSpecExtraValue | None] = {
+        "timeout_min": spec.time_min,
+        "slurm_partition": spec.partition,
+        "cpus_per_task": spec.cpus,
+        "mem_gb": spec.mem_gb,
+    }
+    if spec.gpus:
+        params["gpus_per_node"] = spec.gpus
+    if spec.extra:
+        params.update(spec.extra)
+    executor.update_parameters(
+        **{key: value for key, value in params.items() if value is not None}
+    )
+    return executor

furu/migration.py CHANGED Viewed

@@ -507,8 +507,10 @@ def _apply_single_migration(
     event: dict[str, str | int] = {
         "type": "migrated",
         "policy": policy,
-        "from": f"{candidate.from_ref.namespace}:{candidate.from_ref.furu_hash}",
-        "to": f"{candidate.to_ref.namespace}:{candidate.to_ref.furu_hash}",
+        "from_namespace": candidate.from_ref.namespace,
+        "from_hash": candidate.from_ref.furu_hash,
+        "to_namespace": candidate.to_ref.namespace,
+        "to_hash": candidate.to_ref.furu_hash,
     }
     if default_values is not None:
         event["default_values"] = json.dumps(default_values, sort_keys=True)
@@ -519,8 +521,10 @@ def _apply_single_migration(
         overwrite_event = {
             "type": "migration_overwrite",
             "policy": policy,
-            "from": f"{candidate.from_ref.namespace}:{candidate.from_ref.furu_hash}",
-            "to": f"{candidate.to_ref.namespace}:{candidate.to_ref.furu_hash}",
+            "from_namespace": candidate.from_ref.namespace,
+            "from_hash": candidate.from_ref.furu_hash,
+            "to_namespace": candidate.to_ref.namespace,
+            "to_hash": candidate.to_ref.furu_hash,
             "reason": "force_overwrite",
         }
         StateManager.append_event(to_dir, overwrite_event)

furu/runtime/logging.py CHANGED Viewed

@@ -28,16 +28,16 @@ _FURU_HOLDER_STACK: contextvars.ContextVar[tuple[HolderType, ...]] = (
 _FURU_LOG_LOCK = threading.Lock()
 _FURU_CONSOLE_LOCK = threading.Lock()
-_LOAD_OR_CREATE_PREFIX = "load_or_create"
+_GET_PREFIX = "get"
-def _strip_load_or_create_decision_suffix(message: str) -> str:
+def _strip_get_decision_suffix(message: str) -> str:
     """
-    Strip a trailing `(<decision>)` suffix from `load_or_create ...` console lines.
+    Strip a trailing `(<decision>)` suffix from `get ...` console lines.
     This keeps detailed decision info in file logs, but makes console output cleaner.
     """
-    if not message.startswith(_LOAD_OR_CREATE_PREFIX):
+    if not message.startswith(_GET_PREFIX):
         return message
     if not message.endswith(")"):
         return message
@@ -69,7 +69,7 @@ def enter_holder(holder: HolderType) -> Generator[None, None, None]:
     """
     Push a holder object onto the logging stack for this context.
-    Furu calls this automatically during `load_or_create()`, so nested
+    Furu calls this automatically during `get()`, so nested
     dependencies will log to the active dependency's folder and then revert.
     """
     configure_logging()
@@ -163,7 +163,7 @@ class _FuruRichConsoleHandler(logging.Handler):
     @staticmethod
     def _format_location(record: logging.LogRecord) -> str:
-        # Use caller location if available (for load_or_create messages)
+        # Use caller location if available (for get messages)
         caller_file = getattr(record, "furu_caller_file", None)
         caller_line = getattr(record, "furu_caller_line", None)
         if caller_file is not None and caller_line is not None:
@@ -174,10 +174,10 @@ class _FuruRichConsoleHandler(logging.Handler):
     @staticmethod
     def _format_message_text(record: logging.LogRecord) -> Text:
-        message = _strip_load_or_create_decision_suffix(record.getMessage())
+        message = _strip_get_decision_suffix(record.getMessage())
         action_color = getattr(record, "furu_action_color", None)
-        if isinstance(action_color, str) and message.startswith(_LOAD_OR_CREATE_PREFIX):
-            prefix = _LOAD_OR_CREATE_PREFIX
+        if isinstance(action_color, str) and message.startswith(_GET_PREFIX):
+            prefix = _GET_PREFIX
             rest = message[len(prefix) :]
             text = Text()
             text.append(prefix, style=action_color)
@@ -288,7 +288,7 @@ def write_separator(line: str = "------------------") -> Path:
     """
     Write a raw separator line to the current holder's `furu.log`.
-    This bypasses standard formatting so repeated `load_or_create()` calls are easy to spot.
+    This bypasses standard formatting so repeated `get()` calls are easy to spot.
     """
     directory = current_log_dir()
     log_path = directory / "furu.log"

furu/serialization/serializer.py CHANGED Viewed

@@ -6,9 +6,10 @@ import json
 import pathlib
 import textwrap
 from pathlib import Path
-from typing import Any
+from typing import Any, Protocol, Sequence, cast, runtime_checkable
 import chz
+from chz.util import MISSING as CHZ_MISSING, MISSING_TYPE
 from ..errors import _FuruMissing
 from pydantic import BaseModel as PydanticBaseModel
@@ -91,13 +92,34 @@ class FuruSerializer:
     def compute_hash(cls, obj: object, verbose: bool = False) -> str:
         """Compute deterministic hash of object."""
+        @runtime_checkable
+        class _DependencyHashProvider(Protocol):
+            def _dependency_hashes(self) -> Sequence[str]: ...
+        def _has_required_fields(
+            data_class: type[object],
+            data: dict[str, JsonValue],
+        ) -> bool:
+            if not chz.is_chz(data_class):
+                return False
+            for field in chz.chz_fields(data_class).values():
+                name = field.logical_name
+                if name in data:
+                    continue
+                if field._default is not CHZ_MISSING:
+                    continue
+                if not isinstance(field._default_factory, MISSING_TYPE):
+                    continue
+                return False
+            return True
         def canonicalize(item: object) -> JsonValue:
             if isinstance(item, _FuruMissing):
                 raise ValueError("Cannot hash Furu.MISSING")
             if chz.is_chz(item):
                 fields = chz.chz_fields(item)
-                return {
+                result = {
                     "__class__": cls.get_classname(item),
                     **{
                         name: canonicalize(getattr(item, name))
@@ -105,8 +127,24 @@ class FuruSerializer:
                         if not name.startswith("_")
                     },
                 }
+                if isinstance(item, _DependencyHashProvider):
+                    dependency_hashes = list(item._dependency_hashes())
+                    if dependency_hashes:
+                        result["__dependencies__"] = dependency_hashes
+                return result
             if isinstance(item, dict):
+                if cls.CLASS_MARKER in item:
+                    config = cast(dict[str, JsonValue], item)
+                    module_path, _, class_name = item[cls.CLASS_MARKER].rpartition(".")
+                    module = importlib.import_module(module_path)
+                    data_class = getattr(module, class_name, None)
+                    if (
+                        data_class is not None
+                        and hasattr(data_class, "_dependency_hashes")
+                        and _has_required_fields(data_class, config)
+                    ):
+                        return canonicalize(cls.from_dict(config))
                 filtered = item
                 if cls.CLASS_MARKER in item:
                     filtered = {

furu/storage/metadata.py CHANGED Viewed

@@ -124,7 +124,7 @@ class MetadataManager:
             try:
                 head = cls.run_git_command(["rev-parse", "HEAD"])
                 branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
-            except subprocess.CalledProcessError:
+            except (subprocess.CalledProcessError, FileNotFoundError):
                 return GitInfo(
                     git_commit="<no-git>",
                     git_branch="<no-git>",
@@ -133,15 +133,27 @@ class MetadataManager:
                     git_submodules={},
                 )
         else:
-            head = cls.run_git_command(["rev-parse", "HEAD"])
-            branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
+            try:
+                head = cls.run_git_command(["rev-parse", "HEAD"])
+                branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
+            except (subprocess.CalledProcessError, FileNotFoundError) as e:
+                raise RuntimeError(
+                    "Failed to read git commit/branch for provenance. "
+                    "If this is expected, set FURU_REQUIRE_GIT=0."
+                ) from e
         if FURU_CONFIG.require_git_remote:
-            remote = cls.run_git_command(["remote", "get-url", "origin"])
+            try:
+                remote = cls.run_git_command(["remote", "get-url", "origin"])
+            except (subprocess.CalledProcessError, FileNotFoundError) as e:
+                raise RuntimeError(
+                    "Git remote 'origin' is required for provenance but was not found. "
+                    "Set FURU_REQUIRE_GIT_REMOTE=0 to allow missing origin."
+                ) from e
         else:
             try:
                 remote = cls.run_git_command(["remote", "get-url", "origin"])
-            except subprocess.CalledProcessError:
+            except (subprocess.CalledProcessError, FileNotFoundError):
                 remote = None
         if ignore_diff:

furu/storage/state.py CHANGED Viewed

@@ -400,15 +400,41 @@ class StateManager:
     @classmethod
     def release_lock(cls, fd: int | None, lock_path: Path) -> None:
-        if fd is not None:
+        """Release a lock acquired via :meth:`try_lock`.
+        We best-effort avoid deleting a lock that was broken and replaced by
+        another process by verifying the inode of the open fd matches the
+        current lock_path inode before unlinking.
+        """
+        if fd is None:
+            return
+        try:
+            fd_stat = os.fstat(fd)
+        except OSError:
+            fd_stat = None
+        try:
+            path_stat = lock_path.stat()
+        except FileNotFoundError:
+            path_stat = None
+        try:
+            if (
+                fd_stat is not None
+                and path_stat is not None
+                and fd_stat.st_ino == path_stat.st_ino
+                and fd_stat.st_dev == path_stat.st_dev
+            ):
+                lock_path.unlink(missing_ok=True)
+        finally:
             os.close(fd)
-        lock_path.unlink(missing_ok=True)
     @classmethod
     def _read_lock_info(cls, lock_path: Path) -> _LockInfoDict | None:
         if not lock_path.is_file():
             return None
-        text = lock_path.read_text().strip()
+        try:
+            text = lock_path.read_text().strip()
+        except FileNotFoundError:
+            return None
         if not text:
             return None
         lines = text.splitlines()
@@ -977,6 +1003,8 @@ def compute_lock(
     poll_interval_sec: float = 10.0,
     wait_log_every_sec: float = 10.0,
     reconcile_fn: Callable[[Path], None] | None = None,
+    allow_failed: bool = False,
+    allow_success: bool = False,
 ) -> Generator[ComputeLockContext, None, None]:
     """
     Context manager that atomically acquires lock + records attempt + starts heartbeat.
@@ -1000,6 +1028,8 @@ def compute_lock(
         poll_interval_sec: Interval between lock acquisition attempts
         wait_log_every_sec: Interval between "waiting for lock" log messages
         reconcile_fn: Optional function to call to reconcile stale attempts
+        allow_failed: Allow recomputation even if state is failed
+        allow_success: Allow recomputation even if state is successful
     Yields:
         ComputeLockContext with attempt_id and stop_heartbeat callable
@@ -1008,6 +1038,7 @@ def compute_lock(
         FuruLockNotAcquired: If lock cannot be acquired (after waiting)
         FuruWaitTimeout: If max_wait_time_sec is exceeded
     """
     def _format_wait_duration(seconds: float) -> str:
         if seconds < 60.0:
             return f"{seconds:.1f}s"
@@ -1020,6 +1051,21 @@ def compute_lock(
         days = hours / 24.0
         return f"{days:.1f}d"
+    def _format_owner(attempt: _StateAttempt) -> str:
+        owner = attempt.owner
+        parts: list[str] = []
+        if attempt.id:
+            parts.append(f"attempt {attempt.id}")
+        if owner.host:
+            parts.append(f"host {owner.host}")
+        if owner.pid is not None:
+            parts.append(f"pid {owner.pid}")
+        if owner.user:
+            parts.append(f"user {owner.user}")
+        if not parts:
+            return "owner unknown"
+        return ", ".join(parts)
     def _describe_wait(attempt: _StateAttempt, waited_sec: float) -> str:
         label = "last heartbeat"
         timestamp = attempt.heartbeat_at
@@ -1034,7 +1080,7 @@ def compute_lock(
         return (
             "waited "
             f"{_format_wait_duration(waited_sec)}, {label} {timestamp_info}, "
-            f"status {attempt.status}, backend {attempt.backend}"
+            f"status {attempt.status}, backend {attempt.backend}, {_format_owner(attempt)}"
         )
     lock_path = StateManager.get_lock_path(directory, StateManager.COMPUTE_LOCK)
@@ -1054,21 +1100,41 @@ def compute_lock(
         if max_wait_time_sec is not None:
             elapsed = time.time() - start_time
             if elapsed > max_wait_time_sec:
+                state = StateManager.read_state(directory)
+                attempt = state.attempt
+                attempt_info = "no active attempt"
+                if isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
+                    attempt_info = _describe_wait(attempt, elapsed)
+                message = (
+                    f"Timed out waiting for compute lock after {elapsed:.1f}s."
+                    f"\nDirectory: {directory}"
+                    f"\nLock file: {lock_path}"
+                    f"\nDetails: {attempt_info}"
+                )
                 raise FuruWaitTimeout(
-                    f"Timed out waiting for compute lock after {elapsed:.1f}s"
+                    message,
+                    hints=[
+                        "Increase max wait: set FURU_MAX_WAIT_SECS (or override Furu._max_wait_time_sec).",
+                        "Change poll cadence: set FURU_POLL_INTERVAL_SECS.",
+                        "Change wait logging cadence: set FURU_WAIT_LOG_EVERY_SECS.",
+                        "If locks look stale too quickly/slowly: tune FURU_LEASE_SECS and FURU_HEARTBEAT_SECS.",
+                        "For more logs: set FURU_LOG_LEVEL=DEBUG.",
+                    ],
                 )
         lock_fd = StateManager.try_lock(lock_path)
         if lock_fd is not None:
             state = StateManager.read_state(directory)
-            if isinstance(state.result, _StateResultSuccess):
+            if isinstance(state.result, _StateResultSuccess) and not allow_success:
                 StateManager.release_lock(lock_fd, lock_path)
                 raise FuruLockNotAcquired(
                     "Cannot acquire lock: experiment already succeeded"
                 )
-            if isinstance(state.result, _StateResultFailed):
+            if isinstance(state.result, _StateResultFailed) and not allow_failed:
                 StateManager.release_lock(lock_fd, lock_path)
-                raise FuruLockNotAcquired("Cannot acquire lock: experiment already failed")
+                raise FuruLockNotAcquired(
+                    "Cannot acquire lock: experiment already failed"
+                )
             attempt = state.attempt
             if (
                 isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning))
@@ -1079,11 +1145,11 @@ def compute_lock(
                 if reconcile_fn is not None:
                     reconcile_fn(directory)
                     state = StateManager.read_state(directory)
-                if isinstance(state.result, _StateResultSuccess):
+                if isinstance(state.result, _StateResultSuccess) and not allow_success:
                     raise FuruLockNotAcquired(
                         "Cannot acquire lock: experiment already succeeded"
                     )
-                if isinstance(state.result, _StateResultFailed):
+                if isinstance(state.result, _StateResultFailed) and not allow_failed:
                     raise FuruLockNotAcquired(
                         "Cannot acquire lock: experiment already failed"
                     )
@@ -1113,11 +1179,11 @@ def compute_lock(
         attempt = state.attempt
         # If result is terminal, no point waiting
-        if isinstance(state.result, _StateResultSuccess):
+        if isinstance(state.result, _StateResultSuccess) and not allow_success:
             raise FuruLockNotAcquired(
                 "Cannot acquire lock: experiment already succeeded"
             )
-        if isinstance(state.result, _StateResultFailed):
+        if isinstance(state.result, _StateResultFailed) and not allow_failed:
             raise FuruLockNotAcquired("Cannot acquire lock: experiment already failed")
         # If no active attempt but lock exists, it's orphaned - clean it up

furu 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

furu 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl