furu 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +11 -1
- furu/adapters/submitit.py +23 -2
- furu/config.py +21 -3
- furu/core/__init__.py +2 -2
- furu/core/furu.py +708 -188
- furu/core/list.py +1 -1
- furu/dashboard/__init__.py +10 -1
- furu/dashboard/frontend/dist/assets/{index-CbdDfSOZ.css → index-BXAIKNNr.css} +1 -1
- furu/dashboard/frontend/dist/assets/{index-DDv_TYB_.js → index-DS3FsqcY.js} +3 -3
- furu/dashboard/frontend/dist/index.html +2 -2
- furu/dashboard/main.py +10 -3
- furu/errors.py +60 -5
- furu/execution/__init__.py +22 -0
- furu/execution/context.py +30 -0
- furu/execution/local.py +184 -0
- furu/execution/paths.py +20 -0
- furu/execution/plan.py +238 -0
- furu/execution/plan_utils.py +13 -0
- furu/execution/slurm_dag.py +271 -0
- furu/execution/slurm_pool.py +878 -0
- furu/execution/slurm_spec.py +38 -0
- furu/execution/submitit_factory.py +47 -0
- furu/migration.py +8 -4
- furu/runtime/logging.py +10 -10
- furu/serialization/serializer.py +40 -2
- furu/storage/metadata.py +17 -5
- furu/storage/state.py +78 -12
- {furu-0.0.2.dist-info → furu-0.0.4.dist-info}/METADATA +83 -33
- furu-0.0.4.dist-info/RECORD +46 -0
- furu-0.0.2.dist-info/RECORD +0 -36
- {furu-0.0.2.dist-info → furu-0.0.4.dist-info}/WHEEL +0 -0
- {furu-0.0.2.dist-info → furu-0.0.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Mapping, Protocol
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
SlurmSpecValue = str | int | float | bool
|
|
8
|
+
SlurmSpecExtraValue = SlurmSpecValue | Mapping[str, "SlurmSpecExtraValue"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class SlurmSpec:
|
|
13
|
+
partition: str | None = None
|
|
14
|
+
gpus: int = 0
|
|
15
|
+
cpus: int = 4
|
|
16
|
+
mem_gb: int = 16
|
|
17
|
+
time_min: int = 60
|
|
18
|
+
extra: Mapping[str, SlurmSpecExtraValue] | None = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class _SpecNode(Protocol):
|
|
22
|
+
_furu_hash: str
|
|
23
|
+
|
|
24
|
+
def _executor_spec_key(self) -> str: ...
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def resolve_slurm_spec(specs: Mapping[str, SlurmSpec], node: _SpecNode) -> SlurmSpec:
|
|
28
|
+
if "default" not in specs:
|
|
29
|
+
raise KeyError("Missing slurm spec for key 'default'.")
|
|
30
|
+
|
|
31
|
+
spec_key = node._executor_spec_key()
|
|
32
|
+
if spec_key not in specs:
|
|
33
|
+
raise KeyError(
|
|
34
|
+
"Missing slurm spec for key "
|
|
35
|
+
f"'{spec_key}' for node {node.__class__.__name__} ({node._furu_hash})."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
return specs[spec_key]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from .paths import submitit_logs_dir
|
|
7
|
+
from .slurm_spec import SlurmSpec, SlurmSpecExtraValue
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import submitit
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def make_executor_for_spec(
|
|
15
|
+
spec_key: str,
|
|
16
|
+
spec: SlurmSpec,
|
|
17
|
+
*,
|
|
18
|
+
kind: str,
|
|
19
|
+
submitit_root: Path | None,
|
|
20
|
+
run_id: str | None = None,
|
|
21
|
+
) -> submitit.AutoExecutor:
|
|
22
|
+
import submitit
|
|
23
|
+
|
|
24
|
+
folder = submitit_logs_dir(
|
|
25
|
+
kind,
|
|
26
|
+
spec_key,
|
|
27
|
+
override=submitit_root,
|
|
28
|
+
run_id=run_id,
|
|
29
|
+
)
|
|
30
|
+
folder.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
executor = submitit.AutoExecutor(folder=str(folder))
|
|
33
|
+
params: dict[str, SlurmSpecExtraValue | None] = {
|
|
34
|
+
"timeout_min": spec.time_min,
|
|
35
|
+
"slurm_partition": spec.partition,
|
|
36
|
+
"cpus_per_task": spec.cpus,
|
|
37
|
+
"mem_gb": spec.mem_gb,
|
|
38
|
+
}
|
|
39
|
+
if spec.gpus:
|
|
40
|
+
params["gpus_per_node"] = spec.gpus
|
|
41
|
+
if spec.extra:
|
|
42
|
+
params.update(spec.extra)
|
|
43
|
+
|
|
44
|
+
executor.update_parameters(
|
|
45
|
+
**{key: value for key, value in params.items() if value is not None}
|
|
46
|
+
)
|
|
47
|
+
return executor
|
furu/migration.py
CHANGED
|
@@ -507,8 +507,10 @@ def _apply_single_migration(
|
|
|
507
507
|
event: dict[str, str | int] = {
|
|
508
508
|
"type": "migrated",
|
|
509
509
|
"policy": policy,
|
|
510
|
-
"
|
|
511
|
-
"
|
|
510
|
+
"from_namespace": candidate.from_ref.namespace,
|
|
511
|
+
"from_hash": candidate.from_ref.furu_hash,
|
|
512
|
+
"to_namespace": candidate.to_ref.namespace,
|
|
513
|
+
"to_hash": candidate.to_ref.furu_hash,
|
|
512
514
|
}
|
|
513
515
|
if default_values is not None:
|
|
514
516
|
event["default_values"] = json.dumps(default_values, sort_keys=True)
|
|
@@ -519,8 +521,10 @@ def _apply_single_migration(
|
|
|
519
521
|
overwrite_event = {
|
|
520
522
|
"type": "migration_overwrite",
|
|
521
523
|
"policy": policy,
|
|
522
|
-
"
|
|
523
|
-
"
|
|
524
|
+
"from_namespace": candidate.from_ref.namespace,
|
|
525
|
+
"from_hash": candidate.from_ref.furu_hash,
|
|
526
|
+
"to_namespace": candidate.to_ref.namespace,
|
|
527
|
+
"to_hash": candidate.to_ref.furu_hash,
|
|
524
528
|
"reason": "force_overwrite",
|
|
525
529
|
}
|
|
526
530
|
StateManager.append_event(to_dir, overwrite_event)
|
furu/runtime/logging.py
CHANGED
|
@@ -28,16 +28,16 @@ _FURU_HOLDER_STACK: contextvars.ContextVar[tuple[HolderType, ...]] = (
|
|
|
28
28
|
_FURU_LOG_LOCK = threading.Lock()
|
|
29
29
|
_FURU_CONSOLE_LOCK = threading.Lock()
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
_GET_PREFIX = "get"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def
|
|
34
|
+
def _strip_get_decision_suffix(message: str) -> str:
|
|
35
35
|
"""
|
|
36
|
-
Strip a trailing `(<decision>)` suffix from `
|
|
36
|
+
Strip a trailing `(<decision>)` suffix from `get ...` console lines.
|
|
37
37
|
|
|
38
38
|
This keeps detailed decision info in file logs, but makes console output cleaner.
|
|
39
39
|
"""
|
|
40
|
-
if not message.startswith(
|
|
40
|
+
if not message.startswith(_GET_PREFIX):
|
|
41
41
|
return message
|
|
42
42
|
if not message.endswith(")"):
|
|
43
43
|
return message
|
|
@@ -69,7 +69,7 @@ def enter_holder(holder: HolderType) -> Generator[None, None, None]:
|
|
|
69
69
|
"""
|
|
70
70
|
Push a holder object onto the logging stack for this context.
|
|
71
71
|
|
|
72
|
-
Furu calls this automatically during `
|
|
72
|
+
Furu calls this automatically during `get()`, so nested
|
|
73
73
|
dependencies will log to the active dependency's folder and then revert.
|
|
74
74
|
"""
|
|
75
75
|
configure_logging()
|
|
@@ -163,7 +163,7 @@ class _FuruRichConsoleHandler(logging.Handler):
|
|
|
163
163
|
|
|
164
164
|
@staticmethod
|
|
165
165
|
def _format_location(record: logging.LogRecord) -> str:
|
|
166
|
-
# Use caller location if available (for
|
|
166
|
+
# Use caller location if available (for get messages)
|
|
167
167
|
caller_file = getattr(record, "furu_caller_file", None)
|
|
168
168
|
caller_line = getattr(record, "furu_caller_line", None)
|
|
169
169
|
if caller_file is not None and caller_line is not None:
|
|
@@ -174,10 +174,10 @@ class _FuruRichConsoleHandler(logging.Handler):
|
|
|
174
174
|
|
|
175
175
|
@staticmethod
|
|
176
176
|
def _format_message_text(record: logging.LogRecord) -> Text:
|
|
177
|
-
message =
|
|
177
|
+
message = _strip_get_decision_suffix(record.getMessage())
|
|
178
178
|
action_color = getattr(record, "furu_action_color", None)
|
|
179
|
-
if isinstance(action_color, str) and message.startswith(
|
|
180
|
-
prefix =
|
|
179
|
+
if isinstance(action_color, str) and message.startswith(_GET_PREFIX):
|
|
180
|
+
prefix = _GET_PREFIX
|
|
181
181
|
rest = message[len(prefix) :]
|
|
182
182
|
text = Text()
|
|
183
183
|
text.append(prefix, style=action_color)
|
|
@@ -288,7 +288,7 @@ def write_separator(line: str = "------------------") -> Path:
|
|
|
288
288
|
"""
|
|
289
289
|
Write a raw separator line to the current holder's `furu.log`.
|
|
290
290
|
|
|
291
|
-
This bypasses standard formatting so repeated `
|
|
291
|
+
This bypasses standard formatting so repeated `get()` calls are easy to spot.
|
|
292
292
|
"""
|
|
293
293
|
directory = current_log_dir()
|
|
294
294
|
log_path = directory / "furu.log"
|
furu/serialization/serializer.py
CHANGED
|
@@ -6,9 +6,10 @@ import json
|
|
|
6
6
|
import pathlib
|
|
7
7
|
import textwrap
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any, Protocol, Sequence, cast, runtime_checkable
|
|
10
10
|
|
|
11
11
|
import chz
|
|
12
|
+
from chz.util import MISSING as CHZ_MISSING, MISSING_TYPE
|
|
12
13
|
|
|
13
14
|
from ..errors import _FuruMissing
|
|
14
15
|
from pydantic import BaseModel as PydanticBaseModel
|
|
@@ -91,13 +92,34 @@ class FuruSerializer:
|
|
|
91
92
|
def compute_hash(cls, obj: object, verbose: bool = False) -> str:
|
|
92
93
|
"""Compute deterministic hash of object."""
|
|
93
94
|
|
|
95
|
+
@runtime_checkable
|
|
96
|
+
class _DependencyHashProvider(Protocol):
|
|
97
|
+
def _dependency_hashes(self) -> Sequence[str]: ...
|
|
98
|
+
|
|
99
|
+
def _has_required_fields(
|
|
100
|
+
data_class: type[object],
|
|
101
|
+
data: dict[str, JsonValue],
|
|
102
|
+
) -> bool:
|
|
103
|
+
if not chz.is_chz(data_class):
|
|
104
|
+
return False
|
|
105
|
+
for field in chz.chz_fields(data_class).values():
|
|
106
|
+
name = field.logical_name
|
|
107
|
+
if name in data:
|
|
108
|
+
continue
|
|
109
|
+
if field._default is not CHZ_MISSING:
|
|
110
|
+
continue
|
|
111
|
+
if not isinstance(field._default_factory, MISSING_TYPE):
|
|
112
|
+
continue
|
|
113
|
+
return False
|
|
114
|
+
return True
|
|
115
|
+
|
|
94
116
|
def canonicalize(item: object) -> JsonValue:
|
|
95
117
|
if isinstance(item, _FuruMissing):
|
|
96
118
|
raise ValueError("Cannot hash Furu.MISSING")
|
|
97
119
|
|
|
98
120
|
if chz.is_chz(item):
|
|
99
121
|
fields = chz.chz_fields(item)
|
|
100
|
-
|
|
122
|
+
result = {
|
|
101
123
|
"__class__": cls.get_classname(item),
|
|
102
124
|
**{
|
|
103
125
|
name: canonicalize(getattr(item, name))
|
|
@@ -105,8 +127,24 @@ class FuruSerializer:
|
|
|
105
127
|
if not name.startswith("_")
|
|
106
128
|
},
|
|
107
129
|
}
|
|
130
|
+
if isinstance(item, _DependencyHashProvider):
|
|
131
|
+
dependency_hashes = list(item._dependency_hashes())
|
|
132
|
+
if dependency_hashes:
|
|
133
|
+
result["__dependencies__"] = dependency_hashes
|
|
134
|
+
return result
|
|
108
135
|
|
|
109
136
|
if isinstance(item, dict):
|
|
137
|
+
if cls.CLASS_MARKER in item:
|
|
138
|
+
config = cast(dict[str, JsonValue], item)
|
|
139
|
+
module_path, _, class_name = item[cls.CLASS_MARKER].rpartition(".")
|
|
140
|
+
module = importlib.import_module(module_path)
|
|
141
|
+
data_class = getattr(module, class_name, None)
|
|
142
|
+
if (
|
|
143
|
+
data_class is not None
|
|
144
|
+
and hasattr(data_class, "_dependency_hashes")
|
|
145
|
+
and _has_required_fields(data_class, config)
|
|
146
|
+
):
|
|
147
|
+
return canonicalize(cls.from_dict(config))
|
|
110
148
|
filtered = item
|
|
111
149
|
if cls.CLASS_MARKER in item:
|
|
112
150
|
filtered = {
|
furu/storage/metadata.py
CHANGED
|
@@ -124,7 +124,7 @@ class MetadataManager:
|
|
|
124
124
|
try:
|
|
125
125
|
head = cls.run_git_command(["rev-parse", "HEAD"])
|
|
126
126
|
branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
|
|
127
|
-
except subprocess.CalledProcessError:
|
|
127
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
128
128
|
return GitInfo(
|
|
129
129
|
git_commit="<no-git>",
|
|
130
130
|
git_branch="<no-git>",
|
|
@@ -133,15 +133,27 @@ class MetadataManager:
|
|
|
133
133
|
git_submodules={},
|
|
134
134
|
)
|
|
135
135
|
else:
|
|
136
|
-
|
|
137
|
-
|
|
136
|
+
try:
|
|
137
|
+
head = cls.run_git_command(["rev-parse", "HEAD"])
|
|
138
|
+
branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
|
|
139
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
|
140
|
+
raise RuntimeError(
|
|
141
|
+
"Failed to read git commit/branch for provenance. "
|
|
142
|
+
"If this is expected, set FURU_REQUIRE_GIT=0."
|
|
143
|
+
) from e
|
|
138
144
|
|
|
139
145
|
if FURU_CONFIG.require_git_remote:
|
|
140
|
-
|
|
146
|
+
try:
|
|
147
|
+
remote = cls.run_git_command(["remote", "get-url", "origin"])
|
|
148
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
|
149
|
+
raise RuntimeError(
|
|
150
|
+
"Git remote 'origin' is required for provenance but was not found. "
|
|
151
|
+
"Set FURU_REQUIRE_GIT_REMOTE=0 to allow missing origin."
|
|
152
|
+
) from e
|
|
141
153
|
else:
|
|
142
154
|
try:
|
|
143
155
|
remote = cls.run_git_command(["remote", "get-url", "origin"])
|
|
144
|
-
except subprocess.CalledProcessError:
|
|
156
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
145
157
|
remote = None
|
|
146
158
|
|
|
147
159
|
if ignore_diff:
|
furu/storage/state.py
CHANGED
|
@@ -400,15 +400,41 @@ class StateManager:
|
|
|
400
400
|
|
|
401
401
|
@classmethod
|
|
402
402
|
def release_lock(cls, fd: int | None, lock_path: Path) -> None:
|
|
403
|
-
|
|
403
|
+
"""Release a lock acquired via :meth:`try_lock`.
|
|
404
|
+
|
|
405
|
+
We best-effort avoid deleting a lock that was broken and replaced by
|
|
406
|
+
another process by verifying the inode of the open fd matches the
|
|
407
|
+
current lock_path inode before unlinking.
|
|
408
|
+
"""
|
|
409
|
+
if fd is None:
|
|
410
|
+
return
|
|
411
|
+
try:
|
|
412
|
+
fd_stat = os.fstat(fd)
|
|
413
|
+
except OSError:
|
|
414
|
+
fd_stat = None
|
|
415
|
+
try:
|
|
416
|
+
path_stat = lock_path.stat()
|
|
417
|
+
except FileNotFoundError:
|
|
418
|
+
path_stat = None
|
|
419
|
+
try:
|
|
420
|
+
if (
|
|
421
|
+
fd_stat is not None
|
|
422
|
+
and path_stat is not None
|
|
423
|
+
and fd_stat.st_ino == path_stat.st_ino
|
|
424
|
+
and fd_stat.st_dev == path_stat.st_dev
|
|
425
|
+
):
|
|
426
|
+
lock_path.unlink(missing_ok=True)
|
|
427
|
+
finally:
|
|
404
428
|
os.close(fd)
|
|
405
|
-
lock_path.unlink(missing_ok=True)
|
|
406
429
|
|
|
407
430
|
@classmethod
|
|
408
431
|
def _read_lock_info(cls, lock_path: Path) -> _LockInfoDict | None:
|
|
409
432
|
if not lock_path.is_file():
|
|
410
433
|
return None
|
|
411
|
-
|
|
434
|
+
try:
|
|
435
|
+
text = lock_path.read_text().strip()
|
|
436
|
+
except FileNotFoundError:
|
|
437
|
+
return None
|
|
412
438
|
if not text:
|
|
413
439
|
return None
|
|
414
440
|
lines = text.splitlines()
|
|
@@ -977,6 +1003,8 @@ def compute_lock(
|
|
|
977
1003
|
poll_interval_sec: float = 10.0,
|
|
978
1004
|
wait_log_every_sec: float = 10.0,
|
|
979
1005
|
reconcile_fn: Callable[[Path], None] | None = None,
|
|
1006
|
+
allow_failed: bool = False,
|
|
1007
|
+
allow_success: bool = False,
|
|
980
1008
|
) -> Generator[ComputeLockContext, None, None]:
|
|
981
1009
|
"""
|
|
982
1010
|
Context manager that atomically acquires lock + records attempt + starts heartbeat.
|
|
@@ -1000,6 +1028,8 @@ def compute_lock(
|
|
|
1000
1028
|
poll_interval_sec: Interval between lock acquisition attempts
|
|
1001
1029
|
wait_log_every_sec: Interval between "waiting for lock" log messages
|
|
1002
1030
|
reconcile_fn: Optional function to call to reconcile stale attempts
|
|
1031
|
+
allow_failed: Allow recomputation even if state is failed
|
|
1032
|
+
allow_success: Allow recomputation even if state is successful
|
|
1003
1033
|
|
|
1004
1034
|
Yields:
|
|
1005
1035
|
ComputeLockContext with attempt_id and stop_heartbeat callable
|
|
@@ -1008,6 +1038,7 @@ def compute_lock(
|
|
|
1008
1038
|
FuruLockNotAcquired: If lock cannot be acquired (after waiting)
|
|
1009
1039
|
FuruWaitTimeout: If max_wait_time_sec is exceeded
|
|
1010
1040
|
"""
|
|
1041
|
+
|
|
1011
1042
|
def _format_wait_duration(seconds: float) -> str:
|
|
1012
1043
|
if seconds < 60.0:
|
|
1013
1044
|
return f"{seconds:.1f}s"
|
|
@@ -1020,6 +1051,21 @@ def compute_lock(
|
|
|
1020
1051
|
days = hours / 24.0
|
|
1021
1052
|
return f"{days:.1f}d"
|
|
1022
1053
|
|
|
1054
|
+
def _format_owner(attempt: _StateAttempt) -> str:
|
|
1055
|
+
owner = attempt.owner
|
|
1056
|
+
parts: list[str] = []
|
|
1057
|
+
if attempt.id:
|
|
1058
|
+
parts.append(f"attempt {attempt.id}")
|
|
1059
|
+
if owner.host:
|
|
1060
|
+
parts.append(f"host {owner.host}")
|
|
1061
|
+
if owner.pid is not None:
|
|
1062
|
+
parts.append(f"pid {owner.pid}")
|
|
1063
|
+
if owner.user:
|
|
1064
|
+
parts.append(f"user {owner.user}")
|
|
1065
|
+
if not parts:
|
|
1066
|
+
return "owner unknown"
|
|
1067
|
+
return ", ".join(parts)
|
|
1068
|
+
|
|
1023
1069
|
def _describe_wait(attempt: _StateAttempt, waited_sec: float) -> str:
|
|
1024
1070
|
label = "last heartbeat"
|
|
1025
1071
|
timestamp = attempt.heartbeat_at
|
|
@@ -1034,7 +1080,7 @@ def compute_lock(
|
|
|
1034
1080
|
return (
|
|
1035
1081
|
"waited "
|
|
1036
1082
|
f"{_format_wait_duration(waited_sec)}, {label} {timestamp_info}, "
|
|
1037
|
-
f"status {attempt.status}, backend {attempt.backend}"
|
|
1083
|
+
f"status {attempt.status}, backend {attempt.backend}, {_format_owner(attempt)}"
|
|
1038
1084
|
)
|
|
1039
1085
|
|
|
1040
1086
|
lock_path = StateManager.get_lock_path(directory, StateManager.COMPUTE_LOCK)
|
|
@@ -1054,21 +1100,41 @@ def compute_lock(
|
|
|
1054
1100
|
if max_wait_time_sec is not None:
|
|
1055
1101
|
elapsed = time.time() - start_time
|
|
1056
1102
|
if elapsed > max_wait_time_sec:
|
|
1103
|
+
state = StateManager.read_state(directory)
|
|
1104
|
+
attempt = state.attempt
|
|
1105
|
+
attempt_info = "no active attempt"
|
|
1106
|
+
if isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning)):
|
|
1107
|
+
attempt_info = _describe_wait(attempt, elapsed)
|
|
1108
|
+
message = (
|
|
1109
|
+
f"Timed out waiting for compute lock after {elapsed:.1f}s."
|
|
1110
|
+
f"\nDirectory: {directory}"
|
|
1111
|
+
f"\nLock file: {lock_path}"
|
|
1112
|
+
f"\nDetails: {attempt_info}"
|
|
1113
|
+
)
|
|
1057
1114
|
raise FuruWaitTimeout(
|
|
1058
|
-
|
|
1115
|
+
message,
|
|
1116
|
+
hints=[
|
|
1117
|
+
"Increase max wait: set FURU_MAX_WAIT_SECS (or override Furu._max_wait_time_sec).",
|
|
1118
|
+
"Change poll cadence: set FURU_POLL_INTERVAL_SECS.",
|
|
1119
|
+
"Change wait logging cadence: set FURU_WAIT_LOG_EVERY_SECS.",
|
|
1120
|
+
"If locks look stale too quickly/slowly: tune FURU_LEASE_SECS and FURU_HEARTBEAT_SECS.",
|
|
1121
|
+
"For more logs: set FURU_LOG_LEVEL=DEBUG.",
|
|
1122
|
+
],
|
|
1059
1123
|
)
|
|
1060
1124
|
|
|
1061
1125
|
lock_fd = StateManager.try_lock(lock_path)
|
|
1062
1126
|
if lock_fd is not None:
|
|
1063
1127
|
state = StateManager.read_state(directory)
|
|
1064
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1128
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1065
1129
|
StateManager.release_lock(lock_fd, lock_path)
|
|
1066
1130
|
raise FuruLockNotAcquired(
|
|
1067
1131
|
"Cannot acquire lock: experiment already succeeded"
|
|
1068
1132
|
)
|
|
1069
|
-
if isinstance(state.result, _StateResultFailed):
|
|
1133
|
+
if isinstance(state.result, _StateResultFailed) and not allow_failed:
|
|
1070
1134
|
StateManager.release_lock(lock_fd, lock_path)
|
|
1071
|
-
raise FuruLockNotAcquired(
|
|
1135
|
+
raise FuruLockNotAcquired(
|
|
1136
|
+
"Cannot acquire lock: experiment already failed"
|
|
1137
|
+
)
|
|
1072
1138
|
attempt = state.attempt
|
|
1073
1139
|
if (
|
|
1074
1140
|
isinstance(attempt, (_StateAttemptQueued, _StateAttemptRunning))
|
|
@@ -1079,11 +1145,11 @@ def compute_lock(
|
|
|
1079
1145
|
if reconcile_fn is not None:
|
|
1080
1146
|
reconcile_fn(directory)
|
|
1081
1147
|
state = StateManager.read_state(directory)
|
|
1082
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1148
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1083
1149
|
raise FuruLockNotAcquired(
|
|
1084
1150
|
"Cannot acquire lock: experiment already succeeded"
|
|
1085
1151
|
)
|
|
1086
|
-
if isinstance(state.result, _StateResultFailed):
|
|
1152
|
+
if isinstance(state.result, _StateResultFailed) and not allow_failed:
|
|
1087
1153
|
raise FuruLockNotAcquired(
|
|
1088
1154
|
"Cannot acquire lock: experiment already failed"
|
|
1089
1155
|
)
|
|
@@ -1113,11 +1179,11 @@ def compute_lock(
|
|
|
1113
1179
|
attempt = state.attempt
|
|
1114
1180
|
|
|
1115
1181
|
# If result is terminal, no point waiting
|
|
1116
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1182
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1117
1183
|
raise FuruLockNotAcquired(
|
|
1118
1184
|
"Cannot acquire lock: experiment already succeeded"
|
|
1119
1185
|
)
|
|
1120
|
-
if isinstance(state.result, _StateResultFailed):
|
|
1186
|
+
if isinstance(state.result, _StateResultFailed) and not allow_failed:
|
|
1121
1187
|
raise FuruLockNotAcquired("Cannot acquire lock: experiment already failed")
|
|
1122
1188
|
|
|
1123
1189
|
# If no active attempt but lock exists, it's orphaned - clean it up
|