furu 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +8 -0
- furu/adapters/submitit.py +23 -2
- furu/config.py +13 -1
- furu/core/furu.py +355 -196
- furu/core/list.py +1 -1
- furu/dashboard/__init__.py +10 -1
- furu/dashboard/main.py +10 -3
- furu/errors.py +17 -4
- furu/execution/__init__.py +22 -0
- furu/execution/context.py +30 -0
- furu/execution/local.py +184 -0
- furu/execution/paths.py +20 -0
- furu/execution/plan.py +238 -0
- furu/execution/plan_utils.py +13 -0
- furu/execution/slurm_dag.py +271 -0
- furu/execution/slurm_pool.py +878 -0
- furu/execution/slurm_spec.py +38 -0
- furu/execution/submitit_factory.py +47 -0
- furu/runtime/logging.py +10 -10
- furu/storage/state.py +34 -6
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/METADATA +74 -37
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/RECORD +24 -14
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/WHEEL +0 -0
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Mapping, Protocol
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
SlurmSpecValue = str | int | float | bool
|
|
8
|
+
SlurmSpecExtraValue = SlurmSpecValue | Mapping[str, "SlurmSpecExtraValue"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class SlurmSpec:
|
|
13
|
+
partition: str | None = None
|
|
14
|
+
gpus: int = 0
|
|
15
|
+
cpus: int = 4
|
|
16
|
+
mem_gb: int = 16
|
|
17
|
+
time_min: int = 60
|
|
18
|
+
extra: Mapping[str, SlurmSpecExtraValue] | None = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class _SpecNode(Protocol):
|
|
22
|
+
_furu_hash: str
|
|
23
|
+
|
|
24
|
+
def _executor_spec_key(self) -> str: ...
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def resolve_slurm_spec(specs: Mapping[str, SlurmSpec], node: _SpecNode) -> SlurmSpec:
|
|
28
|
+
if "default" not in specs:
|
|
29
|
+
raise KeyError("Missing slurm spec for key 'default'.")
|
|
30
|
+
|
|
31
|
+
spec_key = node._executor_spec_key()
|
|
32
|
+
if spec_key not in specs:
|
|
33
|
+
raise KeyError(
|
|
34
|
+
"Missing slurm spec for key "
|
|
35
|
+
f"'{spec_key}' for node {node.__class__.__name__} ({node._furu_hash})."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
return specs[spec_key]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
from .paths import submitit_logs_dir
|
|
7
|
+
from .slurm_spec import SlurmSpec, SlurmSpecExtraValue
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import submitit
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def make_executor_for_spec(
|
|
15
|
+
spec_key: str,
|
|
16
|
+
spec: SlurmSpec,
|
|
17
|
+
*,
|
|
18
|
+
kind: str,
|
|
19
|
+
submitit_root: Path | None,
|
|
20
|
+
run_id: str | None = None,
|
|
21
|
+
) -> submitit.AutoExecutor:
|
|
22
|
+
import submitit
|
|
23
|
+
|
|
24
|
+
folder = submitit_logs_dir(
|
|
25
|
+
kind,
|
|
26
|
+
spec_key,
|
|
27
|
+
override=submitit_root,
|
|
28
|
+
run_id=run_id,
|
|
29
|
+
)
|
|
30
|
+
folder.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
executor = submitit.AutoExecutor(folder=str(folder))
|
|
33
|
+
params: dict[str, SlurmSpecExtraValue | None] = {
|
|
34
|
+
"timeout_min": spec.time_min,
|
|
35
|
+
"slurm_partition": spec.partition,
|
|
36
|
+
"cpus_per_task": spec.cpus,
|
|
37
|
+
"mem_gb": spec.mem_gb,
|
|
38
|
+
}
|
|
39
|
+
if spec.gpus:
|
|
40
|
+
params["gpus_per_node"] = spec.gpus
|
|
41
|
+
if spec.extra:
|
|
42
|
+
params.update(spec.extra)
|
|
43
|
+
|
|
44
|
+
executor.update_parameters(
|
|
45
|
+
**{key: value for key, value in params.items() if value is not None}
|
|
46
|
+
)
|
|
47
|
+
return executor
|
furu/runtime/logging.py
CHANGED
|
@@ -28,16 +28,16 @@ _FURU_HOLDER_STACK: contextvars.ContextVar[tuple[HolderType, ...]] = (
|
|
|
28
28
|
_FURU_LOG_LOCK = threading.Lock()
|
|
29
29
|
_FURU_CONSOLE_LOCK = threading.Lock()
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
_GET_PREFIX = "get"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def
|
|
34
|
+
def _strip_get_decision_suffix(message: str) -> str:
|
|
35
35
|
"""
|
|
36
|
-
Strip a trailing `(<decision>)` suffix from `
|
|
36
|
+
Strip a trailing `(<decision>)` suffix from `get ...` console lines.
|
|
37
37
|
|
|
38
38
|
This keeps detailed decision info in file logs, but makes console output cleaner.
|
|
39
39
|
"""
|
|
40
|
-
if not message.startswith(
|
|
40
|
+
if not message.startswith(_GET_PREFIX):
|
|
41
41
|
return message
|
|
42
42
|
if not message.endswith(")"):
|
|
43
43
|
return message
|
|
@@ -69,7 +69,7 @@ def enter_holder(holder: HolderType) -> Generator[None, None, None]:
|
|
|
69
69
|
"""
|
|
70
70
|
Push a holder object onto the logging stack for this context.
|
|
71
71
|
|
|
72
|
-
Furu calls this automatically during `
|
|
72
|
+
Furu calls this automatically during `get()`, so nested
|
|
73
73
|
dependencies will log to the active dependency's folder and then revert.
|
|
74
74
|
"""
|
|
75
75
|
configure_logging()
|
|
@@ -163,7 +163,7 @@ class _FuruRichConsoleHandler(logging.Handler):
|
|
|
163
163
|
|
|
164
164
|
@staticmethod
|
|
165
165
|
def _format_location(record: logging.LogRecord) -> str:
|
|
166
|
-
# Use caller location if available (for
|
|
166
|
+
# Use caller location if available (for get messages)
|
|
167
167
|
caller_file = getattr(record, "furu_caller_file", None)
|
|
168
168
|
caller_line = getattr(record, "furu_caller_line", None)
|
|
169
169
|
if caller_file is not None and caller_line is not None:
|
|
@@ -174,10 +174,10 @@ class _FuruRichConsoleHandler(logging.Handler):
|
|
|
174
174
|
|
|
175
175
|
@staticmethod
|
|
176
176
|
def _format_message_text(record: logging.LogRecord) -> Text:
|
|
177
|
-
message =
|
|
177
|
+
message = _strip_get_decision_suffix(record.getMessage())
|
|
178
178
|
action_color = getattr(record, "furu_action_color", None)
|
|
179
|
-
if isinstance(action_color, str) and message.startswith(
|
|
180
|
-
prefix =
|
|
179
|
+
if isinstance(action_color, str) and message.startswith(_GET_PREFIX):
|
|
180
|
+
prefix = _GET_PREFIX
|
|
181
181
|
rest = message[len(prefix) :]
|
|
182
182
|
text = Text()
|
|
183
183
|
text.append(prefix, style=action_color)
|
|
@@ -288,7 +288,7 @@ def write_separator(line: str = "------------------") -> Path:
|
|
|
288
288
|
"""
|
|
289
289
|
Write a raw separator line to the current holder's `furu.log`.
|
|
290
290
|
|
|
291
|
-
This bypasses standard formatting so repeated `
|
|
291
|
+
This bypasses standard formatting so repeated `get()` calls are easy to spot.
|
|
292
292
|
"""
|
|
293
293
|
directory = current_log_dir()
|
|
294
294
|
log_path = directory / "furu.log"
|
furu/storage/state.py
CHANGED
|
@@ -400,15 +400,41 @@ class StateManager:
|
|
|
400
400
|
|
|
401
401
|
@classmethod
|
|
402
402
|
def release_lock(cls, fd: int | None, lock_path: Path) -> None:
|
|
403
|
-
|
|
403
|
+
"""Release a lock acquired via :meth:`try_lock`.
|
|
404
|
+
|
|
405
|
+
We best-effort avoid deleting a lock that was broken and replaced by
|
|
406
|
+
another process by verifying the inode of the open fd matches the
|
|
407
|
+
current lock_path inode before unlinking.
|
|
408
|
+
"""
|
|
409
|
+
if fd is None:
|
|
410
|
+
return
|
|
411
|
+
try:
|
|
412
|
+
fd_stat = os.fstat(fd)
|
|
413
|
+
except OSError:
|
|
414
|
+
fd_stat = None
|
|
415
|
+
try:
|
|
416
|
+
path_stat = lock_path.stat()
|
|
417
|
+
except FileNotFoundError:
|
|
418
|
+
path_stat = None
|
|
419
|
+
try:
|
|
420
|
+
if (
|
|
421
|
+
fd_stat is not None
|
|
422
|
+
and path_stat is not None
|
|
423
|
+
and fd_stat.st_ino == path_stat.st_ino
|
|
424
|
+
and fd_stat.st_dev == path_stat.st_dev
|
|
425
|
+
):
|
|
426
|
+
lock_path.unlink(missing_ok=True)
|
|
427
|
+
finally:
|
|
404
428
|
os.close(fd)
|
|
405
|
-
lock_path.unlink(missing_ok=True)
|
|
406
429
|
|
|
407
430
|
@classmethod
|
|
408
431
|
def _read_lock_info(cls, lock_path: Path) -> _LockInfoDict | None:
|
|
409
432
|
if not lock_path.is_file():
|
|
410
433
|
return None
|
|
411
|
-
|
|
434
|
+
try:
|
|
435
|
+
text = lock_path.read_text().strip()
|
|
436
|
+
except FileNotFoundError:
|
|
437
|
+
return None
|
|
412
438
|
if not text:
|
|
413
439
|
return None
|
|
414
440
|
lines = text.splitlines()
|
|
@@ -978,6 +1004,7 @@ def compute_lock(
|
|
|
978
1004
|
wait_log_every_sec: float = 10.0,
|
|
979
1005
|
reconcile_fn: Callable[[Path], None] | None = None,
|
|
980
1006
|
allow_failed: bool = False,
|
|
1007
|
+
allow_success: bool = False,
|
|
981
1008
|
) -> Generator[ComputeLockContext, None, None]:
|
|
982
1009
|
"""
|
|
983
1010
|
Context manager that atomically acquires lock + records attempt + starts heartbeat.
|
|
@@ -1002,6 +1029,7 @@ def compute_lock(
|
|
|
1002
1029
|
wait_log_every_sec: Interval between "waiting for lock" log messages
|
|
1003
1030
|
reconcile_fn: Optional function to call to reconcile stale attempts
|
|
1004
1031
|
allow_failed: Allow recomputation even if state is failed
|
|
1032
|
+
allow_success: Allow recomputation even if state is successful
|
|
1005
1033
|
|
|
1006
1034
|
Yields:
|
|
1007
1035
|
ComputeLockContext with attempt_id and stop_heartbeat callable
|
|
@@ -1097,7 +1125,7 @@ def compute_lock(
|
|
|
1097
1125
|
lock_fd = StateManager.try_lock(lock_path)
|
|
1098
1126
|
if lock_fd is not None:
|
|
1099
1127
|
state = StateManager.read_state(directory)
|
|
1100
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1128
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1101
1129
|
StateManager.release_lock(lock_fd, lock_path)
|
|
1102
1130
|
raise FuruLockNotAcquired(
|
|
1103
1131
|
"Cannot acquire lock: experiment already succeeded"
|
|
@@ -1117,7 +1145,7 @@ def compute_lock(
|
|
|
1117
1145
|
if reconcile_fn is not None:
|
|
1118
1146
|
reconcile_fn(directory)
|
|
1119
1147
|
state = StateManager.read_state(directory)
|
|
1120
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1148
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1121
1149
|
raise FuruLockNotAcquired(
|
|
1122
1150
|
"Cannot acquire lock: experiment already succeeded"
|
|
1123
1151
|
)
|
|
@@ -1151,7 +1179,7 @@ def compute_lock(
|
|
|
1151
1179
|
attempt = state.attempt
|
|
1152
1180
|
|
|
1153
1181
|
# If result is terminal, no point waiting
|
|
1154
|
-
if isinstance(state.result, _StateResultSuccess):
|
|
1182
|
+
if isinstance(state.result, _StateResultSuccess) and not allow_success:
|
|
1155
1183
|
raise FuruLockNotAcquired(
|
|
1156
1184
|
"Cannot acquire lock: experiment already succeeded"
|
|
1157
1185
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: furu
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Cacheable, nested pipelines for Python. Define computations as configs; furu handles caching, state tracking, and result reuse across runs.
|
|
5
5
|
Author: Herman Brunborg
|
|
6
6
|
Author-email: Herman Brunborg <herman@brunborg.com>
|
|
@@ -44,7 +44,7 @@ The `[dashboard]` extra includes the web dashboard. Omit it for the core library
|
|
|
44
44
|
1. Subclass `furu.Furu[T]`
|
|
45
45
|
2. Implement `_create(self) -> T` (compute and write to `self.furu_dir`)
|
|
46
46
|
3. Implement `_load(self) -> T` (load from `self.furu_dir`)
|
|
47
|
-
4. Call `
|
|
47
|
+
4. Call `get()`
|
|
48
48
|
|
|
49
49
|
```python
|
|
50
50
|
# my_project/pipelines.py
|
|
@@ -75,10 +75,10 @@ class TrainModel(furu.Furu[Path]):
|
|
|
75
75
|
from my_project.pipelines import TrainModel
|
|
76
76
|
|
|
77
77
|
# First call: runs _create(), caches result
|
|
78
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
78
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
79
79
|
|
|
80
80
|
# Second call with same config: loads from cache via _load()
|
|
81
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
81
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
82
82
|
```
|
|
83
83
|
|
|
84
84
|
> **Tip:** Define Furu classes in importable modules (not `__main__`); the artifact namespace is derived from the class's module + qualified name.
|
|
@@ -96,7 +96,7 @@ Each `Furu` instance maps deterministically to a directory based on its config:
|
|
|
96
96
|
- **namespace**: Derived from the class's module + qualified name (e.g., `my_project.pipelines/TrainModel`)
|
|
97
97
|
- **hash**: Computed from the object's config values using Blake2s
|
|
98
98
|
|
|
99
|
-
When you call `
|
|
99
|
+
When you call `get()`:
|
|
100
100
|
1. If no cached result exists → run `_create()`, save state as "success"
|
|
101
101
|
2. If cached result exists → run `_load()` to retrieve it
|
|
102
102
|
3. If another process is running → wait for it to finish, then load
|
|
@@ -123,7 +123,7 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
123
123
|
dataset: Dataset = furu.chz.field(default_factory=Dataset)
|
|
124
124
|
|
|
125
125
|
def _create(self) -> str:
|
|
126
|
-
data = self.dataset.
|
|
126
|
+
data = self.dataset.get() # Triggers Dataset cache
|
|
127
127
|
(self.furu_dir / "model.txt").write_text(f"trained on:\n{data}")
|
|
128
128
|
return "trained"
|
|
129
129
|
|
|
@@ -131,6 +131,58 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
131
131
|
return (self.furu_dir / "model.txt").read_text()
|
|
132
132
|
```
|
|
133
133
|
|
|
134
|
+
### Executors (Local + Slurm)
|
|
135
|
+
|
|
136
|
+
Use the execution helpers for batch runs and cluster scheduling:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from furu.execution import run_local
|
|
140
|
+
|
|
141
|
+
run_local(
|
|
142
|
+
[TrainModel(lr=3e-4, steps=5000), TrainModel(lr=1e-3, steps=2000)],
|
|
143
|
+
max_workers=8,
|
|
144
|
+
window_size="bfs",
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from furu.execution import SlurmSpec, submit_slurm_dag
|
|
150
|
+
|
|
151
|
+
specs = {
|
|
152
|
+
"default": SlurmSpec(partition="cpu", cpus=8, mem_gb=32, time_min=120),
|
|
153
|
+
"gpu": SlurmSpec(partition="gpu", gpus=1, cpus=8, mem_gb=64, time_min=720),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
submit_slurm_dag([TrainModel(lr=3e-4, steps=5000)], specs=specs)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from furu.execution import run_slurm_pool
|
|
161
|
+
|
|
162
|
+
run_slurm_pool(
|
|
163
|
+
[TrainModel(lr=3e-4, steps=5000)],
|
|
164
|
+
specs=specs,
|
|
165
|
+
max_workers_total=50,
|
|
166
|
+
window_size="bfs",
|
|
167
|
+
)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Submitit logs are stored under `<FURU_PATH>/submitit` by default. Override with
|
|
171
|
+
`FURU_SUBMITIT_PATH` when you want a different logs root.
|
|
172
|
+
|
|
173
|
+
### Breaking Changes and Executor Semantics
|
|
174
|
+
|
|
175
|
+
- `load_or_create()` is removed; use `get()` exclusively.
|
|
176
|
+
- `get()` no longer accepts per-call `retry_failed` overrides. Configure retries via
|
|
177
|
+
`FURU_RETRY_FAILED` or `FURU_CONFIG.retry_failed`.
|
|
178
|
+
- Executor runs (`run_local`, `run_slurm_pool`, `submit_slurm_dag`) fail fast if a
|
|
179
|
+
dependency is FAILED while `retry_failed` is disabled; with retries enabled, failed
|
|
180
|
+
compute nodes are retried (bounded by `FURU_MAX_COMPUTE_RETRIES` retries).
|
|
181
|
+
- Pool protocol/queue failures (invalid payloads, spec mismatch, missing artifacts) are
|
|
182
|
+
fatal even when `retry_failed` is enabled; only compute failures are retried.
|
|
183
|
+
- `FURU_ALWAYS_RERUN` causes matching nodes to recompute once per executor run, but
|
|
184
|
+
repeated references in the same run reuse that result.
|
|
185
|
+
|
|
134
186
|
### Storage Structure
|
|
135
187
|
|
|
136
188
|
Furu uses two roots: `FURU_PATH` for `data/` + `raw/`, and
|
|
@@ -176,7 +228,7 @@ class MyExperiments(furu.FuruList[TrainModel]):
|
|
|
176
228
|
|
|
177
229
|
# Iterate over all experiments
|
|
178
230
|
for exp in MyExperiments:
|
|
179
|
-
exp.
|
|
231
|
+
exp.get()
|
|
180
232
|
|
|
181
233
|
# Access by name
|
|
182
234
|
exp = MyExperiments.by_name("baseline")
|
|
@@ -191,14 +243,17 @@ for name, exp in MyExperiments.items():
|
|
|
191
243
|
|
|
192
244
|
### Custom Validation
|
|
193
245
|
|
|
194
|
-
Override `_validate()` to add custom cache invalidation logic
|
|
246
|
+
Override `_validate()` to add custom cache invalidation logic. Return False or
|
|
247
|
+
raise `furu.FuruValidationError` to force re-computation. In executor planning,
|
|
248
|
+
any other exception is logged and treated as invalid (no crash); in interactive
|
|
249
|
+
`exists()` calls, exceptions still surface:
|
|
195
250
|
|
|
196
251
|
```python
|
|
197
252
|
class ModelWithValidation(furu.Furu[Path]):
|
|
198
253
|
checkpoint_name: str = "model.pt"
|
|
199
254
|
|
|
200
255
|
def _validate(self) -> bool:
|
|
201
|
-
# Return False to force re-computation
|
|
256
|
+
# Return False (or raise FuruValidationError) to force re-computation
|
|
202
257
|
ckpt = self.furu_dir / self.checkpoint_name
|
|
203
258
|
return ckpt.exists() and ckpt.stat().st_size > 0
|
|
204
259
|
|
|
@@ -220,7 +275,7 @@ if obj.exists():
|
|
|
220
275
|
|
|
221
276
|
# Get metadata without triggering computation
|
|
222
277
|
metadata = obj.get_metadata()
|
|
223
|
-
print(f"Hash: {obj.
|
|
278
|
+
print(f"Hash: {obj.furu_hash}")
|
|
224
279
|
print(f"Dir: {obj.furu_dir}")
|
|
225
280
|
```
|
|
226
281
|
|
|
@@ -251,7 +306,7 @@ class LargeDataProcessor(furu.Furu[Path]):
|
|
|
251
306
|
def _create(self) -> Path:
|
|
252
307
|
# self.raw_dir is shared across all configs
|
|
253
308
|
# Create a subfolder for isolation if needed
|
|
254
|
-
my_raw = self.raw_dir / self.
|
|
309
|
+
my_raw = self.raw_dir / self.furu_hash
|
|
255
310
|
my_raw.mkdir(exist_ok=True)
|
|
256
311
|
|
|
257
312
|
large_file = my_raw / "huge_dataset.bin"
|
|
@@ -303,8 +358,8 @@ HHMMSS file.py:line message
|
|
|
303
358
|
|
|
304
359
|
Furu emits status messages like:
|
|
305
360
|
```
|
|
306
|
-
|
|
307
|
-
|
|
361
|
+
get TrainModel abc123def (missing->create)
|
|
362
|
+
get TrainModel abc123def (success->load)
|
|
308
363
|
```
|
|
309
364
|
|
|
310
365
|
### Explicit Setup
|
|
@@ -325,7 +380,7 @@ logger = furu.get_logger()
|
|
|
325
380
|
from furu import FuruComputeError, FuruWaitTimeout, FuruLockNotAcquired
|
|
326
381
|
|
|
327
382
|
try:
|
|
328
|
-
result = obj.
|
|
383
|
+
result = obj.get()
|
|
329
384
|
except FuruComputeError as e:
|
|
330
385
|
print(f"Computation failed: {e}")
|
|
331
386
|
print(f"State file: {e.state_path}")
|
|
@@ -336,8 +391,8 @@ except FuruLockNotAcquired:
|
|
|
336
391
|
print("Could not acquire lock")
|
|
337
392
|
```
|
|
338
393
|
|
|
339
|
-
By default, failed artifacts are retried on the next `
|
|
340
|
-
`FURU_RETRY_FAILED=0`
|
|
394
|
+
By default, failed artifacts are retried on the next `get()` call. Set
|
|
395
|
+
`FURU_RETRY_FAILED=0` to keep failures sticky.
|
|
341
396
|
|
|
342
397
|
`FURU_MAX_WAIT_SECS` overrides the per-class `_max_wait_time_sec` (default 600s)
|
|
343
398
|
timeout used when waiting for compute locks before raising `FuruWaitTimeout`.
|
|
@@ -349,27 +404,8 @@ and `furu.log`.
|
|
|
349
404
|
|
|
350
405
|
## Submitit Integration
|
|
351
406
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
```python
|
|
355
|
-
import submitit
|
|
356
|
-
import furu
|
|
357
|
-
|
|
358
|
-
executor = submitit.AutoExecutor(folder="submitit_logs")
|
|
359
|
-
executor.update_parameters(
|
|
360
|
-
timeout_min=60,
|
|
361
|
-
slurm_partition="gpu",
|
|
362
|
-
gpus_per_node=1,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
# Submit job and return immediately
|
|
366
|
-
job = my_furu_obj.load_or_create(executor=executor)
|
|
367
|
-
|
|
368
|
-
# Job ID is tracked in .furu/state.json
|
|
369
|
-
print(job.job_id)
|
|
370
|
-
```
|
|
371
|
-
|
|
372
|
-
Furu handles preemption, requeuing, and state tracking automatically.
|
|
407
|
+
Furu includes a `SubmititAdapter` for integrating submitit executors with the
|
|
408
|
+
state system. Executor helpers in `furu.execution` handle submission workflows.
|
|
373
409
|
|
|
374
410
|
## Dashboard
|
|
375
411
|
|
|
@@ -427,6 +463,7 @@ The `/api/experiments` endpoint supports:
|
|
|
427
463
|
| `FURU_IGNORE_DIFF` | `false` | Skip embedding git diff in metadata |
|
|
428
464
|
| `FURU_ALWAYS_RERUN` | `""` | Comma-separated class qualnames to always rerun (use `ALL` to bypass cache globally; cannot combine with other entries; entries must be importable) |
|
|
429
465
|
| `FURU_RETRY_FAILED` | `true` | Retry failed artifacts by default (set to `0` to keep failures sticky) |
|
|
466
|
+
| `FURU_MAX_COMPUTE_RETRIES` | `3` | Maximum compute retries per node after the first failure |
|
|
430
467
|
| `FURU_POLL_INTERVAL_SECS` | `10` | Polling interval for queued/running jobs |
|
|
431
468
|
| `FURU_MAX_WAIT_SECS` | unset | Override wait timeout (falls back to `_max_wait_time_sec`, default 600s) |
|
|
432
469
|
| `FURU_WAIT_LOG_EVERY_SECS` | `10` | Interval between "waiting" log messages |
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
furu/__init__.py,sha256=
|
|
1
|
+
furu/__init__.py,sha256=Z8VssTuQm2nH7bgB8SQc8pXsNGc-H1QGHFffKzNzqk8,2018
|
|
2
2
|
furu/adapters/__init__.py,sha256=onLzEj9hccPK15g8a8va2T19nqQXoxb9rQlJIjKSKnE,69
|
|
3
|
-
furu/adapters/submitit.py,sha256=
|
|
4
|
-
furu/config.py,sha256=
|
|
3
|
+
furu/adapters/submitit.py,sha256=FV3XEUSQuS5vIyzkW-Iuqtf8SRL-fsokPG67u7tMF5I,7276
|
|
4
|
+
furu/config.py,sha256=1nlJff4KNrWDvLhmnuLrsc7FJIxFLFhz3eOXZ8-ngX4,7349
|
|
5
5
|
furu/core/__init__.py,sha256=6hH7i6r627c0FZn6eQVsSG7LD4QmTta6iQw0AiPQPTM,156
|
|
6
|
-
furu/core/furu.py,sha256=
|
|
7
|
-
furu/core/list.py,sha256=
|
|
8
|
-
furu/dashboard/__init__.py,sha256=
|
|
6
|
+
furu/core/furu.py,sha256=Cy2cOnM5vsQoSk9nIVYj2Fx017wOQFPbxhnvYQsh7nI,58881
|
|
7
|
+
furu/core/list.py,sha256=xSuBT35p1anJ2fKQPxb-3cRTONUamFjfzkreVaI9Jo4,3614
|
|
8
|
+
furu/dashboard/__init__.py,sha256=ziAordJfkbbXNIM7iA9O7vR2gsCq34AInYiMYOCfWOc,362
|
|
9
9
|
furu/dashboard/__main__.py,sha256=cNs65IMl4kwZFpxa9xLXmFSy4-M5D1X1ZBfTDxW11vo,144
|
|
10
10
|
furu/dashboard/api/__init__.py,sha256=9-WyWOt-VQJJBIsdW29D-7JvR-BivJd9G_SRaRptCz0,80
|
|
11
11
|
furu/dashboard/api/models.py,sha256=SCu-kLJyW7dwSKswdgQNS3wQuj25ORs0pHkvX9xBbo4,4767
|
|
@@ -14,14 +14,24 @@ furu/dashboard/frontend/dist/assets/index-BXAIKNNr.css,sha256=qhsN0Td3mM-GAR8mZ0
|
|
|
14
14
|
furu/dashboard/frontend/dist/assets/index-DS3FsqcY.js,sha256=nfrKjhWThPtL8n5iTd9_1W-bsyMGwg2O8Iq2jkjj9Lg,544699
|
|
15
15
|
furu/dashboard/frontend/dist/favicon.svg,sha256=3TSLHNZITFe3JTPoYHZnDgiGsJxIzf39v97l2A1Hodo,369
|
|
16
16
|
furu/dashboard/frontend/dist/index.html,sha256=d9a8ZFKZ5uDtN3urqVNmS8LWMBhOC0eW7X0noT0RcYQ,810
|
|
17
|
-
furu/dashboard/main.py,sha256=
|
|
17
|
+
furu/dashboard/main.py,sha256=gj9Cdj2qyaSCEkmfNHUMQXlXv6GpWTQ9IZEi7WzlCSo,4463
|
|
18
18
|
furu/dashboard/scanner.py,sha256=qXCvkvFByBc09TUdth5Js67rS8zpRBlRkVQ9dJ7YbdE,34696
|
|
19
|
-
furu/errors.py,sha256=
|
|
19
|
+
furu/errors.py,sha256=FFbV4M0-ipVGizv5ee80L-NZFVjaRjy8i19mClr6R0g,3959
|
|
20
|
+
furu/execution/__init__.py,sha256=ixVw1Shvg2ulS597OYYeGgSSTwv25j_McuQdDXIiEL8,625
|
|
21
|
+
furu/execution/context.py,sha256=0tAbM0azqEus8hknf_A9-Zs9Sq99bnUkFyV4RO4ZMRU,666
|
|
22
|
+
furu/execution/local.py,sha256=TkKrRdmaQrN7i7Sxe87eHibRJOnz5OxU0Oj8qL_xP4I,7059
|
|
23
|
+
furu/execution/paths.py,sha256=0MfQk5Kh7bxvJiWvG40TJe7RF5Q5Na6uvi6qV0OT3Vc,460
|
|
24
|
+
furu/execution/plan.py,sha256=fM7CkXm_M0lL3vqdiNnWzbvMJAoSYKDBAnC82Af_rYM,6860
|
|
25
|
+
furu/execution/plan_utils.py,sha256=TAQqlPeJfOdH2MT-X7g3j1Se_0e4oKvG0tJaWC1kM40,381
|
|
26
|
+
furu/execution/slurm_dag.py,sha256=FOJcPKmIzRyrbJIq7heqGjKN0EFRMyOcV-yP7Ci87Qs,9360
|
|
27
|
+
furu/execution/slurm_pool.py,sha256=bi90fzZXAnoWHSPQba8Z3tk4_QMaqikWxCCzRfvDMvk,30400
|
|
28
|
+
furu/execution/slurm_spec.py,sha256=A1VX5K6aG8Ricg4fhnkz3Alkw_fx1bx53D0p4Ms3FqA,979
|
|
29
|
+
furu/execution/submitit_factory.py,sha256=B2vkDtmscuAX0sBaj9V5pNlgOtkkV35yJ1fZ7A-DSvU,1119
|
|
20
30
|
furu/migrate.py,sha256=x_Uh7oXAv40L5ZAHJhdnw-o7ct56rWUSZLbHHfRObeY,1313
|
|
21
31
|
furu/migration.py,sha256=R2-tARMx4VKryiqJ7WHia_dPVxRbTqofPpCFVE9zQ8U,31411
|
|
22
32
|
furu/runtime/__init__.py,sha256=fQqE7wUuWunLD73Vm3lss7BFSij3UVxXOKQXBAOS8zw,504
|
|
23
33
|
furu/runtime/env.py,sha256=o1phhoTDhOnhALr3Ozf1ldrdvk2ClyEvBWbebHM6BXg,160
|
|
24
|
-
furu/runtime/logging.py,sha256=
|
|
34
|
+
furu/runtime/logging.py,sha256=WS3mB8VqMYUxPPI0yv1K-LnzVBj84Mnu1Qf9P2hCUUE,9652
|
|
25
35
|
furu/runtime/tracebacks.py,sha256=PGCuOq8QkWSoun791gjUXM8frOP2wWV8IBlqaA4nuGE,1631
|
|
26
36
|
furu/serialization/__init__.py,sha256=L7oHuIbxdSh7GCY3thMQnDwlt_ERH-TMy0YKEAZLrPs,341
|
|
27
37
|
furu/serialization/migrations.py,sha256=HD5g8JCBdH3Y0rHJYc4Ug1IXBVcUDxLE7nfiXZnXcUE,7772
|
|
@@ -29,8 +39,8 @@ furu/serialization/serializer.py,sha256=_nfUaAOy_KHegvfXlpPh4rCuvkzalJva75OvDg5n
|
|
|
29
39
|
furu/storage/__init__.py,sha256=cLLL-GPpSu9C72Mdk5S6TGu3g-SnBfEuxzfpx5ZJPtw,616
|
|
30
40
|
furu/storage/metadata.py,sha256=MH6w5hs-2rwHD6G9erMPM5pE3hm0h5Pk_G3Z6eyyGB0,9899
|
|
31
41
|
furu/storage/migration.py,sha256=Ars9aYwvhXpIBDf6L9ojGjp_l656-RfdtEAFKN0sZZY,2640
|
|
32
|
-
furu/storage/state.py,sha256=
|
|
33
|
-
furu-0.0.
|
|
34
|
-
furu-0.0.
|
|
35
|
-
furu-0.0.
|
|
36
|
-
furu-0.0.
|
|
42
|
+
furu/storage/state.py,sha256=SFonqragT2eMCZbBKIvcA4JVe78rVmDRvo4Ky2IcNgc,43632
|
|
43
|
+
furu-0.0.4.dist-info/WHEEL,sha256=XV0cjMrO7zXhVAIyyc8aFf1VjZ33Fen4IiJk5zFlC3g,80
|
|
44
|
+
furu-0.0.4.dist-info/entry_points.txt,sha256=hZkjtFzNlb33Zk-aUfLMRj-XgVDxdT82-JXG9d4bu2E,60
|
|
45
|
+
furu-0.0.4.dist-info/METADATA,sha256=fdUBvn-vEnVim9V5hAamE1sFuaKzWdwWPI17VU2Vyfc,16162
|
|
46
|
+
furu-0.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|