furu 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {furu-0.0.2 → furu-0.0.4}/PKG-INFO +83 -33
- {furu-0.0.2 → furu-0.0.4}/README.md +82 -32
- {furu-0.0.2 → furu-0.0.4}/pyproject.toml +4 -1
- {furu-0.0.2 → furu-0.0.4}/src/furu/__init__.py +11 -1
- {furu-0.0.2 → furu-0.0.4}/src/furu/adapters/submitit.py +23 -2
- {furu-0.0.2 → furu-0.0.4}/src/furu/config.py +21 -3
- furu-0.0.4/src/furu/core/__init__.py +4 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/core/furu.py +708 -188
- {furu-0.0.2 → furu-0.0.4}/src/furu/core/list.py +1 -1
- furu-0.0.4/src/furu/dashboard/__init__.py +18 -0
- furu-0.0.2/src/furu/dashboard/frontend/dist/assets/index-CbdDfSOZ.css → furu-0.0.4/src/furu/dashboard/frontend/dist/assets/index-BXAIKNNr.css +1 -1
- furu-0.0.2/src/furu/dashboard/frontend/dist/assets/index-DDv_TYB_.js → furu-0.0.4/src/furu/dashboard/frontend/dist/assets/index-DS3FsqcY.js +3 -3
- {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/frontend/dist/index.html +2 -2
- {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/main.py +10 -3
- furu-0.0.4/src/furu/errors.py +131 -0
- furu-0.0.4/src/furu/execution/__init__.py +22 -0
- furu-0.0.4/src/furu/execution/context.py +30 -0
- furu-0.0.4/src/furu/execution/local.py +184 -0
- furu-0.0.4/src/furu/execution/paths.py +20 -0
- furu-0.0.4/src/furu/execution/plan.py +238 -0
- furu-0.0.4/src/furu/execution/plan_utils.py +13 -0
- furu-0.0.4/src/furu/execution/slurm_dag.py +271 -0
- furu-0.0.4/src/furu/execution/slurm_pool.py +878 -0
- furu-0.0.4/src/furu/execution/slurm_spec.py +38 -0
- furu-0.0.4/src/furu/execution/submitit_factory.py +47 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/migration.py +8 -4
- {furu-0.0.2 → furu-0.0.4}/src/furu/runtime/logging.py +10 -10
- {furu-0.0.2 → furu-0.0.4}/src/furu/serialization/serializer.py +40 -2
- {furu-0.0.2 → furu-0.0.4}/src/furu/storage/metadata.py +17 -5
- {furu-0.0.2 → furu-0.0.4}/src/furu/storage/state.py +78 -12
- furu-0.0.2/src/furu/core/__init__.py +0 -4
- furu-0.0.2/src/furu/dashboard/__init__.py +0 -9
- furu-0.0.2/src/furu/errors.py +0 -76
- {furu-0.0.2 → furu-0.0.4}/src/furu/adapters/__init__.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/__main__.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/api/__init__.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/api/models.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/api/routes.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/frontend/dist/favicon.svg +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/scanner.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/migrate.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/runtime/__init__.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/runtime/env.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/runtime/tracebacks.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/serialization/__init__.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/serialization/migrations.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/storage/__init__.py +0 -0
- {furu-0.0.2 → furu-0.0.4}/src/furu/storage/migration.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: furu
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Cacheable, nested pipelines for Python. Define computations as configs; furu handles caching, state tracking, and result reuse across runs.
|
|
5
5
|
Author: Herman Brunborg
|
|
6
6
|
Author-email: Herman Brunborg <herman@brunborg.com>
|
|
@@ -44,7 +44,7 @@ The `[dashboard]` extra includes the web dashboard. Omit it for the core library
|
|
|
44
44
|
1. Subclass `furu.Furu[T]`
|
|
45
45
|
2. Implement `_create(self) -> T` (compute and write to `self.furu_dir`)
|
|
46
46
|
3. Implement `_load(self) -> T` (load from `self.furu_dir`)
|
|
47
|
-
4. Call `
|
|
47
|
+
4. Call `get()`
|
|
48
48
|
|
|
49
49
|
```python
|
|
50
50
|
# my_project/pipelines.py
|
|
@@ -75,10 +75,10 @@ class TrainModel(furu.Furu[Path]):
|
|
|
75
75
|
from my_project.pipelines import TrainModel
|
|
76
76
|
|
|
77
77
|
# First call: runs _create(), caches result
|
|
78
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
78
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
79
79
|
|
|
80
80
|
# Second call with same config: loads from cache via _load()
|
|
81
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
81
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
82
82
|
```
|
|
83
83
|
|
|
84
84
|
> **Tip:** Define Furu classes in importable modules (not `__main__`); the artifact namespace is derived from the class's module + qualified name.
|
|
@@ -96,7 +96,7 @@ Each `Furu` instance maps deterministically to a directory based on its config:
|
|
|
96
96
|
- **namespace**: Derived from the class's module + qualified name (e.g., `my_project.pipelines/TrainModel`)
|
|
97
97
|
- **hash**: Computed from the object's config values using Blake2s
|
|
98
98
|
|
|
99
|
-
When you call `
|
|
99
|
+
When you call `get()`:
|
|
100
100
|
1. If no cached result exists → run `_create()`, save state as "success"
|
|
101
101
|
2. If cached result exists → run `_load()` to retrieve it
|
|
102
102
|
3. If another process is running → wait for it to finish, then load
|
|
@@ -123,7 +123,7 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
123
123
|
dataset: Dataset = furu.chz.field(default_factory=Dataset)
|
|
124
124
|
|
|
125
125
|
def _create(self) -> str:
|
|
126
|
-
data = self.dataset.
|
|
126
|
+
data = self.dataset.get() # Triggers Dataset cache
|
|
127
127
|
(self.furu_dir / "model.txt").write_text(f"trained on:\n{data}")
|
|
128
128
|
return "trained"
|
|
129
129
|
|
|
@@ -131,6 +131,58 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
131
131
|
return (self.furu_dir / "model.txt").read_text()
|
|
132
132
|
```
|
|
133
133
|
|
|
134
|
+
### Executors (Local + Slurm)
|
|
135
|
+
|
|
136
|
+
Use the execution helpers for batch runs and cluster scheduling:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from furu.execution import run_local
|
|
140
|
+
|
|
141
|
+
run_local(
|
|
142
|
+
[TrainModel(lr=3e-4, steps=5000), TrainModel(lr=1e-3, steps=2000)],
|
|
143
|
+
max_workers=8,
|
|
144
|
+
window_size="bfs",
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from furu.execution import SlurmSpec, submit_slurm_dag
|
|
150
|
+
|
|
151
|
+
specs = {
|
|
152
|
+
"default": SlurmSpec(partition="cpu", cpus=8, mem_gb=32, time_min=120),
|
|
153
|
+
"gpu": SlurmSpec(partition="gpu", gpus=1, cpus=8, mem_gb=64, time_min=720),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
submit_slurm_dag([TrainModel(lr=3e-4, steps=5000)], specs=specs)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from furu.execution import run_slurm_pool
|
|
161
|
+
|
|
162
|
+
run_slurm_pool(
|
|
163
|
+
[TrainModel(lr=3e-4, steps=5000)],
|
|
164
|
+
specs=specs,
|
|
165
|
+
max_workers_total=50,
|
|
166
|
+
window_size="bfs",
|
|
167
|
+
)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Submitit logs are stored under `<FURU_PATH>/submitit` by default. Override with
|
|
171
|
+
`FURU_SUBMITIT_PATH` when you want a different logs root.
|
|
172
|
+
|
|
173
|
+
### Breaking Changes and Executor Semantics
|
|
174
|
+
|
|
175
|
+
- `load_or_create()` is removed; use `get()` exclusively.
|
|
176
|
+
- `get()` no longer accepts per-call `retry_failed` overrides. Configure retries via
|
|
177
|
+
`FURU_RETRY_FAILED` or `FURU_CONFIG.retry_failed`.
|
|
178
|
+
- Executor runs (`run_local`, `run_slurm_pool`, `submit_slurm_dag`) fail fast if a
|
|
179
|
+
dependency is FAILED while `retry_failed` is disabled; with retries enabled, failed
|
|
180
|
+
compute nodes are retried (bounded by `FURU_MAX_COMPUTE_RETRIES` retries).
|
|
181
|
+
- Pool protocol/queue failures (invalid payloads, spec mismatch, missing artifacts) are
|
|
182
|
+
fatal even when `retry_failed` is enabled; only compute failures are retried.
|
|
183
|
+
- `FURU_ALWAYS_RERUN` causes matching nodes to recompute once per executor run, but
|
|
184
|
+
repeated references in the same run reuse that result.
|
|
185
|
+
|
|
134
186
|
### Storage Structure
|
|
135
187
|
|
|
136
188
|
Furu uses two roots: `FURU_PATH` for `data/` + `raw/`, and
|
|
@@ -176,7 +228,7 @@ class MyExperiments(furu.FuruList[TrainModel]):
|
|
|
176
228
|
|
|
177
229
|
# Iterate over all experiments
|
|
178
230
|
for exp in MyExperiments:
|
|
179
|
-
exp.
|
|
231
|
+
exp.get()
|
|
180
232
|
|
|
181
233
|
# Access by name
|
|
182
234
|
exp = MyExperiments.by_name("baseline")
|
|
@@ -191,14 +243,17 @@ for name, exp in MyExperiments.items():
|
|
|
191
243
|
|
|
192
244
|
### Custom Validation
|
|
193
245
|
|
|
194
|
-
Override `_validate()` to add custom cache invalidation logic
|
|
246
|
+
Override `_validate()` to add custom cache invalidation logic. Return False or
|
|
247
|
+
raise `furu.FuruValidationError` to force re-computation. In executor planning,
|
|
248
|
+
any other exception is logged and treated as invalid (no crash); in interactive
|
|
249
|
+
`exists()` calls, exceptions still surface:
|
|
195
250
|
|
|
196
251
|
```python
|
|
197
252
|
class ModelWithValidation(furu.Furu[Path]):
|
|
198
253
|
checkpoint_name: str = "model.pt"
|
|
199
254
|
|
|
200
255
|
def _validate(self) -> bool:
|
|
201
|
-
# Return False to force re-computation
|
|
256
|
+
# Return False (or raise FuruValidationError) to force re-computation
|
|
202
257
|
ckpt = self.furu_dir / self.checkpoint_name
|
|
203
258
|
return ckpt.exists() and ckpt.stat().st_size > 0
|
|
204
259
|
|
|
@@ -220,7 +275,7 @@ if obj.exists():
|
|
|
220
275
|
|
|
221
276
|
# Get metadata without triggering computation
|
|
222
277
|
metadata = obj.get_metadata()
|
|
223
|
-
print(f"Hash: {obj.
|
|
278
|
+
print(f"Hash: {obj.furu_hash}")
|
|
224
279
|
print(f"Dir: {obj.furu_dir}")
|
|
225
280
|
```
|
|
226
281
|
|
|
@@ -251,7 +306,7 @@ class LargeDataProcessor(furu.Furu[Path]):
|
|
|
251
306
|
def _create(self) -> Path:
|
|
252
307
|
# self.raw_dir is shared across all configs
|
|
253
308
|
# Create a subfolder for isolation if needed
|
|
254
|
-
my_raw = self.raw_dir / self.
|
|
309
|
+
my_raw = self.raw_dir / self.furu_hash
|
|
255
310
|
my_raw.mkdir(exist_ok=True)
|
|
256
311
|
|
|
257
312
|
large_file = my_raw / "huge_dataset.bin"
|
|
@@ -303,8 +358,8 @@ HHMMSS file.py:line message
|
|
|
303
358
|
|
|
304
359
|
Furu emits status messages like:
|
|
305
360
|
```
|
|
306
|
-
|
|
307
|
-
|
|
361
|
+
get TrainModel abc123def (missing->create)
|
|
362
|
+
get TrainModel abc123def (success->load)
|
|
308
363
|
```
|
|
309
364
|
|
|
310
365
|
### Explicit Setup
|
|
@@ -325,7 +380,7 @@ logger = furu.get_logger()
|
|
|
325
380
|
from furu import FuruComputeError, FuruWaitTimeout, FuruLockNotAcquired
|
|
326
381
|
|
|
327
382
|
try:
|
|
328
|
-
result = obj.
|
|
383
|
+
result = obj.get()
|
|
329
384
|
except FuruComputeError as e:
|
|
330
385
|
print(f"Computation failed: {e}")
|
|
331
386
|
print(f"State file: {e.state_path}")
|
|
@@ -336,29 +391,21 @@ except FuruLockNotAcquired:
|
|
|
336
391
|
print("Could not acquire lock")
|
|
337
392
|
```
|
|
338
393
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
Run computations on SLURM clusters via [submitit](https://github.com/facebookincubator/submitit):
|
|
394
|
+
By default, failed artifacts are retried on the next `get()` call. Set
|
|
395
|
+
`FURU_RETRY_FAILED=0` to keep failures sticky.
|
|
342
396
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
import furu
|
|
397
|
+
`FURU_MAX_WAIT_SECS` overrides the per-class `_max_wait_time_sec` (default 600s)
|
|
398
|
+
timeout used when waiting for compute locks before raising `FuruWaitTimeout`.
|
|
346
399
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
gpus_per_node=1,
|
|
352
|
-
)
|
|
400
|
+
Failures during metadata collection or signal handler setup (before `_create()`
|
|
401
|
+
runs) raise `FuruComputeError` with the original exception attached. These
|
|
402
|
+
failures still mark the attempt as failed and record details in `state.json`
|
|
403
|
+
and `furu.log`.
|
|
353
404
|
|
|
354
|
-
|
|
355
|
-
job = my_furu_obj.load_or_create(executor=executor)
|
|
356
|
-
|
|
357
|
-
# Job ID is tracked in .furu/state.json
|
|
358
|
-
print(job.job_id)
|
|
359
|
-
```
|
|
405
|
+
## Submitit Integration
|
|
360
406
|
|
|
361
|
-
Furu
|
|
407
|
+
Furu includes a `SubmititAdapter` for integrating submitit executors with the
|
|
408
|
+
state system. Executor helpers in `furu.execution` handle submission workflows.
|
|
362
409
|
|
|
363
410
|
## Dashboard
|
|
364
411
|
|
|
@@ -415,7 +462,10 @@ The `/api/experiments` endpoint supports:
|
|
|
415
462
|
| `FURU_LOG_LEVEL` | `INFO` | Console verbosity (`DEBUG`, `INFO`, `WARNING`, `ERROR`) |
|
|
416
463
|
| `FURU_IGNORE_DIFF` | `false` | Skip embedding git diff in metadata |
|
|
417
464
|
| `FURU_ALWAYS_RERUN` | `""` | Comma-separated class qualnames to always rerun (use `ALL` to bypass cache globally; cannot combine with other entries; entries must be importable) |
|
|
465
|
+
| `FURU_RETRY_FAILED` | `true` | Retry failed artifacts by default (set to `0` to keep failures sticky) |
|
|
466
|
+
| `FURU_MAX_COMPUTE_RETRIES` | `3` | Maximum compute retries per node after the first failure |
|
|
418
467
|
| `FURU_POLL_INTERVAL_SECS` | `10` | Polling interval for queued/running jobs |
|
|
468
|
+
| `FURU_MAX_WAIT_SECS` | unset | Override wait timeout (falls back to `_max_wait_time_sec`, default 600s) |
|
|
419
469
|
| `FURU_WAIT_LOG_EVERY_SECS` | `10` | Interval between "waiting" log messages |
|
|
420
470
|
| `FURU_STALE_AFTER_SECS` | `1800` | Consider running jobs stale after this duration |
|
|
421
471
|
| `FURU_LEASE_SECS` | `120` | Compute lock lease duration |
|
|
@@ -25,7 +25,7 @@ The `[dashboard]` extra includes the web dashboard. Omit it for the core library
|
|
|
25
25
|
1. Subclass `furu.Furu[T]`
|
|
26
26
|
2. Implement `_create(self) -> T` (compute and write to `self.furu_dir`)
|
|
27
27
|
3. Implement `_load(self) -> T` (load from `self.furu_dir`)
|
|
28
|
-
4. Call `
|
|
28
|
+
4. Call `get()`
|
|
29
29
|
|
|
30
30
|
```python
|
|
31
31
|
# my_project/pipelines.py
|
|
@@ -56,10 +56,10 @@ class TrainModel(furu.Furu[Path]):
|
|
|
56
56
|
from my_project.pipelines import TrainModel
|
|
57
57
|
|
|
58
58
|
# First call: runs _create(), caches result
|
|
59
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
59
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
60
60
|
|
|
61
61
|
# Second call with same config: loads from cache via _load()
|
|
62
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
62
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
63
63
|
```
|
|
64
64
|
|
|
65
65
|
> **Tip:** Define Furu classes in importable modules (not `__main__`); the artifact namespace is derived from the class's module + qualified name.
|
|
@@ -77,7 +77,7 @@ Each `Furu` instance maps deterministically to a directory based on its config:
|
|
|
77
77
|
- **namespace**: Derived from the class's module + qualified name (e.g., `my_project.pipelines/TrainModel`)
|
|
78
78
|
- **hash**: Computed from the object's config values using Blake2s
|
|
79
79
|
|
|
80
|
-
When you call `
|
|
80
|
+
When you call `get()`:
|
|
81
81
|
1. If no cached result exists → run `_create()`, save state as "success"
|
|
82
82
|
2. If cached result exists → run `_load()` to retrieve it
|
|
83
83
|
3. If another process is running → wait for it to finish, then load
|
|
@@ -104,7 +104,7 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
104
104
|
dataset: Dataset = furu.chz.field(default_factory=Dataset)
|
|
105
105
|
|
|
106
106
|
def _create(self) -> str:
|
|
107
|
-
data = self.dataset.
|
|
107
|
+
data = self.dataset.get() # Triggers Dataset cache
|
|
108
108
|
(self.furu_dir / "model.txt").write_text(f"trained on:\n{data}")
|
|
109
109
|
return "trained"
|
|
110
110
|
|
|
@@ -112,6 +112,58 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
112
112
|
return (self.furu_dir / "model.txt").read_text()
|
|
113
113
|
```
|
|
114
114
|
|
|
115
|
+
### Executors (Local + Slurm)
|
|
116
|
+
|
|
117
|
+
Use the execution helpers for batch runs and cluster scheduling:
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from furu.execution import run_local
|
|
121
|
+
|
|
122
|
+
run_local(
|
|
123
|
+
[TrainModel(lr=3e-4, steps=5000), TrainModel(lr=1e-3, steps=2000)],
|
|
124
|
+
max_workers=8,
|
|
125
|
+
window_size="bfs",
|
|
126
|
+
)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from furu.execution import SlurmSpec, submit_slurm_dag
|
|
131
|
+
|
|
132
|
+
specs = {
|
|
133
|
+
"default": SlurmSpec(partition="cpu", cpus=8, mem_gb=32, time_min=120),
|
|
134
|
+
"gpu": SlurmSpec(partition="gpu", gpus=1, cpus=8, mem_gb=64, time_min=720),
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
submit_slurm_dag([TrainModel(lr=3e-4, steps=5000)], specs=specs)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from furu.execution import run_slurm_pool
|
|
142
|
+
|
|
143
|
+
run_slurm_pool(
|
|
144
|
+
[TrainModel(lr=3e-4, steps=5000)],
|
|
145
|
+
specs=specs,
|
|
146
|
+
max_workers_total=50,
|
|
147
|
+
window_size="bfs",
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Submitit logs are stored under `<FURU_PATH>/submitit` by default. Override with
|
|
152
|
+
`FURU_SUBMITIT_PATH` when you want a different logs root.
|
|
153
|
+
|
|
154
|
+
### Breaking Changes and Executor Semantics
|
|
155
|
+
|
|
156
|
+
- `load_or_create()` is removed; use `get()` exclusively.
|
|
157
|
+
- `get()` no longer accepts per-call `retry_failed` overrides. Configure retries via
|
|
158
|
+
`FURU_RETRY_FAILED` or `FURU_CONFIG.retry_failed`.
|
|
159
|
+
- Executor runs (`run_local`, `run_slurm_pool`, `submit_slurm_dag`) fail fast if a
|
|
160
|
+
dependency is FAILED while `retry_failed` is disabled; with retries enabled, failed
|
|
161
|
+
compute nodes are retried (bounded by `FURU_MAX_COMPUTE_RETRIES` retries).
|
|
162
|
+
- Pool protocol/queue failures (invalid payloads, spec mismatch, missing artifacts) are
|
|
163
|
+
fatal even when `retry_failed` is enabled; only compute failures are retried.
|
|
164
|
+
- `FURU_ALWAYS_RERUN` causes matching nodes to recompute once per executor run, but
|
|
165
|
+
repeated references in the same run reuse that result.
|
|
166
|
+
|
|
115
167
|
### Storage Structure
|
|
116
168
|
|
|
117
169
|
Furu uses two roots: `FURU_PATH` for `data/` + `raw/`, and
|
|
@@ -157,7 +209,7 @@ class MyExperiments(furu.FuruList[TrainModel]):
|
|
|
157
209
|
|
|
158
210
|
# Iterate over all experiments
|
|
159
211
|
for exp in MyExperiments:
|
|
160
|
-
exp.
|
|
212
|
+
exp.get()
|
|
161
213
|
|
|
162
214
|
# Access by name
|
|
163
215
|
exp = MyExperiments.by_name("baseline")
|
|
@@ -172,14 +224,17 @@ for name, exp in MyExperiments.items():
|
|
|
172
224
|
|
|
173
225
|
### Custom Validation
|
|
174
226
|
|
|
175
|
-
Override `_validate()` to add custom cache invalidation logic
|
|
227
|
+
Override `_validate()` to add custom cache invalidation logic. Return False or
|
|
228
|
+
raise `furu.FuruValidationError` to force re-computation. In executor planning,
|
|
229
|
+
any other exception is logged and treated as invalid (no crash); in interactive
|
|
230
|
+
`exists()` calls, exceptions still surface:
|
|
176
231
|
|
|
177
232
|
```python
|
|
178
233
|
class ModelWithValidation(furu.Furu[Path]):
|
|
179
234
|
checkpoint_name: str = "model.pt"
|
|
180
235
|
|
|
181
236
|
def _validate(self) -> bool:
|
|
182
|
-
# Return False to force re-computation
|
|
237
|
+
# Return False (or raise FuruValidationError) to force re-computation
|
|
183
238
|
ckpt = self.furu_dir / self.checkpoint_name
|
|
184
239
|
return ckpt.exists() and ckpt.stat().st_size > 0
|
|
185
240
|
|
|
@@ -201,7 +256,7 @@ if obj.exists():
|
|
|
201
256
|
|
|
202
257
|
# Get metadata without triggering computation
|
|
203
258
|
metadata = obj.get_metadata()
|
|
204
|
-
print(f"Hash: {obj.
|
|
259
|
+
print(f"Hash: {obj.furu_hash}")
|
|
205
260
|
print(f"Dir: {obj.furu_dir}")
|
|
206
261
|
```
|
|
207
262
|
|
|
@@ -232,7 +287,7 @@ class LargeDataProcessor(furu.Furu[Path]):
|
|
|
232
287
|
def _create(self) -> Path:
|
|
233
288
|
# self.raw_dir is shared across all configs
|
|
234
289
|
# Create a subfolder for isolation if needed
|
|
235
|
-
my_raw = self.raw_dir / self.
|
|
290
|
+
my_raw = self.raw_dir / self.furu_hash
|
|
236
291
|
my_raw.mkdir(exist_ok=True)
|
|
237
292
|
|
|
238
293
|
large_file = my_raw / "huge_dataset.bin"
|
|
@@ -284,8 +339,8 @@ HHMMSS file.py:line message
|
|
|
284
339
|
|
|
285
340
|
Furu emits status messages like:
|
|
286
341
|
```
|
|
287
|
-
|
|
288
|
-
|
|
342
|
+
get TrainModel abc123def (missing->create)
|
|
343
|
+
get TrainModel abc123def (success->load)
|
|
289
344
|
```
|
|
290
345
|
|
|
291
346
|
### Explicit Setup
|
|
@@ -306,7 +361,7 @@ logger = furu.get_logger()
|
|
|
306
361
|
from furu import FuruComputeError, FuruWaitTimeout, FuruLockNotAcquired
|
|
307
362
|
|
|
308
363
|
try:
|
|
309
|
-
result = obj.
|
|
364
|
+
result = obj.get()
|
|
310
365
|
except FuruComputeError as e:
|
|
311
366
|
print(f"Computation failed: {e}")
|
|
312
367
|
print(f"State file: {e.state_path}")
|
|
@@ -317,29 +372,21 @@ except FuruLockNotAcquired:
|
|
|
317
372
|
print("Could not acquire lock")
|
|
318
373
|
```
|
|
319
374
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
Run computations on SLURM clusters via [submitit](https://github.com/facebookincubator/submitit):
|
|
375
|
+
By default, failed artifacts are retried on the next `get()` call. Set
|
|
376
|
+
`FURU_RETRY_FAILED=0` to keep failures sticky.
|
|
323
377
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
import furu
|
|
378
|
+
`FURU_MAX_WAIT_SECS` overrides the per-class `_max_wait_time_sec` (default 600s)
|
|
379
|
+
timeout used when waiting for compute locks before raising `FuruWaitTimeout`.
|
|
327
380
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
gpus_per_node=1,
|
|
333
|
-
)
|
|
381
|
+
Failures during metadata collection or signal handler setup (before `_create()`
|
|
382
|
+
runs) raise `FuruComputeError` with the original exception attached. These
|
|
383
|
+
failures still mark the attempt as failed and record details in `state.json`
|
|
384
|
+
and `furu.log`.
|
|
334
385
|
|
|
335
|
-
|
|
336
|
-
job = my_furu_obj.load_or_create(executor=executor)
|
|
337
|
-
|
|
338
|
-
# Job ID is tracked in .furu/state.json
|
|
339
|
-
print(job.job_id)
|
|
340
|
-
```
|
|
386
|
+
## Submitit Integration
|
|
341
387
|
|
|
342
|
-
Furu
|
|
388
|
+
Furu includes a `SubmititAdapter` for integrating submitit executors with the
|
|
389
|
+
state system. Executor helpers in `furu.execution` handle submission workflows.
|
|
343
390
|
|
|
344
391
|
## Dashboard
|
|
345
392
|
|
|
@@ -396,7 +443,10 @@ The `/api/experiments` endpoint supports:
|
|
|
396
443
|
| `FURU_LOG_LEVEL` | `INFO` | Console verbosity (`DEBUG`, `INFO`, `WARNING`, `ERROR`) |
|
|
397
444
|
| `FURU_IGNORE_DIFF` | `false` | Skip embedding git diff in metadata |
|
|
398
445
|
| `FURU_ALWAYS_RERUN` | `""` | Comma-separated class qualnames to always rerun (use `ALL` to bypass cache globally; cannot combine with other entries; entries must be importable) |
|
|
446
|
+
| `FURU_RETRY_FAILED` | `true` | Retry failed artifacts by default (set to `0` to keep failures sticky) |
|
|
447
|
+
| `FURU_MAX_COMPUTE_RETRIES` | `3` | Maximum compute retries per node after the first failure |
|
|
399
448
|
| `FURU_POLL_INTERVAL_SECS` | `10` | Polling interval for queued/running jobs |
|
|
449
|
+
| `FURU_MAX_WAIT_SECS` | unset | Override wait timeout (falls back to `_max_wait_time_sec`, default 600s) |
|
|
400
450
|
| `FURU_WAIT_LOG_EVERY_SECS` | `10` | Interval between "waiting" log messages |
|
|
401
451
|
| `FURU_STALE_AFTER_SECS` | `1800` | Consider running jobs stale after this duration |
|
|
402
452
|
| `FURU_LEASE_SECS` | `120` | Compute lock lease duration |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "furu"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.4"
|
|
4
4
|
description = "Cacheable, nested pipelines for Python. Define computations as configs; furu handles caching, state tracking, and result reuse across runs."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -41,6 +41,9 @@ test = [
|
|
|
41
41
|
"httpx>=0.27.0",
|
|
42
42
|
]
|
|
43
43
|
|
|
44
|
+
[tool.uv.build-backend]
|
|
45
|
+
source-include = ["src/furu/dashboard/frontend/dist/**"]
|
|
46
|
+
|
|
44
47
|
[build-system]
|
|
45
48
|
requires = ["uv_build>=0.9.26,<0.10.0"]
|
|
46
49
|
build-backend = "uv_build"
|
|
@@ -13,12 +13,16 @@ __version__ = version("furu")
|
|
|
13
13
|
|
|
14
14
|
from .config import FURU_CONFIG, FuruConfig, get_furu_root, set_furu_root
|
|
15
15
|
from .adapters import SubmititAdapter
|
|
16
|
-
from .core import Furu, FuruList
|
|
16
|
+
from .core import DependencyChzSpec, DependencySpec, Furu, FuruList
|
|
17
17
|
from .errors import (
|
|
18
18
|
FuruComputeError,
|
|
19
19
|
FuruError,
|
|
20
|
+
FuruExecutionError,
|
|
20
21
|
FuruLockNotAcquired,
|
|
22
|
+
FuruMissingArtifact,
|
|
21
23
|
FuruMigrationRequired,
|
|
24
|
+
FuruSpecMismatch,
|
|
25
|
+
FuruValidationError,
|
|
22
26
|
FuruWaitTimeout,
|
|
23
27
|
MISSING,
|
|
24
28
|
)
|
|
@@ -51,11 +55,17 @@ __all__ = [
|
|
|
51
55
|
"FuruComputeError",
|
|
52
56
|
"FuruConfig",
|
|
53
57
|
"FuruError",
|
|
58
|
+
"FuruExecutionError",
|
|
54
59
|
"FuruList",
|
|
55
60
|
"FuruLockNotAcquired",
|
|
61
|
+
"FuruMissingArtifact",
|
|
56
62
|
"FuruMigrationRequired",
|
|
63
|
+
"FuruSpecMismatch",
|
|
64
|
+
"FuruValidationError",
|
|
57
65
|
"FuruSerializer",
|
|
58
66
|
"FuruWaitTimeout",
|
|
67
|
+
"DependencyChzSpec",
|
|
68
|
+
"DependencySpec",
|
|
59
69
|
"MISSING",
|
|
60
70
|
"migrate",
|
|
61
71
|
"NamespacePair",
|
|
@@ -6,6 +6,7 @@ from typing import Any, Callable, Protocol
|
|
|
6
6
|
|
|
7
7
|
from ..config import FURU_CONFIG
|
|
8
8
|
from ..storage import StateManager
|
|
9
|
+
from ..runtime.logging import get_logger
|
|
9
10
|
from ..storage.state import _FuruState, ProbeResult
|
|
10
11
|
|
|
11
12
|
|
|
@@ -102,19 +103,39 @@ class SubmititAdapter:
|
|
|
102
103
|
"""Watch for job ID in background thread and update state."""
|
|
103
104
|
|
|
104
105
|
def watcher():
|
|
106
|
+
_ = attempt_id # intentionally unused; queued->running attempt swap is expected
|
|
105
107
|
while True:
|
|
106
108
|
job_id = self.get_job_id(job)
|
|
107
109
|
if job_id:
|
|
108
110
|
|
|
109
111
|
def mutate(state: _FuruState) -> None:
|
|
110
112
|
attempt = state.attempt
|
|
111
|
-
if attempt is None
|
|
113
|
+
if attempt is None:
|
|
114
|
+
return
|
|
115
|
+
if attempt.backend != "submitit":
|
|
116
|
+
return
|
|
117
|
+
if (
|
|
118
|
+
attempt.status not in {"queued", "running"}
|
|
119
|
+
and attempt.status not in StateManager.TERMINAL_STATUSES
|
|
120
|
+
):
|
|
121
|
+
return
|
|
122
|
+
existing = attempt.scheduler.get("job_id")
|
|
123
|
+
if existing == job_id:
|
|
112
124
|
return
|
|
113
125
|
attempt.scheduler["job_id"] = job_id
|
|
114
126
|
|
|
115
127
|
StateManager.update_state(directory, mutate)
|
|
116
128
|
if callback:
|
|
117
|
-
|
|
129
|
+
try:
|
|
130
|
+
callback(job_id)
|
|
131
|
+
except Exception:
|
|
132
|
+
# Avoid killing the watcher thread; state update already happened.
|
|
133
|
+
logger = get_logger()
|
|
134
|
+
logger.exception(
|
|
135
|
+
"submitit watcher: job_id callback failed for %s: %s",
|
|
136
|
+
directory,
|
|
137
|
+
job_id,
|
|
138
|
+
)
|
|
118
139
|
break
|
|
119
140
|
|
|
120
141
|
if self.is_done(job):
|
|
@@ -18,16 +18,29 @@ class FuruConfig:
|
|
|
18
18
|
return (project_root / self.DEFAULT_ROOT_DIR).resolve()
|
|
19
19
|
|
|
20
20
|
self.base_root = _get_base_root()
|
|
21
|
+
self.submitit_root = (
|
|
22
|
+
Path(os.getenv("FURU_SUBMITIT_PATH", str(self.base_root / "submitit")))
|
|
23
|
+
.expanduser()
|
|
24
|
+
.resolve()
|
|
25
|
+
)
|
|
21
26
|
self.version_controlled_root_override = self._get_version_controlled_override()
|
|
22
27
|
self.poll_interval = float(os.getenv("FURU_POLL_INTERVAL_SECS", "10"))
|
|
23
28
|
self.wait_log_every_sec = float(os.getenv("FURU_WAIT_LOG_EVERY_SECS", "10"))
|
|
24
29
|
self.stale_timeout = float(os.getenv("FURU_STALE_AFTER_SECS", str(30 * 60)))
|
|
30
|
+
max_wait_env = os.getenv("FURU_MAX_WAIT_SECS")
|
|
31
|
+
self.max_wait_time_sec = float(max_wait_env) if max_wait_env else None
|
|
25
32
|
self.lease_duration_sec = float(os.getenv("FURU_LEASE_SECS", "120"))
|
|
26
33
|
hb = os.getenv("FURU_HEARTBEAT_SECS")
|
|
27
34
|
self.heartbeat_interval_sec = (
|
|
28
35
|
float(hb) if hb is not None else max(1.0, self.lease_duration_sec / 3.0)
|
|
29
36
|
)
|
|
30
37
|
self.max_requeues = int(os.getenv("FURU_PREEMPT_MAX", "5"))
|
|
38
|
+
self.max_compute_retries = int(os.getenv("FURU_MAX_COMPUTE_RETRIES", "3"))
|
|
39
|
+
self.retry_failed = os.getenv("FURU_RETRY_FAILED", "1").lower() in {
|
|
40
|
+
"1",
|
|
41
|
+
"true",
|
|
42
|
+
"yes",
|
|
43
|
+
}
|
|
31
44
|
self.ignore_git_diff = os.getenv("FURU_IGNORE_DIFF", "0").lower() in {
|
|
32
45
|
"1",
|
|
33
46
|
"true",
|
|
@@ -102,6 +115,9 @@ class FuruConfig:
|
|
|
102
115
|
return self._resolve_version_controlled_root()
|
|
103
116
|
return self.base_root / "data"
|
|
104
117
|
|
|
118
|
+
def get_submitit_root(self) -> Path:
|
|
119
|
+
return self.submitit_root
|
|
120
|
+
|
|
105
121
|
@classmethod
|
|
106
122
|
def _get_version_controlled_override(cls) -> Path | None:
|
|
107
123
|
env = os.getenv("FURU_VERSION_CONTROLLED_PATH")
|
|
@@ -151,8 +167,7 @@ class FuruConfig:
|
|
|
151
167
|
value = getattr(target, attr, missing_sentinel)
|
|
152
168
|
if value is missing_sentinel:
|
|
153
169
|
raise ValueError(
|
|
154
|
-
"FURU_ALWAYS_RERUN entry does not exist: "
|
|
155
|
-
f"{namespace!r}"
|
|
170
|
+
f"FURU_ALWAYS_RERUN entry does not exist: {namespace!r}"
|
|
156
171
|
)
|
|
157
172
|
target = value
|
|
158
173
|
|
|
@@ -169,4 +184,7 @@ def get_furu_root(*, version_controlled: bool = False) -> Path:
|
|
|
169
184
|
|
|
170
185
|
|
|
171
186
|
def set_furu_root(path: Path) -> None:
|
|
172
|
-
|
|
187
|
+
root = path.resolve()
|
|
188
|
+
FURU_CONFIG.base_root = root
|
|
189
|
+
if os.getenv("FURU_SUBMITIT_PATH") is None:
|
|
190
|
+
FURU_CONFIG.submitit_root = (root / "submitit").resolve()
|