furu 0.0.3__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {furu-0.0.3 → furu-0.0.4}/PKG-INFO +74 -37
- {furu-0.0.3 → furu-0.0.4}/README.md +73 -36
- {furu-0.0.3 → furu-0.0.4}/pyproject.toml +4 -1
- {furu-0.0.3 → furu-0.0.4}/src/furu/__init__.py +8 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/adapters/submitit.py +23 -2
- {furu-0.0.3 → furu-0.0.4}/src/furu/config.py +13 -1
- {furu-0.0.3 → furu-0.0.4}/src/furu/core/furu.py +355 -196
- {furu-0.0.3 → furu-0.0.4}/src/furu/core/list.py +1 -1
- furu-0.0.4/src/furu/dashboard/__init__.py +18 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/main.py +10 -3
- {furu-0.0.3 → furu-0.0.4}/src/furu/errors.py +17 -4
- furu-0.0.4/src/furu/execution/__init__.py +22 -0
- furu-0.0.4/src/furu/execution/context.py +30 -0
- furu-0.0.4/src/furu/execution/local.py +184 -0
- furu-0.0.4/src/furu/execution/paths.py +20 -0
- furu-0.0.4/src/furu/execution/plan.py +238 -0
- furu-0.0.4/src/furu/execution/plan_utils.py +13 -0
- furu-0.0.4/src/furu/execution/slurm_dag.py +271 -0
- furu-0.0.4/src/furu/execution/slurm_pool.py +878 -0
- furu-0.0.4/src/furu/execution/slurm_spec.py +38 -0
- furu-0.0.4/src/furu/execution/submitit_factory.py +47 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/runtime/logging.py +10 -10
- {furu-0.0.3 → furu-0.0.4}/src/furu/storage/state.py +34 -6
- furu-0.0.3/src/furu/dashboard/__init__.py +0 -9
- {furu-0.0.3 → furu-0.0.4}/src/furu/adapters/__init__.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/core/__init__.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/__main__.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/api/__init__.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/api/models.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/api/routes.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/frontend/dist/assets/index-BXAIKNNr.css +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/frontend/dist/assets/index-DS3FsqcY.js +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/frontend/dist/favicon.svg +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/frontend/dist/index.html +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/dashboard/scanner.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/migrate.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/migration.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/runtime/__init__.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/runtime/env.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/runtime/tracebacks.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/serialization/__init__.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/serialization/migrations.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/serialization/serializer.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/storage/__init__.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/storage/metadata.py +0 -0
- {furu-0.0.3 → furu-0.0.4}/src/furu/storage/migration.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: furu
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Cacheable, nested pipelines for Python. Define computations as configs; furu handles caching, state tracking, and result reuse across runs.
|
|
5
5
|
Author: Herman Brunborg
|
|
6
6
|
Author-email: Herman Brunborg <herman@brunborg.com>
|
|
@@ -44,7 +44,7 @@ The `[dashboard]` extra includes the web dashboard. Omit it for the core library
|
|
|
44
44
|
1. Subclass `furu.Furu[T]`
|
|
45
45
|
2. Implement `_create(self) -> T` (compute and write to `self.furu_dir`)
|
|
46
46
|
3. Implement `_load(self) -> T` (load from `self.furu_dir`)
|
|
47
|
-
4. Call `
|
|
47
|
+
4. Call `get()`
|
|
48
48
|
|
|
49
49
|
```python
|
|
50
50
|
# my_project/pipelines.py
|
|
@@ -75,10 +75,10 @@ class TrainModel(furu.Furu[Path]):
|
|
|
75
75
|
from my_project.pipelines import TrainModel
|
|
76
76
|
|
|
77
77
|
# First call: runs _create(), caches result
|
|
78
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
78
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
79
79
|
|
|
80
80
|
# Second call with same config: loads from cache via _load()
|
|
81
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
81
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
82
82
|
```
|
|
83
83
|
|
|
84
84
|
> **Tip:** Define Furu classes in importable modules (not `__main__`); the artifact namespace is derived from the class's module + qualified name.
|
|
@@ -96,7 +96,7 @@ Each `Furu` instance maps deterministically to a directory based on its config:
|
|
|
96
96
|
- **namespace**: Derived from the class's module + qualified name (e.g., `my_project.pipelines/TrainModel`)
|
|
97
97
|
- **hash**: Computed from the object's config values using Blake2s
|
|
98
98
|
|
|
99
|
-
When you call `
|
|
99
|
+
When you call `get()`:
|
|
100
100
|
1. If no cached result exists → run `_create()`, save state as "success"
|
|
101
101
|
2. If cached result exists → run `_load()` to retrieve it
|
|
102
102
|
3. If another process is running → wait for it to finish, then load
|
|
@@ -123,7 +123,7 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
123
123
|
dataset: Dataset = furu.chz.field(default_factory=Dataset)
|
|
124
124
|
|
|
125
125
|
def _create(self) -> str:
|
|
126
|
-
data = self.dataset.
|
|
126
|
+
data = self.dataset.get() # Triggers Dataset cache
|
|
127
127
|
(self.furu_dir / "model.txt").write_text(f"trained on:\n{data}")
|
|
128
128
|
return "trained"
|
|
129
129
|
|
|
@@ -131,6 +131,58 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
131
131
|
return (self.furu_dir / "model.txt").read_text()
|
|
132
132
|
```
|
|
133
133
|
|
|
134
|
+
### Executors (Local + Slurm)
|
|
135
|
+
|
|
136
|
+
Use the execution helpers for batch runs and cluster scheduling:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from furu.execution import run_local
|
|
140
|
+
|
|
141
|
+
run_local(
|
|
142
|
+
[TrainModel(lr=3e-4, steps=5000), TrainModel(lr=1e-3, steps=2000)],
|
|
143
|
+
max_workers=8,
|
|
144
|
+
window_size="bfs",
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from furu.execution import SlurmSpec, submit_slurm_dag
|
|
150
|
+
|
|
151
|
+
specs = {
|
|
152
|
+
"default": SlurmSpec(partition="cpu", cpus=8, mem_gb=32, time_min=120),
|
|
153
|
+
"gpu": SlurmSpec(partition="gpu", gpus=1, cpus=8, mem_gb=64, time_min=720),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
submit_slurm_dag([TrainModel(lr=3e-4, steps=5000)], specs=specs)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from furu.execution import run_slurm_pool
|
|
161
|
+
|
|
162
|
+
run_slurm_pool(
|
|
163
|
+
[TrainModel(lr=3e-4, steps=5000)],
|
|
164
|
+
specs=specs,
|
|
165
|
+
max_workers_total=50,
|
|
166
|
+
window_size="bfs",
|
|
167
|
+
)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Submitit logs are stored under `<FURU_PATH>/submitit` by default. Override with
|
|
171
|
+
`FURU_SUBMITIT_PATH` when you want a different logs root.
|
|
172
|
+
|
|
173
|
+
### Breaking Changes and Executor Semantics
|
|
174
|
+
|
|
175
|
+
- `load_or_create()` is removed; use `get()` exclusively.
|
|
176
|
+
- `get()` no longer accepts per-call `retry_failed` overrides. Configure retries via
|
|
177
|
+
`FURU_RETRY_FAILED` or `FURU_CONFIG.retry_failed`.
|
|
178
|
+
- Executor runs (`run_local`, `run_slurm_pool`, `submit_slurm_dag`) fail fast if a
|
|
179
|
+
dependency is FAILED while `retry_failed` is disabled; with retries enabled, failed
|
|
180
|
+
compute nodes are retried (bounded by `FURU_MAX_COMPUTE_RETRIES` retries).
|
|
181
|
+
- Pool protocol/queue failures (invalid payloads, spec mismatch, missing artifacts) are
|
|
182
|
+
fatal even when `retry_failed` is enabled; only compute failures are retried.
|
|
183
|
+
- `FURU_ALWAYS_RERUN` causes matching nodes to recompute once per executor run, but
|
|
184
|
+
repeated references in the same run reuse that result.
|
|
185
|
+
|
|
134
186
|
### Storage Structure
|
|
135
187
|
|
|
136
188
|
Furu uses two roots: `FURU_PATH` for `data/` + `raw/`, and
|
|
@@ -176,7 +228,7 @@ class MyExperiments(furu.FuruList[TrainModel]):
|
|
|
176
228
|
|
|
177
229
|
# Iterate over all experiments
|
|
178
230
|
for exp in MyExperiments:
|
|
179
|
-
exp.
|
|
231
|
+
exp.get()
|
|
180
232
|
|
|
181
233
|
# Access by name
|
|
182
234
|
exp = MyExperiments.by_name("baseline")
|
|
@@ -191,14 +243,17 @@ for name, exp in MyExperiments.items():
|
|
|
191
243
|
|
|
192
244
|
### Custom Validation
|
|
193
245
|
|
|
194
|
-
Override `_validate()` to add custom cache invalidation logic
|
|
246
|
+
Override `_validate()` to add custom cache invalidation logic. Return False or
|
|
247
|
+
raise `furu.FuruValidationError` to force re-computation. In executor planning,
|
|
248
|
+
any other exception is logged and treated as invalid (no crash); in interactive
|
|
249
|
+
`exists()` calls, exceptions still surface:
|
|
195
250
|
|
|
196
251
|
```python
|
|
197
252
|
class ModelWithValidation(furu.Furu[Path]):
|
|
198
253
|
checkpoint_name: str = "model.pt"
|
|
199
254
|
|
|
200
255
|
def _validate(self) -> bool:
|
|
201
|
-
# Return False to force re-computation
|
|
256
|
+
# Return False (or raise FuruValidationError) to force re-computation
|
|
202
257
|
ckpt = self.furu_dir / self.checkpoint_name
|
|
203
258
|
return ckpt.exists() and ckpt.stat().st_size > 0
|
|
204
259
|
|
|
@@ -220,7 +275,7 @@ if obj.exists():
|
|
|
220
275
|
|
|
221
276
|
# Get metadata without triggering computation
|
|
222
277
|
metadata = obj.get_metadata()
|
|
223
|
-
print(f"Hash: {obj.
|
|
278
|
+
print(f"Hash: {obj.furu_hash}")
|
|
224
279
|
print(f"Dir: {obj.furu_dir}")
|
|
225
280
|
```
|
|
226
281
|
|
|
@@ -251,7 +306,7 @@ class LargeDataProcessor(furu.Furu[Path]):
|
|
|
251
306
|
def _create(self) -> Path:
|
|
252
307
|
# self.raw_dir is shared across all configs
|
|
253
308
|
# Create a subfolder for isolation if needed
|
|
254
|
-
my_raw = self.raw_dir / self.
|
|
309
|
+
my_raw = self.raw_dir / self.furu_hash
|
|
255
310
|
my_raw.mkdir(exist_ok=True)
|
|
256
311
|
|
|
257
312
|
large_file = my_raw / "huge_dataset.bin"
|
|
@@ -303,8 +358,8 @@ HHMMSS file.py:line message
|
|
|
303
358
|
|
|
304
359
|
Furu emits status messages like:
|
|
305
360
|
```
|
|
306
|
-
|
|
307
|
-
|
|
361
|
+
get TrainModel abc123def (missing->create)
|
|
362
|
+
get TrainModel abc123def (success->load)
|
|
308
363
|
```
|
|
309
364
|
|
|
310
365
|
### Explicit Setup
|
|
@@ -325,7 +380,7 @@ logger = furu.get_logger()
|
|
|
325
380
|
from furu import FuruComputeError, FuruWaitTimeout, FuruLockNotAcquired
|
|
326
381
|
|
|
327
382
|
try:
|
|
328
|
-
result = obj.
|
|
383
|
+
result = obj.get()
|
|
329
384
|
except FuruComputeError as e:
|
|
330
385
|
print(f"Computation failed: {e}")
|
|
331
386
|
print(f"State file: {e.state_path}")
|
|
@@ -336,8 +391,8 @@ except FuruLockNotAcquired:
|
|
|
336
391
|
print("Could not acquire lock")
|
|
337
392
|
```
|
|
338
393
|
|
|
339
|
-
By default, failed artifacts are retried on the next `
|
|
340
|
-
`FURU_RETRY_FAILED=0`
|
|
394
|
+
By default, failed artifacts are retried on the next `get()` call. Set
|
|
395
|
+
`FURU_RETRY_FAILED=0` to keep failures sticky.
|
|
341
396
|
|
|
342
397
|
`FURU_MAX_WAIT_SECS` overrides the per-class `_max_wait_time_sec` (default 600s)
|
|
343
398
|
timeout used when waiting for compute locks before raising `FuruWaitTimeout`.
|
|
@@ -349,27 +404,8 @@ and `furu.log`.
|
|
|
349
404
|
|
|
350
405
|
## Submitit Integration
|
|
351
406
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
```python
|
|
355
|
-
import submitit
|
|
356
|
-
import furu
|
|
357
|
-
|
|
358
|
-
executor = submitit.AutoExecutor(folder="submitit_logs")
|
|
359
|
-
executor.update_parameters(
|
|
360
|
-
timeout_min=60,
|
|
361
|
-
slurm_partition="gpu",
|
|
362
|
-
gpus_per_node=1,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
# Submit job and return immediately
|
|
366
|
-
job = my_furu_obj.load_or_create(executor=executor)
|
|
367
|
-
|
|
368
|
-
# Job ID is tracked in .furu/state.json
|
|
369
|
-
print(job.job_id)
|
|
370
|
-
```
|
|
371
|
-
|
|
372
|
-
Furu handles preemption, requeuing, and state tracking automatically.
|
|
407
|
+
Furu includes a `SubmititAdapter` for integrating submitit executors with the
|
|
408
|
+
state system. Executor helpers in `furu.execution` handle submission workflows.
|
|
373
409
|
|
|
374
410
|
## Dashboard
|
|
375
411
|
|
|
@@ -427,6 +463,7 @@ The `/api/experiments` endpoint supports:
|
|
|
427
463
|
| `FURU_IGNORE_DIFF` | `false` | Skip embedding git diff in metadata |
|
|
428
464
|
| `FURU_ALWAYS_RERUN` | `""` | Comma-separated class qualnames to always rerun (use `ALL` to bypass cache globally; cannot combine with other entries; entries must be importable) |
|
|
429
465
|
| `FURU_RETRY_FAILED` | `true` | Retry failed artifacts by default (set to `0` to keep failures sticky) |
|
|
466
|
+
| `FURU_MAX_COMPUTE_RETRIES` | `3` | Maximum compute retries per node after the first failure |
|
|
430
467
|
| `FURU_POLL_INTERVAL_SECS` | `10` | Polling interval for queued/running jobs |
|
|
431
468
|
| `FURU_MAX_WAIT_SECS` | unset | Override wait timeout (falls back to `_max_wait_time_sec`, default 600s) |
|
|
432
469
|
| `FURU_WAIT_LOG_EVERY_SECS` | `10` | Interval between "waiting" log messages |
|
|
@@ -25,7 +25,7 @@ The `[dashboard]` extra includes the web dashboard. Omit it for the core library
|
|
|
25
25
|
1. Subclass `furu.Furu[T]`
|
|
26
26
|
2. Implement `_create(self) -> T` (compute and write to `self.furu_dir`)
|
|
27
27
|
3. Implement `_load(self) -> T` (load from `self.furu_dir`)
|
|
28
|
-
4. Call `
|
|
28
|
+
4. Call `get()`
|
|
29
29
|
|
|
30
30
|
```python
|
|
31
31
|
# my_project/pipelines.py
|
|
@@ -56,10 +56,10 @@ class TrainModel(furu.Furu[Path]):
|
|
|
56
56
|
from my_project.pipelines import TrainModel
|
|
57
57
|
|
|
58
58
|
# First call: runs _create(), caches result
|
|
59
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
59
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
60
60
|
|
|
61
61
|
# Second call with same config: loads from cache via _load()
|
|
62
|
-
artifact = TrainModel(lr=3e-4, steps=5000).
|
|
62
|
+
artifact = TrainModel(lr=3e-4, steps=5000).get()
|
|
63
63
|
```
|
|
64
64
|
|
|
65
65
|
> **Tip:** Define Furu classes in importable modules (not `__main__`); the artifact namespace is derived from the class's module + qualified name.
|
|
@@ -77,7 +77,7 @@ Each `Furu` instance maps deterministically to a directory based on its config:
|
|
|
77
77
|
- **namespace**: Derived from the class's module + qualified name (e.g., `my_project.pipelines/TrainModel`)
|
|
78
78
|
- **hash**: Computed from the object's config values using Blake2s
|
|
79
79
|
|
|
80
|
-
When you call `
|
|
80
|
+
When you call `get()`:
|
|
81
81
|
1. If no cached result exists → run `_create()`, save state as "success"
|
|
82
82
|
2. If cached result exists → run `_load()` to retrieve it
|
|
83
83
|
3. If another process is running → wait for it to finish, then load
|
|
@@ -104,7 +104,7 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
104
104
|
dataset: Dataset = furu.chz.field(default_factory=Dataset)
|
|
105
105
|
|
|
106
106
|
def _create(self) -> str:
|
|
107
|
-
data = self.dataset.
|
|
107
|
+
data = self.dataset.get() # Triggers Dataset cache
|
|
108
108
|
(self.furu_dir / "model.txt").write_text(f"trained on:\n{data}")
|
|
109
109
|
return "trained"
|
|
110
110
|
|
|
@@ -112,6 +112,58 @@ class TrainTextModel(furu.Furu[str]):
|
|
|
112
112
|
return (self.furu_dir / "model.txt").read_text()
|
|
113
113
|
```
|
|
114
114
|
|
|
115
|
+
### Executors (Local + Slurm)
|
|
116
|
+
|
|
117
|
+
Use the execution helpers for batch runs and cluster scheduling:
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from furu.execution import run_local
|
|
121
|
+
|
|
122
|
+
run_local(
|
|
123
|
+
[TrainModel(lr=3e-4, steps=5000), TrainModel(lr=1e-3, steps=2000)],
|
|
124
|
+
max_workers=8,
|
|
125
|
+
window_size="bfs",
|
|
126
|
+
)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from furu.execution import SlurmSpec, submit_slurm_dag
|
|
131
|
+
|
|
132
|
+
specs = {
|
|
133
|
+
"default": SlurmSpec(partition="cpu", cpus=8, mem_gb=32, time_min=120),
|
|
134
|
+
"gpu": SlurmSpec(partition="gpu", gpus=1, cpus=8, mem_gb=64, time_min=720),
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
submit_slurm_dag([TrainModel(lr=3e-4, steps=5000)], specs=specs)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from furu.execution import run_slurm_pool
|
|
142
|
+
|
|
143
|
+
run_slurm_pool(
|
|
144
|
+
[TrainModel(lr=3e-4, steps=5000)],
|
|
145
|
+
specs=specs,
|
|
146
|
+
max_workers_total=50,
|
|
147
|
+
window_size="bfs",
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Submitit logs are stored under `<FURU_PATH>/submitit` by default. Override with
|
|
152
|
+
`FURU_SUBMITIT_PATH` when you want a different logs root.
|
|
153
|
+
|
|
154
|
+
### Breaking Changes and Executor Semantics
|
|
155
|
+
|
|
156
|
+
- `load_or_create()` is removed; use `get()` exclusively.
|
|
157
|
+
- `get()` no longer accepts per-call `retry_failed` overrides. Configure retries via
|
|
158
|
+
`FURU_RETRY_FAILED` or `FURU_CONFIG.retry_failed`.
|
|
159
|
+
- Executor runs (`run_local`, `run_slurm_pool`, `submit_slurm_dag`) fail fast if a
|
|
160
|
+
dependency is FAILED while `retry_failed` is disabled; with retries enabled, failed
|
|
161
|
+
compute nodes are retried (bounded by `FURU_MAX_COMPUTE_RETRIES` retries).
|
|
162
|
+
- Pool protocol/queue failures (invalid payloads, spec mismatch, missing artifacts) are
|
|
163
|
+
fatal even when `retry_failed` is enabled; only compute failures are retried.
|
|
164
|
+
- `FURU_ALWAYS_RERUN` causes matching nodes to recompute once per executor run, but
|
|
165
|
+
repeated references in the same run reuse that result.
|
|
166
|
+
|
|
115
167
|
### Storage Structure
|
|
116
168
|
|
|
117
169
|
Furu uses two roots: `FURU_PATH` for `data/` + `raw/`, and
|
|
@@ -157,7 +209,7 @@ class MyExperiments(furu.FuruList[TrainModel]):
|
|
|
157
209
|
|
|
158
210
|
# Iterate over all experiments
|
|
159
211
|
for exp in MyExperiments:
|
|
160
|
-
exp.
|
|
212
|
+
exp.get()
|
|
161
213
|
|
|
162
214
|
# Access by name
|
|
163
215
|
exp = MyExperiments.by_name("baseline")
|
|
@@ -172,14 +224,17 @@ for name, exp in MyExperiments.items():
|
|
|
172
224
|
|
|
173
225
|
### Custom Validation
|
|
174
226
|
|
|
175
|
-
Override `_validate()` to add custom cache invalidation logic
|
|
227
|
+
Override `_validate()` to add custom cache invalidation logic. Return False or
|
|
228
|
+
raise `furu.FuruValidationError` to force re-computation. In executor planning,
|
|
229
|
+
any other exception is logged and treated as invalid (no crash); in interactive
|
|
230
|
+
`exists()` calls, exceptions still surface:
|
|
176
231
|
|
|
177
232
|
```python
|
|
178
233
|
class ModelWithValidation(furu.Furu[Path]):
|
|
179
234
|
checkpoint_name: str = "model.pt"
|
|
180
235
|
|
|
181
236
|
def _validate(self) -> bool:
|
|
182
|
-
# Return False to force re-computation
|
|
237
|
+
# Return False (or raise FuruValidationError) to force re-computation
|
|
183
238
|
ckpt = self.furu_dir / self.checkpoint_name
|
|
184
239
|
return ckpt.exists() and ckpt.stat().st_size > 0
|
|
185
240
|
|
|
@@ -201,7 +256,7 @@ if obj.exists():
|
|
|
201
256
|
|
|
202
257
|
# Get metadata without triggering computation
|
|
203
258
|
metadata = obj.get_metadata()
|
|
204
|
-
print(f"Hash: {obj.
|
|
259
|
+
print(f"Hash: {obj.furu_hash}")
|
|
205
260
|
print(f"Dir: {obj.furu_dir}")
|
|
206
261
|
```
|
|
207
262
|
|
|
@@ -232,7 +287,7 @@ class LargeDataProcessor(furu.Furu[Path]):
|
|
|
232
287
|
def _create(self) -> Path:
|
|
233
288
|
# self.raw_dir is shared across all configs
|
|
234
289
|
# Create a subfolder for isolation if needed
|
|
235
|
-
my_raw = self.raw_dir / self.
|
|
290
|
+
my_raw = self.raw_dir / self.furu_hash
|
|
236
291
|
my_raw.mkdir(exist_ok=True)
|
|
237
292
|
|
|
238
293
|
large_file = my_raw / "huge_dataset.bin"
|
|
@@ -284,8 +339,8 @@ HHMMSS file.py:line message
|
|
|
284
339
|
|
|
285
340
|
Furu emits status messages like:
|
|
286
341
|
```
|
|
287
|
-
|
|
288
|
-
|
|
342
|
+
get TrainModel abc123def (missing->create)
|
|
343
|
+
get TrainModel abc123def (success->load)
|
|
289
344
|
```
|
|
290
345
|
|
|
291
346
|
### Explicit Setup
|
|
@@ -306,7 +361,7 @@ logger = furu.get_logger()
|
|
|
306
361
|
from furu import FuruComputeError, FuruWaitTimeout, FuruLockNotAcquired
|
|
307
362
|
|
|
308
363
|
try:
|
|
309
|
-
result = obj.
|
|
364
|
+
result = obj.get()
|
|
310
365
|
except FuruComputeError as e:
|
|
311
366
|
print(f"Computation failed: {e}")
|
|
312
367
|
print(f"State file: {e.state_path}")
|
|
@@ -317,8 +372,8 @@ except FuruLockNotAcquired:
|
|
|
317
372
|
print("Could not acquire lock")
|
|
318
373
|
```
|
|
319
374
|
|
|
320
|
-
By default, failed artifacts are retried on the next `
|
|
321
|
-
`FURU_RETRY_FAILED=0`
|
|
375
|
+
By default, failed artifacts are retried on the next `get()` call. Set
|
|
376
|
+
`FURU_RETRY_FAILED=0` to keep failures sticky.
|
|
322
377
|
|
|
323
378
|
`FURU_MAX_WAIT_SECS` overrides the per-class `_max_wait_time_sec` (default 600s)
|
|
324
379
|
timeout used when waiting for compute locks before raising `FuruWaitTimeout`.
|
|
@@ -330,27 +385,8 @@ and `furu.log`.
|
|
|
330
385
|
|
|
331
386
|
## Submitit Integration
|
|
332
387
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
```python
|
|
336
|
-
import submitit
|
|
337
|
-
import furu
|
|
338
|
-
|
|
339
|
-
executor = submitit.AutoExecutor(folder="submitit_logs")
|
|
340
|
-
executor.update_parameters(
|
|
341
|
-
timeout_min=60,
|
|
342
|
-
slurm_partition="gpu",
|
|
343
|
-
gpus_per_node=1,
|
|
344
|
-
)
|
|
345
|
-
|
|
346
|
-
# Submit job and return immediately
|
|
347
|
-
job = my_furu_obj.load_or_create(executor=executor)
|
|
348
|
-
|
|
349
|
-
# Job ID is tracked in .furu/state.json
|
|
350
|
-
print(job.job_id)
|
|
351
|
-
```
|
|
352
|
-
|
|
353
|
-
Furu handles preemption, requeuing, and state tracking automatically.
|
|
388
|
+
Furu includes a `SubmititAdapter` for integrating submitit executors with the
|
|
389
|
+
state system. Executor helpers in `furu.execution` handle submission workflows.
|
|
354
390
|
|
|
355
391
|
## Dashboard
|
|
356
392
|
|
|
@@ -408,6 +444,7 @@ The `/api/experiments` endpoint supports:
|
|
|
408
444
|
| `FURU_IGNORE_DIFF` | `false` | Skip embedding git diff in metadata |
|
|
409
445
|
| `FURU_ALWAYS_RERUN` | `""` | Comma-separated class qualnames to always rerun (use `ALL` to bypass cache globally; cannot combine with other entries; entries must be importable) |
|
|
410
446
|
| `FURU_RETRY_FAILED` | `true` | Retry failed artifacts by default (set to `0` to keep failures sticky) |
|
|
447
|
+
| `FURU_MAX_COMPUTE_RETRIES` | `3` | Maximum compute retries per node after the first failure |
|
|
411
448
|
| `FURU_POLL_INTERVAL_SECS` | `10` | Polling interval for queued/running jobs |
|
|
412
449
|
| `FURU_MAX_WAIT_SECS` | unset | Override wait timeout (falls back to `_max_wait_time_sec`, default 600s) |
|
|
413
450
|
| `FURU_WAIT_LOG_EVERY_SECS` | `10` | Interval between "waiting" log messages |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "furu"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.4"
|
|
4
4
|
description = "Cacheable, nested pipelines for Python. Define computations as configs; furu handles caching, state tracking, and result reuse across runs."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -41,6 +41,9 @@ test = [
|
|
|
41
41
|
"httpx>=0.27.0",
|
|
42
42
|
]
|
|
43
43
|
|
|
44
|
+
[tool.uv.build-backend]
|
|
45
|
+
source-include = ["src/furu/dashboard/frontend/dist/**"]
|
|
46
|
+
|
|
44
47
|
[build-system]
|
|
45
48
|
requires = ["uv_build>=0.9.26,<0.10.0"]
|
|
46
49
|
build-backend = "uv_build"
|
|
@@ -17,8 +17,12 @@ from .core import DependencyChzSpec, DependencySpec, Furu, FuruList
|
|
|
17
17
|
from .errors import (
|
|
18
18
|
FuruComputeError,
|
|
19
19
|
FuruError,
|
|
20
|
+
FuruExecutionError,
|
|
20
21
|
FuruLockNotAcquired,
|
|
22
|
+
FuruMissingArtifact,
|
|
21
23
|
FuruMigrationRequired,
|
|
24
|
+
FuruSpecMismatch,
|
|
25
|
+
FuruValidationError,
|
|
22
26
|
FuruWaitTimeout,
|
|
23
27
|
MISSING,
|
|
24
28
|
)
|
|
@@ -51,9 +55,13 @@ __all__ = [
|
|
|
51
55
|
"FuruComputeError",
|
|
52
56
|
"FuruConfig",
|
|
53
57
|
"FuruError",
|
|
58
|
+
"FuruExecutionError",
|
|
54
59
|
"FuruList",
|
|
55
60
|
"FuruLockNotAcquired",
|
|
61
|
+
"FuruMissingArtifact",
|
|
56
62
|
"FuruMigrationRequired",
|
|
63
|
+
"FuruSpecMismatch",
|
|
64
|
+
"FuruValidationError",
|
|
57
65
|
"FuruSerializer",
|
|
58
66
|
"FuruWaitTimeout",
|
|
59
67
|
"DependencyChzSpec",
|
|
@@ -6,6 +6,7 @@ from typing import Any, Callable, Protocol
|
|
|
6
6
|
|
|
7
7
|
from ..config import FURU_CONFIG
|
|
8
8
|
from ..storage import StateManager
|
|
9
|
+
from ..runtime.logging import get_logger
|
|
9
10
|
from ..storage.state import _FuruState, ProbeResult
|
|
10
11
|
|
|
11
12
|
|
|
@@ -102,19 +103,39 @@ class SubmititAdapter:
|
|
|
102
103
|
"""Watch for job ID in background thread and update state."""
|
|
103
104
|
|
|
104
105
|
def watcher():
|
|
106
|
+
_ = attempt_id # intentionally unused; queued->running attempt swap is expected
|
|
105
107
|
while True:
|
|
106
108
|
job_id = self.get_job_id(job)
|
|
107
109
|
if job_id:
|
|
108
110
|
|
|
109
111
|
def mutate(state: _FuruState) -> None:
|
|
110
112
|
attempt = state.attempt
|
|
111
|
-
if attempt is None
|
|
113
|
+
if attempt is None:
|
|
114
|
+
return
|
|
115
|
+
if attempt.backend != "submitit":
|
|
116
|
+
return
|
|
117
|
+
if (
|
|
118
|
+
attempt.status not in {"queued", "running"}
|
|
119
|
+
and attempt.status not in StateManager.TERMINAL_STATUSES
|
|
120
|
+
):
|
|
121
|
+
return
|
|
122
|
+
existing = attempt.scheduler.get("job_id")
|
|
123
|
+
if existing == job_id:
|
|
112
124
|
return
|
|
113
125
|
attempt.scheduler["job_id"] = job_id
|
|
114
126
|
|
|
115
127
|
StateManager.update_state(directory, mutate)
|
|
116
128
|
if callback:
|
|
117
|
-
|
|
129
|
+
try:
|
|
130
|
+
callback(job_id)
|
|
131
|
+
except Exception:
|
|
132
|
+
# Avoid killing the watcher thread; state update already happened.
|
|
133
|
+
logger = get_logger()
|
|
134
|
+
logger.exception(
|
|
135
|
+
"submitit watcher: job_id callback failed for %s: %s",
|
|
136
|
+
directory,
|
|
137
|
+
job_id,
|
|
138
|
+
)
|
|
118
139
|
break
|
|
119
140
|
|
|
120
141
|
if self.is_done(job):
|
|
@@ -18,6 +18,11 @@ class FuruConfig:
|
|
|
18
18
|
return (project_root / self.DEFAULT_ROOT_DIR).resolve()
|
|
19
19
|
|
|
20
20
|
self.base_root = _get_base_root()
|
|
21
|
+
self.submitit_root = (
|
|
22
|
+
Path(os.getenv("FURU_SUBMITIT_PATH", str(self.base_root / "submitit")))
|
|
23
|
+
.expanduser()
|
|
24
|
+
.resolve()
|
|
25
|
+
)
|
|
21
26
|
self.version_controlled_root_override = self._get_version_controlled_override()
|
|
22
27
|
self.poll_interval = float(os.getenv("FURU_POLL_INTERVAL_SECS", "10"))
|
|
23
28
|
self.wait_log_every_sec = float(os.getenv("FURU_WAIT_LOG_EVERY_SECS", "10"))
|
|
@@ -30,6 +35,7 @@ class FuruConfig:
|
|
|
30
35
|
float(hb) if hb is not None else max(1.0, self.lease_duration_sec / 3.0)
|
|
31
36
|
)
|
|
32
37
|
self.max_requeues = int(os.getenv("FURU_PREEMPT_MAX", "5"))
|
|
38
|
+
self.max_compute_retries = int(os.getenv("FURU_MAX_COMPUTE_RETRIES", "3"))
|
|
33
39
|
self.retry_failed = os.getenv("FURU_RETRY_FAILED", "1").lower() in {
|
|
34
40
|
"1",
|
|
35
41
|
"true",
|
|
@@ -109,6 +115,9 @@ class FuruConfig:
|
|
|
109
115
|
return self._resolve_version_controlled_root()
|
|
110
116
|
return self.base_root / "data"
|
|
111
117
|
|
|
118
|
+
def get_submitit_root(self) -> Path:
|
|
119
|
+
return self.submitit_root
|
|
120
|
+
|
|
112
121
|
@classmethod
|
|
113
122
|
def _get_version_controlled_override(cls) -> Path | None:
|
|
114
123
|
env = os.getenv("FURU_VERSION_CONTROLLED_PATH")
|
|
@@ -175,4 +184,7 @@ def get_furu_root(*, version_controlled: bool = False) -> Path:
|
|
|
175
184
|
|
|
176
185
|
|
|
177
186
|
def set_furu_root(path: Path) -> None:
|
|
178
|
-
|
|
187
|
+
root = path.resolve()
|
|
188
|
+
FURU_CONFIG.base_root = root
|
|
189
|
+
if os.getenv("FURU_SUBMITIT_PATH") is None:
|
|
190
|
+
FURU_CONFIG.submitit_root = (root / "submitit").resolve()
|