furu 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {furu-0.0.2 → furu-0.0.4}/PKG-INFO +83 -33
  2. {furu-0.0.2 → furu-0.0.4}/README.md +82 -32
  3. {furu-0.0.2 → furu-0.0.4}/pyproject.toml +4 -1
  4. {furu-0.0.2 → furu-0.0.4}/src/furu/__init__.py +11 -1
  5. {furu-0.0.2 → furu-0.0.4}/src/furu/adapters/submitit.py +23 -2
  6. {furu-0.0.2 → furu-0.0.4}/src/furu/config.py +21 -3
  7. furu-0.0.4/src/furu/core/__init__.py +4 -0
  8. {furu-0.0.2 → furu-0.0.4}/src/furu/core/furu.py +708 -188
  9. {furu-0.0.2 → furu-0.0.4}/src/furu/core/list.py +1 -1
  10. furu-0.0.4/src/furu/dashboard/__init__.py +18 -0
  11. furu-0.0.2/src/furu/dashboard/frontend/dist/assets/index-CbdDfSOZ.css → furu-0.0.4/src/furu/dashboard/frontend/dist/assets/index-BXAIKNNr.css +1 -1
  12. furu-0.0.2/src/furu/dashboard/frontend/dist/assets/index-DDv_TYB_.js → furu-0.0.4/src/furu/dashboard/frontend/dist/assets/index-DS3FsqcY.js +3 -3
  13. {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/frontend/dist/index.html +2 -2
  14. {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/main.py +10 -3
  15. furu-0.0.4/src/furu/errors.py +131 -0
  16. furu-0.0.4/src/furu/execution/__init__.py +22 -0
  17. furu-0.0.4/src/furu/execution/context.py +30 -0
  18. furu-0.0.4/src/furu/execution/local.py +184 -0
  19. furu-0.0.4/src/furu/execution/paths.py +20 -0
  20. furu-0.0.4/src/furu/execution/plan.py +238 -0
  21. furu-0.0.4/src/furu/execution/plan_utils.py +13 -0
  22. furu-0.0.4/src/furu/execution/slurm_dag.py +271 -0
  23. furu-0.0.4/src/furu/execution/slurm_pool.py +878 -0
  24. furu-0.0.4/src/furu/execution/slurm_spec.py +38 -0
  25. furu-0.0.4/src/furu/execution/submitit_factory.py +47 -0
  26. {furu-0.0.2 → furu-0.0.4}/src/furu/migration.py +8 -4
  27. {furu-0.0.2 → furu-0.0.4}/src/furu/runtime/logging.py +10 -10
  28. {furu-0.0.2 → furu-0.0.4}/src/furu/serialization/serializer.py +40 -2
  29. {furu-0.0.2 → furu-0.0.4}/src/furu/storage/metadata.py +17 -5
  30. {furu-0.0.2 → furu-0.0.4}/src/furu/storage/state.py +78 -12
  31. furu-0.0.2/src/furu/core/__init__.py +0 -4
  32. furu-0.0.2/src/furu/dashboard/__init__.py +0 -9
  33. furu-0.0.2/src/furu/errors.py +0 -76
  34. {furu-0.0.2 → furu-0.0.4}/src/furu/adapters/__init__.py +0 -0
  35. {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/__main__.py +0 -0
  36. {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/api/__init__.py +0 -0
  37. {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/api/models.py +0 -0
  38. {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/api/routes.py +0 -0
  39. {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/frontend/dist/favicon.svg +0 -0
  40. {furu-0.0.2 → furu-0.0.4}/src/furu/dashboard/scanner.py +0 -0
  41. {furu-0.0.2 → furu-0.0.4}/src/furu/migrate.py +0 -0
  42. {furu-0.0.2 → furu-0.0.4}/src/furu/runtime/__init__.py +0 -0
  43. {furu-0.0.2 → furu-0.0.4}/src/furu/runtime/env.py +0 -0
  44. {furu-0.0.2 → furu-0.0.4}/src/furu/runtime/tracebacks.py +0 -0
  45. {furu-0.0.2 → furu-0.0.4}/src/furu/serialization/__init__.py +0 -0
  46. {furu-0.0.2 → furu-0.0.4}/src/furu/serialization/migrations.py +0 -0
  47. {furu-0.0.2 → furu-0.0.4}/src/furu/storage/__init__.py +0 -0
  48. {furu-0.0.2 → furu-0.0.4}/src/furu/storage/migration.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: furu
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Cacheable, nested pipelines for Python. Define computations as configs; furu handles caching, state tracking, and result reuse across runs.
5
5
  Author: Herman Brunborg
6
6
  Author-email: Herman Brunborg <herman@brunborg.com>
@@ -44,7 +44,7 @@ The `[dashboard]` extra includes the web dashboard. Omit it for the core library
44
44
  1. Subclass `furu.Furu[T]`
45
45
  2. Implement `_create(self) -> T` (compute and write to `self.furu_dir`)
46
46
  3. Implement `_load(self) -> T` (load from `self.furu_dir`)
47
- 4. Call `load_or_create()`
47
+ 4. Call `get()`
48
48
 
49
49
  ```python
50
50
  # my_project/pipelines.py
@@ -75,10 +75,10 @@ class TrainModel(furu.Furu[Path]):
75
75
  from my_project.pipelines import TrainModel
76
76
 
77
77
  # First call: runs _create(), caches result
78
- artifact = TrainModel(lr=3e-4, steps=5000).load_or_create()
78
+ artifact = TrainModel(lr=3e-4, steps=5000).get()
79
79
 
80
80
  # Second call with same config: loads from cache via _load()
81
- artifact = TrainModel(lr=3e-4, steps=5000).load_or_create()
81
+ artifact = TrainModel(lr=3e-4, steps=5000).get()
82
82
  ```
83
83
 
84
84
  > **Tip:** Define Furu classes in importable modules (not `__main__`); the artifact namespace is derived from the class's module + qualified name.
@@ -96,7 +96,7 @@ Each `Furu` instance maps deterministically to a directory based on its config:
96
96
  - **namespace**: Derived from the class's module + qualified name (e.g., `my_project.pipelines/TrainModel`)
97
97
  - **hash**: Computed from the object's config values using Blake2s
98
98
 
99
- When you call `load_or_create()`:
99
+ When you call `get()`:
100
100
  1. If no cached result exists → run `_create()`, save state as "success"
101
101
  2. If cached result exists → run `_load()` to retrieve it
102
102
  3. If another process is running → wait for it to finish, then load
@@ -123,7 +123,7 @@ class TrainTextModel(furu.Furu[str]):
123
123
  dataset: Dataset = furu.chz.field(default_factory=Dataset)
124
124
 
125
125
  def _create(self) -> str:
126
- data = self.dataset.load_or_create() # Triggers Dataset cache
126
+ data = self.dataset.get() # Triggers Dataset cache
127
127
  (self.furu_dir / "model.txt").write_text(f"trained on:\n{data}")
128
128
  return "trained"
129
129
 
@@ -131,6 +131,58 @@ class TrainTextModel(furu.Furu[str]):
131
131
  return (self.furu_dir / "model.txt").read_text()
132
132
  ```
133
133
 
134
+ ### Executors (Local + Slurm)
135
+
136
+ Use the execution helpers for batch runs and cluster scheduling:
137
+
138
+ ```python
139
+ from furu.execution import run_local
140
+
141
+ run_local(
142
+ [TrainModel(lr=3e-4, steps=5000), TrainModel(lr=1e-3, steps=2000)],
143
+ max_workers=8,
144
+ window_size="bfs",
145
+ )
146
+ ```
147
+
148
+ ```python
149
+ from furu.execution import SlurmSpec, submit_slurm_dag
150
+
151
+ specs = {
152
+ "default": SlurmSpec(partition="cpu", cpus=8, mem_gb=32, time_min=120),
153
+ "gpu": SlurmSpec(partition="gpu", gpus=1, cpus=8, mem_gb=64, time_min=720),
154
+ }
155
+
156
+ submit_slurm_dag([TrainModel(lr=3e-4, steps=5000)], specs=specs)
157
+ ```
158
+
159
+ ```python
160
+ from furu.execution import run_slurm_pool
161
+
162
+ run_slurm_pool(
163
+ [TrainModel(lr=3e-4, steps=5000)],
164
+ specs=specs,
165
+ max_workers_total=50,
166
+ window_size="bfs",
167
+ )
168
+ ```
169
+
170
+ Submitit logs are stored under `<FURU_PATH>/submitit` by default. Override with
171
+ `FURU_SUBMITIT_PATH` when you want a different logs root.
172
+
173
+ ### Breaking Changes and Executor Semantics
174
+
175
+ - `load_or_create()` is removed; use `get()` exclusively.
176
+ - `get()` no longer accepts per-call `retry_failed` overrides. Configure retries via
177
+ `FURU_RETRY_FAILED` or `FURU_CONFIG.retry_failed`.
178
+ - Executor runs (`run_local`, `run_slurm_pool`, `submit_slurm_dag`) fail fast if a
179
+ dependency is FAILED while `retry_failed` is disabled; with retries enabled, failed
180
+ compute nodes are retried (bounded by `FURU_MAX_COMPUTE_RETRIES` retries).
181
+ - Pool protocol/queue failures (invalid payloads, spec mismatch, missing artifacts) are
182
+ fatal even when `retry_failed` is enabled; only compute failures are retried.
183
+ - `FURU_ALWAYS_RERUN` causes matching nodes to recompute once per executor run, but
184
+ repeated references in the same run reuse that result.
185
+
134
186
  ### Storage Structure
135
187
 
136
188
  Furu uses two roots: `FURU_PATH` for `data/` + `raw/`, and
@@ -176,7 +228,7 @@ class MyExperiments(furu.FuruList[TrainModel]):
176
228
 
177
229
  # Iterate over all experiments
178
230
  for exp in MyExperiments:
179
- exp.load_or_create()
231
+ exp.get()
180
232
 
181
233
  # Access by name
182
234
  exp = MyExperiments.by_name("baseline")
@@ -191,14 +243,17 @@ for name, exp in MyExperiments.items():
191
243
 
192
244
  ### Custom Validation
193
245
 
194
- Override `_validate()` to add custom cache invalidation logic:
246
+ Override `_validate()` to add custom cache invalidation logic. Return False or
247
+ raise `furu.FuruValidationError` to force re-computation. In executor planning,
248
+ any other exception is logged and treated as invalid (no crash); in interactive
249
+ `exists()` calls, exceptions still surface:
195
250
 
196
251
  ```python
197
252
  class ModelWithValidation(furu.Furu[Path]):
198
253
  checkpoint_name: str = "model.pt"
199
254
 
200
255
  def _validate(self) -> bool:
201
- # Return False to force re-computation
256
+ # Return False (or raise FuruValidationError) to force re-computation
202
257
  ckpt = self.furu_dir / self.checkpoint_name
203
258
  return ckpt.exists() and ckpt.stat().st_size > 0
204
259
 
@@ -220,7 +275,7 @@ if obj.exists():
220
275
 
221
276
  # Get metadata without triggering computation
222
277
  metadata = obj.get_metadata()
223
- print(f"Hash: {obj._furu_hash}")
278
+ print(f"Hash: {obj.furu_hash}")
224
279
  print(f"Dir: {obj.furu_dir}")
225
280
  ```
226
281
 
@@ -251,7 +306,7 @@ class LargeDataProcessor(furu.Furu[Path]):
251
306
  def _create(self) -> Path:
252
307
  # self.raw_dir is shared across all configs
253
308
  # Create a subfolder for isolation if needed
254
- my_raw = self.raw_dir / self._furu_hash
309
+ my_raw = self.raw_dir / self.furu_hash
255
310
  my_raw.mkdir(exist_ok=True)
256
311
 
257
312
  large_file = my_raw / "huge_dataset.bin"
@@ -303,8 +358,8 @@ HHMMSS file.py:line message
303
358
 
304
359
  Furu emits status messages like:
305
360
  ```
306
- load_or_create TrainModel abc123def (missing->create)
307
- load_or_create TrainModel abc123def (success->load)
361
+ get TrainModel abc123def (missing->create)
362
+ get TrainModel abc123def (success->load)
308
363
  ```
309
364
 
310
365
  ### Explicit Setup
@@ -325,7 +380,7 @@ logger = furu.get_logger()
325
380
  from furu import FuruComputeError, FuruWaitTimeout, FuruLockNotAcquired
326
381
 
327
382
  try:
328
- result = obj.load_or_create()
383
+ result = obj.get()
329
384
  except FuruComputeError as e:
330
385
  print(f"Computation failed: {e}")
331
386
  print(f"State file: {e.state_path}")
@@ -336,29 +391,21 @@ except FuruLockNotAcquired:
336
391
  print("Could not acquire lock")
337
392
  ```
338
393
 
339
- ## Submitit Integration
340
-
341
- Run computations on SLURM clusters via [submitit](https://github.com/facebookincubator/submitit):
394
+ By default, failed artifacts are retried on the next `get()` call. Set
395
+ `FURU_RETRY_FAILED=0` to keep failures sticky.
342
396
 
343
- ```python
344
- import submitit
345
- import furu
397
+ `FURU_MAX_WAIT_SECS` overrides the per-class `_max_wait_time_sec` (default 600s)
398
+ timeout used when waiting for compute locks before raising `FuruWaitTimeout`.
346
399
 
347
- executor = submitit.AutoExecutor(folder="submitit_logs")
348
- executor.update_parameters(
349
- timeout_min=60,
350
- slurm_partition="gpu",
351
- gpus_per_node=1,
352
- )
400
+ Failures during metadata collection or signal handler setup (before `_create()`
401
+ runs) raise `FuruComputeError` with the original exception attached. These
402
+ failures still mark the attempt as failed and record details in `state.json`
403
+ and `furu.log`.
353
404
 
354
- # Submit job and return immediately
355
- job = my_furu_obj.load_or_create(executor=executor)
356
-
357
- # Job ID is tracked in .furu/state.json
358
- print(job.job_id)
359
- ```
405
+ ## Submitit Integration
360
406
 
361
- Furu handles preemption, requeuing, and state tracking automatically.
407
+ Furu includes a `SubmititAdapter` for integrating submitit executors with the
408
+ state system. Executor helpers in `furu.execution` handle submission workflows.
362
409
 
363
410
  ## Dashboard
364
411
 
@@ -415,7 +462,10 @@ The `/api/experiments` endpoint supports:
415
462
  | `FURU_LOG_LEVEL` | `INFO` | Console verbosity (`DEBUG`, `INFO`, `WARNING`, `ERROR`) |
416
463
  | `FURU_IGNORE_DIFF` | `false` | Skip embedding git diff in metadata |
417
464
  | `FURU_ALWAYS_RERUN` | `""` | Comma-separated class qualnames to always rerun (use `ALL` to bypass cache globally; cannot combine with other entries; entries must be importable) |
465
+ | `FURU_RETRY_FAILED` | `true` | Retry failed artifacts by default (set to `0` to keep failures sticky) |
466
+ | `FURU_MAX_COMPUTE_RETRIES` | `3` | Maximum compute retries per node after the first failure |
418
467
  | `FURU_POLL_INTERVAL_SECS` | `10` | Polling interval for queued/running jobs |
468
+ | `FURU_MAX_WAIT_SECS` | unset | Override wait timeout (falls back to `_max_wait_time_sec`, default 600s) |
419
469
  | `FURU_WAIT_LOG_EVERY_SECS` | `10` | Interval between "waiting" log messages |
420
470
  | `FURU_STALE_AFTER_SECS` | `1800` | Consider running jobs stale after this duration |
421
471
  | `FURU_LEASE_SECS` | `120` | Compute lock lease duration |
@@ -25,7 +25,7 @@ The `[dashboard]` extra includes the web dashboard. Omit it for the core library
25
25
  1. Subclass `furu.Furu[T]`
26
26
  2. Implement `_create(self) -> T` (compute and write to `self.furu_dir`)
27
27
  3. Implement `_load(self) -> T` (load from `self.furu_dir`)
28
- 4. Call `load_or_create()`
28
+ 4. Call `get()`
29
29
 
30
30
  ```python
31
31
  # my_project/pipelines.py
@@ -56,10 +56,10 @@ class TrainModel(furu.Furu[Path]):
56
56
  from my_project.pipelines import TrainModel
57
57
 
58
58
  # First call: runs _create(), caches result
59
- artifact = TrainModel(lr=3e-4, steps=5000).load_or_create()
59
+ artifact = TrainModel(lr=3e-4, steps=5000).get()
60
60
 
61
61
  # Second call with same config: loads from cache via _load()
62
- artifact = TrainModel(lr=3e-4, steps=5000).load_or_create()
62
+ artifact = TrainModel(lr=3e-4, steps=5000).get()
63
63
  ```
64
64
 
65
65
  > **Tip:** Define Furu classes in importable modules (not `__main__`); the artifact namespace is derived from the class's module + qualified name.
@@ -77,7 +77,7 @@ Each `Furu` instance maps deterministically to a directory based on its config:
77
77
  - **namespace**: Derived from the class's module + qualified name (e.g., `my_project.pipelines/TrainModel`)
78
78
  - **hash**: Computed from the object's config values using Blake2s
79
79
 
80
- When you call `load_or_create()`:
80
+ When you call `get()`:
81
81
  1. If no cached result exists → run `_create()`, save state as "success"
82
82
  2. If cached result exists → run `_load()` to retrieve it
83
83
  3. If another process is running → wait for it to finish, then load
@@ -104,7 +104,7 @@ class TrainTextModel(furu.Furu[str]):
104
104
  dataset: Dataset = furu.chz.field(default_factory=Dataset)
105
105
 
106
106
  def _create(self) -> str:
107
- data = self.dataset.load_or_create() # Triggers Dataset cache
107
+ data = self.dataset.get() # Triggers Dataset cache
108
108
  (self.furu_dir / "model.txt").write_text(f"trained on:\n{data}")
109
109
  return "trained"
110
110
 
@@ -112,6 +112,58 @@ class TrainTextModel(furu.Furu[str]):
112
112
  return (self.furu_dir / "model.txt").read_text()
113
113
  ```
114
114
 
115
+ ### Executors (Local + Slurm)
116
+
117
+ Use the execution helpers for batch runs and cluster scheduling:
118
+
119
+ ```python
120
+ from furu.execution import run_local
121
+
122
+ run_local(
123
+ [TrainModel(lr=3e-4, steps=5000), TrainModel(lr=1e-3, steps=2000)],
124
+ max_workers=8,
125
+ window_size="bfs",
126
+ )
127
+ ```
128
+
129
+ ```python
130
+ from furu.execution import SlurmSpec, submit_slurm_dag
131
+
132
+ specs = {
133
+ "default": SlurmSpec(partition="cpu", cpus=8, mem_gb=32, time_min=120),
134
+ "gpu": SlurmSpec(partition="gpu", gpus=1, cpus=8, mem_gb=64, time_min=720),
135
+ }
136
+
137
+ submit_slurm_dag([TrainModel(lr=3e-4, steps=5000)], specs=specs)
138
+ ```
139
+
140
+ ```python
141
+ from furu.execution import run_slurm_pool
142
+
143
+ run_slurm_pool(
144
+ [TrainModel(lr=3e-4, steps=5000)],
145
+ specs=specs,
146
+ max_workers_total=50,
147
+ window_size="bfs",
148
+ )
149
+ ```
150
+
151
+ Submitit logs are stored under `<FURU_PATH>/submitit` by default. Override with
152
+ `FURU_SUBMITIT_PATH` when you want a different logs root.
153
+
154
+ ### Breaking Changes and Executor Semantics
155
+
156
+ - `load_or_create()` is removed; use `get()` exclusively.
157
+ - `get()` no longer accepts per-call `retry_failed` overrides. Configure retries via
158
+ `FURU_RETRY_FAILED` or `FURU_CONFIG.retry_failed`.
159
+ - Executor runs (`run_local`, `run_slurm_pool`, `submit_slurm_dag`) fail fast if a
160
+ dependency is FAILED while `retry_failed` is disabled; with retries enabled, failed
161
+ compute nodes are retried (bounded by `FURU_MAX_COMPUTE_RETRIES` retries).
162
+ - Pool protocol/queue failures (invalid payloads, spec mismatch, missing artifacts) are
163
+ fatal even when `retry_failed` is enabled; only compute failures are retried.
164
+ - `FURU_ALWAYS_RERUN` causes matching nodes to recompute once per executor run, but
165
+ repeated references in the same run reuse that result.
166
+
115
167
  ### Storage Structure
116
168
 
117
169
  Furu uses two roots: `FURU_PATH` for `data/` + `raw/`, and
@@ -157,7 +209,7 @@ class MyExperiments(furu.FuruList[TrainModel]):
157
209
 
158
210
  # Iterate over all experiments
159
211
  for exp in MyExperiments:
160
- exp.load_or_create()
212
+ exp.get()
161
213
 
162
214
  # Access by name
163
215
  exp = MyExperiments.by_name("baseline")
@@ -172,14 +224,17 @@ for name, exp in MyExperiments.items():
172
224
 
173
225
  ### Custom Validation
174
226
 
175
- Override `_validate()` to add custom cache invalidation logic:
227
+ Override `_validate()` to add custom cache invalidation logic. Return False or
228
+ raise `furu.FuruValidationError` to force re-computation. In executor planning,
229
+ any other exception is logged and treated as invalid (no crash); in interactive
230
+ `exists()` calls, exceptions still surface:
176
231
 
177
232
  ```python
178
233
  class ModelWithValidation(furu.Furu[Path]):
179
234
  checkpoint_name: str = "model.pt"
180
235
 
181
236
  def _validate(self) -> bool:
182
- # Return False to force re-computation
237
+ # Return False (or raise FuruValidationError) to force re-computation
183
238
  ckpt = self.furu_dir / self.checkpoint_name
184
239
  return ckpt.exists() and ckpt.stat().st_size > 0
185
240
 
@@ -201,7 +256,7 @@ if obj.exists():
201
256
 
202
257
  # Get metadata without triggering computation
203
258
  metadata = obj.get_metadata()
204
- print(f"Hash: {obj._furu_hash}")
259
+ print(f"Hash: {obj.furu_hash}")
205
260
  print(f"Dir: {obj.furu_dir}")
206
261
  ```
207
262
 
@@ -232,7 +287,7 @@ class LargeDataProcessor(furu.Furu[Path]):
232
287
  def _create(self) -> Path:
233
288
  # self.raw_dir is shared across all configs
234
289
  # Create a subfolder for isolation if needed
235
- my_raw = self.raw_dir / self._furu_hash
290
+ my_raw = self.raw_dir / self.furu_hash
236
291
  my_raw.mkdir(exist_ok=True)
237
292
 
238
293
  large_file = my_raw / "huge_dataset.bin"
@@ -284,8 +339,8 @@ HHMMSS file.py:line message
284
339
 
285
340
  Furu emits status messages like:
286
341
  ```
287
- load_or_create TrainModel abc123def (missing->create)
288
- load_or_create TrainModel abc123def (success->load)
342
+ get TrainModel abc123def (missing->create)
343
+ get TrainModel abc123def (success->load)
289
344
  ```
290
345
 
291
346
  ### Explicit Setup
@@ -306,7 +361,7 @@ logger = furu.get_logger()
306
361
  from furu import FuruComputeError, FuruWaitTimeout, FuruLockNotAcquired
307
362
 
308
363
  try:
309
- result = obj.load_or_create()
364
+ result = obj.get()
310
365
  except FuruComputeError as e:
311
366
  print(f"Computation failed: {e}")
312
367
  print(f"State file: {e.state_path}")
@@ -317,29 +372,21 @@ except FuruLockNotAcquired:
317
372
  print("Could not acquire lock")
318
373
  ```
319
374
 
320
- ## Submitit Integration
321
-
322
- Run computations on SLURM clusters via [submitit](https://github.com/facebookincubator/submitit):
375
+ By default, failed artifacts are retried on the next `get()` call. Set
376
+ `FURU_RETRY_FAILED=0` to keep failures sticky.
323
377
 
324
- ```python
325
- import submitit
326
- import furu
378
+ `FURU_MAX_WAIT_SECS` overrides the per-class `_max_wait_time_sec` (default 600s)
379
+ timeout used when waiting for compute locks before raising `FuruWaitTimeout`.
327
380
 
328
- executor = submitit.AutoExecutor(folder="submitit_logs")
329
- executor.update_parameters(
330
- timeout_min=60,
331
- slurm_partition="gpu",
332
- gpus_per_node=1,
333
- )
381
+ Failures during metadata collection or signal handler setup (before `_create()`
382
+ runs) raise `FuruComputeError` with the original exception attached. These
383
+ failures still mark the attempt as failed and record details in `state.json`
384
+ and `furu.log`.
334
385
 
335
- # Submit job and return immediately
336
- job = my_furu_obj.load_or_create(executor=executor)
337
-
338
- # Job ID is tracked in .furu/state.json
339
- print(job.job_id)
340
- ```
386
+ ## Submitit Integration
341
387
 
342
- Furu handles preemption, requeuing, and state tracking automatically.
388
+ Furu includes a `SubmititAdapter` for integrating submitit executors with the
389
+ state system. Executor helpers in `furu.execution` handle submission workflows.
343
390
 
344
391
  ## Dashboard
345
392
 
@@ -396,7 +443,10 @@ The `/api/experiments` endpoint supports:
396
443
  | `FURU_LOG_LEVEL` | `INFO` | Console verbosity (`DEBUG`, `INFO`, `WARNING`, `ERROR`) |
397
444
  | `FURU_IGNORE_DIFF` | `false` | Skip embedding git diff in metadata |
398
445
  | `FURU_ALWAYS_RERUN` | `""` | Comma-separated class qualnames to always rerun (use `ALL` to bypass cache globally; cannot combine with other entries; entries must be importable) |
446
+ | `FURU_RETRY_FAILED` | `true` | Retry failed artifacts by default (set to `0` to keep failures sticky) |
447
+ | `FURU_MAX_COMPUTE_RETRIES` | `3` | Maximum compute retries per node after the first failure |
399
448
  | `FURU_POLL_INTERVAL_SECS` | `10` | Polling interval for queued/running jobs |
449
+ | `FURU_MAX_WAIT_SECS` | unset | Override wait timeout (falls back to `_max_wait_time_sec`, default 600s) |
400
450
  | `FURU_WAIT_LOG_EVERY_SECS` | `10` | Interval between "waiting" log messages |
401
451
  | `FURU_STALE_AFTER_SECS` | `1800` | Consider running jobs stale after this duration |
402
452
  | `FURU_LEASE_SECS` | `120` | Compute lock lease duration |
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "furu"
3
- version = "0.0.2"
3
+ version = "0.0.4"
4
4
  description = "Cacheable, nested pipelines for Python. Define computations as configs; furu handles caching, state tracking, and result reuse across runs."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -41,6 +41,9 @@ test = [
41
41
  "httpx>=0.27.0",
42
42
  ]
43
43
 
44
+ [tool.uv.build-backend]
45
+ source-include = ["src/furu/dashboard/frontend/dist/**"]
46
+
44
47
  [build-system]
45
48
  requires = ["uv_build>=0.9.26,<0.10.0"]
46
49
  build-backend = "uv_build"
@@ -13,12 +13,16 @@ __version__ = version("furu")
13
13
 
14
14
  from .config import FURU_CONFIG, FuruConfig, get_furu_root, set_furu_root
15
15
  from .adapters import SubmititAdapter
16
- from .core import Furu, FuruList
16
+ from .core import DependencyChzSpec, DependencySpec, Furu, FuruList
17
17
  from .errors import (
18
18
  FuruComputeError,
19
19
  FuruError,
20
+ FuruExecutionError,
20
21
  FuruLockNotAcquired,
22
+ FuruMissingArtifact,
21
23
  FuruMigrationRequired,
24
+ FuruSpecMismatch,
25
+ FuruValidationError,
22
26
  FuruWaitTimeout,
23
27
  MISSING,
24
28
  )
@@ -51,11 +55,17 @@ __all__ = [
51
55
  "FuruComputeError",
52
56
  "FuruConfig",
53
57
  "FuruError",
58
+ "FuruExecutionError",
54
59
  "FuruList",
55
60
  "FuruLockNotAcquired",
61
+ "FuruMissingArtifact",
56
62
  "FuruMigrationRequired",
63
+ "FuruSpecMismatch",
64
+ "FuruValidationError",
57
65
  "FuruSerializer",
58
66
  "FuruWaitTimeout",
67
+ "DependencyChzSpec",
68
+ "DependencySpec",
59
69
  "MISSING",
60
70
  "migrate",
61
71
  "NamespacePair",
@@ -6,6 +6,7 @@ from typing import Any, Callable, Protocol
6
6
 
7
7
  from ..config import FURU_CONFIG
8
8
  from ..storage import StateManager
9
+ from ..runtime.logging import get_logger
9
10
  from ..storage.state import _FuruState, ProbeResult
10
11
 
11
12
 
@@ -102,19 +103,39 @@ class SubmititAdapter:
102
103
  """Watch for job ID in background thread and update state."""
103
104
 
104
105
  def watcher():
106
+ _ = attempt_id # intentionally unused; queued->running attempt swap is expected
105
107
  while True:
106
108
  job_id = self.get_job_id(job)
107
109
  if job_id:
108
110
 
109
111
  def mutate(state: _FuruState) -> None:
110
112
  attempt = state.attempt
111
- if attempt is None or attempt.id != attempt_id:
113
+ if attempt is None:
114
+ return
115
+ if attempt.backend != "submitit":
116
+ return
117
+ if (
118
+ attempt.status not in {"queued", "running"}
119
+ and attempt.status not in StateManager.TERMINAL_STATUSES
120
+ ):
121
+ return
122
+ existing = attempt.scheduler.get("job_id")
123
+ if existing == job_id:
112
124
  return
113
125
  attempt.scheduler["job_id"] = job_id
114
126
 
115
127
  StateManager.update_state(directory, mutate)
116
128
  if callback:
117
- callback(job_id)
129
+ try:
130
+ callback(job_id)
131
+ except Exception:
132
+ # Avoid killing the watcher thread; state update already happened.
133
+ logger = get_logger()
134
+ logger.exception(
135
+ "submitit watcher: job_id callback failed for %s: %s",
136
+ directory,
137
+ job_id,
138
+ )
118
139
  break
119
140
 
120
141
  if self.is_done(job):
@@ -18,16 +18,29 @@ class FuruConfig:
18
18
  return (project_root / self.DEFAULT_ROOT_DIR).resolve()
19
19
 
20
20
  self.base_root = _get_base_root()
21
+ self.submitit_root = (
22
+ Path(os.getenv("FURU_SUBMITIT_PATH", str(self.base_root / "submitit")))
23
+ .expanduser()
24
+ .resolve()
25
+ )
21
26
  self.version_controlled_root_override = self._get_version_controlled_override()
22
27
  self.poll_interval = float(os.getenv("FURU_POLL_INTERVAL_SECS", "10"))
23
28
  self.wait_log_every_sec = float(os.getenv("FURU_WAIT_LOG_EVERY_SECS", "10"))
24
29
  self.stale_timeout = float(os.getenv("FURU_STALE_AFTER_SECS", str(30 * 60)))
30
+ max_wait_env = os.getenv("FURU_MAX_WAIT_SECS")
31
+ self.max_wait_time_sec = float(max_wait_env) if max_wait_env else None
25
32
  self.lease_duration_sec = float(os.getenv("FURU_LEASE_SECS", "120"))
26
33
  hb = os.getenv("FURU_HEARTBEAT_SECS")
27
34
  self.heartbeat_interval_sec = (
28
35
  float(hb) if hb is not None else max(1.0, self.lease_duration_sec / 3.0)
29
36
  )
30
37
  self.max_requeues = int(os.getenv("FURU_PREEMPT_MAX", "5"))
38
+ self.max_compute_retries = int(os.getenv("FURU_MAX_COMPUTE_RETRIES", "3"))
39
+ self.retry_failed = os.getenv("FURU_RETRY_FAILED", "1").lower() in {
40
+ "1",
41
+ "true",
42
+ "yes",
43
+ }
31
44
  self.ignore_git_diff = os.getenv("FURU_IGNORE_DIFF", "0").lower() in {
32
45
  "1",
33
46
  "true",
@@ -102,6 +115,9 @@ class FuruConfig:
102
115
  return self._resolve_version_controlled_root()
103
116
  return self.base_root / "data"
104
117
 
118
+ def get_submitit_root(self) -> Path:
119
+ return self.submitit_root
120
+
105
121
  @classmethod
106
122
  def _get_version_controlled_override(cls) -> Path | None:
107
123
  env = os.getenv("FURU_VERSION_CONTROLLED_PATH")
@@ -151,8 +167,7 @@ class FuruConfig:
151
167
  value = getattr(target, attr, missing_sentinel)
152
168
  if value is missing_sentinel:
153
169
  raise ValueError(
154
- "FURU_ALWAYS_RERUN entry does not exist: "
155
- f"{namespace!r}"
170
+ f"FURU_ALWAYS_RERUN entry does not exist: {namespace!r}"
156
171
  )
157
172
  target = value
158
173
 
@@ -169,4 +184,7 @@ def get_furu_root(*, version_controlled: bool = False) -> Path:
169
184
 
170
185
 
171
186
  def set_furu_root(path: Path) -> None:
172
- FURU_CONFIG.base_root = path.resolve()
187
+ root = path.resolve()
188
+ FURU_CONFIG.base_root = root
189
+ if os.getenv("FURU_SUBMITIT_PATH") is None:
190
+ FURU_CONFIG.submitit_root = (root / "submitit").resolve()
@@ -0,0 +1,4 @@
1
+ from .furu import DependencyChzSpec, DependencySpec, Furu
2
+ from .list import FuruList
3
+
4
+ __all__ = ["DependencyChzSpec", "DependencySpec", "Furu", "FuruList"]