furu 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +3 -1
- furu/config.py +85 -5
- furu/core/__init__.py +2 -2
- furu/core/furu.py +438 -75
- furu/dashboard/frontend/dist/assets/{index-CbdDfSOZ.css → index-BXAIKNNr.css} +1 -1
- furu/dashboard/frontend/dist/assets/{index-DDv_TYB_.js → index-DS3FsqcY.js} +3 -3
- furu/dashboard/frontend/dist/index.html +2 -2
- furu/errors.py +47 -5
- furu/migration.py +8 -4
- furu/serialization/serializer.py +40 -2
- furu/storage/metadata.py +17 -5
- furu/storage/state.py +115 -3
- {furu-0.0.1.dist-info → furu-0.0.3.dist-info}/METADATA +48 -20
- {furu-0.0.1.dist-info → furu-0.0.3.dist-info}/RECORD +19 -19
- {furu-0.0.1.dist-info → furu-0.0.3.dist-info}/WHEEL +1 -1
- {furu-0.0.1.dist-info → furu-0.0.3.dist-info}/entry_points.txt +1 -0
furu/core/furu.py
CHANGED
|
@@ -10,12 +10,29 @@ import traceback
|
|
|
10
10
|
from abc import ABC, abstractmethod
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from types import FrameType
|
|
13
|
-
from typing import
|
|
13
|
+
from typing import (
|
|
14
|
+
AbstractSet,
|
|
15
|
+
Any,
|
|
16
|
+
Callable,
|
|
17
|
+
ClassVar,
|
|
18
|
+
Hashable,
|
|
19
|
+
Mapping,
|
|
20
|
+
Protocol,
|
|
21
|
+
Self,
|
|
22
|
+
Sequence,
|
|
23
|
+
TypedDict,
|
|
24
|
+
TypeAlias,
|
|
25
|
+
TypeVar,
|
|
26
|
+
cast,
|
|
27
|
+
overload,
|
|
28
|
+
)
|
|
14
29
|
|
|
15
30
|
import chz
|
|
16
31
|
import submitit
|
|
17
32
|
from typing_extensions import dataclass_transform
|
|
18
33
|
|
|
34
|
+
from chz.field import Field as ChzField
|
|
35
|
+
|
|
19
36
|
from ..adapters import SubmititAdapter
|
|
20
37
|
from ..adapters.submitit import SubmititJob
|
|
21
38
|
from ..config import FURU_CONFIG
|
|
@@ -41,7 +58,6 @@ from ..storage import (
|
|
|
41
58
|
from ..storage.state import (
|
|
42
59
|
_FuruState,
|
|
43
60
|
_OwnerDict,
|
|
44
|
-
_StateAttemptFailed,
|
|
45
61
|
_StateAttemptQueued,
|
|
46
62
|
_StateAttemptRunning,
|
|
47
63
|
_StateResultAbsent,
|
|
@@ -177,6 +193,29 @@ class Furu[T](ABC):
|
|
|
177
193
|
"""Validate that result is complete and correct (override if needed)."""
|
|
178
194
|
return True
|
|
179
195
|
|
|
196
|
+
def _dependencies(self: Self) -> "DependencySpec | None":
|
|
197
|
+
"""Return extra dependencies not captured by fields."""
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
def _get_dependencies(self: Self, *, recursive: bool = True) -> list["Furu"]:
|
|
201
|
+
"""Collect Furu dependencies from fields and `_dependencies()`."""
|
|
202
|
+
seen = {self._furu_hash}
|
|
203
|
+
dependencies: list[Furu] = []
|
|
204
|
+
_collect_dependencies(self, dependencies, seen, recursive=recursive)
|
|
205
|
+
return dependencies
|
|
206
|
+
|
|
207
|
+
def _dependency_hashes(self: Self) -> list[str]:
|
|
208
|
+
dependencies = _direct_dependencies(self)
|
|
209
|
+
if not dependencies:
|
|
210
|
+
return []
|
|
211
|
+
|
|
212
|
+
digests: set[str] = set()
|
|
213
|
+
for dependency in dependencies:
|
|
214
|
+
if dependency is self:
|
|
215
|
+
raise ValueError("Furu dependencies cannot include self")
|
|
216
|
+
digests.add(dependency._furu_hash)
|
|
217
|
+
return sorted(digests)
|
|
218
|
+
|
|
180
219
|
def _invalidate_cached_success(self: Self, directory: Path, *, reason: str) -> None:
|
|
181
220
|
logger = get_logger()
|
|
182
221
|
logger.warning(
|
|
@@ -204,11 +243,13 @@ class Furu[T](ABC):
|
|
|
204
243
|
"""Compute hash of this object's content for storage identification."""
|
|
205
244
|
return FuruSerializer.compute_hash(self)
|
|
206
245
|
|
|
207
|
-
def
|
|
208
|
-
if
|
|
246
|
+
def _always_rerun(self: Self) -> bool:
|
|
247
|
+
if FURU_CONFIG.always_rerun_all:
|
|
248
|
+
return True
|
|
249
|
+
if not FURU_CONFIG.always_rerun:
|
|
209
250
|
return False
|
|
210
251
|
qualname = f"{self.__class__.__module__}.{self.__class__.__qualname__}"
|
|
211
|
-
return qualname in FURU_CONFIG.
|
|
252
|
+
return qualname in FURU_CONFIG.always_rerun
|
|
212
253
|
|
|
213
254
|
def _base_furu_dir(self: Self) -> Path:
|
|
214
255
|
root = FURU_CONFIG.get_root(self.version_controlled)
|
|
@@ -273,20 +314,33 @@ class Furu[T](ABC):
|
|
|
273
314
|
return MigrationManager.read_migration(self._base_furu_dir())
|
|
274
315
|
|
|
275
316
|
@overload
|
|
276
|
-
def load_or_create(
|
|
317
|
+
def load_or_create(
|
|
318
|
+
self,
|
|
319
|
+
executor: submitit.Executor,
|
|
320
|
+
*,
|
|
321
|
+
retry_failed: bool | None = None,
|
|
322
|
+
) -> T | submitit.Job[T]: ...
|
|
277
323
|
|
|
278
324
|
@overload
|
|
279
|
-
def load_or_create(
|
|
325
|
+
def load_or_create(
|
|
326
|
+
self,
|
|
327
|
+
executor: None = None,
|
|
328
|
+
*,
|
|
329
|
+
retry_failed: bool | None = None,
|
|
330
|
+
) -> T: ...
|
|
280
331
|
|
|
281
332
|
def load_or_create(
|
|
282
333
|
self: Self,
|
|
283
334
|
executor: submitit.Executor | None = None,
|
|
335
|
+
*,
|
|
336
|
+
retry_failed: bool | None = None,
|
|
284
337
|
) -> T | submitit.Job[T]:
|
|
285
338
|
"""
|
|
286
339
|
Load result if it exists, computing if necessary.
|
|
287
340
|
|
|
288
341
|
Args:
|
|
289
342
|
executor: Optional executor for batch submission (e.g., submitit.Executor)
|
|
343
|
+
retry_failed: Whether to retry failed results (default uses FURU_RETRY_FAILED)
|
|
290
344
|
|
|
291
345
|
Returns:
|
|
292
346
|
Result if wait=True, job handle if wait=False, or None if already exists
|
|
@@ -297,6 +351,9 @@ class Furu[T](ABC):
|
|
|
297
351
|
logger = get_logger()
|
|
298
352
|
parent_holder = current_holder()
|
|
299
353
|
has_parent = parent_holder is not None and parent_holder is not self
|
|
354
|
+
retry_failed_effective = (
|
|
355
|
+
retry_failed if retry_failed is not None else FURU_CONFIG.retry_failed
|
|
356
|
+
)
|
|
300
357
|
if has_parent:
|
|
301
358
|
logger.debug(
|
|
302
359
|
"dep: begin %s %s %s",
|
|
@@ -333,12 +390,12 @@ class Furu[T](ABC):
|
|
|
333
390
|
)
|
|
334
391
|
migration = MigrationManager.read_migration(base_dir)
|
|
335
392
|
|
|
336
|
-
if alias_active and self.
|
|
393
|
+
if alias_active and self._always_rerun():
|
|
337
394
|
if migration is not None:
|
|
338
395
|
self._maybe_detach_alias(
|
|
339
396
|
directory=base_dir,
|
|
340
397
|
record=migration,
|
|
341
|
-
reason="
|
|
398
|
+
reason="always_rerun",
|
|
342
399
|
)
|
|
343
400
|
migration = MigrationManager.read_migration(base_dir)
|
|
344
401
|
alias_active = False
|
|
@@ -350,9 +407,9 @@ class Furu[T](ABC):
|
|
|
350
407
|
success_marker = StateManager.get_success_marker_path(directory)
|
|
351
408
|
if success_marker.is_file():
|
|
352
409
|
# We have a success marker. Check if we can use it.
|
|
353
|
-
if self.
|
|
410
|
+
if self._always_rerun():
|
|
354
411
|
self._invalidate_cached_success(
|
|
355
|
-
directory, reason="
|
|
412
|
+
directory, reason="always_rerun enabled"
|
|
356
413
|
)
|
|
357
414
|
# Fall through to normal load
|
|
358
415
|
else:
|
|
@@ -378,12 +435,22 @@ class Furu[T](ABC):
|
|
|
378
435
|
|
|
379
436
|
state0 = StateManager.read_state(directory)
|
|
380
437
|
|
|
438
|
+
if (
|
|
439
|
+
isinstance(state0.result, _StateResultFailed)
|
|
440
|
+
and not retry_failed_effective
|
|
441
|
+
):
|
|
442
|
+
raise self._build_failed_state_error(
|
|
443
|
+
directory,
|
|
444
|
+
state0,
|
|
445
|
+
message="Computation previously failed",
|
|
446
|
+
)
|
|
447
|
+
|
|
381
448
|
needs_reconcile = True
|
|
382
449
|
if isinstance(state0.result, _StateResultSuccess):
|
|
383
450
|
# Double check logic if we fell through to here (e.g. race condition or invalidation above)
|
|
384
|
-
if self.
|
|
451
|
+
if self._always_rerun():
|
|
385
452
|
self._invalidate_cached_success(
|
|
386
|
-
directory, reason="
|
|
453
|
+
directory, reason="always_rerun enabled"
|
|
387
454
|
)
|
|
388
455
|
state0 = StateManager.read_state(directory)
|
|
389
456
|
else:
|
|
@@ -460,7 +527,8 @@ class Furu[T](ABC):
|
|
|
460
527
|
# Synchronous execution
|
|
461
528
|
if executor is None:
|
|
462
529
|
status, created_here, result = self._run_locally(
|
|
463
|
-
start_time=start_time
|
|
530
|
+
start_time=start_time,
|
|
531
|
+
allow_failed=retry_failed_effective,
|
|
464
532
|
)
|
|
465
533
|
if status == "success":
|
|
466
534
|
ok = True
|
|
@@ -476,19 +544,10 @@ class Furu[T](ABC):
|
|
|
476
544
|
)
|
|
477
545
|
return self._load()
|
|
478
546
|
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
if isinstance(attempt, _StateAttemptFailed)
|
|
484
|
-
else None
|
|
485
|
-
)
|
|
486
|
-
suffix = (
|
|
487
|
-
f": {message}" if isinstance(message, str) and message else ""
|
|
488
|
-
)
|
|
489
|
-
raise FuruComputeError(
|
|
490
|
-
f"Computation {status}{suffix}",
|
|
491
|
-
StateManager.get_state_path(directory),
|
|
547
|
+
raise self._build_failed_state_error(
|
|
548
|
+
directory,
|
|
549
|
+
None,
|
|
550
|
+
message="Computation previously failed",
|
|
492
551
|
)
|
|
493
552
|
|
|
494
553
|
# Asynchronous execution with submitit
|
|
@@ -502,7 +561,12 @@ class Furu[T](ABC):
|
|
|
502
561
|
"load_or_create: %s -> submitit submit_once()",
|
|
503
562
|
self.__class__.__name__,
|
|
504
563
|
)
|
|
505
|
-
job = self._submit_once(
|
|
564
|
+
job = self._submit_once(
|
|
565
|
+
adapter,
|
|
566
|
+
directory,
|
|
567
|
+
None,
|
|
568
|
+
allow_failed=retry_failed_effective,
|
|
569
|
+
)
|
|
506
570
|
ok = True
|
|
507
571
|
return cast(submitit.Job[T], job)
|
|
508
572
|
finally:
|
|
@@ -545,12 +609,56 @@ class Furu[T](ABC):
|
|
|
545
609
|
},
|
|
546
610
|
)
|
|
547
611
|
|
|
612
|
+
def _add_exception_breadcrumbs(self, exc: BaseException, directory: Path) -> None:
|
|
613
|
+
if not hasattr(exc, "add_note"):
|
|
614
|
+
return
|
|
615
|
+
state_path = StateManager.get_state_path(directory)
|
|
616
|
+
log_path = StateManager.get_internal_dir(directory) / "furu.log"
|
|
617
|
+
note = (
|
|
618
|
+
f"Furu directory: {directory}\n"
|
|
619
|
+
f"State file: {state_path}\n"
|
|
620
|
+
f"Log file: {log_path}"
|
|
621
|
+
)
|
|
622
|
+
exc.add_note(note)
|
|
623
|
+
|
|
624
|
+
@staticmethod
|
|
625
|
+
def _failed_state_hints() -> list[str]:
|
|
626
|
+
return [
|
|
627
|
+
"To retry this failed artifact: set FURU_RETRY_FAILED=1 or call load_or_create(retry_failed=True).",
|
|
628
|
+
"To inspect details: open the state file and furu.log shown above.",
|
|
629
|
+
]
|
|
630
|
+
|
|
631
|
+
def _build_failed_state_error(
|
|
632
|
+
self,
|
|
633
|
+
directory: Path,
|
|
634
|
+
state: _FuruState | None,
|
|
635
|
+
*,
|
|
636
|
+
message: str,
|
|
637
|
+
) -> FuruComputeError:
|
|
638
|
+
current_state = state or StateManager.read_state(directory)
|
|
639
|
+
attempt = current_state.attempt
|
|
640
|
+
error = getattr(attempt, "error", None) if attempt is not None else None
|
|
641
|
+
return FuruComputeError(
|
|
642
|
+
message,
|
|
643
|
+
StateManager.get_state_path(directory),
|
|
644
|
+
recorded_error_type=getattr(error, "type", None),
|
|
645
|
+
recorded_error_message=getattr(error, "message", None),
|
|
646
|
+
recorded_traceback=getattr(error, "traceback", None),
|
|
647
|
+
hints=self._failed_state_hints(),
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
def _effective_max_wait_time_sec(self) -> float | None:
|
|
651
|
+
if FURU_CONFIG.max_wait_time_sec is not None:
|
|
652
|
+
return FURU_CONFIG.max_wait_time_sec
|
|
653
|
+
return self._max_wait_time_sec
|
|
654
|
+
|
|
548
655
|
def _check_timeout(self, start_time: float) -> None:
|
|
549
656
|
"""Check if operation has timed out."""
|
|
550
|
-
|
|
551
|
-
|
|
657
|
+
max_wait_time = self._effective_max_wait_time_sec()
|
|
658
|
+
if max_wait_time is not None:
|
|
659
|
+
if time.time() - start_time > max_wait_time:
|
|
552
660
|
raise FuruWaitTimeout(
|
|
553
|
-
f"Furu operation timed out after {
|
|
661
|
+
f"Furu operation timed out after {max_wait_time} seconds."
|
|
554
662
|
)
|
|
555
663
|
|
|
556
664
|
def _is_migrated_state(self, directory: Path) -> bool:
|
|
@@ -611,8 +719,10 @@ class Furu[T](ABC):
|
|
|
611
719
|
event: dict[str, str | int] = {
|
|
612
720
|
"type": "migration_overwrite",
|
|
613
721
|
"policy": record.policy,
|
|
614
|
-
"
|
|
615
|
-
"
|
|
722
|
+
"from_namespace": record.from_namespace,
|
|
723
|
+
"from_hash": record.from_hash,
|
|
724
|
+
"to_namespace": record.to_namespace,
|
|
725
|
+
"to_hash": record.to_hash,
|
|
616
726
|
"reason": reason,
|
|
617
727
|
}
|
|
618
728
|
StateManager.append_event(directory, event.copy())
|
|
@@ -623,6 +733,8 @@ class Furu[T](ABC):
|
|
|
623
733
|
adapter: SubmititAdapter,
|
|
624
734
|
directory: Path,
|
|
625
735
|
on_job_id: Callable[[str], None] | None,
|
|
736
|
+
*,
|
|
737
|
+
allow_failed: bool,
|
|
626
738
|
) -> SubmititJob | None:
|
|
627
739
|
"""Submit job once without waiting (fire-and-forget mode)."""
|
|
628
740
|
logger = get_logger()
|
|
@@ -691,7 +803,7 @@ class Furu[T](ABC):
|
|
|
691
803
|
scheduler={},
|
|
692
804
|
)
|
|
693
805
|
|
|
694
|
-
job = adapter.submit(lambda: self._worker_entry())
|
|
806
|
+
job = adapter.submit(lambda: self._worker_entry(allow_failed=allow_failed))
|
|
695
807
|
|
|
696
808
|
# Save job handle and watch for job ID
|
|
697
809
|
adapter.pickle_job(job, directory)
|
|
@@ -727,7 +839,7 @@ class Furu[T](ABC):
|
|
|
727
839
|
finally:
|
|
728
840
|
StateManager.release_lock(lock_fd, lock_path)
|
|
729
841
|
|
|
730
|
-
def _worker_entry(self: Self) -> None:
|
|
842
|
+
def _worker_entry(self: Self, *, allow_failed: bool | None = None) -> None:
|
|
731
843
|
"""Entry point for worker process (called by submitit or locally)."""
|
|
732
844
|
with enter_holder(self):
|
|
733
845
|
logger = get_logger()
|
|
@@ -735,6 +847,9 @@ class Furu[T](ABC):
|
|
|
735
847
|
directory.mkdir(parents=True, exist_ok=True)
|
|
736
848
|
|
|
737
849
|
env_info = self._collect_submitit_env()
|
|
850
|
+
allow_failed_effective = (
|
|
851
|
+
allow_failed if allow_failed is not None else FURU_CONFIG.retry_failed
|
|
852
|
+
)
|
|
738
853
|
|
|
739
854
|
try:
|
|
740
855
|
with compute_lock(
|
|
@@ -756,19 +871,23 @@ class Furu[T](ABC):
|
|
|
756
871
|
poll_interval_sec=FURU_CONFIG.poll_interval,
|
|
757
872
|
wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
|
|
758
873
|
reconcile_fn=lambda d: self._reconcile(d),
|
|
874
|
+
allow_failed=allow_failed_effective,
|
|
759
875
|
) as ctx:
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
876
|
+
stage = "metadata"
|
|
877
|
+
try:
|
|
878
|
+
# Refresh metadata (now safe - attempt is already recorded)
|
|
879
|
+
metadata = MetadataManager.create_metadata(
|
|
880
|
+
self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
|
|
881
|
+
)
|
|
882
|
+
MetadataManager.write_metadata(metadata, directory)
|
|
765
883
|
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
884
|
+
# Set up signal handlers
|
|
885
|
+
stage = "signal handler setup"
|
|
886
|
+
self._setup_signal_handlers(
|
|
887
|
+
directory, ctx.stop_heartbeat, attempt_id=ctx.attempt_id
|
|
888
|
+
)
|
|
770
889
|
|
|
771
|
-
|
|
890
|
+
stage = "_create"
|
|
772
891
|
# Run computation
|
|
773
892
|
logger.debug(
|
|
774
893
|
"_create: begin %s %s %s",
|
|
@@ -796,13 +915,23 @@ class Furu[T](ABC):
|
|
|
796
915
|
extra={"furu_console_only": True},
|
|
797
916
|
)
|
|
798
917
|
except Exception as e:
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
918
|
+
if stage == "_create":
|
|
919
|
+
logger.error(
|
|
920
|
+
"_create failed %s %s %s",
|
|
921
|
+
self.__class__.__name__,
|
|
922
|
+
self._furu_hash,
|
|
923
|
+
directory,
|
|
924
|
+
extra={"furu_file_only": True},
|
|
925
|
+
)
|
|
926
|
+
else:
|
|
927
|
+
logger.error(
|
|
928
|
+
"attempt failed (%s) %s %s %s",
|
|
929
|
+
stage,
|
|
930
|
+
self.__class__.__name__,
|
|
931
|
+
self._furu_hash,
|
|
932
|
+
directory,
|
|
933
|
+
extra={"furu_file_only": True},
|
|
934
|
+
)
|
|
806
935
|
logger.error(
|
|
807
936
|
"%s", format_traceback(e), extra={"furu_file_only": True}
|
|
808
937
|
)
|
|
@@ -819,6 +948,18 @@ class Furu[T](ABC):
|
|
|
819
948
|
"traceback": tb,
|
|
820
949
|
},
|
|
821
950
|
)
|
|
951
|
+
self._add_exception_breadcrumbs(e, directory)
|
|
952
|
+
if stage != "_create":
|
|
953
|
+
message = (
|
|
954
|
+
"Failed to create metadata"
|
|
955
|
+
if stage == "metadata"
|
|
956
|
+
else "Failed to set up signal handlers"
|
|
957
|
+
)
|
|
958
|
+
raise FuruComputeError(
|
|
959
|
+
message,
|
|
960
|
+
StateManager.get_state_path(directory),
|
|
961
|
+
e,
|
|
962
|
+
) from e
|
|
822
963
|
raise
|
|
823
964
|
except FuruLockNotAcquired:
|
|
824
965
|
# Experiment already completed (success or failed), nothing to do
|
|
@@ -848,16 +989,22 @@ class Furu[T](ABC):
|
|
|
848
989
|
|
|
849
990
|
return info
|
|
850
991
|
|
|
851
|
-
def _run_locally(
|
|
992
|
+
def _run_locally(
|
|
993
|
+
self: Self,
|
|
994
|
+
start_time: float,
|
|
995
|
+
*,
|
|
996
|
+
allow_failed: bool,
|
|
997
|
+
) -> tuple[str, bool, T | None]:
|
|
852
998
|
"""Run computation locally, returning (status, created_here, result)."""
|
|
853
999
|
logger = get_logger()
|
|
854
1000
|
directory = self._base_furu_dir()
|
|
855
1001
|
|
|
856
1002
|
# Calculate remaining time for the lock wait
|
|
857
1003
|
max_wait: float | None = None
|
|
858
|
-
|
|
1004
|
+
max_wait_time = self._effective_max_wait_time_sec()
|
|
1005
|
+
if max_wait_time is not None:
|
|
859
1006
|
elapsed = time.time() - start_time
|
|
860
|
-
max_wait = max(0.0,
|
|
1007
|
+
max_wait = max(0.0, max_wait_time - elapsed)
|
|
861
1008
|
|
|
862
1009
|
try:
|
|
863
1010
|
with compute_lock(
|
|
@@ -876,26 +1023,23 @@ class Furu[T](ABC):
|
|
|
876
1023
|
poll_interval_sec=FURU_CONFIG.poll_interval,
|
|
877
1024
|
wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
|
|
878
1025
|
reconcile_fn=lambda d: self._reconcile(d),
|
|
1026
|
+
allow_failed=allow_failed,
|
|
879
1027
|
) as ctx:
|
|
880
|
-
|
|
1028
|
+
stage = "metadata"
|
|
881
1029
|
try:
|
|
1030
|
+
# Create metadata (now safe - attempt is already recorded)
|
|
882
1031
|
metadata = MetadataManager.create_metadata(
|
|
883
1032
|
self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
|
|
884
1033
|
)
|
|
885
1034
|
MetadataManager.write_metadata(metadata, directory)
|
|
886
|
-
except Exception as e:
|
|
887
|
-
raise FuruComputeError(
|
|
888
|
-
"Failed to create metadata",
|
|
889
|
-
StateManager.get_state_path(directory),
|
|
890
|
-
e,
|
|
891
|
-
) from e
|
|
892
|
-
|
|
893
|
-
# Set up preemption handler
|
|
894
|
-
self._setup_signal_handlers(
|
|
895
|
-
directory, ctx.stop_heartbeat, attempt_id=ctx.attempt_id
|
|
896
|
-
)
|
|
897
1035
|
|
|
898
|
-
|
|
1036
|
+
# Set up preemption handler
|
|
1037
|
+
stage = "signal handler setup"
|
|
1038
|
+
self._setup_signal_handlers(
|
|
1039
|
+
directory, ctx.stop_heartbeat, attempt_id=ctx.attempt_id
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
stage = "_create"
|
|
899
1043
|
# Run the computation
|
|
900
1044
|
logger.debug(
|
|
901
1045
|
"_create: begin %s %s %s",
|
|
@@ -924,13 +1068,23 @@ class Furu[T](ABC):
|
|
|
924
1068
|
)
|
|
925
1069
|
return "success", True, result
|
|
926
1070
|
except Exception as e:
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
1071
|
+
if stage == "_create":
|
|
1072
|
+
logger.error(
|
|
1073
|
+
"_create failed %s %s %s",
|
|
1074
|
+
self.__class__.__name__,
|
|
1075
|
+
self._furu_hash,
|
|
1076
|
+
directory,
|
|
1077
|
+
extra={"furu_file_only": True},
|
|
1078
|
+
)
|
|
1079
|
+
else:
|
|
1080
|
+
logger.error(
|
|
1081
|
+
"attempt failed (%s) %s %s %s",
|
|
1082
|
+
stage,
|
|
1083
|
+
self.__class__.__name__,
|
|
1084
|
+
self._furu_hash,
|
|
1085
|
+
directory,
|
|
1086
|
+
extra={"furu_file_only": True},
|
|
1087
|
+
)
|
|
934
1088
|
logger.error(
|
|
935
1089
|
"%s", format_traceback(e), extra={"furu_file_only": True}
|
|
936
1090
|
)
|
|
@@ -948,6 +1102,18 @@ class Furu[T](ABC):
|
|
|
948
1102
|
"traceback": tb,
|
|
949
1103
|
},
|
|
950
1104
|
)
|
|
1105
|
+
self._add_exception_breadcrumbs(e, directory)
|
|
1106
|
+
if stage != "_create":
|
|
1107
|
+
message = (
|
|
1108
|
+
"Failed to create metadata"
|
|
1109
|
+
if stage == "metadata"
|
|
1110
|
+
else "Failed to set up signal handlers"
|
|
1111
|
+
)
|
|
1112
|
+
raise FuruComputeError(
|
|
1113
|
+
message,
|
|
1114
|
+
StateManager.get_state_path(directory),
|
|
1115
|
+
e,
|
|
1116
|
+
) from e
|
|
951
1117
|
raise
|
|
952
1118
|
except FuruLockNotAcquired:
|
|
953
1119
|
# Lock couldn't be acquired because experiment already completed
|
|
@@ -996,4 +1162,201 @@ class Furu[T](ABC):
|
|
|
996
1162
|
signal.signal(sig, handle_signal)
|
|
997
1163
|
|
|
998
1164
|
|
|
1165
|
+
class DependencyChzSpec(Protocol):
|
|
1166
|
+
__chz_fields__: dict[str, ChzField]
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
DependencySequence: TypeAlias = Sequence[Furu]
|
|
1170
|
+
DependencySet: TypeAlias = AbstractSet[Furu]
|
|
1171
|
+
DependencyMapping: TypeAlias = Mapping[str, Furu]
|
|
1172
|
+
DependencyCollection: TypeAlias = DependencySequence | DependencySet | DependencyMapping
|
|
1173
|
+
DependencyValue: TypeAlias = Furu | DependencyCollection
|
|
1174
|
+
DependencySpec: TypeAlias = DependencyValue | DependencyChzSpec
|
|
1175
|
+
DependencyLeaf: TypeAlias = str | int | float | bool | None | Path | bytes
|
|
1176
|
+
DependencyScanValue: TypeAlias = (
|
|
1177
|
+
DependencyLeaf
|
|
1178
|
+
| Furu
|
|
1179
|
+
| Mapping[Hashable, "DependencyScanValue"]
|
|
1180
|
+
| Sequence["DependencyScanValue"]
|
|
1181
|
+
| AbstractSet["DependencyScanValue"]
|
|
1182
|
+
| DependencyChzSpec
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
|
|
1186
|
+
def _collect_dependencies(
|
|
1187
|
+
obj: Furu,
|
|
1188
|
+
dependencies: list[Furu],
|
|
1189
|
+
seen: set[str],
|
|
1190
|
+
*,
|
|
1191
|
+
recursive: bool,
|
|
1192
|
+
) -> None:
|
|
1193
|
+
for dependency in _direct_dependencies(obj):
|
|
1194
|
+
digest = dependency._furu_hash
|
|
1195
|
+
if digest in seen:
|
|
1196
|
+
continue
|
|
1197
|
+
seen.add(digest)
|
|
1198
|
+
dependencies.append(dependency)
|
|
1199
|
+
if recursive:
|
|
1200
|
+
_collect_dependencies(
|
|
1201
|
+
dependency,
|
|
1202
|
+
dependencies,
|
|
1203
|
+
seen,
|
|
1204
|
+
recursive=recursive,
|
|
1205
|
+
)
|
|
1206
|
+
|
|
1207
|
+
|
|
1208
|
+
def _direct_dependencies(obj: Furu) -> list[Furu]:
|
|
1209
|
+
dependencies: list[Furu] = []
|
|
1210
|
+
for field in chz.chz_fields(obj).values():
|
|
1211
|
+
value = cast(DependencyScanValue, getattr(obj, field.logical_name))
|
|
1212
|
+
dependencies.extend(_collect_dependencies_from_value(value))
|
|
1213
|
+
extra = obj._dependencies()
|
|
1214
|
+
if extra is not None:
|
|
1215
|
+
dependencies.extend(_collect_dependencies_from_spec(extra, path="dependencies"))
|
|
1216
|
+
return dependencies
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
def _collect_dependencies_from_value(value: DependencyScanValue) -> list[Furu]:
|
|
1220
|
+
dependencies: list[Furu] = []
|
|
1221
|
+
if isinstance(value, Furu):
|
|
1222
|
+
dependencies.append(value)
|
|
1223
|
+
return dependencies
|
|
1224
|
+
if isinstance(value, dict):
|
|
1225
|
+
mapping = cast(Mapping[Hashable, DependencyScanValue], value)
|
|
1226
|
+
for item in mapping.values():
|
|
1227
|
+
dependencies.extend(_collect_dependencies_from_value(item))
|
|
1228
|
+
return dependencies
|
|
1229
|
+
if isinstance(value, (list, tuple)):
|
|
1230
|
+
sequence = cast(Sequence[DependencyScanValue], value)
|
|
1231
|
+
for item in sequence:
|
|
1232
|
+
dependencies.extend(_collect_dependencies_from_value(item))
|
|
1233
|
+
return dependencies
|
|
1234
|
+
if isinstance(value, (set, frozenset)):
|
|
1235
|
+
items = _sorted_dependency_set(cast(AbstractSet[DependencyScanValue], value))
|
|
1236
|
+
for item in items:
|
|
1237
|
+
dependencies.extend(_collect_dependencies_from_value(item))
|
|
1238
|
+
return dependencies
|
|
1239
|
+
if chz.is_chz(value):
|
|
1240
|
+
for field in chz.chz_fields(value).values():
|
|
1241
|
+
field_value = cast(DependencyScanValue, getattr(value, field.logical_name))
|
|
1242
|
+
dependencies.extend(_collect_dependencies_from_value(field_value))
|
|
1243
|
+
return dependencies
|
|
1244
|
+
|
|
1245
|
+
|
|
1246
|
+
def _collect_dependencies_from_spec(value: DependencySpec, path: str) -> list[Furu]:
|
|
1247
|
+
if isinstance(value, Furu):
|
|
1248
|
+
return [value]
|
|
1249
|
+
if isinstance(value, dict):
|
|
1250
|
+
return _collect_dependencies_from_mapping(
|
|
1251
|
+
cast(Mapping[Hashable, DependencyValue], value),
|
|
1252
|
+
path,
|
|
1253
|
+
)
|
|
1254
|
+
if isinstance(value, (list, tuple)):
|
|
1255
|
+
return _collect_dependencies_from_sequence(
|
|
1256
|
+
cast(Sequence[DependencyValue], value),
|
|
1257
|
+
path,
|
|
1258
|
+
)
|
|
1259
|
+
if isinstance(value, (set, frozenset)):
|
|
1260
|
+
return _collect_dependencies_from_set(
|
|
1261
|
+
cast(AbstractSet[DependencyValue], value),
|
|
1262
|
+
path,
|
|
1263
|
+
)
|
|
1264
|
+
if chz.is_chz(value):
|
|
1265
|
+
dependencies: list[Furu] = []
|
|
1266
|
+
for field in chz.chz_fields(value).values():
|
|
1267
|
+
field_value = getattr(value, field.logical_name)
|
|
1268
|
+
field_path = f"{path}.{field.logical_name}"
|
|
1269
|
+
dependencies.extend(
|
|
1270
|
+
_collect_dependencies_from_value_spec(field_value, field_path)
|
|
1271
|
+
)
|
|
1272
|
+
return dependencies
|
|
1273
|
+
raise _dependency_type_error(path, value)
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
def _collect_dependencies_from_value_spec(
|
|
1277
|
+
value: DependencyValue,
|
|
1278
|
+
path: str,
|
|
1279
|
+
) -> list[Furu]:
|
|
1280
|
+
if isinstance(value, Furu):
|
|
1281
|
+
return [value]
|
|
1282
|
+
if isinstance(value, dict):
|
|
1283
|
+
return _collect_dependencies_from_mapping(
|
|
1284
|
+
cast(Mapping[Hashable, DependencyValue], value),
|
|
1285
|
+
path,
|
|
1286
|
+
)
|
|
1287
|
+
if isinstance(value, (list, tuple)):
|
|
1288
|
+
return _collect_dependencies_from_sequence(
|
|
1289
|
+
cast(Sequence[DependencyValue], value),
|
|
1290
|
+
path,
|
|
1291
|
+
)
|
|
1292
|
+
if isinstance(value, (set, frozenset)):
|
|
1293
|
+
return _collect_dependencies_from_set(
|
|
1294
|
+
cast(AbstractSet[DependencyValue], value),
|
|
1295
|
+
path,
|
|
1296
|
+
)
|
|
1297
|
+
raise _dependency_type_error(path, value)
|
|
1298
|
+
|
|
1299
|
+
|
|
1300
|
+
def _collect_dependencies_from_mapping(
|
|
1301
|
+
mapping: Mapping[Hashable, DependencyValue],
|
|
1302
|
+
path: str,
|
|
1303
|
+
) -> list[Furu]:
|
|
1304
|
+
dependencies: list[Furu] = []
|
|
1305
|
+
for key, item in mapping.items():
|
|
1306
|
+
if not isinstance(item, Furu):
|
|
1307
|
+
raise _dependency_type_error(f"{path}[{key!r}]", item)
|
|
1308
|
+
dependencies.append(item)
|
|
1309
|
+
return dependencies
|
|
1310
|
+
|
|
1311
|
+
|
|
1312
|
+
def _collect_dependencies_from_sequence(
|
|
1313
|
+
sequence: Sequence[DependencyValue],
|
|
1314
|
+
path: str,
|
|
1315
|
+
) -> list[Furu]:
|
|
1316
|
+
dependencies: list[Furu] = []
|
|
1317
|
+
for index, item in enumerate(sequence):
|
|
1318
|
+
if not isinstance(item, Furu):
|
|
1319
|
+
raise _dependency_type_error(f"{path}[{index}]", item)
|
|
1320
|
+
dependencies.append(item)
|
|
1321
|
+
return dependencies
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
def _collect_dependencies_from_set(
|
|
1325
|
+
values: AbstractSet[DependencyValue],
|
|
1326
|
+
path: str,
|
|
1327
|
+
) -> list[Furu]:
|
|
1328
|
+
dependencies: list[Furu] = []
|
|
1329
|
+
ordered = sorted(
|
|
1330
|
+
list(cast(AbstractSet[DependencyScanValue], values)),
|
|
1331
|
+
key=_dependency_sort_key,
|
|
1332
|
+
)
|
|
1333
|
+
for index, item in enumerate(ordered):
|
|
1334
|
+
if not isinstance(item, Furu):
|
|
1335
|
+
raise _dependency_type_error(f"{path}[{index}]", item)
|
|
1336
|
+
dependencies.append(item)
|
|
1337
|
+
return dependencies
|
|
1338
|
+
|
|
1339
|
+
|
|
1340
|
+
def _sorted_dependency_set(
|
|
1341
|
+
values: AbstractSet[DependencyScanValue],
|
|
1342
|
+
) -> list[DependencyScanValue]:
|
|
1343
|
+
return sorted(list(values), key=_dependency_sort_key)
|
|
1344
|
+
|
|
1345
|
+
|
|
1346
|
+
def _dependency_sort_key(value: DependencyScanValue) -> tuple[int, str]:
|
|
1347
|
+
if isinstance(value, Furu):
|
|
1348
|
+
return (0, value._furu_hash)
|
|
1349
|
+
return (1, f"{type(value).__name__}:{value!r}")
|
|
1350
|
+
|
|
1351
|
+
|
|
1352
|
+
def _dependency_type_error(
|
|
1353
|
+
path: str,
|
|
1354
|
+
value: DependencySpec | DependencyValue | DependencyScanValue,
|
|
1355
|
+
) -> TypeError:
|
|
1356
|
+
return TypeError(
|
|
1357
|
+
f"{path} must be a Furu instance or a collection of Furu instances; "
|
|
1358
|
+
f"got {type(value).__name__}"
|
|
1359
|
+
)
|
|
1360
|
+
|
|
1361
|
+
|
|
999
1362
|
_H = TypeVar("_H", bound=Furu, covariant=True)
|