furu 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
furu/core/furu.py CHANGED
@@ -5,17 +5,34 @@ import os
5
5
  import signal
6
6
  import socket
7
7
  import sys
8
+ import threading
8
9
  import time
9
10
  import traceback
10
11
  from abc import ABC, abstractmethod
11
12
  from pathlib import Path
12
13
  from types import FrameType
13
- from typing import Any, Callable, ClassVar, Self, TypedDict, TypeVar, cast, overload
14
+ from typing import (
15
+ AbstractSet,
16
+ Any,
17
+ Callable,
18
+ ClassVar,
19
+ Hashable,
20
+ Mapping,
21
+ Protocol,
22
+ Self,
23
+ Sequence,
24
+ TypedDict,
25
+ TypeAlias,
26
+ TypeVar,
27
+ cast,
28
+ )
14
29
 
15
30
  import chz
16
31
  import submitit
17
32
  from typing_extensions import dataclass_transform
18
33
 
34
+ from chz.field import Field as ChzField
35
+
19
36
  from ..adapters import SubmititAdapter
20
37
  from ..adapters.submitit import SubmititJob
21
38
  from ..config import FURU_CONFIG
@@ -23,6 +40,7 @@ from ..errors import (
23
40
  MISSING,
24
41
  FuruComputeError,
25
42
  FuruLockNotAcquired,
43
+ FuruValidationError,
26
44
  FuruWaitTimeout,
27
45
  )
28
46
  from ..runtime import current_holder
@@ -41,7 +59,6 @@ from ..storage import (
41
59
  from ..storage.state import (
42
60
  _FuruState,
43
61
  _OwnerDict,
44
- _StateAttemptFailed,
45
62
  _StateAttemptQueued,
46
63
  _StateAttemptRunning,
47
64
  _StateResultAbsent,
@@ -174,9 +191,39 @@ class Furu[T](ABC):
174
191
  raise NotImplementedError(f"{self.__class__.__name__}._load() not implemented")
175
192
 
176
193
  def _validate(self: Self) -> bool:
177
- """Validate that result is complete and correct (override if needed)."""
194
+ """
195
+ Validate that result is complete and correct (override if needed).
196
+
197
+ Return False or raise FuruValidationError to mark artifacts as invalid.
198
+ """
178
199
  return True
179
200
 
201
+ def _dependencies(self: Self) -> "DependencySpec | None":
202
+ """Return extra dependencies not captured by fields."""
203
+ return None
204
+
205
+ def _executor_spec_key(self: Self) -> str:
206
+ return "default"
207
+
208
+ def _get_dependencies(self: Self, *, recursive: bool = True) -> list["Furu"]:
209
+ """Collect Furu dependencies from fields and `_dependencies()`."""
210
+ seen = {self._furu_hash}
211
+ dependencies: list[Furu] = []
212
+ _collect_dependencies(self, dependencies, seen, recursive=recursive)
213
+ return dependencies
214
+
215
+ def _dependency_hashes(self: Self) -> list[str]:
216
+ dependencies = _direct_dependencies(self)
217
+ if not dependencies:
218
+ return []
219
+
220
+ digests: set[str] = set()
221
+ for dependency in dependencies:
222
+ if dependency is self:
223
+ raise ValueError("Furu dependencies cannot include self")
224
+ digests.add(dependency._furu_hash)
225
+ return sorted(digests)
226
+
180
227
  def _invalidate_cached_success(self: Self, directory: Path, *, reason: str) -> None:
181
228
  logger = get_logger()
182
229
  logger.warning(
@@ -199,6 +246,27 @@ class Furu[T](ABC):
199
246
  directory, {"type": "result_invalidated", "reason": reason, "at": now}
200
247
  )
201
248
 
249
+ def _prepare_executor_rerun(self: Self, directory: Path) -> None:
250
+ if not self._always_rerun():
251
+ return
252
+ if not directory.exists():
253
+ return
254
+ migration = self._alias_record(directory)
255
+ if migration is not None and self._alias_is_active(directory, migration):
256
+ self._maybe_detach_alias(
257
+ directory=directory,
258
+ record=migration,
259
+ reason="always_rerun",
260
+ )
261
+ state = StateManager.read_state(directory)
262
+ if isinstance(state.result, _StateResultSuccess):
263
+ self._invalidate_cached_success(directory, reason="always_rerun enabled")
264
+
265
+ @property
266
+ def furu_hash(self: Self) -> str:
267
+ """Return the stable content hash for this Furu object."""
268
+ return self._furu_hash
269
+
202
270
  @property
203
271
  def _furu_hash(self: Self) -> str:
204
272
  """Compute hash of this object's content for storage identification."""
@@ -251,6 +319,33 @@ class Furu[T](ABC):
251
319
  """Log a message to the current holder's `furu.log`."""
252
320
  return log(message, level=level)
253
321
 
322
+ def _exists_quiet(self: Self) -> bool:
323
+ directory = self._base_furu_dir()
324
+ state = self.get_state(directory)
325
+
326
+ if not isinstance(state.result, _StateResultSuccess):
327
+ return False
328
+ try:
329
+ return self._validate()
330
+ except FuruValidationError as exc:
331
+ logger = get_logger()
332
+ logger.warning(
333
+ "exists %s -> false (validate invalid for %s: %s)",
334
+ directory,
335
+ f"{self.__class__.__name__}({self._furu_hash})",
336
+ exc,
337
+ )
338
+ return False
339
+ except Exception as exc:
340
+ logger = get_logger()
341
+ logger.exception(
342
+ "exists %s -> false (validate crashed for %s: %s)",
343
+ directory,
344
+ f"{self.__class__.__name__}({self._furu_hash})",
345
+ exc,
346
+ )
347
+ return False
348
+
254
349
  def exists(self: Self) -> bool:
255
350
  """Check if result exists and is valid."""
256
351
  logger = get_logger()
@@ -274,31 +369,93 @@ class Furu[T](ABC):
274
369
  """Get migration record for this object."""
275
370
  return MigrationManager.read_migration(self._base_furu_dir())
276
371
 
277
- @overload
278
- def load_or_create(self, executor: submitit.Executor) -> T | submitit.Job[T]: ...
279
-
280
- @overload
281
- def load_or_create(self, executor: None = None) -> T: ...
282
-
283
- def load_or_create(
284
- self: Self,
285
- executor: submitit.Executor | None = None,
286
- ) -> T | submitit.Job[T]:
372
+ def get(self: Self, *, force: bool = False) -> T:
287
373
  """
288
374
  Load result if it exists, computing if necessary.
289
375
 
290
376
  Args:
291
- executor: Optional executor for batch submission (e.g., submitit.Executor)
377
+ force: Allow computation inside executor contexts if the spec matches.
292
378
 
293
379
  Returns:
294
- Result if wait=True, job handle if wait=False, or None if already exists
380
+ Loaded or computed result.
295
381
 
296
382
  Raises:
297
383
  FuruComputeError: If computation fails with detailed error information
298
384
  """
385
+ from furu.execution.context import EXEC_CONTEXT
386
+ from furu.errors import (
387
+ FuruExecutionError,
388
+ FuruMissingArtifact,
389
+ FuruSpecMismatch,
390
+ )
391
+
392
+ ctx = EXEC_CONTEXT.get()
393
+ if ctx.mode == "executor":
394
+ directory = self._base_furu_dir()
395
+ if force:
396
+ if (
397
+ ctx.current_node_hash is None
398
+ or self._furu_hash != ctx.current_node_hash
399
+ ):
400
+ raise FuruExecutionError(
401
+ "force=True not allowed: only the current node may compute in executor mode. "
402
+ f"current_node_hash={ctx.current_node_hash!r} "
403
+ f"obj={self.__class__.__name__}({self._furu_hash})",
404
+ hints=[
405
+ "Declare this object as a dependency instead of calling dep.get(force=True).",
406
+ "Inside executor mode, use get(force=True) only on the node being executed.",
407
+ ],
408
+ )
409
+ self._prepare_executor_rerun(directory)
410
+
411
+ exists_ok = self._exists_quiet()
412
+ if exists_ok and not (force and self._always_rerun()):
413
+ return self._load()
414
+
415
+ if force and not exists_ok:
416
+ state = self.get_state(directory)
417
+ if isinstance(state.result, _StateResultSuccess):
418
+ self._invalidate_cached_success(
419
+ directory, reason="_validate returned false (executor)"
420
+ )
421
+
422
+ if not force:
423
+ raise FuruMissingArtifact(
424
+ "Missing artifact "
425
+ f"{self.__class__.__name__}({self._furu_hash}) in executor mode. "
426
+ f"Requested by {ctx.current_node_hash}. Declare it as a dependency."
427
+ )
428
+
429
+ required = self._executor_spec_key()
430
+ if ctx.spec_key is None or required != ctx.spec_key:
431
+ raise FuruSpecMismatch(
432
+ "force=True not allowed: "
433
+ f"required={required!r} != worker={ctx.spec_key!r} (v1 exact match)"
434
+ )
435
+
436
+ status, created_here, result = self._run_locally(
437
+ start_time=time.time(),
438
+ allow_failed=FURU_CONFIG.retry_failed,
439
+ executor_mode=True,
440
+ )
441
+ if status == "success":
442
+ if created_here:
443
+ return cast(T, result)
444
+ return self._load()
445
+
446
+ raise self._build_failed_state_error(
447
+ self._base_furu_dir(),
448
+ None,
449
+ message="Computation previously failed",
450
+ )
451
+
452
+ return self._get_impl_interactive(force=force)
453
+
454
+ def _get_impl_interactive(self: Self, *, force: bool) -> T:
299
455
  logger = get_logger()
300
456
  parent_holder = current_holder()
301
457
  has_parent = parent_holder is not None and parent_holder is not self
458
+ retry_failed_effective = FURU_CONFIG.retry_failed
302
459
  if has_parent:
303
460
  logger.debug(
304
461
  "dep: begin %s %s %s",
@@ -380,7 +537,16 @@ class Furu[T](ABC):
380
537
 
381
538
  state0 = StateManager.read_state(directory)
382
539
 
383
- needs_reconcile = True
540
+ if (
541
+ isinstance(state0.result, _StateResultFailed)
542
+ and not retry_failed_effective
543
+ ):
544
+ raise self._build_failed_state_error(
545
+ directory,
546
+ state0,
547
+ message="Computation previously failed",
548
+ )
549
+
384
550
  if isinstance(state0.result, _StateResultSuccess):
385
551
  # Double check logic if we fell through to here (e.g. race condition or invalidation above)
386
552
  if self._always_rerun():
@@ -395,9 +561,6 @@ class Furu[T](ABC):
395
561
  directory, reason="_validate returned false"
396
562
  )
397
563
  state0 = StateManager.read_state(directory)
398
- else:
399
- # Valid success found, skip reconcile
400
- needs_reconcile = False
401
564
  except Exception as e:
402
565
  self._invalidate_cached_success(
403
566
  directory,
@@ -405,11 +568,6 @@ class Furu[T](ABC):
405
568
  )
406
569
  state0 = StateManager.read_state(directory)
407
570
 
408
- if needs_reconcile and executor is not None:
409
- adapter0 = SubmititAdapter(executor)
410
- self._reconcile(directory, adapter=adapter0)
411
- state0 = StateManager.read_state(directory)
412
-
413
571
  attempt0 = state0.attempt
414
572
  if isinstance(state0.result, _StateResultSuccess):
415
573
  decision = "success->load"
@@ -429,7 +587,7 @@ class Furu[T](ABC):
429
587
  if decision != "success->load":
430
588
  write_separator()
431
589
  logger.debug(
432
- "load_or_create %s %s %s (%s)",
590
+ "get %s %s %s (%s)",
433
591
  self.__class__.__name__,
434
592
  self._furu_hash,
435
593
  directory,
@@ -449,7 +607,7 @@ class Furu[T](ABC):
449
607
  # failures even when we suppressed the cache-hit header line.
450
608
  write_separator()
451
609
  logger.error(
452
- "load_or_create %s %s (load failed)",
610
+ "get %s %s (load failed)",
453
611
  self.__class__.__name__,
454
612
  self._furu_hash,
455
613
  )
@@ -459,54 +617,30 @@ class Furu[T](ABC):
459
617
  e,
460
618
  ) from e
461
619
 
462
- # Synchronous execution
463
- if executor is None:
464
- status, created_here, result = self._run_locally(
465
- start_time=start_time
466
- )
467
- if status == "success":
468
- ok = True
469
- if created_here:
470
- logger.debug(
471
- "load_or_create: %s created -> return",
472
- self.__class__.__name__,
473
- )
474
- return cast(T, result)
620
+ status, created_here, result = self._run_locally(
621
+ start_time=start_time,
622
+ allow_failed=retry_failed_effective,
623
+ executor_mode=False,
624
+ )
625
+ if status == "success":
626
+ ok = True
627
+ if created_here:
475
628
  logger.debug(
476
- "load_or_create: %s success -> _load()",
629
+ "get: %s created -> return",
477
630
  self.__class__.__name__,
478
631
  )
479
- return self._load()
480
-
481
- state = StateManager.read_state(directory)
482
- attempt = state.attempt
483
- message = (
484
- attempt.error.message
485
- if isinstance(attempt, _StateAttemptFailed)
486
- else None
487
- )
488
- suffix = (
489
- f": {message}" if isinstance(message, str) and message else ""
490
- )
491
- raise FuruComputeError(
492
- f"Computation {status}{suffix}",
493
- StateManager.get_state_path(directory),
632
+ return cast(T, result)
633
+ logger.debug(
634
+ "get: %s success -> _load()",
635
+ self.__class__.__name__,
494
636
  )
637
+ return self._load()
495
638
 
496
- # Asynchronous execution with submitit
497
- (submitit_folder := self._base_furu_dir() / "submitit").mkdir(
498
- exist_ok=True, parents=True
499
- )
500
- executor.folder = submitit_folder
501
- adapter = SubmititAdapter(executor)
502
-
503
- logger.debug(
504
- "load_or_create: %s -> submitit submit_once()",
505
- self.__class__.__name__,
639
+ raise self._build_failed_state_error(
640
+ directory,
641
+ None,
642
+ message="Computation previously failed",
506
643
  )
507
- job = self._submit_once(adapter, directory, None)
508
- ok = True
509
- return cast(submitit.Job[T], job)
510
644
  finally:
511
645
  if has_parent:
512
646
  logger.debug(
@@ -517,7 +651,7 @@ class Furu[T](ABC):
517
651
  )
518
652
 
519
653
  def _log_console_start(self, action_color: str) -> None:
520
- """Log the start of load_or_create to console with caller info."""
654
+ """Log the start of get to console with caller info."""
521
655
  logger = get_logger()
522
656
  frame = sys._getframe(1)
523
657
 
@@ -537,7 +671,7 @@ class Furu[T](ABC):
537
671
  frame = frame.f_back
538
672
 
539
673
  logger.info(
540
- "load_or_create %s %s",
674
+ "get %s %s",
541
675
  self.__class__.__name__,
542
676
  self._furu_hash,
543
677
  extra={
@@ -547,12 +681,50 @@ class Furu[T](ABC):
547
681
  },
548
682
  )
549
683
 
684
+ def _add_exception_breadcrumbs(self, exc: BaseException, directory: Path) -> None:
685
+ if not hasattr(exc, "add_note"):
686
+ return
687
+ note = f"Furu dir: {directory}"
688
+ exc.add_note(note)
689
+
690
+ @staticmethod
691
+ def _failed_state_hints() -> list[str]:
692
+ return [
693
+ "To retry this failed artifact: set FURU_RETRY_FAILED=1 or call get() again.",
694
+ "To inspect details: open the furu dir shown above.",
695
+ ]
696
+
697
+ def _build_failed_state_error(
698
+ self,
699
+ directory: Path,
700
+ state: _FuruState | None,
701
+ *,
702
+ message: str,
703
+ ) -> FuruComputeError:
704
+ current_state = state or StateManager.read_state(directory)
705
+ attempt = current_state.attempt
706
+ error = getattr(attempt, "error", None) if attempt is not None else None
707
+ return FuruComputeError(
708
+ message,
709
+ StateManager.get_state_path(directory),
710
+ recorded_error_type=getattr(error, "type", None),
711
+ recorded_error_message=getattr(error, "message", None),
712
+ recorded_traceback=getattr(error, "traceback", None),
713
+ hints=self._failed_state_hints(),
714
+ )
715
+
716
+ def _effective_max_wait_time_sec(self) -> float | None:
717
+ if FURU_CONFIG.max_wait_time_sec is not None:
718
+ return FURU_CONFIG.max_wait_time_sec
719
+ return self._max_wait_time_sec
720
+
550
721
  def _check_timeout(self, start_time: float) -> None:
551
722
  """Check if operation has timed out."""
552
- if self._max_wait_time_sec is not None:
553
- if time.time() - start_time > self._max_wait_time_sec:
723
+ max_wait_time = self._effective_max_wait_time_sec()
724
+ if max_wait_time is not None:
725
+ if time.time() - start_time > max_wait_time:
554
726
  raise FuruWaitTimeout(
555
- f"Furu operation timed out after {self._max_wait_time_sec} seconds."
727
+ f"Furu operation timed out after {max_wait_time} seconds."
556
728
  )
557
729
 
558
730
  def _is_migrated_state(self, directory: Path) -> bool:
@@ -613,8 +785,10 @@ class Furu[T](ABC):
613
785
  event: dict[str, str | int] = {
614
786
  "type": "migration_overwrite",
615
787
  "policy": record.policy,
616
- "from": f"{record.from_namespace}:{record.from_hash}",
617
- "to": f"{record.to_namespace}:{record.to_hash}",
788
+ "from_namespace": record.from_namespace,
789
+ "from_hash": record.from_hash,
790
+ "to_namespace": record.to_namespace,
791
+ "to_hash": record.to_hash,
618
792
  "reason": reason,
619
793
  }
620
794
  StateManager.append_event(directory, event.copy())
@@ -625,6 +799,8 @@ class Furu[T](ABC):
625
799
  adapter: SubmititAdapter,
626
800
  directory: Path,
627
801
  on_job_id: Callable[[str], None] | None,
802
+ *,
803
+ allow_failed: bool,
628
804
  ) -> SubmititJob | None:
629
805
  """Submit job once without waiting (fire-and-forget mode)."""
630
806
  logger = get_logger()
@@ -693,7 +869,7 @@ class Furu[T](ABC):
693
869
  scheduler={},
694
870
  )
695
871
 
696
- job = adapter.submit(lambda: self._worker_entry())
872
+ job = adapter.submit(lambda: self._worker_entry(allow_failed=allow_failed))
697
873
 
698
874
  # Save job handle and watch for job ID
699
875
  adapter.pickle_job(job, directory)
@@ -729,102 +905,205 @@ class Furu[T](ABC):
729
905
  finally:
730
906
  StateManager.release_lock(lock_fd, lock_path)
731
907
 
732
- def _worker_entry(self: Self) -> None:
908
+ def _worker_entry(self: Self, *, allow_failed: bool | None = None) -> None:
733
909
  """Entry point for worker process (called by submitit or locally)."""
734
910
  with enter_holder(self):
735
911
  logger = get_logger()
736
- directory = self._base_furu_dir()
737
- directory.mkdir(parents=True, exist_ok=True)
738
-
739
- env_info = self._collect_submitit_env()
740
-
741
- try:
742
- with compute_lock(
743
- directory,
912
+ # Ensure executor semantics apply to *all* work in the worker, not
913
+ # just `_create()`. This prevents accidental dependency computation
914
+ # (e.g., from within `_validate()` or metadata hooks).
915
+ from furu.execution.context import EXEC_CONTEXT, ExecContext
916
+
917
+ exec_token = EXEC_CONTEXT.set(
918
+ ExecContext(
919
+ mode="executor",
920
+ spec_key=self._executor_spec_key(),
744
921
  backend="submitit",
745
- lease_duration_sec=FURU_CONFIG.lease_duration_sec,
746
- heartbeat_interval_sec=FURU_CONFIG.heartbeat_interval_sec,
747
- owner={
748
- "pid": os.getpid(),
749
- "host": socket.gethostname(),
750
- "user": getpass.getuser(),
751
- "command": " ".join(sys.argv) if sys.argv else "<unknown>",
752
- },
753
- scheduler={
754
- "backend": env_info.get("backend"),
755
- "job_id": env_info.get("slurm_job_id"),
756
- },
757
- max_wait_time_sec=None, # Workers wait indefinitely
758
- poll_interval_sec=FURU_CONFIG.poll_interval,
759
- wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
760
- reconcile_fn=lambda d: self._reconcile(d),
761
- ) as ctx:
762
- # Refresh metadata (now safe - attempt is already recorded)
763
- metadata = MetadataManager.create_metadata(
764
- self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
765
- )
766
- MetadataManager.write_metadata(metadata, directory)
922
+ current_node_hash=self._furu_hash,
923
+ )
924
+ )
925
+ try:
926
+ directory = self._base_furu_dir()
927
+ directory.mkdir(parents=True, exist_ok=True)
928
+ always_rerun = self._always_rerun()
929
+ needs_success_invalidation = False
930
+ if not always_rerun:
931
+ exists_ok = self._exists_quiet()
932
+ if not exists_ok:
933
+ state = self.get_state(directory)
934
+ if isinstance(state.result, _StateResultSuccess):
935
+ needs_success_invalidation = True
936
+
937
+ env_info = self._collect_submitit_env()
938
+ allow_failed_effective = (
939
+ allow_failed
940
+ if allow_failed is not None
941
+ else FURU_CONFIG.retry_failed
942
+ )
943
+ allow_success = always_rerun or needs_success_invalidation
767
944
 
768
- # Set up signal handlers
769
- self._setup_signal_handlers(
770
- directory, ctx.stop_heartbeat, attempt_id=ctx.attempt_id
771
- )
945
+ try:
946
+ with compute_lock(
947
+ directory,
948
+ backend="submitit",
949
+ lease_duration_sec=FURU_CONFIG.lease_duration_sec,
950
+ heartbeat_interval_sec=FURU_CONFIG.heartbeat_interval_sec,
951
+ owner={
952
+ "pid": os.getpid(),
953
+ "host": socket.gethostname(),
954
+ "user": getpass.getuser(),
955
+ "command": " ".join(sys.argv) if sys.argv else "<unknown>",
956
+ },
957
+ scheduler={
958
+ "backend": env_info.get("backend"),
959
+ "job_id": env_info.get("slurm_job_id"),
960
+ },
961
+ max_wait_time_sec=None, # Workers wait indefinitely
962
+ poll_interval_sec=FURU_CONFIG.poll_interval,
963
+ wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
964
+ reconcile_fn=lambda d: self._reconcile(d),
965
+ allow_failed=allow_failed_effective,
966
+ allow_success=allow_success,
967
+ ) as ctx:
968
+ self._prepare_executor_rerun(directory)
969
+ if not always_rerun:
970
+ exists_ok = self._exists_quiet()
971
+ if not exists_ok:
972
+ state = self.get_state(directory)
973
+ if isinstance(state.result, _StateResultSuccess):
974
+ self._invalidate_cached_success(
975
+ directory,
976
+ reason="_validate returned false (worker)",
977
+ )
978
+
979
+ stage = "metadata"
980
+ try:
981
+ # Refresh metadata (now safe - attempt is already recorded)
982
+ metadata = MetadataManager.create_metadata(
983
+ self,
984
+ directory,
985
+ ignore_diff=FURU_CONFIG.ignore_git_diff,
986
+ )
987
+ MetadataManager.write_metadata(metadata, directory)
772
988
 
773
- try:
774
- # Run computation
775
- logger.debug(
776
- "_create: begin %s %s %s",
777
- self.__class__.__name__,
778
- self._furu_hash,
779
- directory,
780
- )
781
- self._create()
782
- logger.debug(
783
- "_create: ok %s %s %s",
784
- self.__class__.__name__,
785
- self._furu_hash,
786
- directory,
787
- )
788
- StateManager.write_success_marker(
789
- directory, attempt_id=ctx.attempt_id
790
- )
791
- StateManager.finish_attempt_success(
792
- directory, attempt_id=ctx.attempt_id
793
- )
794
- logger.info(
795
- "_create ok %s %s",
796
- self.__class__.__name__,
797
- self._furu_hash,
798
- extra={"furu_console_only": True},
799
- )
800
- except Exception as e:
801
- logger.error(
802
- "_create failed %s %s %s",
803
- self.__class__.__name__,
804
- self._furu_hash,
805
- directory,
806
- extra={"furu_file_only": True},
807
- )
808
- logger.error(
809
- "%s", format_traceback(e), extra={"furu_file_only": True}
810
- )
989
+ # Set up signal handlers
990
+ stage = "signal handler setup"
991
+ self._setup_signal_handlers(
992
+ directory,
993
+ ctx.stop_heartbeat,
994
+ attempt_id=ctx.attempt_id,
995
+ )
811
996
 
812
- tb = "".join(
813
- traceback.format_exception(type(e), e, e.__traceback__)
814
- )
815
- StateManager.finish_attempt_failed(
816
- directory,
817
- attempt_id=ctx.attempt_id,
818
- error={
819
- "type": type(e).__name__,
820
- "message": str(e),
821
- "traceback": tb,
822
- },
997
+ stage = "_create"
998
+ # Run computation
999
+ logger.debug(
1000
+ "_create: begin %s %s %s",
1001
+ self.__class__.__name__,
1002
+ self._furu_hash,
1003
+ directory,
1004
+ )
1005
+ self._create()
1006
+ logger.debug(
1007
+ "_create: ok %s %s %s",
1008
+ self.__class__.__name__,
1009
+ self._furu_hash,
1010
+ directory,
1011
+ )
1012
+ StateManager.write_success_marker(
1013
+ directory, attempt_id=ctx.attempt_id
1014
+ )
1015
+ StateManager.finish_attempt_success(
1016
+ directory, attempt_id=ctx.attempt_id
1017
+ )
1018
+ logger.info(
1019
+ "_create ok %s %s",
1020
+ self.__class__.__name__,
1021
+ self._furu_hash,
1022
+ extra={"furu_console_only": True},
1023
+ )
1024
+ except Exception as e:
1025
+ if stage == "_create":
1026
+ logger.error(
1027
+ "_create failed %s %s %s",
1028
+ self.__class__.__name__,
1029
+ self._furu_hash,
1030
+ directory,
1031
+ extra={"furu_file_only": True},
1032
+ )
1033
+ else:
1034
+ logger.error(
1035
+ "attempt failed (%s) %s %s %s",
1036
+ stage,
1037
+ self.__class__.__name__,
1038
+ self._furu_hash,
1039
+ directory,
1040
+ extra={"furu_file_only": True},
1041
+ )
1042
+ logger.error(
1043
+ "%s",
1044
+ format_traceback(e),
1045
+ extra={"furu_file_only": True},
1046
+ )
1047
+
1048
+ tb = "".join(
1049
+ traceback.format_exception(type(e), e, e.__traceback__)
1050
+ )
1051
+ StateManager.finish_attempt_failed(
1052
+ directory,
1053
+ attempt_id=ctx.attempt_id,
1054
+ error={
1055
+ "type": type(e).__name__,
1056
+ "message": str(e),
1057
+ "traceback": tb,
1058
+ },
1059
+ )
1060
+ self._add_exception_breadcrumbs(e, directory)
1061
+ if stage != "_create":
1062
+ message = (
1063
+ "Failed to create metadata"
1064
+ if stage == "metadata"
1065
+ else "Failed to set up signal handlers"
1066
+ )
1067
+ raise FuruComputeError(
1068
+ message,
1069
+ StateManager.get_state_path(directory),
1070
+ e,
1071
+ ) from e
1072
+ raise
1073
+ except FuruLockNotAcquired as exc:
1074
+ # Experiment already completed; succeed if success, fail if failed.
1075
+ state = StateManager.read_state(directory)
1076
+ state_path = StateManager.get_state_path(directory)
1077
+ attempt = state.attempt
1078
+ attempt_info = "no active attempt"
1079
+ if attempt is not None:
1080
+ attempt_info = (
1081
+ f"attempt {attempt.id} status {attempt.status} "
1082
+ f"backend {attempt.backend}"
823
1083
  )
824
- raise
825
- except FuruLockNotAcquired:
826
- # Experiment already completed (success or failed), nothing to do
827
- return
1084
+ hints = [
1085
+ f"Furu hash: {self._furu_hash}",
1086
+ f"Directory: {directory}",
1087
+ f"State file: {state_path}",
1088
+ f"Attempt: {attempt_info}",
1089
+ ]
1090
+ if isinstance(state.result, _StateResultSuccess):
1091
+ return
1092
+ if isinstance(state.result, _StateResultFailed):
1093
+ if allow_failed_effective:
1094
+ return
1095
+ raise FuruComputeError(
1096
+ "Worker refused to run: experiment already failed",
1097
+ state_path,
1098
+ exc,
1099
+ hints=hints,
1100
+ ) from exc
1101
+ raise FuruLockNotAcquired(
1102
+ "Worker refused to run: experiment already running elsewhere",
1103
+ hints=hints,
1104
+ ) from exc
1105
+ finally:
1106
+ EXEC_CONTEXT.reset(exec_token)
828
1107
 
829
1108
  def _collect_submitit_env(self: Self) -> _SubmititEnvInfo:
830
1109
  """Collect submitit/slurm environment information."""
@@ -850,16 +1129,23 @@ class Furu[T](ABC):
850
1129
 
851
1130
  return info
852
1131
 
853
- def _run_locally(self: Self, start_time: float) -> tuple[str, bool, T | None]:
1132
+ def _run_locally(
1133
+ self: Self,
1134
+ start_time: float,
1135
+ *,
1136
+ allow_failed: bool,
1137
+ executor_mode: bool = False,
1138
+ ) -> tuple[str, bool, T | None]:
854
1139
  """Run computation locally, returning (status, created_here, result)."""
855
1140
  logger = get_logger()
856
1141
  directory = self._base_furu_dir()
857
1142
 
858
1143
  # Calculate remaining time for the lock wait
859
1144
  max_wait: float | None = None
860
- if self._max_wait_time_sec is not None:
1145
+ max_wait_time = self._effective_max_wait_time_sec()
1146
+ if max_wait_time is not None:
861
1147
  elapsed = time.time() - start_time
862
- max_wait = max(0.0, self._max_wait_time_sec - elapsed)
1148
+ max_wait = max(0.0, max_wait_time - elapsed)
863
1149
 
864
1150
  try:
865
1151
  with compute_lock(
@@ -878,26 +1164,23 @@ class Furu[T](ABC):
878
1164
  poll_interval_sec=FURU_CONFIG.poll_interval,
879
1165
  wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
880
1166
  reconcile_fn=lambda d: self._reconcile(d),
1167
+ allow_failed=allow_failed,
881
1168
  ) as ctx:
882
- # Create metadata (now safe - attempt is already recorded)
1169
+ stage = "metadata"
883
1170
  try:
1171
+ # Create metadata (now safe - attempt is already recorded)
884
1172
  metadata = MetadataManager.create_metadata(
885
1173
  self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
886
1174
  )
887
1175
  MetadataManager.write_metadata(metadata, directory)
888
- except Exception as e:
889
- raise FuruComputeError(
890
- "Failed to create metadata",
891
- StateManager.get_state_path(directory),
892
- e,
893
- ) from e
894
-
895
- # Set up preemption handler
896
- self._setup_signal_handlers(
897
- directory, ctx.stop_heartbeat, attempt_id=ctx.attempt_id
898
- )
899
1176
 
900
- try:
1177
+ # Set up preemption handler
1178
+ stage = "signal handler setup"
1179
+ self._setup_signal_handlers(
1180
+ directory, ctx.stop_heartbeat, attempt_id=ctx.attempt_id
1181
+ )
1182
+
1183
+ stage = "_create"
901
1184
  # Run the computation
902
1185
  logger.debug(
903
1186
  "_create: begin %s %s %s",
@@ -905,7 +1188,23 @@ class Furu[T](ABC):
905
1188
  self._furu_hash,
906
1189
  directory,
907
1190
  )
908
- result = self._create()
1191
+ token = None
1192
+ if executor_mode:
1193
+ from furu.execution.context import EXEC_CONTEXT, ExecContext
1194
+
1195
+ token = EXEC_CONTEXT.set(
1196
+ ExecContext(
1197
+ mode="executor",
1198
+ spec_key=self._executor_spec_key(),
1199
+ backend="local",
1200
+ current_node_hash=self._furu_hash,
1201
+ )
1202
+ )
1203
+ try:
1204
+ result = self._create()
1205
+ finally:
1206
+ if token is not None:
1207
+ EXEC_CONTEXT.reset(token)
909
1208
  logger.debug(
910
1209
  "_create: ok %s %s %s",
911
1210
  self.__class__.__name__,
@@ -926,13 +1225,23 @@ class Furu[T](ABC):
926
1225
  )
927
1226
  return "success", True, result
928
1227
  except Exception as e:
929
- logger.error(
930
- "_create failed %s %s %s",
931
- self.__class__.__name__,
932
- self._furu_hash,
933
- directory,
934
- extra={"furu_file_only": True},
935
- )
1228
+ if stage == "_create":
1229
+ logger.error(
1230
+ "_create failed %s %s %s",
1231
+ self.__class__.__name__,
1232
+ self._furu_hash,
1233
+ directory,
1234
+ extra={"furu_file_only": True},
1235
+ )
1236
+ else:
1237
+ logger.error(
1238
+ "attempt failed (%s) %s %s %s",
1239
+ stage,
1240
+ self.__class__.__name__,
1241
+ self._furu_hash,
1242
+ directory,
1243
+ extra={"furu_file_only": True},
1244
+ )
936
1245
  logger.error(
937
1246
  "%s", format_traceback(e), extra={"furu_file_only": True}
938
1247
  )
@@ -950,6 +1259,18 @@ class Furu[T](ABC):
950
1259
  "traceback": tb,
951
1260
  },
952
1261
  )
1262
+ self._add_exception_breadcrumbs(e, directory)
1263
+ if stage != "_create":
1264
+ message = (
1265
+ "Failed to create metadata"
1266
+ if stage == "metadata"
1267
+ else "Failed to set up signal handlers"
1268
+ )
1269
+ raise FuruComputeError(
1270
+ message,
1271
+ StateManager.get_state_path(directory),
1272
+ e,
1273
+ ) from e
953
1274
  raise
954
1275
  except FuruLockNotAcquired:
955
1276
  # Lock couldn't be acquired because experiment already completed
@@ -981,6 +1302,8 @@ class Furu[T](ABC):
981
1302
  attempt_id: str,
982
1303
  ) -> None:
983
1304
  """Set up signal handlers for graceful preemption."""
1305
+ if threading.current_thread() is not threading.main_thread():
1306
+ return
984
1307
 
985
1308
  def handle_signal(signum: int, frame: FrameType | None) -> None:
986
1309
  try:
@@ -998,4 +1321,201 @@ class Furu[T](ABC):
998
1321
  signal.signal(sig, handle_signal)
999
1322
 
1000
1323
 
1324
+ class DependencyChzSpec(Protocol):
1325
+ __chz_fields__: dict[str, ChzField]
1326
+
1327
+
1328
+ DependencySequence: TypeAlias = Sequence[Furu]
1329
+ DependencySet: TypeAlias = AbstractSet[Furu]
1330
+ DependencyMapping: TypeAlias = Mapping[str, Furu]
1331
+ DependencyCollection: TypeAlias = DependencySequence | DependencySet | DependencyMapping
1332
+ DependencyValue: TypeAlias = Furu | DependencyCollection
1333
+ DependencySpec: TypeAlias = DependencyValue | DependencyChzSpec
1334
+ DependencyLeaf: TypeAlias = str | int | float | bool | None | Path | bytes
1335
+ DependencyScanValue: TypeAlias = (
1336
+ DependencyLeaf
1337
+ | Furu
1338
+ | Mapping[Hashable, "DependencyScanValue"]
1339
+ | Sequence["DependencyScanValue"]
1340
+ | AbstractSet["DependencyScanValue"]
1341
+ | DependencyChzSpec
1342
+ )
1343
+
1344
+
1345
+ def _collect_dependencies(
1346
+ obj: Furu,
1347
+ dependencies: list[Furu],
1348
+ seen: set[str],
1349
+ *,
1350
+ recursive: bool,
1351
+ ) -> None:
1352
+ for dependency in _direct_dependencies(obj):
1353
+ digest = dependency._furu_hash
1354
+ if digest in seen:
1355
+ continue
1356
+ seen.add(digest)
1357
+ dependencies.append(dependency)
1358
+ if recursive:
1359
+ _collect_dependencies(
1360
+ dependency,
1361
+ dependencies,
1362
+ seen,
1363
+ recursive=recursive,
1364
+ )
1365
+
1366
+
1367
+ def _direct_dependencies(obj: Furu) -> list[Furu]:
1368
+ dependencies: list[Furu] = []
1369
+ for field in chz.chz_fields(obj).values():
1370
+ value = cast(DependencyScanValue, getattr(obj, field.logical_name))
1371
+ dependencies.extend(_collect_dependencies_from_value(value))
1372
+ extra = obj._dependencies()
1373
+ if extra is not None:
1374
+ dependencies.extend(_collect_dependencies_from_spec(extra, path="dependencies"))
1375
+ return dependencies
1376
+
1377
+
1378
+ def _collect_dependencies_from_value(value: DependencyScanValue) -> list[Furu]:
1379
+ dependencies: list[Furu] = []
1380
+ if isinstance(value, Furu):
1381
+ dependencies.append(value)
1382
+ return dependencies
1383
+ if isinstance(value, dict):
1384
+ mapping = cast(Mapping[Hashable, DependencyScanValue], value)
1385
+ for item in mapping.values():
1386
+ dependencies.extend(_collect_dependencies_from_value(item))
1387
+ return dependencies
1388
+ if isinstance(value, (list, tuple)):
1389
+ sequence = cast(Sequence[DependencyScanValue], value)
1390
+ for item in sequence:
1391
+ dependencies.extend(_collect_dependencies_from_value(item))
1392
+ return dependencies
1393
+ if isinstance(value, (set, frozenset)):
1394
+ items = _sorted_dependency_set(cast(AbstractSet[DependencyScanValue], value))
1395
+ for item in items:
1396
+ dependencies.extend(_collect_dependencies_from_value(item))
1397
+ return dependencies
1398
+ if chz.is_chz(value):
1399
+ for field in chz.chz_fields(value).values():
1400
+ field_value = cast(DependencyScanValue, getattr(value, field.logical_name))
1401
+ dependencies.extend(_collect_dependencies_from_value(field_value))
1402
+ return dependencies
1403
+
1404
+
1405
+ def _collect_dependencies_from_spec(value: DependencySpec, path: str) -> list[Furu]:
1406
+ if isinstance(value, Furu):
1407
+ return [value]
1408
+ if isinstance(value, dict):
1409
+ return _collect_dependencies_from_mapping(
1410
+ cast(Mapping[Hashable, DependencyValue], value),
1411
+ path,
1412
+ )
1413
+ if isinstance(value, (list, tuple)):
1414
+ return _collect_dependencies_from_sequence(
1415
+ cast(Sequence[DependencyValue], value),
1416
+ path,
1417
+ )
1418
+ if isinstance(value, (set, frozenset)):
1419
+ return _collect_dependencies_from_set(
1420
+ cast(AbstractSet[DependencyValue], value),
1421
+ path,
1422
+ )
1423
+ if chz.is_chz(value):
1424
+ dependencies: list[Furu] = []
1425
+ for field in chz.chz_fields(value).values():
1426
+ field_value = getattr(value, field.logical_name)
1427
+ field_path = f"{path}.{field.logical_name}"
1428
+ dependencies.extend(
1429
+ _collect_dependencies_from_value_spec(field_value, field_path)
1430
+ )
1431
+ return dependencies
1432
+ raise _dependency_type_error(path, value)
1433
+
1434
+
1435
+ def _collect_dependencies_from_value_spec(
1436
+ value: DependencyValue,
1437
+ path: str,
1438
+ ) -> list[Furu]:
1439
+ if isinstance(value, Furu):
1440
+ return [value]
1441
+ if isinstance(value, dict):
1442
+ return _collect_dependencies_from_mapping(
1443
+ cast(Mapping[Hashable, DependencyValue], value),
1444
+ path,
1445
+ )
1446
+ if isinstance(value, (list, tuple)):
1447
+ return _collect_dependencies_from_sequence(
1448
+ cast(Sequence[DependencyValue], value),
1449
+ path,
1450
+ )
1451
+ if isinstance(value, (set, frozenset)):
1452
+ return _collect_dependencies_from_set(
1453
+ cast(AbstractSet[DependencyValue], value),
1454
+ path,
1455
+ )
1456
+ raise _dependency_type_error(path, value)
1457
+
1458
+
1459
+ def _collect_dependencies_from_mapping(
1460
+ mapping: Mapping[Hashable, DependencyValue],
1461
+ path: str,
1462
+ ) -> list[Furu]:
1463
+ dependencies: list[Furu] = []
1464
+ for key, item in mapping.items():
1465
+ if not isinstance(item, Furu):
1466
+ raise _dependency_type_error(f"{path}[{key!r}]", item)
1467
+ dependencies.append(item)
1468
+ return dependencies
1469
+
1470
+
1471
+ def _collect_dependencies_from_sequence(
1472
+ sequence: Sequence[DependencyValue],
1473
+ path: str,
1474
+ ) -> list[Furu]:
1475
+ dependencies: list[Furu] = []
1476
+ for index, item in enumerate(sequence):
1477
+ if not isinstance(item, Furu):
1478
+ raise _dependency_type_error(f"{path}[{index}]", item)
1479
+ dependencies.append(item)
1480
+ return dependencies
1481
+
1482
+
1483
+ def _collect_dependencies_from_set(
1484
+ values: AbstractSet[DependencyValue],
1485
+ path: str,
1486
+ ) -> list[Furu]:
1487
+ dependencies: list[Furu] = []
1488
+ ordered = sorted(
1489
+ list(cast(AbstractSet[DependencyScanValue], values)),
1490
+ key=_dependency_sort_key,
1491
+ )
1492
+ for index, item in enumerate(ordered):
1493
+ if not isinstance(item, Furu):
1494
+ raise _dependency_type_error(f"{path}[{index}]", item)
1495
+ dependencies.append(item)
1496
+ return dependencies
1497
+
1498
+
1499
+ def _sorted_dependency_set(
1500
+ values: AbstractSet[DependencyScanValue],
1501
+ ) -> list[DependencyScanValue]:
1502
+ return sorted(list(values), key=_dependency_sort_key)
1503
+
1504
+
1505
+ def _dependency_sort_key(value: DependencyScanValue) -> tuple[int, str]:
1506
+ if isinstance(value, Furu):
1507
+ return (0, value._furu_hash)
1508
+ return (1, f"{type(value).__name__}:{value!r}")
1509
+
1510
+
1511
+ def _dependency_type_error(
1512
+ path: str,
1513
+ value: DependencySpec | DependencyValue | DependencyScanValue,
1514
+ ) -> TypeError:
1515
+ return TypeError(
1516
+ f"{path} must be a Furu instance or a collection of Furu instances; "
1517
+ f"got {type(value).__name__}"
1518
+ )
1519
+
1520
+
1001
1521
  _H = TypeVar("_H", bound=Furu, covariant=True)