furu 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
furu/core/furu.py CHANGED
@@ -5,9 +5,11 @@ import os
5
5
  import signal
6
6
  import socket
7
7
  import sys
8
+ import threading
8
9
  import time
9
10
  import traceback
10
11
  from abc import ABC, abstractmethod
12
+ from functools import cached_property
11
13
  from pathlib import Path
12
14
  from types import FrameType
13
15
  from typing import (
@@ -20,18 +22,16 @@ from typing import (
20
22
  Protocol,
21
23
  Self,
22
24
  Sequence,
23
- TypedDict,
24
25
  TypeAlias,
26
+ TypedDict,
25
27
  TypeVar,
26
28
  cast,
27
- overload,
28
29
  )
29
30
 
30
31
  import chz
31
32
  import submitit
32
- from typing_extensions import dataclass_transform
33
-
34
33
  from chz.field import Field as ChzField
34
+ from typing_extensions import dataclass_transform
35
35
 
36
36
  from ..adapters import SubmititAdapter
37
37
  from ..adapters.submitit import SubmititJob
@@ -40,6 +40,7 @@ from ..errors import (
40
40
  MISSING,
41
41
  FuruComputeError,
42
42
  FuruLockNotAcquired,
43
+ FuruValidationError,
43
44
  FuruWaitTimeout,
44
45
  )
45
46
  from ..runtime import current_holder
@@ -62,7 +63,6 @@ from ..storage.state import (
62
63
  _StateAttemptRunning,
63
64
  _StateResultAbsent,
64
65
  _StateResultFailed,
65
- _StateResultMigrated,
66
66
  _StateResultSuccess,
67
67
  compute_lock,
68
68
  )
@@ -190,16 +190,23 @@ class Furu[T](ABC):
190
190
  raise NotImplementedError(f"{self.__class__.__name__}._load() not implemented")
191
191
 
192
192
  def _validate(self: Self) -> bool:
193
- """Validate that result is complete and correct (override if needed)."""
193
+ """
194
+ Validate that result is complete and correct (override if needed).
195
+
196
+ Return False or raise FuruValidationError to mark artifacts as invalid.
197
+ """
194
198
  return True
195
199
 
196
200
  def _dependencies(self: Self) -> "DependencySpec | None":
197
201
  """Return extra dependencies not captured by fields."""
198
202
  return None
199
203
 
204
+ def _executor_spec_key(self: Self) -> str:
205
+ return "default"
206
+
200
207
  def _get_dependencies(self: Self, *, recursive: bool = True) -> list["Furu"]:
201
208
  """Collect Furu dependencies from fields and `_dependencies()`."""
202
- seen = {self._furu_hash}
209
+ seen = {self.furu_hash}
203
210
  dependencies: list[Furu] = []
204
211
  _collect_dependencies(self, dependencies, seen, recursive=recursive)
205
212
  return dependencies
@@ -213,7 +220,7 @@ class Furu[T](ABC):
213
220
  for dependency in dependencies:
214
221
  if dependency is self:
215
222
  raise ValueError("Furu dependencies cannot include self")
216
- digests.add(dependency._furu_hash)
223
+ digests.add(dependency.furu_hash)
217
224
  return sorted(digests)
218
225
 
219
226
  def _invalidate_cached_success(self: Self, directory: Path, *, reason: str) -> None:
@@ -221,7 +228,7 @@ class Furu[T](ABC):
221
228
  logger.warning(
222
229
  "invalidate %s %s %s (%s)",
223
230
  self.__class__.__name__,
224
- self._furu_hash,
231
+ self.furu_hash,
225
232
  directory,
226
233
  reason,
227
234
  )
@@ -238,9 +245,25 @@ class Furu[T](ABC):
238
245
  directory, {"type": "result_invalidated", "reason": reason, "at": now}
239
246
  )
240
247
 
241
- @property
242
- def _furu_hash(self: Self) -> str:
243
- """Compute hash of this object's content for storage identification."""
248
+ def _prepare_executor_rerun(self: Self, directory: Path) -> None:
249
+ if not self._always_rerun():
250
+ return
251
+ if not directory.exists():
252
+ return
253
+ migration = self._alias_record(directory)
254
+ if migration is not None and self._alias_is_active(directory, migration):
255
+ self._maybe_detach_alias(
256
+ directory=directory,
257
+ record=migration,
258
+ reason="always_rerun",
259
+ )
260
+ state = StateManager.read_state(directory)
261
+ if isinstance(state.result, _StateResultSuccess):
262
+ self._invalidate_cached_success(directory, reason="always_rerun enabled")
263
+
264
+ @cached_property
265
+ def furu_hash(self: Self) -> str:
266
+ """Return the stable content hash for this Furu object."""
244
267
  return FuruSerializer.compute_hash(self)
245
268
 
246
269
  def _always_rerun(self: Self) -> bool:
@@ -253,15 +276,17 @@ class Furu[T](ABC):
253
276
 
254
277
  def _base_furu_dir(self: Self) -> Path:
255
278
  root = FURU_CONFIG.get_root(self.version_controlled)
256
- return root / self.__class__._namespace() / self._furu_hash
279
+ return root / self.__class__._namespace() / self.furu_hash
257
280
 
258
- @property
281
+ @cached_property
259
282
  def furu_dir(self: Self) -> Path:
260
283
  """Get the directory for this Furu object."""
261
284
  directory = self._base_furu_dir()
262
285
  migration = self._alias_record(directory)
263
- if migration is not None and self._alias_is_active(directory, migration):
264
- return MigrationManager.resolve_dir(migration, target="from")
286
+ if migration is not None:
287
+ target_dir = self._alias_target_dir(directory, migration)
288
+ if target_dir is not None:
289
+ return target_dir
265
290
  return directory
266
291
 
267
292
  @property
@@ -290,13 +315,38 @@ class Furu[T](ABC):
290
315
  """Log a message to the current holder's `furu.log`."""
291
316
  return log(message, level=level)
292
317
 
318
+ def _exists_quiet(self: Self) -> bool:
319
+ directory = self._base_furu_dir()
320
+ success_dir = self._success_marker_dir(directory)
321
+ if success_dir is None:
322
+ return False
323
+ try:
324
+ return self._validate()
325
+ except FuruValidationError as exc:
326
+ logger = get_logger()
327
+ logger.warning(
328
+ "exists %s -> false (validate invalid for %s: %s)",
329
+ directory,
330
+ f"{self.__class__.__name__}({self.furu_hash})",
331
+ exc,
332
+ )
333
+ return False
334
+ except Exception as exc:
335
+ logger = get_logger()
336
+ logger.exception(
337
+ "exists %s -> false (validate crashed for %s: %s)",
338
+ directory,
339
+ f"{self.__class__.__name__}({self.furu_hash})",
340
+ exc,
341
+ )
342
+ return False
343
+
293
344
  def exists(self: Self) -> bool:
294
345
  """Check if result exists and is valid."""
295
346
  logger = get_logger()
296
347
  directory = self._base_furu_dir()
297
- state = self.get_state(directory)
298
-
299
- if not isinstance(state.result, _StateResultSuccess):
348
+ success_dir = self._success_marker_dir(directory)
349
+ if success_dir is None:
300
350
  logger.info("exists %s -> false", directory)
301
351
  return False
302
352
 
@@ -313,53 +363,139 @@ class Furu[T](ABC):
313
363
  """Get migration record for this object."""
314
364
  return MigrationManager.read_migration(self._base_furu_dir())
315
365
 
316
- @overload
317
- def load_or_create(
318
- self,
319
- executor: submitit.Executor,
320
- *,
321
- retry_failed: bool | None = None,
322
- ) -> T | submitit.Job[T]: ...
323
-
324
- @overload
325
- def load_or_create(
326
- self,
327
- executor: None = None,
328
- *,
329
- retry_failed: bool | None = None,
330
- ) -> T: ...
331
-
332
- def load_or_create(
333
- self: Self,
334
- executor: submitit.Executor | None = None,
335
- *,
336
- retry_failed: bool | None = None,
337
- ) -> T | submitit.Job[T]:
366
+ def get(self: Self, *, force: bool = False) -> T:
338
367
  """
339
368
  Load result if it exists, computing if necessary.
340
369
 
341
370
  Args:
342
- executor: Optional executor for batch submission (e.g., submitit.Executor)
343
- retry_failed: Whether to retry failed results (default uses FURU_RETRY_FAILED)
371
+ force: Allow computation inside executor contexts if the spec matches.
344
372
 
345
373
  Returns:
346
- Result if wait=True, job handle if wait=False, or None if already exists
374
+ Loaded or computed result.
347
375
 
348
376
  Raises:
349
377
  FuruComputeError: If computation fails with detailed error information
350
378
  """
379
+ from furu.errors import (
380
+ FuruExecutionError,
381
+ FuruMissingArtifact,
382
+ FuruSpecMismatch,
383
+ )
384
+ from furu.execution.context import EXEC_CONTEXT
385
+
386
+ ctx = EXEC_CONTEXT.get()
387
+ if ctx.mode == "executor":
388
+ logger = get_logger()
389
+ parent_holder = current_holder()
390
+ has_parent = parent_holder is not None and parent_holder is not self
391
+ needs_holder = parent_holder is None or has_parent
392
+ caller_info: _CallerInfo = {}
393
+ if has_parent:
394
+ caller_info = self._get_caller_info()
395
+
396
+ def _executor_get() -> T:
397
+ directory = self._base_furu_dir()
398
+ if force:
399
+ if (
400
+ ctx.current_node_hash is None
401
+ or self.furu_hash != ctx.current_node_hash
402
+ ):
403
+ raise FuruExecutionError(
404
+ "force=True not allowed: only the current node may compute in executor mode. "
405
+ f"current_node_hash={ctx.current_node_hash!r} "
406
+ f"obj={self.__class__.__name__}({self.furu_hash})",
407
+ hints=[
408
+ "Declare this object as a dependency instead of calling dep.get(force=True).",
409
+ "Inside executor mode, use get(force=True) only on the node being executed.",
410
+ ],
411
+ )
412
+ self._prepare_executor_rerun(directory)
413
+
414
+ exists_ok = self._exists_quiet()
415
+ if exists_ok and not (force and self._always_rerun()):
416
+ return self._load()
417
+
418
+ if force and not exists_ok:
419
+ state = self.get_state(directory)
420
+ if isinstance(state.result, _StateResultSuccess):
421
+ self._invalidate_cached_success(
422
+ directory, reason="_validate returned false (executor)"
423
+ )
424
+
425
+ if not force:
426
+ raise FuruMissingArtifact(
427
+ "Missing artifact "
428
+ f"{self.__class__.__name__}({self.furu_hash}) in executor mode. "
429
+ f"Requested by {ctx.current_node_hash}. Declare it as a dependency."
430
+ )
431
+
432
+ required = self._executor_spec_key()
433
+ if ctx.spec_key is None or required != ctx.spec_key:
434
+ raise FuruSpecMismatch(
435
+ "force=True not allowed: "
436
+ f"required={required!r} != worker={ctx.spec_key!r} (v1 exact match)"
437
+ )
438
+
439
+ StateManager.ensure_internal_dir(directory)
440
+ status, created_here, result = self._run_locally(
441
+ start_time=time.time(),
442
+ allow_failed=FURU_CONFIG.retry_failed,
443
+ executor_mode=True,
444
+ )
445
+ if status == "success":
446
+ if created_here:
447
+ return cast(T, result)
448
+ return self._load()
449
+
450
+ raise self._build_failed_state_error(
451
+ self._base_furu_dir(),
452
+ None,
453
+ message="Computation previously failed",
454
+ )
455
+
456
+ if has_parent:
457
+ logger.debug(
458
+ "dep: begin %s %s %s",
459
+ self.__class__.__name__,
460
+ self.furu_hash,
461
+ self._base_furu_dir(),
462
+ extra=caller_info,
463
+ )
464
+
465
+ ok = False
466
+ try:
467
+ if needs_holder:
468
+ with enter_holder(self):
469
+ result = _executor_get()
470
+ else:
471
+ result = _executor_get()
472
+ ok = True
473
+ return result
474
+ finally:
475
+ if has_parent:
476
+ logger.debug(
477
+ "dep: end %s %s (%s)",
478
+ self.__class__.__name__,
479
+ self.furu_hash,
480
+ "ok" if ok else "error",
481
+ extra=caller_info,
482
+ )
483
+
484
+ return self._get_impl_interactive(force=force)
485
+
486
+ def _get_impl_interactive(self: Self, *, force: bool) -> T:
351
487
  logger = get_logger()
352
488
  parent_holder = current_holder()
353
489
  has_parent = parent_holder is not None and parent_holder is not self
354
- retry_failed_effective = (
355
- retry_failed if retry_failed is not None else FURU_CONFIG.retry_failed
356
- )
490
+ caller_info = self._get_caller_info()
491
+ retry_failed_effective = FURU_CONFIG.retry_failed
357
492
  if has_parent:
358
493
  logger.debug(
359
494
  "dep: begin %s %s %s",
360
495
  self.__class__.__name__,
361
- self._furu_hash,
496
+ self.furu_hash,
362
497
  self._base_furu_dir(),
498
+ extra=caller_info,
363
499
  )
364
500
 
365
501
  ok = False
@@ -367,19 +503,21 @@ class Furu[T](ABC):
367
503
  with enter_holder(self):
368
504
  start_time = time.time()
369
505
  base_dir = self._base_furu_dir()
370
- base_dir.mkdir(parents=True, exist_ok=True)
371
506
  directory = base_dir
372
507
  migration = self._alias_record(base_dir)
373
508
  alias_active = False
509
+ base_marker = StateManager.success_marker_exists(base_dir)
374
510
 
375
511
  if (
376
512
  migration is not None
377
513
  and migration.kind == "alias"
378
514
  and migration.overwritten_at is None
515
+ and not base_marker
379
516
  ):
380
- target_dir = MigrationManager.resolve_dir(migration, target="from")
381
- target_state = StateManager.read_state(target_dir)
382
- if isinstance(target_state.result, _StateResultSuccess):
517
+ target_dir = self._alias_target_dir(
518
+ base_dir, migration, base_marker=base_marker
519
+ )
520
+ if target_dir is not None:
383
521
  alias_active = True
384
522
  directory = target_dir
385
523
  else:
@@ -445,7 +583,6 @@ class Furu[T](ABC):
445
583
  message="Computation previously failed",
446
584
  )
447
585
 
448
- needs_reconcile = True
449
586
  if isinstance(state0.result, _StateResultSuccess):
450
587
  # Double check logic if we fell through to here (e.g. race condition or invalidation above)
451
588
  if self._always_rerun():
@@ -460,9 +597,6 @@ class Furu[T](ABC):
460
597
  directory, reason="_validate returned false"
461
598
  )
462
599
  state0 = StateManager.read_state(directory)
463
- else:
464
- # Valid success found, skip reconcile
465
- needs_reconcile = False
466
600
  except Exception as e:
467
601
  self._invalidate_cached_success(
468
602
  directory,
@@ -470,11 +604,6 @@ class Furu[T](ABC):
470
604
  )
471
605
  state0 = StateManager.read_state(directory)
472
606
 
473
- if needs_reconcile and executor is not None:
474
- adapter0 = SubmititAdapter(executor)
475
- self._reconcile(directory, adapter=adapter0)
476
- state0 = StateManager.read_state(directory)
477
-
478
607
  attempt0 = state0.attempt
479
608
  if isinstance(state0.result, _StateResultSuccess):
480
609
  decision = "success->load"
@@ -489,17 +618,25 @@ class Furu[T](ABC):
489
618
  # Cache hits can be extremely noisy in pipelines; keep logs for state
490
619
  # transitions (create/wait) and error cases, but suppress repeated
491
620
  # "success->load" lines and the raw separator on successful loads.
492
- self._log_console_start(action_color=action_color)
621
+ self._log_console_start(
622
+ action_color=action_color,
623
+ caller_info=caller_info,
624
+ )
493
625
 
494
626
  if decision != "success->load":
627
+ if decision == "create":
628
+ StateManager.ensure_internal_dir(directory)
495
629
  write_separator()
496
630
  logger.debug(
497
- "load_or_create %s %s %s (%s)",
631
+ "get %s %s %s (%s)",
498
632
  self.__class__.__name__,
499
- self._furu_hash,
633
+ self.furu_hash,
500
634
  directory,
501
635
  decision,
502
- extra={"furu_action_color": action_color},
636
+ extra={
637
+ "furu_action_color": action_color,
638
+ **caller_info,
639
+ },
503
640
  )
504
641
 
505
642
  # Fast path: already successful
@@ -514,9 +651,9 @@ class Furu[T](ABC):
514
651
  # failures even when we suppressed the cache-hit header line.
515
652
  write_separator()
516
653
  logger.error(
517
- "load_or_create %s %s (load failed)",
654
+ "get %s %s (load failed)",
518
655
  self.__class__.__name__,
519
- self._furu_hash,
656
+ self.furu_hash,
520
657
  )
521
658
  raise FuruComputeError(
522
659
  f"Failed to load result from {directory}",
@@ -524,65 +661,43 @@ class Furu[T](ABC):
524
661
  e,
525
662
  ) from e
526
663
 
527
- # Synchronous execution
528
- if executor is None:
529
- status, created_here, result = self._run_locally(
530
- start_time=start_time,
531
- allow_failed=retry_failed_effective,
532
- )
533
- if status == "success":
534
- ok = True
535
- if created_here:
536
- logger.debug(
537
- "load_or_create: %s created -> return",
538
- self.__class__.__name__,
539
- )
540
- return cast(T, result)
664
+ status, created_here, result = self._run_locally(
665
+ start_time=start_time,
666
+ allow_failed=retry_failed_effective,
667
+ executor_mode=False,
668
+ )
669
+ if status == "success":
670
+ ok = True
671
+ if created_here:
541
672
  logger.debug(
542
- "load_or_create: %s success -> _load()",
673
+ "get: %s created -> return",
543
674
  self.__class__.__name__,
544
675
  )
545
- return self._load()
546
-
547
- raise self._build_failed_state_error(
548
- directory,
549
- None,
550
- message="Computation previously failed",
676
+ return cast(T, result)
677
+ logger.debug(
678
+ "get: %s success -> _load()",
679
+ self.__class__.__name__,
551
680
  )
681
+ return self._load()
552
682
 
553
- # Asynchronous execution with submitit
554
- (submitit_folder := self._base_furu_dir() / "submitit").mkdir(
555
- exist_ok=True, parents=True
556
- )
557
- executor.folder = submitit_folder
558
- adapter = SubmititAdapter(executor)
559
-
560
- logger.debug(
561
- "load_or_create: %s -> submitit submit_once()",
562
- self.__class__.__name__,
563
- )
564
- job = self._submit_once(
565
- adapter,
683
+ raise self._build_failed_state_error(
566
684
  directory,
567
685
  None,
568
- allow_failed=retry_failed_effective,
686
+ message="Computation previously failed",
569
687
  )
570
- ok = True
571
- return cast(submitit.Job[T], job)
572
688
  finally:
573
689
  if has_parent:
574
690
  logger.debug(
575
691
  "dep: end %s %s (%s)",
576
692
  self.__class__.__name__,
577
- self._furu_hash,
693
+ self.furu_hash,
578
694
  "ok" if ok else "error",
695
+ extra=caller_info,
579
696
  )
580
697
 
581
- def _log_console_start(self, action_color: str) -> None:
582
- """Log the start of load_or_create to console with caller info."""
583
- logger = get_logger()
698
+ @staticmethod
699
+ def _get_caller_info() -> _CallerInfo:
584
700
  frame = sys._getframe(1)
585
-
586
701
  caller_info: _CallerInfo = {}
587
702
  if frame is not None:
588
703
  # Walk up the stack to find the caller outside of furu package
@@ -597,11 +712,20 @@ class Furu[T](ABC):
597
712
  }
598
713
  break
599
714
  frame = frame.f_back
715
+ return caller_info
716
+
717
+ def _log_console_start(
718
+ self, action_color: str, caller_info: _CallerInfo | None = None
719
+ ) -> None:
720
+ """Log the start of get to console with caller info."""
721
+ logger = get_logger()
722
+ if caller_info is None:
723
+ caller_info = self._get_caller_info()
600
724
 
601
725
  logger.info(
602
- "load_or_create %s %s",
726
+ "get %s %s",
603
727
  self.__class__.__name__,
604
- self._furu_hash,
728
+ self.furu_hash,
605
729
  extra={
606
730
  "furu_console_only": True,
607
731
  "furu_action_color": action_color,
@@ -612,20 +736,14 @@ class Furu[T](ABC):
612
736
  def _add_exception_breadcrumbs(self, exc: BaseException, directory: Path) -> None:
613
737
  if not hasattr(exc, "add_note"):
614
738
  return
615
- state_path = StateManager.get_state_path(directory)
616
- log_path = StateManager.get_internal_dir(directory) / "furu.log"
617
- note = (
618
- f"Furu directory: {directory}\n"
619
- f"State file: {state_path}\n"
620
- f"Log file: {log_path}"
621
- )
739
+ note = f"Furu dir: {directory}"
622
740
  exc.add_note(note)
623
741
 
624
742
  @staticmethod
625
743
  def _failed_state_hints() -> list[str]:
626
744
  return [
627
- "To retry this failed artifact: set FURU_RETRY_FAILED=1 or call load_or_create(retry_failed=True).",
628
- "To inspect details: open the state file and furu.log shown above.",
745
+ "To retry this failed artifact: set FURU_RETRY_FAILED=1 or call get() again.",
746
+ "To inspect details: open the furu dir shown above.",
629
747
  ]
630
748
 
631
749
  def _build_failed_state_error(
@@ -678,9 +796,11 @@ class Furu[T](ABC):
678
796
  """Return the alias-aware state for this Furu directory."""
679
797
  base_dir = directory or self._base_furu_dir()
680
798
  record = self._alias_record(base_dir)
681
- if record is None or not self._alias_is_active(base_dir, record):
799
+ if record is None:
800
+ return StateManager.read_state(base_dir)
801
+ target_dir = self._alias_target_dir(base_dir, record)
802
+ if target_dir is None:
682
803
  return StateManager.read_state(base_dir)
683
- target_dir = MigrationManager.resolve_dir(record, target="from")
684
804
  return StateManager.read_state(target_dir)
685
805
 
686
806
  def _alias_record(self, directory: Path) -> MigrationRecord | None:
@@ -689,15 +809,36 @@ class Furu[T](ABC):
689
809
  return None
690
810
  return record
691
811
 
692
- def _alias_is_active(self, directory: Path, record: MigrationRecord) -> bool:
812
+ def _alias_target_dir(
813
+ self,
814
+ directory: Path,
815
+ record: MigrationRecord,
816
+ *,
817
+ base_marker: bool | None = None,
818
+ ) -> Path | None:
693
819
  if record.overwritten_at is not None:
694
- return False
695
- state = StateManager.read_state(directory)
696
- if not isinstance(state.result, _StateResultMigrated):
697
- return False
820
+ return None
821
+ if base_marker is None:
822
+ base_marker = StateManager.success_marker_exists(directory)
823
+ if base_marker:
824
+ return None
698
825
  target = MigrationManager.resolve_dir(record, target="from")
699
- target_state = StateManager.read_state(target)
700
- return isinstance(target_state.result, _StateResultSuccess)
826
+ if StateManager.success_marker_exists(target):
827
+ return target
828
+ return None
829
+
830
+ def _success_marker_dir(self, directory: Path) -> Path | None:
831
+ base_marker = StateManager.success_marker_exists(directory)
832
+ record = self._alias_record(directory)
833
+ if record is None:
834
+ return directory if base_marker else None
835
+ target_dir = self._alias_target_dir(directory, record, base_marker=base_marker)
836
+ if target_dir is not None:
837
+ return target_dir
838
+ return directory if base_marker else None
839
+
840
+ def _alias_is_active(self, directory: Path, record: MigrationRecord) -> bool:
841
+ return self._alias_target_dir(directory, record) is not None
701
842
 
702
843
  def _maybe_detach_alias(
703
844
  self: Self,
@@ -738,6 +879,7 @@ class Furu[T](ABC):
738
879
  ) -> SubmititJob | None:
739
880
  """Submit job once without waiting (fire-and-forget mode)."""
740
881
  logger = get_logger()
882
+ StateManager.ensure_internal_dir(directory)
741
883
  self._reconcile(directory, adapter=adapter)
742
884
  state = StateManager.read_state(directory)
743
885
  attempt = state.attempt
@@ -758,7 +900,7 @@ class Furu[T](ABC):
758
900
  logger.debug(
759
901
  "submit: waiting for submit lock %s %s %s",
760
902
  self.__class__.__name__,
761
- self._furu_hash,
903
+ self.furu_hash,
762
904
  directory,
763
905
  )
764
906
  time.sleep(0.5)
@@ -767,9 +909,7 @@ class Furu[T](ABC):
767
909
  attempt_id: str | None = None
768
910
  try:
769
911
  # Create metadata
770
- metadata = MetadataManager.create_metadata(
771
- self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
772
- )
912
+ metadata = MetadataManager.create_metadata(self, directory)
773
913
  MetadataManager.write_metadata(metadata, directory)
774
914
 
775
915
  env_info = MetadataManager.collect_environment_info()
@@ -843,127 +983,197 @@ class Furu[T](ABC):
843
983
  """Entry point for worker process (called by submitit or locally)."""
844
984
  with enter_holder(self):
845
985
  logger = get_logger()
846
- directory = self._base_furu_dir()
847
- directory.mkdir(parents=True, exist_ok=True)
848
-
849
- env_info = self._collect_submitit_env()
850
- allow_failed_effective = (
851
- allow_failed if allow_failed is not None else FURU_CONFIG.retry_failed
986
+ # Ensure executor semantics apply to *all* work in the worker, not
987
+ # just `_create()`. This prevents accidental dependency computation
988
+ # (e.g., from within `_validate()` or metadata hooks).
989
+ from furu.execution.context import EXEC_CONTEXT, ExecContext
990
+
991
+ exec_token = EXEC_CONTEXT.set(
992
+ ExecContext(
993
+ mode="executor",
994
+ spec_key=self._executor_spec_key(),
995
+ backend="submitit",
996
+ current_node_hash=self.furu_hash,
997
+ )
852
998
  )
853
-
854
999
  try:
855
- with compute_lock(
856
- directory,
857
- backend="submitit",
858
- lease_duration_sec=FURU_CONFIG.lease_duration_sec,
859
- heartbeat_interval_sec=FURU_CONFIG.heartbeat_interval_sec,
860
- owner={
861
- "pid": os.getpid(),
862
- "host": socket.gethostname(),
863
- "user": getpass.getuser(),
864
- "command": " ".join(sys.argv) if sys.argv else "<unknown>",
865
- },
866
- scheduler={
867
- "backend": env_info.get("backend"),
868
- "job_id": env_info.get("slurm_job_id"),
869
- },
870
- max_wait_time_sec=None, # Workers wait indefinitely
871
- poll_interval_sec=FURU_CONFIG.poll_interval,
872
- wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
873
- reconcile_fn=lambda d: self._reconcile(d),
874
- allow_failed=allow_failed_effective,
875
- ) as ctx:
876
- stage = "metadata"
877
- try:
878
- # Refresh metadata (now safe - attempt is already recorded)
879
- metadata = MetadataManager.create_metadata(
880
- self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
881
- )
882
- MetadataManager.write_metadata(metadata, directory)
1000
+ directory = self._base_furu_dir()
1001
+ StateManager.ensure_internal_dir(directory)
1002
+ always_rerun = self._always_rerun()
1003
+ needs_success_invalidation = False
1004
+ if not always_rerun:
1005
+ exists_ok = self._exists_quiet()
1006
+ if not exists_ok:
1007
+ state = self.get_state(directory)
1008
+ if isinstance(state.result, _StateResultSuccess):
1009
+ needs_success_invalidation = True
1010
+
1011
+ env_info = self._collect_submitit_env()
1012
+ allow_failed_effective = (
1013
+ allow_failed
1014
+ if allow_failed is not None
1015
+ else FURU_CONFIG.retry_failed
1016
+ )
1017
+ allow_success = always_rerun or needs_success_invalidation
883
1018
 
884
- # Set up signal handlers
885
- stage = "signal handler setup"
886
- self._setup_signal_handlers(
887
- directory, ctx.stop_heartbeat, attempt_id=ctx.attempt_id
888
- )
1019
+ try:
1020
+ with compute_lock(
1021
+ directory,
1022
+ backend="submitit",
1023
+ lease_duration_sec=FURU_CONFIG.lease_duration_sec,
1024
+ heartbeat_interval_sec=FURU_CONFIG.heartbeat_interval_sec,
1025
+ owner={
1026
+ "pid": os.getpid(),
1027
+ "host": socket.gethostname(),
1028
+ "user": getpass.getuser(),
1029
+ "command": " ".join(sys.argv) if sys.argv else "<unknown>",
1030
+ },
1031
+ scheduler={
1032
+ "backend": env_info.get("backend"),
1033
+ "job_id": env_info.get("slurm_job_id"),
1034
+ },
1035
+ max_wait_time_sec=None, # Workers wait indefinitely
1036
+ poll_interval_sec=FURU_CONFIG.poll_interval,
1037
+ wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
1038
+ reconcile_fn=lambda d: self._reconcile(d),
1039
+ allow_failed=allow_failed_effective,
1040
+ allow_success=allow_success,
1041
+ ) as ctx:
1042
+ self._prepare_executor_rerun(directory)
1043
+ if not always_rerun:
1044
+ exists_ok = self._exists_quiet()
1045
+ if not exists_ok:
1046
+ state = self.get_state(directory)
1047
+ if isinstance(state.result, _StateResultSuccess):
1048
+ self._invalidate_cached_success(
1049
+ directory,
1050
+ reason="_validate returned false (worker)",
1051
+ )
1052
+
1053
+ stage = "metadata"
1054
+ try:
1055
+ # Refresh metadata (now safe - attempt is already recorded)
1056
+ metadata = MetadataManager.create_metadata(self, directory)
1057
+ MetadataManager.write_metadata(metadata, directory)
889
1058
 
890
- stage = "_create"
891
- # Run computation
892
- logger.debug(
893
- "_create: begin %s %s %s",
894
- self.__class__.__name__,
895
- self._furu_hash,
896
- directory,
897
- )
898
- self._create()
899
- logger.debug(
900
- "_create: ok %s %s %s",
901
- self.__class__.__name__,
902
- self._furu_hash,
903
- directory,
904
- )
905
- StateManager.write_success_marker(
906
- directory, attempt_id=ctx.attempt_id
907
- )
908
- StateManager.finish_attempt_success(
909
- directory, attempt_id=ctx.attempt_id
910
- )
911
- logger.info(
912
- "_create ok %s %s",
913
- self.__class__.__name__,
914
- self._furu_hash,
915
- extra={"furu_console_only": True},
916
- )
917
- except Exception as e:
918
- if stage == "_create":
919
- logger.error(
920
- "_create failed %s %s %s",
1059
+ # Set up signal handlers
1060
+ stage = "signal handler setup"
1061
+ self._setup_signal_handlers(
1062
+ directory,
1063
+ ctx.stop_heartbeat,
1064
+ attempt_id=ctx.attempt_id,
1065
+ )
1066
+
1067
+ stage = "_create"
1068
+ # Run computation
1069
+ logger.debug(
1070
+ "_create: begin %s %s %s",
921
1071
  self.__class__.__name__,
922
- self._furu_hash,
1072
+ self.furu_hash,
923
1073
  directory,
924
- extra={"furu_file_only": True},
925
1074
  )
926
- else:
927
- logger.error(
928
- "attempt failed (%s) %s %s %s",
929
- stage,
1075
+ self._create()
1076
+ logger.debug(
1077
+ "_create: ok %s %s %s",
930
1078
  self.__class__.__name__,
931
- self._furu_hash,
1079
+ self.furu_hash,
932
1080
  directory,
1081
+ )
1082
+ StateManager.write_success_marker(
1083
+ directory, attempt_id=ctx.attempt_id
1084
+ )
1085
+ StateManager.finish_attempt_success(
1086
+ directory, attempt_id=ctx.attempt_id
1087
+ )
1088
+ logger.info(
1089
+ "_create ok %s %s",
1090
+ self.__class__.__name__,
1091
+ self.furu_hash,
1092
+ extra={"furu_console_only": True},
1093
+ )
1094
+ except Exception as e:
1095
+ if stage == "_create":
1096
+ logger.error(
1097
+ "_create failed %s %s %s",
1098
+ self.__class__.__name__,
1099
+ self.furu_hash,
1100
+ directory,
1101
+ extra={"furu_file_only": True},
1102
+ )
1103
+ else:
1104
+ logger.error(
1105
+ "attempt failed (%s) %s %s %s",
1106
+ stage,
1107
+ self.__class__.__name__,
1108
+ self.furu_hash,
1109
+ directory,
1110
+ extra={"furu_file_only": True},
1111
+ )
1112
+ logger.error(
1113
+ "%s",
1114
+ format_traceback(e),
933
1115
  extra={"furu_file_only": True},
934
1116
  )
935
- logger.error(
936
- "%s", format_traceback(e), extra={"furu_file_only": True}
937
- )
938
1117
 
939
- tb = "".join(
940
- traceback.format_exception(type(e), e, e.__traceback__)
941
- )
942
- StateManager.finish_attempt_failed(
943
- directory,
944
- attempt_id=ctx.attempt_id,
945
- error={
946
- "type": type(e).__name__,
947
- "message": str(e),
948
- "traceback": tb,
949
- },
950
- )
951
- self._add_exception_breadcrumbs(e, directory)
952
- if stage != "_create":
953
- message = (
954
- "Failed to create metadata"
955
- if stage == "metadata"
956
- else "Failed to set up signal handlers"
1118
+ tb = "".join(
1119
+ traceback.format_exception(type(e), e, e.__traceback__)
957
1120
  )
958
- raise FuruComputeError(
959
- message,
960
- StateManager.get_state_path(directory),
961
- e,
962
- ) from e
963
- raise
964
- except FuruLockNotAcquired:
965
- # Experiment already completed (success or failed), nothing to do
966
- return
1121
+ StateManager.finish_attempt_failed(
1122
+ directory,
1123
+ attempt_id=ctx.attempt_id,
1124
+ error={
1125
+ "type": type(e).__name__,
1126
+ "message": str(e),
1127
+ "traceback": tb,
1128
+ },
1129
+ )
1130
+ self._add_exception_breadcrumbs(e, directory)
1131
+ if stage != "_create":
1132
+ message = (
1133
+ "Failed to create metadata"
1134
+ if stage == "metadata"
1135
+ else "Failed to set up signal handlers"
1136
+ )
1137
+ raise FuruComputeError(
1138
+ message,
1139
+ StateManager.get_state_path(directory),
1140
+ e,
1141
+ ) from e
1142
+ raise
1143
+ except FuruLockNotAcquired as exc:
1144
+ # Experiment already completed; succeed if success, fail if failed.
1145
+ state = StateManager.read_state(directory)
1146
+ state_path = StateManager.get_state_path(directory)
1147
+ attempt = state.attempt
1148
+ attempt_info = "no active attempt"
1149
+ if attempt is not None:
1150
+ attempt_info = (
1151
+ f"attempt {attempt.id} status {attempt.status} "
1152
+ f"backend {attempt.backend}"
1153
+ )
1154
+ hints = [
1155
+ f"Furu hash: {self.furu_hash}",
1156
+ f"Directory: {directory}",
1157
+ f"State file: {state_path}",
1158
+ f"Attempt: {attempt_info}",
1159
+ ]
1160
+ if isinstance(state.result, _StateResultSuccess):
1161
+ return
1162
+ if isinstance(state.result, _StateResultFailed):
1163
+ if allow_failed_effective:
1164
+ return
1165
+ raise FuruComputeError(
1166
+ "Worker refused to run: experiment already failed",
1167
+ state_path,
1168
+ exc,
1169
+ hints=hints,
1170
+ ) from exc
1171
+ raise FuruLockNotAcquired(
1172
+ "Worker refused to run: experiment already running elsewhere",
1173
+ hints=hints,
1174
+ ) from exc
1175
+ finally:
1176
+ EXEC_CONTEXT.reset(exec_token)
967
1177
 
968
1178
  def _collect_submitit_env(self: Self) -> _SubmititEnvInfo:
969
1179
  """Collect submitit/slurm environment information."""
@@ -994,6 +1204,7 @@ class Furu[T](ABC):
994
1204
  start_time: float,
995
1205
  *,
996
1206
  allow_failed: bool,
1207
+ executor_mode: bool = False,
997
1208
  ) -> tuple[str, bool, T | None]:
998
1209
  """Run computation locally, returning (status, created_here, result)."""
999
1210
  logger = get_logger()
@@ -1028,9 +1239,7 @@ class Furu[T](ABC):
1028
1239
  stage = "metadata"
1029
1240
  try:
1030
1241
  # Create metadata (now safe - attempt is already recorded)
1031
- metadata = MetadataManager.create_metadata(
1032
- self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
1033
- )
1242
+ metadata = MetadataManager.create_metadata(self, directory)
1034
1243
  MetadataManager.write_metadata(metadata, directory)
1035
1244
 
1036
1245
  # Set up preemption handler
@@ -1044,14 +1253,30 @@ class Furu[T](ABC):
1044
1253
  logger.debug(
1045
1254
  "_create: begin %s %s %s",
1046
1255
  self.__class__.__name__,
1047
- self._furu_hash,
1256
+ self.furu_hash,
1048
1257
  directory,
1049
1258
  )
1050
- result = self._create()
1259
+ token = None
1260
+ if executor_mode:
1261
+ from furu.execution.context import EXEC_CONTEXT, ExecContext
1262
+
1263
+ token = EXEC_CONTEXT.set(
1264
+ ExecContext(
1265
+ mode="executor",
1266
+ spec_key=self._executor_spec_key(),
1267
+ backend="local",
1268
+ current_node_hash=self.furu_hash,
1269
+ )
1270
+ )
1271
+ try:
1272
+ result = self._create()
1273
+ finally:
1274
+ if token is not None:
1275
+ EXEC_CONTEXT.reset(token)
1051
1276
  logger.debug(
1052
1277
  "_create: ok %s %s %s",
1053
1278
  self.__class__.__name__,
1054
- self._furu_hash,
1279
+ self.furu_hash,
1055
1280
  directory,
1056
1281
  )
1057
1282
  StateManager.write_success_marker(
@@ -1063,7 +1288,7 @@ class Furu[T](ABC):
1063
1288
  logger.info(
1064
1289
  "_create ok %s %s",
1065
1290
  self.__class__.__name__,
1066
- self._furu_hash,
1291
+ self.furu_hash,
1067
1292
  extra={"furu_console_only": True},
1068
1293
  )
1069
1294
  return "success", True, result
@@ -1072,7 +1297,7 @@ class Furu[T](ABC):
1072
1297
  logger.error(
1073
1298
  "_create failed %s %s %s",
1074
1299
  self.__class__.__name__,
1075
- self._furu_hash,
1300
+ self.furu_hash,
1076
1301
  directory,
1077
1302
  extra={"furu_file_only": True},
1078
1303
  )
@@ -1081,7 +1306,7 @@ class Furu[T](ABC):
1081
1306
  "attempt failed (%s) %s %s %s",
1082
1307
  stage,
1083
1308
  self.__class__.__name__,
1084
- self._furu_hash,
1309
+ self.furu_hash,
1085
1310
  directory,
1086
1311
  extra={"furu_file_only": True},
1087
1312
  )
@@ -1145,6 +1370,8 @@ class Furu[T](ABC):
1145
1370
  attempt_id: str,
1146
1371
  ) -> None:
1147
1372
  """Set up signal handlers for graceful preemption."""
1373
+ if threading.current_thread() is not threading.main_thread():
1374
+ return
1148
1375
 
1149
1376
  def handle_signal(signum: int, frame: FrameType | None) -> None:
1150
1377
  try:
@@ -1191,7 +1418,7 @@ def _collect_dependencies(
1191
1418
  recursive: bool,
1192
1419
  ) -> None:
1193
1420
  for dependency in _direct_dependencies(obj):
1194
- digest = dependency._furu_hash
1421
+ digest = dependency.furu_hash
1195
1422
  if digest in seen:
1196
1423
  continue
1197
1424
  seen.add(digest)
@@ -1345,7 +1572,7 @@ def _sorted_dependency_set(
1345
1572
 
1346
1573
  def _dependency_sort_key(value: DependencyScanValue) -> tuple[int, str]:
1347
1574
  if isinstance(value, Furu):
1348
- return (0, value._furu_hash)
1575
+ return (0, cast(str, value.furu_hash))
1349
1576
  return (1, f"{type(value).__name__}:{value!r}")
1350
1577
 
1351
1578