furu 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +8 -0
- furu/adapters/submitit.py +23 -2
- furu/config.py +13 -1
- furu/core/furu.py +355 -196
- furu/core/list.py +1 -1
- furu/dashboard/__init__.py +10 -1
- furu/dashboard/main.py +10 -3
- furu/errors.py +17 -4
- furu/execution/__init__.py +22 -0
- furu/execution/context.py +30 -0
- furu/execution/local.py +184 -0
- furu/execution/paths.py +20 -0
- furu/execution/plan.py +238 -0
- furu/execution/plan_utils.py +13 -0
- furu/execution/slurm_dag.py +271 -0
- furu/execution/slurm_pool.py +878 -0
- furu/execution/slurm_spec.py +38 -0
- furu/execution/submitit_factory.py +47 -0
- furu/runtime/logging.py +10 -10
- furu/storage/state.py +34 -6
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/METADATA +74 -37
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/RECORD +24 -14
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/WHEEL +0 -0
- {furu-0.0.3.dist-info → furu-0.0.4.dist-info}/entry_points.txt +0 -0
furu/core/furu.py
CHANGED
|
@@ -5,6 +5,7 @@ import os
|
|
|
5
5
|
import signal
|
|
6
6
|
import socket
|
|
7
7
|
import sys
|
|
8
|
+
import threading
|
|
8
9
|
import time
|
|
9
10
|
import traceback
|
|
10
11
|
from abc import ABC, abstractmethod
|
|
@@ -24,7 +25,6 @@ from typing import (
|
|
|
24
25
|
TypeAlias,
|
|
25
26
|
TypeVar,
|
|
26
27
|
cast,
|
|
27
|
-
overload,
|
|
28
28
|
)
|
|
29
29
|
|
|
30
30
|
import chz
|
|
@@ -40,6 +40,7 @@ from ..errors import (
|
|
|
40
40
|
MISSING,
|
|
41
41
|
FuruComputeError,
|
|
42
42
|
FuruLockNotAcquired,
|
|
43
|
+
FuruValidationError,
|
|
43
44
|
FuruWaitTimeout,
|
|
44
45
|
)
|
|
45
46
|
from ..runtime import current_holder
|
|
@@ -190,13 +191,20 @@ class Furu[T](ABC):
|
|
|
190
191
|
raise NotImplementedError(f"{self.__class__.__name__}._load() not implemented")
|
|
191
192
|
|
|
192
193
|
def _validate(self: Self) -> bool:
|
|
193
|
-
"""
|
|
194
|
+
"""
|
|
195
|
+
Validate that result is complete and correct (override if needed).
|
|
196
|
+
|
|
197
|
+
Return False or raise FuruValidationError to mark artifacts as invalid.
|
|
198
|
+
"""
|
|
194
199
|
return True
|
|
195
200
|
|
|
196
201
|
def _dependencies(self: Self) -> "DependencySpec | None":
|
|
197
202
|
"""Return extra dependencies not captured by fields."""
|
|
198
203
|
return None
|
|
199
204
|
|
|
205
|
+
def _executor_spec_key(self: Self) -> str:
|
|
206
|
+
return "default"
|
|
207
|
+
|
|
200
208
|
def _get_dependencies(self: Self, *, recursive: bool = True) -> list["Furu"]:
|
|
201
209
|
"""Collect Furu dependencies from fields and `_dependencies()`."""
|
|
202
210
|
seen = {self._furu_hash}
|
|
@@ -238,6 +246,27 @@ class Furu[T](ABC):
|
|
|
238
246
|
directory, {"type": "result_invalidated", "reason": reason, "at": now}
|
|
239
247
|
)
|
|
240
248
|
|
|
249
|
+
def _prepare_executor_rerun(self: Self, directory: Path) -> None:
|
|
250
|
+
if not self._always_rerun():
|
|
251
|
+
return
|
|
252
|
+
if not directory.exists():
|
|
253
|
+
return
|
|
254
|
+
migration = self._alias_record(directory)
|
|
255
|
+
if migration is not None and self._alias_is_active(directory, migration):
|
|
256
|
+
self._maybe_detach_alias(
|
|
257
|
+
directory=directory,
|
|
258
|
+
record=migration,
|
|
259
|
+
reason="always_rerun",
|
|
260
|
+
)
|
|
261
|
+
state = StateManager.read_state(directory)
|
|
262
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
263
|
+
self._invalidate_cached_success(directory, reason="always_rerun enabled")
|
|
264
|
+
|
|
265
|
+
@property
|
|
266
|
+
def furu_hash(self: Self) -> str:
|
|
267
|
+
"""Return the stable content hash for this Furu object."""
|
|
268
|
+
return self._furu_hash
|
|
269
|
+
|
|
241
270
|
@property
|
|
242
271
|
def _furu_hash(self: Self) -> str:
|
|
243
272
|
"""Compute hash of this object's content for storage identification."""
|
|
@@ -290,6 +319,33 @@ class Furu[T](ABC):
|
|
|
290
319
|
"""Log a message to the current holder's `furu.log`."""
|
|
291
320
|
return log(message, level=level)
|
|
292
321
|
|
|
322
|
+
def _exists_quiet(self: Self) -> bool:
|
|
323
|
+
directory = self._base_furu_dir()
|
|
324
|
+
state = self.get_state(directory)
|
|
325
|
+
|
|
326
|
+
if not isinstance(state.result, _StateResultSuccess):
|
|
327
|
+
return False
|
|
328
|
+
try:
|
|
329
|
+
return self._validate()
|
|
330
|
+
except FuruValidationError as exc:
|
|
331
|
+
logger = get_logger()
|
|
332
|
+
logger.warning(
|
|
333
|
+
"exists %s -> false (validate invalid for %s: %s)",
|
|
334
|
+
directory,
|
|
335
|
+
f"{self.__class__.__name__}({self._furu_hash})",
|
|
336
|
+
exc,
|
|
337
|
+
)
|
|
338
|
+
return False
|
|
339
|
+
except Exception as exc:
|
|
340
|
+
logger = get_logger()
|
|
341
|
+
logger.exception(
|
|
342
|
+
"exists %s -> false (validate crashed for %s: %s)",
|
|
343
|
+
directory,
|
|
344
|
+
f"{self.__class__.__name__}({self._furu_hash})",
|
|
345
|
+
exc,
|
|
346
|
+
)
|
|
347
|
+
return False
|
|
348
|
+
|
|
293
349
|
def exists(self: Self) -> bool:
|
|
294
350
|
"""Check if result exists and is valid."""
|
|
295
351
|
logger = get_logger()
|
|
@@ -313,47 +369,93 @@ class Furu[T](ABC):
|
|
|
313
369
|
"""Get migration record for this object."""
|
|
314
370
|
return MigrationManager.read_migration(self._base_furu_dir())
|
|
315
371
|
|
|
316
|
-
|
|
317
|
-
def load_or_create(
|
|
318
|
-
self,
|
|
319
|
-
executor: submitit.Executor,
|
|
320
|
-
*,
|
|
321
|
-
retry_failed: bool | None = None,
|
|
322
|
-
) -> T | submitit.Job[T]: ...
|
|
323
|
-
|
|
324
|
-
@overload
|
|
325
|
-
def load_or_create(
|
|
326
|
-
self,
|
|
327
|
-
executor: None = None,
|
|
328
|
-
*,
|
|
329
|
-
retry_failed: bool | None = None,
|
|
330
|
-
) -> T: ...
|
|
331
|
-
|
|
332
|
-
def load_or_create(
|
|
333
|
-
self: Self,
|
|
334
|
-
executor: submitit.Executor | None = None,
|
|
335
|
-
*,
|
|
336
|
-
retry_failed: bool | None = None,
|
|
337
|
-
) -> T | submitit.Job[T]:
|
|
372
|
+
def get(self: Self, *, force: bool = False) -> T:
|
|
338
373
|
"""
|
|
339
374
|
Load result if it exists, computing if necessary.
|
|
340
375
|
|
|
341
376
|
Args:
|
|
342
|
-
|
|
343
|
-
retry_failed: Whether to retry failed results (default uses FURU_RETRY_FAILED)
|
|
377
|
+
force: Allow computation inside executor contexts if the spec matches.
|
|
344
378
|
|
|
345
379
|
Returns:
|
|
346
|
-
|
|
380
|
+
Loaded or computed result.
|
|
347
381
|
|
|
348
382
|
Raises:
|
|
349
383
|
FuruComputeError: If computation fails with detailed error information
|
|
350
384
|
"""
|
|
385
|
+
from furu.execution.context import EXEC_CONTEXT
|
|
386
|
+
from furu.errors import (
|
|
387
|
+
FuruExecutionError,
|
|
388
|
+
FuruMissingArtifact,
|
|
389
|
+
FuruSpecMismatch,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
ctx = EXEC_CONTEXT.get()
|
|
393
|
+
if ctx.mode == "executor":
|
|
394
|
+
directory = self._base_furu_dir()
|
|
395
|
+
if force:
|
|
396
|
+
if (
|
|
397
|
+
ctx.current_node_hash is None
|
|
398
|
+
or self._furu_hash != ctx.current_node_hash
|
|
399
|
+
):
|
|
400
|
+
raise FuruExecutionError(
|
|
401
|
+
"force=True not allowed: only the current node may compute in executor mode. "
|
|
402
|
+
f"current_node_hash={ctx.current_node_hash!r} "
|
|
403
|
+
f"obj={self.__class__.__name__}({self._furu_hash})",
|
|
404
|
+
hints=[
|
|
405
|
+
"Declare this object as a dependency instead of calling dep.get(force=True).",
|
|
406
|
+
"Inside executor mode, use get(force=True) only on the node being executed.",
|
|
407
|
+
],
|
|
408
|
+
)
|
|
409
|
+
self._prepare_executor_rerun(directory)
|
|
410
|
+
|
|
411
|
+
exists_ok = self._exists_quiet()
|
|
412
|
+
if exists_ok and not (force and self._always_rerun()):
|
|
413
|
+
return self._load()
|
|
414
|
+
|
|
415
|
+
if force and not exists_ok:
|
|
416
|
+
state = self.get_state(directory)
|
|
417
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
418
|
+
self._invalidate_cached_success(
|
|
419
|
+
directory, reason="_validate returned false (executor)"
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
if not force:
|
|
423
|
+
raise FuruMissingArtifact(
|
|
424
|
+
"Missing artifact "
|
|
425
|
+
f"{self.__class__.__name__}({self._furu_hash}) in executor mode. "
|
|
426
|
+
f"Requested by {ctx.current_node_hash}. Declare it as a dependency."
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
required = self._executor_spec_key()
|
|
430
|
+
if ctx.spec_key is None or required != ctx.spec_key:
|
|
431
|
+
raise FuruSpecMismatch(
|
|
432
|
+
"force=True not allowed: "
|
|
433
|
+
f"required={required!r} != worker={ctx.spec_key!r} (v1 exact match)"
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
status, created_here, result = self._run_locally(
|
|
437
|
+
start_time=time.time(),
|
|
438
|
+
allow_failed=FURU_CONFIG.retry_failed,
|
|
439
|
+
executor_mode=True,
|
|
440
|
+
)
|
|
441
|
+
if status == "success":
|
|
442
|
+
if created_here:
|
|
443
|
+
return cast(T, result)
|
|
444
|
+
return self._load()
|
|
445
|
+
|
|
446
|
+
raise self._build_failed_state_error(
|
|
447
|
+
self._base_furu_dir(),
|
|
448
|
+
None,
|
|
449
|
+
message="Computation previously failed",
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
return self._get_impl_interactive(force=force)
|
|
453
|
+
|
|
454
|
+
def _get_impl_interactive(self: Self, *, force: bool) -> T:
|
|
351
455
|
logger = get_logger()
|
|
352
456
|
parent_holder = current_holder()
|
|
353
457
|
has_parent = parent_holder is not None and parent_holder is not self
|
|
354
|
-
retry_failed_effective =
|
|
355
|
-
retry_failed if retry_failed is not None else FURU_CONFIG.retry_failed
|
|
356
|
-
)
|
|
458
|
+
retry_failed_effective = FURU_CONFIG.retry_failed
|
|
357
459
|
if has_parent:
|
|
358
460
|
logger.debug(
|
|
359
461
|
"dep: begin %s %s %s",
|
|
@@ -445,7 +547,6 @@ class Furu[T](ABC):
|
|
|
445
547
|
message="Computation previously failed",
|
|
446
548
|
)
|
|
447
549
|
|
|
448
|
-
needs_reconcile = True
|
|
449
550
|
if isinstance(state0.result, _StateResultSuccess):
|
|
450
551
|
# Double check logic if we fell through to here (e.g. race condition or invalidation above)
|
|
451
552
|
if self._always_rerun():
|
|
@@ -460,9 +561,6 @@ class Furu[T](ABC):
|
|
|
460
561
|
directory, reason="_validate returned false"
|
|
461
562
|
)
|
|
462
563
|
state0 = StateManager.read_state(directory)
|
|
463
|
-
else:
|
|
464
|
-
# Valid success found, skip reconcile
|
|
465
|
-
needs_reconcile = False
|
|
466
564
|
except Exception as e:
|
|
467
565
|
self._invalidate_cached_success(
|
|
468
566
|
directory,
|
|
@@ -470,11 +568,6 @@ class Furu[T](ABC):
|
|
|
470
568
|
)
|
|
471
569
|
state0 = StateManager.read_state(directory)
|
|
472
570
|
|
|
473
|
-
if needs_reconcile and executor is not None:
|
|
474
|
-
adapter0 = SubmititAdapter(executor)
|
|
475
|
-
self._reconcile(directory, adapter=adapter0)
|
|
476
|
-
state0 = StateManager.read_state(directory)
|
|
477
|
-
|
|
478
571
|
attempt0 = state0.attempt
|
|
479
572
|
if isinstance(state0.result, _StateResultSuccess):
|
|
480
573
|
decision = "success->load"
|
|
@@ -494,7 +587,7 @@ class Furu[T](ABC):
|
|
|
494
587
|
if decision != "success->load":
|
|
495
588
|
write_separator()
|
|
496
589
|
logger.debug(
|
|
497
|
-
"
|
|
590
|
+
"get %s %s %s (%s)",
|
|
498
591
|
self.__class__.__name__,
|
|
499
592
|
self._furu_hash,
|
|
500
593
|
directory,
|
|
@@ -514,7 +607,7 @@ class Furu[T](ABC):
|
|
|
514
607
|
# failures even when we suppressed the cache-hit header line.
|
|
515
608
|
write_separator()
|
|
516
609
|
logger.error(
|
|
517
|
-
"
|
|
610
|
+
"get %s %s (load failed)",
|
|
518
611
|
self.__class__.__name__,
|
|
519
612
|
self._furu_hash,
|
|
520
613
|
)
|
|
@@ -524,51 +617,30 @@ class Furu[T](ABC):
|
|
|
524
617
|
e,
|
|
525
618
|
) from e
|
|
526
619
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
if created_here:
|
|
536
|
-
logger.debug(
|
|
537
|
-
"load_or_create: %s created -> return",
|
|
538
|
-
self.__class__.__name__,
|
|
539
|
-
)
|
|
540
|
-
return cast(T, result)
|
|
620
|
+
status, created_here, result = self._run_locally(
|
|
621
|
+
start_time=start_time,
|
|
622
|
+
allow_failed=retry_failed_effective,
|
|
623
|
+
executor_mode=False,
|
|
624
|
+
)
|
|
625
|
+
if status == "success":
|
|
626
|
+
ok = True
|
|
627
|
+
if created_here:
|
|
541
628
|
logger.debug(
|
|
542
|
-
"
|
|
629
|
+
"get: %s created -> return",
|
|
543
630
|
self.__class__.__name__,
|
|
544
631
|
)
|
|
545
|
-
return
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
None,
|
|
550
|
-
message="Computation previously failed",
|
|
632
|
+
return cast(T, result)
|
|
633
|
+
logger.debug(
|
|
634
|
+
"get: %s success -> _load()",
|
|
635
|
+
self.__class__.__name__,
|
|
551
636
|
)
|
|
637
|
+
return self._load()
|
|
552
638
|
|
|
553
|
-
|
|
554
|
-
(submitit_folder := self._base_furu_dir() / "submitit").mkdir(
|
|
555
|
-
exist_ok=True, parents=True
|
|
556
|
-
)
|
|
557
|
-
executor.folder = submitit_folder
|
|
558
|
-
adapter = SubmititAdapter(executor)
|
|
559
|
-
|
|
560
|
-
logger.debug(
|
|
561
|
-
"load_or_create: %s -> submitit submit_once()",
|
|
562
|
-
self.__class__.__name__,
|
|
563
|
-
)
|
|
564
|
-
job = self._submit_once(
|
|
565
|
-
adapter,
|
|
639
|
+
raise self._build_failed_state_error(
|
|
566
640
|
directory,
|
|
567
641
|
None,
|
|
568
|
-
|
|
642
|
+
message="Computation previously failed",
|
|
569
643
|
)
|
|
570
|
-
ok = True
|
|
571
|
-
return cast(submitit.Job[T], job)
|
|
572
644
|
finally:
|
|
573
645
|
if has_parent:
|
|
574
646
|
logger.debug(
|
|
@@ -579,7 +651,7 @@ class Furu[T](ABC):
|
|
|
579
651
|
)
|
|
580
652
|
|
|
581
653
|
def _log_console_start(self, action_color: str) -> None:
|
|
582
|
-
"""Log the start of
|
|
654
|
+
"""Log the start of get to console with caller info."""
|
|
583
655
|
logger = get_logger()
|
|
584
656
|
frame = sys._getframe(1)
|
|
585
657
|
|
|
@@ -599,7 +671,7 @@ class Furu[T](ABC):
|
|
|
599
671
|
frame = frame.f_back
|
|
600
672
|
|
|
601
673
|
logger.info(
|
|
602
|
-
"
|
|
674
|
+
"get %s %s",
|
|
603
675
|
self.__class__.__name__,
|
|
604
676
|
self._furu_hash,
|
|
605
677
|
extra={
|
|
@@ -612,20 +684,14 @@ class Furu[T](ABC):
|
|
|
612
684
|
def _add_exception_breadcrumbs(self, exc: BaseException, directory: Path) -> None:
|
|
613
685
|
if not hasattr(exc, "add_note"):
|
|
614
686
|
return
|
|
615
|
-
|
|
616
|
-
log_path = StateManager.get_internal_dir(directory) / "furu.log"
|
|
617
|
-
note = (
|
|
618
|
-
f"Furu directory: {directory}\n"
|
|
619
|
-
f"State file: {state_path}\n"
|
|
620
|
-
f"Log file: {log_path}"
|
|
621
|
-
)
|
|
687
|
+
note = f"Furu dir: {directory}"
|
|
622
688
|
exc.add_note(note)
|
|
623
689
|
|
|
624
690
|
@staticmethod
|
|
625
691
|
def _failed_state_hints() -> list[str]:
|
|
626
692
|
return [
|
|
627
|
-
"To retry this failed artifact: set FURU_RETRY_FAILED=1 or call
|
|
628
|
-
"To inspect details: open the
|
|
693
|
+
"To retry this failed artifact: set FURU_RETRY_FAILED=1 or call get() again.",
|
|
694
|
+
"To inspect details: open the furu dir shown above.",
|
|
629
695
|
]
|
|
630
696
|
|
|
631
697
|
def _build_failed_state_error(
|
|
@@ -843,127 +909,201 @@ class Furu[T](ABC):
|
|
|
843
909
|
"""Entry point for worker process (called by submitit or locally)."""
|
|
844
910
|
with enter_holder(self):
|
|
845
911
|
logger = get_logger()
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
912
|
+
# Ensure executor semantics apply to *all* work in the worker, not
|
|
913
|
+
# just `_create()`. This prevents accidental dependency computation
|
|
914
|
+
# (e.g., from within `_validate()` or metadata hooks).
|
|
915
|
+
from furu.execution.context import EXEC_CONTEXT, ExecContext
|
|
916
|
+
|
|
917
|
+
exec_token = EXEC_CONTEXT.set(
|
|
918
|
+
ExecContext(
|
|
919
|
+
mode="executor",
|
|
920
|
+
spec_key=self._executor_spec_key(),
|
|
921
|
+
backend="submitit",
|
|
922
|
+
current_node_hash=self._furu_hash,
|
|
923
|
+
)
|
|
852
924
|
)
|
|
853
|
-
|
|
854
925
|
try:
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
reconcile_fn=lambda d: self._reconcile(d),
|
|
874
|
-
allow_failed=allow_failed_effective,
|
|
875
|
-
) as ctx:
|
|
876
|
-
stage = "metadata"
|
|
877
|
-
try:
|
|
878
|
-
# Refresh metadata (now safe - attempt is already recorded)
|
|
879
|
-
metadata = MetadataManager.create_metadata(
|
|
880
|
-
self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
|
|
881
|
-
)
|
|
882
|
-
MetadataManager.write_metadata(metadata, directory)
|
|
926
|
+
directory = self._base_furu_dir()
|
|
927
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
928
|
+
always_rerun = self._always_rerun()
|
|
929
|
+
needs_success_invalidation = False
|
|
930
|
+
if not always_rerun:
|
|
931
|
+
exists_ok = self._exists_quiet()
|
|
932
|
+
if not exists_ok:
|
|
933
|
+
state = self.get_state(directory)
|
|
934
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
935
|
+
needs_success_invalidation = True
|
|
936
|
+
|
|
937
|
+
env_info = self._collect_submitit_env()
|
|
938
|
+
allow_failed_effective = (
|
|
939
|
+
allow_failed
|
|
940
|
+
if allow_failed is not None
|
|
941
|
+
else FURU_CONFIG.retry_failed
|
|
942
|
+
)
|
|
943
|
+
allow_success = always_rerun or needs_success_invalidation
|
|
883
944
|
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
945
|
+
try:
|
|
946
|
+
with compute_lock(
|
|
947
|
+
directory,
|
|
948
|
+
backend="submitit",
|
|
949
|
+
lease_duration_sec=FURU_CONFIG.lease_duration_sec,
|
|
950
|
+
heartbeat_interval_sec=FURU_CONFIG.heartbeat_interval_sec,
|
|
951
|
+
owner={
|
|
952
|
+
"pid": os.getpid(),
|
|
953
|
+
"host": socket.gethostname(),
|
|
954
|
+
"user": getpass.getuser(),
|
|
955
|
+
"command": " ".join(sys.argv) if sys.argv else "<unknown>",
|
|
956
|
+
},
|
|
957
|
+
scheduler={
|
|
958
|
+
"backend": env_info.get("backend"),
|
|
959
|
+
"job_id": env_info.get("slurm_job_id"),
|
|
960
|
+
},
|
|
961
|
+
max_wait_time_sec=None, # Workers wait indefinitely
|
|
962
|
+
poll_interval_sec=FURU_CONFIG.poll_interval,
|
|
963
|
+
wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
|
|
964
|
+
reconcile_fn=lambda d: self._reconcile(d),
|
|
965
|
+
allow_failed=allow_failed_effective,
|
|
966
|
+
allow_success=allow_success,
|
|
967
|
+
) as ctx:
|
|
968
|
+
self._prepare_executor_rerun(directory)
|
|
969
|
+
if not always_rerun:
|
|
970
|
+
exists_ok = self._exists_quiet()
|
|
971
|
+
if not exists_ok:
|
|
972
|
+
state = self.get_state(directory)
|
|
973
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
974
|
+
self._invalidate_cached_success(
|
|
975
|
+
directory,
|
|
976
|
+
reason="_validate returned false (worker)",
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
stage = "metadata"
|
|
980
|
+
try:
|
|
981
|
+
# Refresh metadata (now safe - attempt is already recorded)
|
|
982
|
+
metadata = MetadataManager.create_metadata(
|
|
983
|
+
self,
|
|
984
|
+
directory,
|
|
985
|
+
ignore_diff=FURU_CONFIG.ignore_git_diff,
|
|
986
|
+
)
|
|
987
|
+
MetadataManager.write_metadata(metadata, directory)
|
|
889
988
|
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
self._furu_hash,
|
|
903
|
-
directory,
|
|
904
|
-
)
|
|
905
|
-
StateManager.write_success_marker(
|
|
906
|
-
directory, attempt_id=ctx.attempt_id
|
|
907
|
-
)
|
|
908
|
-
StateManager.finish_attempt_success(
|
|
909
|
-
directory, attempt_id=ctx.attempt_id
|
|
910
|
-
)
|
|
911
|
-
logger.info(
|
|
912
|
-
"_create ok %s %s",
|
|
913
|
-
self.__class__.__name__,
|
|
914
|
-
self._furu_hash,
|
|
915
|
-
extra={"furu_console_only": True},
|
|
916
|
-
)
|
|
917
|
-
except Exception as e:
|
|
918
|
-
if stage == "_create":
|
|
919
|
-
logger.error(
|
|
920
|
-
"_create failed %s %s %s",
|
|
989
|
+
# Set up signal handlers
|
|
990
|
+
stage = "signal handler setup"
|
|
991
|
+
self._setup_signal_handlers(
|
|
992
|
+
directory,
|
|
993
|
+
ctx.stop_heartbeat,
|
|
994
|
+
attempt_id=ctx.attempt_id,
|
|
995
|
+
)
|
|
996
|
+
|
|
997
|
+
stage = "_create"
|
|
998
|
+
# Run computation
|
|
999
|
+
logger.debug(
|
|
1000
|
+
"_create: begin %s %s %s",
|
|
921
1001
|
self.__class__.__name__,
|
|
922
1002
|
self._furu_hash,
|
|
923
1003
|
directory,
|
|
924
|
-
extra={"furu_file_only": True},
|
|
925
1004
|
)
|
|
926
|
-
|
|
927
|
-
logger.
|
|
928
|
-
"
|
|
929
|
-
stage,
|
|
1005
|
+
self._create()
|
|
1006
|
+
logger.debug(
|
|
1007
|
+
"_create: ok %s %s %s",
|
|
930
1008
|
self.__class__.__name__,
|
|
931
1009
|
self._furu_hash,
|
|
932
1010
|
directory,
|
|
1011
|
+
)
|
|
1012
|
+
StateManager.write_success_marker(
|
|
1013
|
+
directory, attempt_id=ctx.attempt_id
|
|
1014
|
+
)
|
|
1015
|
+
StateManager.finish_attempt_success(
|
|
1016
|
+
directory, attempt_id=ctx.attempt_id
|
|
1017
|
+
)
|
|
1018
|
+
logger.info(
|
|
1019
|
+
"_create ok %s %s",
|
|
1020
|
+
self.__class__.__name__,
|
|
1021
|
+
self._furu_hash,
|
|
1022
|
+
extra={"furu_console_only": True},
|
|
1023
|
+
)
|
|
1024
|
+
except Exception as e:
|
|
1025
|
+
if stage == "_create":
|
|
1026
|
+
logger.error(
|
|
1027
|
+
"_create failed %s %s %s",
|
|
1028
|
+
self.__class__.__name__,
|
|
1029
|
+
self._furu_hash,
|
|
1030
|
+
directory,
|
|
1031
|
+
extra={"furu_file_only": True},
|
|
1032
|
+
)
|
|
1033
|
+
else:
|
|
1034
|
+
logger.error(
|
|
1035
|
+
"attempt failed (%s) %s %s %s",
|
|
1036
|
+
stage,
|
|
1037
|
+
self.__class__.__name__,
|
|
1038
|
+
self._furu_hash,
|
|
1039
|
+
directory,
|
|
1040
|
+
extra={"furu_file_only": True},
|
|
1041
|
+
)
|
|
1042
|
+
logger.error(
|
|
1043
|
+
"%s",
|
|
1044
|
+
format_traceback(e),
|
|
933
1045
|
extra={"furu_file_only": True},
|
|
934
1046
|
)
|
|
935
|
-
logger.error(
|
|
936
|
-
"%s", format_traceback(e), extra={"furu_file_only": True}
|
|
937
|
-
)
|
|
938
1047
|
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
)
|
|
951
|
-
self._add_exception_breadcrumbs(e, directory)
|
|
952
|
-
if stage != "_create":
|
|
953
|
-
message = (
|
|
954
|
-
"Failed to create metadata"
|
|
955
|
-
if stage == "metadata"
|
|
956
|
-
else "Failed to set up signal handlers"
|
|
1048
|
+
tb = "".join(
|
|
1049
|
+
traceback.format_exception(type(e), e, e.__traceback__)
|
|
1050
|
+
)
|
|
1051
|
+
StateManager.finish_attempt_failed(
|
|
1052
|
+
directory,
|
|
1053
|
+
attempt_id=ctx.attempt_id,
|
|
1054
|
+
error={
|
|
1055
|
+
"type": type(e).__name__,
|
|
1056
|
+
"message": str(e),
|
|
1057
|
+
"traceback": tb,
|
|
1058
|
+
},
|
|
957
1059
|
)
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
1060
|
+
self._add_exception_breadcrumbs(e, directory)
|
|
1061
|
+
if stage != "_create":
|
|
1062
|
+
message = (
|
|
1063
|
+
"Failed to create metadata"
|
|
1064
|
+
if stage == "metadata"
|
|
1065
|
+
else "Failed to set up signal handlers"
|
|
1066
|
+
)
|
|
1067
|
+
raise FuruComputeError(
|
|
1068
|
+
message,
|
|
1069
|
+
StateManager.get_state_path(directory),
|
|
1070
|
+
e,
|
|
1071
|
+
) from e
|
|
1072
|
+
raise
|
|
1073
|
+
except FuruLockNotAcquired as exc:
|
|
1074
|
+
# Experiment already completed; succeed if success, fail if failed.
|
|
1075
|
+
state = StateManager.read_state(directory)
|
|
1076
|
+
state_path = StateManager.get_state_path(directory)
|
|
1077
|
+
attempt = state.attempt
|
|
1078
|
+
attempt_info = "no active attempt"
|
|
1079
|
+
if attempt is not None:
|
|
1080
|
+
attempt_info = (
|
|
1081
|
+
f"attempt {attempt.id} status {attempt.status} "
|
|
1082
|
+
f"backend {attempt.backend}"
|
|
1083
|
+
)
|
|
1084
|
+
hints = [
|
|
1085
|
+
f"Furu hash: {self._furu_hash}",
|
|
1086
|
+
f"Directory: {directory}",
|
|
1087
|
+
f"State file: {state_path}",
|
|
1088
|
+
f"Attempt: {attempt_info}",
|
|
1089
|
+
]
|
|
1090
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
1091
|
+
return
|
|
1092
|
+
if isinstance(state.result, _StateResultFailed):
|
|
1093
|
+
if allow_failed_effective:
|
|
1094
|
+
return
|
|
1095
|
+
raise FuruComputeError(
|
|
1096
|
+
"Worker refused to run: experiment already failed",
|
|
1097
|
+
state_path,
|
|
1098
|
+
exc,
|
|
1099
|
+
hints=hints,
|
|
1100
|
+
) from exc
|
|
1101
|
+
raise FuruLockNotAcquired(
|
|
1102
|
+
"Worker refused to run: experiment already running elsewhere",
|
|
1103
|
+
hints=hints,
|
|
1104
|
+
) from exc
|
|
1105
|
+
finally:
|
|
1106
|
+
EXEC_CONTEXT.reset(exec_token)
|
|
967
1107
|
|
|
968
1108
|
def _collect_submitit_env(self: Self) -> _SubmititEnvInfo:
|
|
969
1109
|
"""Collect submitit/slurm environment information."""
|
|
@@ -994,6 +1134,7 @@ class Furu[T](ABC):
|
|
|
994
1134
|
start_time: float,
|
|
995
1135
|
*,
|
|
996
1136
|
allow_failed: bool,
|
|
1137
|
+
executor_mode: bool = False,
|
|
997
1138
|
) -> tuple[str, bool, T | None]:
|
|
998
1139
|
"""Run computation locally, returning (status, created_here, result)."""
|
|
999
1140
|
logger = get_logger()
|
|
@@ -1047,7 +1188,23 @@ class Furu[T](ABC):
|
|
|
1047
1188
|
self._furu_hash,
|
|
1048
1189
|
directory,
|
|
1049
1190
|
)
|
|
1050
|
-
|
|
1191
|
+
token = None
|
|
1192
|
+
if executor_mode:
|
|
1193
|
+
from furu.execution.context import EXEC_CONTEXT, ExecContext
|
|
1194
|
+
|
|
1195
|
+
token = EXEC_CONTEXT.set(
|
|
1196
|
+
ExecContext(
|
|
1197
|
+
mode="executor",
|
|
1198
|
+
spec_key=self._executor_spec_key(),
|
|
1199
|
+
backend="local",
|
|
1200
|
+
current_node_hash=self._furu_hash,
|
|
1201
|
+
)
|
|
1202
|
+
)
|
|
1203
|
+
try:
|
|
1204
|
+
result = self._create()
|
|
1205
|
+
finally:
|
|
1206
|
+
if token is not None:
|
|
1207
|
+
EXEC_CONTEXT.reset(token)
|
|
1051
1208
|
logger.debug(
|
|
1052
1209
|
"_create: ok %s %s %s",
|
|
1053
1210
|
self.__class__.__name__,
|
|
@@ -1145,6 +1302,8 @@ class Furu[T](ABC):
|
|
|
1145
1302
|
attempt_id: str,
|
|
1146
1303
|
) -> None:
|
|
1147
1304
|
"""Set up signal handlers for graceful preemption."""
|
|
1305
|
+
if threading.current_thread() is not threading.main_thread():
|
|
1306
|
+
return
|
|
1148
1307
|
|
|
1149
1308
|
def handle_signal(signum: int, frame: FrameType | None) -> None:
|
|
1150
1309
|
try:
|