furu 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- furu/__init__.py +8 -0
- furu/adapters/submitit.py +23 -2
- furu/config.py +40 -41
- furu/core/furu.py +479 -252
- furu/core/list.py +4 -3
- furu/dashboard/__init__.py +10 -1
- furu/dashboard/frontend/dist/assets/{index-DS3FsqcY.js → index-BjyrY-Zz.js} +1 -1
- furu/dashboard/frontend/dist/index.html +1 -1
- furu/dashboard/main.py +10 -3
- furu/errors.py +17 -4
- furu/execution/__init__.py +22 -0
- furu/execution/context.py +30 -0
- furu/execution/local.py +186 -0
- furu/execution/paths.py +20 -0
- furu/execution/plan.py +330 -0
- furu/execution/plan_utils.py +13 -0
- furu/execution/slurm_dag.py +273 -0
- furu/execution/slurm_pool.py +878 -0
- furu/execution/slurm_spec.py +38 -0
- furu/execution/submitit_factory.py +47 -0
- furu/migration.py +1 -2
- furu/runtime/env.py +1 -1
- furu/runtime/logging.py +40 -14
- furu/storage/metadata.py +25 -29
- furu/storage/migration.py +0 -1
- furu/storage/state.py +120 -98
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/METADATA +91 -42
- furu-0.0.5.dist-info/RECORD +46 -0
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/WHEEL +1 -1
- furu-0.0.3.dist-info/RECORD +0 -36
- {furu-0.0.3.dist-info → furu-0.0.5.dist-info}/entry_points.txt +0 -0
furu/core/furu.py
CHANGED
|
@@ -5,9 +5,11 @@ import os
|
|
|
5
5
|
import signal
|
|
6
6
|
import socket
|
|
7
7
|
import sys
|
|
8
|
+
import threading
|
|
8
9
|
import time
|
|
9
10
|
import traceback
|
|
10
11
|
from abc import ABC, abstractmethod
|
|
12
|
+
from functools import cached_property
|
|
11
13
|
from pathlib import Path
|
|
12
14
|
from types import FrameType
|
|
13
15
|
from typing import (
|
|
@@ -20,18 +22,16 @@ from typing import (
|
|
|
20
22
|
Protocol,
|
|
21
23
|
Self,
|
|
22
24
|
Sequence,
|
|
23
|
-
TypedDict,
|
|
24
25
|
TypeAlias,
|
|
26
|
+
TypedDict,
|
|
25
27
|
TypeVar,
|
|
26
28
|
cast,
|
|
27
|
-
overload,
|
|
28
29
|
)
|
|
29
30
|
|
|
30
31
|
import chz
|
|
31
32
|
import submitit
|
|
32
|
-
from typing_extensions import dataclass_transform
|
|
33
|
-
|
|
34
33
|
from chz.field import Field as ChzField
|
|
34
|
+
from typing_extensions import dataclass_transform
|
|
35
35
|
|
|
36
36
|
from ..adapters import SubmititAdapter
|
|
37
37
|
from ..adapters.submitit import SubmititJob
|
|
@@ -40,6 +40,7 @@ from ..errors import (
|
|
|
40
40
|
MISSING,
|
|
41
41
|
FuruComputeError,
|
|
42
42
|
FuruLockNotAcquired,
|
|
43
|
+
FuruValidationError,
|
|
43
44
|
FuruWaitTimeout,
|
|
44
45
|
)
|
|
45
46
|
from ..runtime import current_holder
|
|
@@ -62,7 +63,6 @@ from ..storage.state import (
|
|
|
62
63
|
_StateAttemptRunning,
|
|
63
64
|
_StateResultAbsent,
|
|
64
65
|
_StateResultFailed,
|
|
65
|
-
_StateResultMigrated,
|
|
66
66
|
_StateResultSuccess,
|
|
67
67
|
compute_lock,
|
|
68
68
|
)
|
|
@@ -190,16 +190,23 @@ class Furu[T](ABC):
|
|
|
190
190
|
raise NotImplementedError(f"{self.__class__.__name__}._load() not implemented")
|
|
191
191
|
|
|
192
192
|
def _validate(self: Self) -> bool:
|
|
193
|
-
"""
|
|
193
|
+
"""
|
|
194
|
+
Validate that result is complete and correct (override if needed).
|
|
195
|
+
|
|
196
|
+
Return False or raise FuruValidationError to mark artifacts as invalid.
|
|
197
|
+
"""
|
|
194
198
|
return True
|
|
195
199
|
|
|
196
200
|
def _dependencies(self: Self) -> "DependencySpec | None":
|
|
197
201
|
"""Return extra dependencies not captured by fields."""
|
|
198
202
|
return None
|
|
199
203
|
|
|
204
|
+
def _executor_spec_key(self: Self) -> str:
|
|
205
|
+
return "default"
|
|
206
|
+
|
|
200
207
|
def _get_dependencies(self: Self, *, recursive: bool = True) -> list["Furu"]:
|
|
201
208
|
"""Collect Furu dependencies from fields and `_dependencies()`."""
|
|
202
|
-
seen = {self.
|
|
209
|
+
seen = {self.furu_hash}
|
|
203
210
|
dependencies: list[Furu] = []
|
|
204
211
|
_collect_dependencies(self, dependencies, seen, recursive=recursive)
|
|
205
212
|
return dependencies
|
|
@@ -213,7 +220,7 @@ class Furu[T](ABC):
|
|
|
213
220
|
for dependency in dependencies:
|
|
214
221
|
if dependency is self:
|
|
215
222
|
raise ValueError("Furu dependencies cannot include self")
|
|
216
|
-
digests.add(dependency.
|
|
223
|
+
digests.add(dependency.furu_hash)
|
|
217
224
|
return sorted(digests)
|
|
218
225
|
|
|
219
226
|
def _invalidate_cached_success(self: Self, directory: Path, *, reason: str) -> None:
|
|
@@ -221,7 +228,7 @@ class Furu[T](ABC):
|
|
|
221
228
|
logger.warning(
|
|
222
229
|
"invalidate %s %s %s (%s)",
|
|
223
230
|
self.__class__.__name__,
|
|
224
|
-
self.
|
|
231
|
+
self.furu_hash,
|
|
225
232
|
directory,
|
|
226
233
|
reason,
|
|
227
234
|
)
|
|
@@ -238,9 +245,25 @@ class Furu[T](ABC):
|
|
|
238
245
|
directory, {"type": "result_invalidated", "reason": reason, "at": now}
|
|
239
246
|
)
|
|
240
247
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
248
|
+
def _prepare_executor_rerun(self: Self, directory: Path) -> None:
|
|
249
|
+
if not self._always_rerun():
|
|
250
|
+
return
|
|
251
|
+
if not directory.exists():
|
|
252
|
+
return
|
|
253
|
+
migration = self._alias_record(directory)
|
|
254
|
+
if migration is not None and self._alias_is_active(directory, migration):
|
|
255
|
+
self._maybe_detach_alias(
|
|
256
|
+
directory=directory,
|
|
257
|
+
record=migration,
|
|
258
|
+
reason="always_rerun",
|
|
259
|
+
)
|
|
260
|
+
state = StateManager.read_state(directory)
|
|
261
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
262
|
+
self._invalidate_cached_success(directory, reason="always_rerun enabled")
|
|
263
|
+
|
|
264
|
+
@cached_property
|
|
265
|
+
def furu_hash(self: Self) -> str:
|
|
266
|
+
"""Return the stable content hash for this Furu object."""
|
|
244
267
|
return FuruSerializer.compute_hash(self)
|
|
245
268
|
|
|
246
269
|
def _always_rerun(self: Self) -> bool:
|
|
@@ -253,15 +276,17 @@ class Furu[T](ABC):
|
|
|
253
276
|
|
|
254
277
|
def _base_furu_dir(self: Self) -> Path:
|
|
255
278
|
root = FURU_CONFIG.get_root(self.version_controlled)
|
|
256
|
-
return root / self.__class__._namespace() / self.
|
|
279
|
+
return root / self.__class__._namespace() / self.furu_hash
|
|
257
280
|
|
|
258
|
-
@
|
|
281
|
+
@cached_property
|
|
259
282
|
def furu_dir(self: Self) -> Path:
|
|
260
283
|
"""Get the directory for this Furu object."""
|
|
261
284
|
directory = self._base_furu_dir()
|
|
262
285
|
migration = self._alias_record(directory)
|
|
263
|
-
if migration is not None
|
|
264
|
-
|
|
286
|
+
if migration is not None:
|
|
287
|
+
target_dir = self._alias_target_dir(directory, migration)
|
|
288
|
+
if target_dir is not None:
|
|
289
|
+
return target_dir
|
|
265
290
|
return directory
|
|
266
291
|
|
|
267
292
|
@property
|
|
@@ -290,13 +315,38 @@ class Furu[T](ABC):
|
|
|
290
315
|
"""Log a message to the current holder's `furu.log`."""
|
|
291
316
|
return log(message, level=level)
|
|
292
317
|
|
|
318
|
+
def _exists_quiet(self: Self) -> bool:
|
|
319
|
+
directory = self._base_furu_dir()
|
|
320
|
+
success_dir = self._success_marker_dir(directory)
|
|
321
|
+
if success_dir is None:
|
|
322
|
+
return False
|
|
323
|
+
try:
|
|
324
|
+
return self._validate()
|
|
325
|
+
except FuruValidationError as exc:
|
|
326
|
+
logger = get_logger()
|
|
327
|
+
logger.warning(
|
|
328
|
+
"exists %s -> false (validate invalid for %s: %s)",
|
|
329
|
+
directory,
|
|
330
|
+
f"{self.__class__.__name__}({self.furu_hash})",
|
|
331
|
+
exc,
|
|
332
|
+
)
|
|
333
|
+
return False
|
|
334
|
+
except Exception as exc:
|
|
335
|
+
logger = get_logger()
|
|
336
|
+
logger.exception(
|
|
337
|
+
"exists %s -> false (validate crashed for %s: %s)",
|
|
338
|
+
directory,
|
|
339
|
+
f"{self.__class__.__name__}({self.furu_hash})",
|
|
340
|
+
exc,
|
|
341
|
+
)
|
|
342
|
+
return False
|
|
343
|
+
|
|
293
344
|
def exists(self: Self) -> bool:
|
|
294
345
|
"""Check if result exists and is valid."""
|
|
295
346
|
logger = get_logger()
|
|
296
347
|
directory = self._base_furu_dir()
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
if not isinstance(state.result, _StateResultSuccess):
|
|
348
|
+
success_dir = self._success_marker_dir(directory)
|
|
349
|
+
if success_dir is None:
|
|
300
350
|
logger.info("exists %s -> false", directory)
|
|
301
351
|
return False
|
|
302
352
|
|
|
@@ -313,53 +363,139 @@ class Furu[T](ABC):
|
|
|
313
363
|
"""Get migration record for this object."""
|
|
314
364
|
return MigrationManager.read_migration(self._base_furu_dir())
|
|
315
365
|
|
|
316
|
-
|
|
317
|
-
def load_or_create(
|
|
318
|
-
self,
|
|
319
|
-
executor: submitit.Executor,
|
|
320
|
-
*,
|
|
321
|
-
retry_failed: bool | None = None,
|
|
322
|
-
) -> T | submitit.Job[T]: ...
|
|
323
|
-
|
|
324
|
-
@overload
|
|
325
|
-
def load_or_create(
|
|
326
|
-
self,
|
|
327
|
-
executor: None = None,
|
|
328
|
-
*,
|
|
329
|
-
retry_failed: bool | None = None,
|
|
330
|
-
) -> T: ...
|
|
331
|
-
|
|
332
|
-
def load_or_create(
|
|
333
|
-
self: Self,
|
|
334
|
-
executor: submitit.Executor | None = None,
|
|
335
|
-
*,
|
|
336
|
-
retry_failed: bool | None = None,
|
|
337
|
-
) -> T | submitit.Job[T]:
|
|
366
|
+
def get(self: Self, *, force: bool = False) -> T:
|
|
338
367
|
"""
|
|
339
368
|
Load result if it exists, computing if necessary.
|
|
340
369
|
|
|
341
370
|
Args:
|
|
342
|
-
|
|
343
|
-
retry_failed: Whether to retry failed results (default uses FURU_RETRY_FAILED)
|
|
371
|
+
force: Allow computation inside executor contexts if the spec matches.
|
|
344
372
|
|
|
345
373
|
Returns:
|
|
346
|
-
|
|
374
|
+
Loaded or computed result.
|
|
347
375
|
|
|
348
376
|
Raises:
|
|
349
377
|
FuruComputeError: If computation fails with detailed error information
|
|
350
378
|
"""
|
|
379
|
+
from furu.errors import (
|
|
380
|
+
FuruExecutionError,
|
|
381
|
+
FuruMissingArtifact,
|
|
382
|
+
FuruSpecMismatch,
|
|
383
|
+
)
|
|
384
|
+
from furu.execution.context import EXEC_CONTEXT
|
|
385
|
+
|
|
386
|
+
ctx = EXEC_CONTEXT.get()
|
|
387
|
+
if ctx.mode == "executor":
|
|
388
|
+
logger = get_logger()
|
|
389
|
+
parent_holder = current_holder()
|
|
390
|
+
has_parent = parent_holder is not None and parent_holder is not self
|
|
391
|
+
needs_holder = parent_holder is None or has_parent
|
|
392
|
+
caller_info: _CallerInfo = {}
|
|
393
|
+
if has_parent:
|
|
394
|
+
caller_info = self._get_caller_info()
|
|
395
|
+
|
|
396
|
+
def _executor_get() -> T:
|
|
397
|
+
directory = self._base_furu_dir()
|
|
398
|
+
if force:
|
|
399
|
+
if (
|
|
400
|
+
ctx.current_node_hash is None
|
|
401
|
+
or self.furu_hash != ctx.current_node_hash
|
|
402
|
+
):
|
|
403
|
+
raise FuruExecutionError(
|
|
404
|
+
"force=True not allowed: only the current node may compute in executor mode. "
|
|
405
|
+
f"current_node_hash={ctx.current_node_hash!r} "
|
|
406
|
+
f"obj={self.__class__.__name__}({self.furu_hash})",
|
|
407
|
+
hints=[
|
|
408
|
+
"Declare this object as a dependency instead of calling dep.get(force=True).",
|
|
409
|
+
"Inside executor mode, use get(force=True) only on the node being executed.",
|
|
410
|
+
],
|
|
411
|
+
)
|
|
412
|
+
self._prepare_executor_rerun(directory)
|
|
413
|
+
|
|
414
|
+
exists_ok = self._exists_quiet()
|
|
415
|
+
if exists_ok and not (force and self._always_rerun()):
|
|
416
|
+
return self._load()
|
|
417
|
+
|
|
418
|
+
if force and not exists_ok:
|
|
419
|
+
state = self.get_state(directory)
|
|
420
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
421
|
+
self._invalidate_cached_success(
|
|
422
|
+
directory, reason="_validate returned false (executor)"
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if not force:
|
|
426
|
+
raise FuruMissingArtifact(
|
|
427
|
+
"Missing artifact "
|
|
428
|
+
f"{self.__class__.__name__}({self.furu_hash}) in executor mode. "
|
|
429
|
+
f"Requested by {ctx.current_node_hash}. Declare it as a dependency."
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
required = self._executor_spec_key()
|
|
433
|
+
if ctx.spec_key is None or required != ctx.spec_key:
|
|
434
|
+
raise FuruSpecMismatch(
|
|
435
|
+
"force=True not allowed: "
|
|
436
|
+
f"required={required!r} != worker={ctx.spec_key!r} (v1 exact match)"
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
StateManager.ensure_internal_dir(directory)
|
|
440
|
+
status, created_here, result = self._run_locally(
|
|
441
|
+
start_time=time.time(),
|
|
442
|
+
allow_failed=FURU_CONFIG.retry_failed,
|
|
443
|
+
executor_mode=True,
|
|
444
|
+
)
|
|
445
|
+
if status == "success":
|
|
446
|
+
if created_here:
|
|
447
|
+
return cast(T, result)
|
|
448
|
+
return self._load()
|
|
449
|
+
|
|
450
|
+
raise self._build_failed_state_error(
|
|
451
|
+
self._base_furu_dir(),
|
|
452
|
+
None,
|
|
453
|
+
message="Computation previously failed",
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
if has_parent:
|
|
457
|
+
logger.debug(
|
|
458
|
+
"dep: begin %s %s %s",
|
|
459
|
+
self.__class__.__name__,
|
|
460
|
+
self.furu_hash,
|
|
461
|
+
self._base_furu_dir(),
|
|
462
|
+
extra=caller_info,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
ok = False
|
|
466
|
+
try:
|
|
467
|
+
if needs_holder:
|
|
468
|
+
with enter_holder(self):
|
|
469
|
+
result = _executor_get()
|
|
470
|
+
else:
|
|
471
|
+
result = _executor_get()
|
|
472
|
+
ok = True
|
|
473
|
+
return result
|
|
474
|
+
finally:
|
|
475
|
+
if has_parent:
|
|
476
|
+
logger.debug(
|
|
477
|
+
"dep: end %s %s (%s)",
|
|
478
|
+
self.__class__.__name__,
|
|
479
|
+
self.furu_hash,
|
|
480
|
+
"ok" if ok else "error",
|
|
481
|
+
extra=caller_info,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
return self._get_impl_interactive(force=force)
|
|
485
|
+
|
|
486
|
+
def _get_impl_interactive(self: Self, *, force: bool) -> T:
|
|
351
487
|
logger = get_logger()
|
|
352
488
|
parent_holder = current_holder()
|
|
353
489
|
has_parent = parent_holder is not None and parent_holder is not self
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
)
|
|
490
|
+
caller_info = self._get_caller_info()
|
|
491
|
+
retry_failed_effective = FURU_CONFIG.retry_failed
|
|
357
492
|
if has_parent:
|
|
358
493
|
logger.debug(
|
|
359
494
|
"dep: begin %s %s %s",
|
|
360
495
|
self.__class__.__name__,
|
|
361
|
-
self.
|
|
496
|
+
self.furu_hash,
|
|
362
497
|
self._base_furu_dir(),
|
|
498
|
+
extra=caller_info,
|
|
363
499
|
)
|
|
364
500
|
|
|
365
501
|
ok = False
|
|
@@ -367,19 +503,21 @@ class Furu[T](ABC):
|
|
|
367
503
|
with enter_holder(self):
|
|
368
504
|
start_time = time.time()
|
|
369
505
|
base_dir = self._base_furu_dir()
|
|
370
|
-
base_dir.mkdir(parents=True, exist_ok=True)
|
|
371
506
|
directory = base_dir
|
|
372
507
|
migration = self._alias_record(base_dir)
|
|
373
508
|
alias_active = False
|
|
509
|
+
base_marker = StateManager.success_marker_exists(base_dir)
|
|
374
510
|
|
|
375
511
|
if (
|
|
376
512
|
migration is not None
|
|
377
513
|
and migration.kind == "alias"
|
|
378
514
|
and migration.overwritten_at is None
|
|
515
|
+
and not base_marker
|
|
379
516
|
):
|
|
380
|
-
target_dir =
|
|
381
|
-
|
|
382
|
-
|
|
517
|
+
target_dir = self._alias_target_dir(
|
|
518
|
+
base_dir, migration, base_marker=base_marker
|
|
519
|
+
)
|
|
520
|
+
if target_dir is not None:
|
|
383
521
|
alias_active = True
|
|
384
522
|
directory = target_dir
|
|
385
523
|
else:
|
|
@@ -445,7 +583,6 @@ class Furu[T](ABC):
|
|
|
445
583
|
message="Computation previously failed",
|
|
446
584
|
)
|
|
447
585
|
|
|
448
|
-
needs_reconcile = True
|
|
449
586
|
if isinstance(state0.result, _StateResultSuccess):
|
|
450
587
|
# Double check logic if we fell through to here (e.g. race condition or invalidation above)
|
|
451
588
|
if self._always_rerun():
|
|
@@ -460,9 +597,6 @@ class Furu[T](ABC):
|
|
|
460
597
|
directory, reason="_validate returned false"
|
|
461
598
|
)
|
|
462
599
|
state0 = StateManager.read_state(directory)
|
|
463
|
-
else:
|
|
464
|
-
# Valid success found, skip reconcile
|
|
465
|
-
needs_reconcile = False
|
|
466
600
|
except Exception as e:
|
|
467
601
|
self._invalidate_cached_success(
|
|
468
602
|
directory,
|
|
@@ -470,11 +604,6 @@ class Furu[T](ABC):
|
|
|
470
604
|
)
|
|
471
605
|
state0 = StateManager.read_state(directory)
|
|
472
606
|
|
|
473
|
-
if needs_reconcile and executor is not None:
|
|
474
|
-
adapter0 = SubmititAdapter(executor)
|
|
475
|
-
self._reconcile(directory, adapter=adapter0)
|
|
476
|
-
state0 = StateManager.read_state(directory)
|
|
477
|
-
|
|
478
607
|
attempt0 = state0.attempt
|
|
479
608
|
if isinstance(state0.result, _StateResultSuccess):
|
|
480
609
|
decision = "success->load"
|
|
@@ -489,17 +618,25 @@ class Furu[T](ABC):
|
|
|
489
618
|
# Cache hits can be extremely noisy in pipelines; keep logs for state
|
|
490
619
|
# transitions (create/wait) and error cases, but suppress repeated
|
|
491
620
|
# "success->load" lines and the raw separator on successful loads.
|
|
492
|
-
self._log_console_start(
|
|
621
|
+
self._log_console_start(
|
|
622
|
+
action_color=action_color,
|
|
623
|
+
caller_info=caller_info,
|
|
624
|
+
)
|
|
493
625
|
|
|
494
626
|
if decision != "success->load":
|
|
627
|
+
if decision == "create":
|
|
628
|
+
StateManager.ensure_internal_dir(directory)
|
|
495
629
|
write_separator()
|
|
496
630
|
logger.debug(
|
|
497
|
-
"
|
|
631
|
+
"get %s %s %s (%s)",
|
|
498
632
|
self.__class__.__name__,
|
|
499
|
-
self.
|
|
633
|
+
self.furu_hash,
|
|
500
634
|
directory,
|
|
501
635
|
decision,
|
|
502
|
-
extra={
|
|
636
|
+
extra={
|
|
637
|
+
"furu_action_color": action_color,
|
|
638
|
+
**caller_info,
|
|
639
|
+
},
|
|
503
640
|
)
|
|
504
641
|
|
|
505
642
|
# Fast path: already successful
|
|
@@ -514,9 +651,9 @@ class Furu[T](ABC):
|
|
|
514
651
|
# failures even when we suppressed the cache-hit header line.
|
|
515
652
|
write_separator()
|
|
516
653
|
logger.error(
|
|
517
|
-
"
|
|
654
|
+
"get %s %s (load failed)",
|
|
518
655
|
self.__class__.__name__,
|
|
519
|
-
self.
|
|
656
|
+
self.furu_hash,
|
|
520
657
|
)
|
|
521
658
|
raise FuruComputeError(
|
|
522
659
|
f"Failed to load result from {directory}",
|
|
@@ -524,65 +661,43 @@ class Furu[T](ABC):
|
|
|
524
661
|
e,
|
|
525
662
|
) from e
|
|
526
663
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
if created_here:
|
|
536
|
-
logger.debug(
|
|
537
|
-
"load_or_create: %s created -> return",
|
|
538
|
-
self.__class__.__name__,
|
|
539
|
-
)
|
|
540
|
-
return cast(T, result)
|
|
664
|
+
status, created_here, result = self._run_locally(
|
|
665
|
+
start_time=start_time,
|
|
666
|
+
allow_failed=retry_failed_effective,
|
|
667
|
+
executor_mode=False,
|
|
668
|
+
)
|
|
669
|
+
if status == "success":
|
|
670
|
+
ok = True
|
|
671
|
+
if created_here:
|
|
541
672
|
logger.debug(
|
|
542
|
-
"
|
|
673
|
+
"get: %s created -> return",
|
|
543
674
|
self.__class__.__name__,
|
|
544
675
|
)
|
|
545
|
-
return
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
None,
|
|
550
|
-
message="Computation previously failed",
|
|
676
|
+
return cast(T, result)
|
|
677
|
+
logger.debug(
|
|
678
|
+
"get: %s success -> _load()",
|
|
679
|
+
self.__class__.__name__,
|
|
551
680
|
)
|
|
681
|
+
return self._load()
|
|
552
682
|
|
|
553
|
-
|
|
554
|
-
(submitit_folder := self._base_furu_dir() / "submitit").mkdir(
|
|
555
|
-
exist_ok=True, parents=True
|
|
556
|
-
)
|
|
557
|
-
executor.folder = submitit_folder
|
|
558
|
-
adapter = SubmititAdapter(executor)
|
|
559
|
-
|
|
560
|
-
logger.debug(
|
|
561
|
-
"load_or_create: %s -> submitit submit_once()",
|
|
562
|
-
self.__class__.__name__,
|
|
563
|
-
)
|
|
564
|
-
job = self._submit_once(
|
|
565
|
-
adapter,
|
|
683
|
+
raise self._build_failed_state_error(
|
|
566
684
|
directory,
|
|
567
685
|
None,
|
|
568
|
-
|
|
686
|
+
message="Computation previously failed",
|
|
569
687
|
)
|
|
570
|
-
ok = True
|
|
571
|
-
return cast(submitit.Job[T], job)
|
|
572
688
|
finally:
|
|
573
689
|
if has_parent:
|
|
574
690
|
logger.debug(
|
|
575
691
|
"dep: end %s %s (%s)",
|
|
576
692
|
self.__class__.__name__,
|
|
577
|
-
self.
|
|
693
|
+
self.furu_hash,
|
|
578
694
|
"ok" if ok else "error",
|
|
695
|
+
extra=caller_info,
|
|
579
696
|
)
|
|
580
697
|
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
logger = get_logger()
|
|
698
|
+
@staticmethod
|
|
699
|
+
def _get_caller_info() -> _CallerInfo:
|
|
584
700
|
frame = sys._getframe(1)
|
|
585
|
-
|
|
586
701
|
caller_info: _CallerInfo = {}
|
|
587
702
|
if frame is not None:
|
|
588
703
|
# Walk up the stack to find the caller outside of furu package
|
|
@@ -597,11 +712,20 @@ class Furu[T](ABC):
|
|
|
597
712
|
}
|
|
598
713
|
break
|
|
599
714
|
frame = frame.f_back
|
|
715
|
+
return caller_info
|
|
716
|
+
|
|
717
|
+
def _log_console_start(
|
|
718
|
+
self, action_color: str, caller_info: _CallerInfo | None = None
|
|
719
|
+
) -> None:
|
|
720
|
+
"""Log the start of get to console with caller info."""
|
|
721
|
+
logger = get_logger()
|
|
722
|
+
if caller_info is None:
|
|
723
|
+
caller_info = self._get_caller_info()
|
|
600
724
|
|
|
601
725
|
logger.info(
|
|
602
|
-
"
|
|
726
|
+
"get %s %s",
|
|
603
727
|
self.__class__.__name__,
|
|
604
|
-
self.
|
|
728
|
+
self.furu_hash,
|
|
605
729
|
extra={
|
|
606
730
|
"furu_console_only": True,
|
|
607
731
|
"furu_action_color": action_color,
|
|
@@ -612,20 +736,14 @@ class Furu[T](ABC):
|
|
|
612
736
|
def _add_exception_breadcrumbs(self, exc: BaseException, directory: Path) -> None:
|
|
613
737
|
if not hasattr(exc, "add_note"):
|
|
614
738
|
return
|
|
615
|
-
|
|
616
|
-
log_path = StateManager.get_internal_dir(directory) / "furu.log"
|
|
617
|
-
note = (
|
|
618
|
-
f"Furu directory: {directory}\n"
|
|
619
|
-
f"State file: {state_path}\n"
|
|
620
|
-
f"Log file: {log_path}"
|
|
621
|
-
)
|
|
739
|
+
note = f"Furu dir: {directory}"
|
|
622
740
|
exc.add_note(note)
|
|
623
741
|
|
|
624
742
|
@staticmethod
|
|
625
743
|
def _failed_state_hints() -> list[str]:
|
|
626
744
|
return [
|
|
627
|
-
"To retry this failed artifact: set FURU_RETRY_FAILED=1 or call
|
|
628
|
-
"To inspect details: open the
|
|
745
|
+
"To retry this failed artifact: set FURU_RETRY_FAILED=1 or call get() again.",
|
|
746
|
+
"To inspect details: open the furu dir shown above.",
|
|
629
747
|
]
|
|
630
748
|
|
|
631
749
|
def _build_failed_state_error(
|
|
@@ -678,9 +796,11 @@ class Furu[T](ABC):
|
|
|
678
796
|
"""Return the alias-aware state for this Furu directory."""
|
|
679
797
|
base_dir = directory or self._base_furu_dir()
|
|
680
798
|
record = self._alias_record(base_dir)
|
|
681
|
-
if record is None
|
|
799
|
+
if record is None:
|
|
800
|
+
return StateManager.read_state(base_dir)
|
|
801
|
+
target_dir = self._alias_target_dir(base_dir, record)
|
|
802
|
+
if target_dir is None:
|
|
682
803
|
return StateManager.read_state(base_dir)
|
|
683
|
-
target_dir = MigrationManager.resolve_dir(record, target="from")
|
|
684
804
|
return StateManager.read_state(target_dir)
|
|
685
805
|
|
|
686
806
|
def _alias_record(self, directory: Path) -> MigrationRecord | None:
|
|
@@ -689,15 +809,36 @@ class Furu[T](ABC):
|
|
|
689
809
|
return None
|
|
690
810
|
return record
|
|
691
811
|
|
|
692
|
-
def
|
|
812
|
+
def _alias_target_dir(
|
|
813
|
+
self,
|
|
814
|
+
directory: Path,
|
|
815
|
+
record: MigrationRecord,
|
|
816
|
+
*,
|
|
817
|
+
base_marker: bool | None = None,
|
|
818
|
+
) -> Path | None:
|
|
693
819
|
if record.overwritten_at is not None:
|
|
694
|
-
return
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
820
|
+
return None
|
|
821
|
+
if base_marker is None:
|
|
822
|
+
base_marker = StateManager.success_marker_exists(directory)
|
|
823
|
+
if base_marker:
|
|
824
|
+
return None
|
|
698
825
|
target = MigrationManager.resolve_dir(record, target="from")
|
|
699
|
-
|
|
700
|
-
|
|
826
|
+
if StateManager.success_marker_exists(target):
|
|
827
|
+
return target
|
|
828
|
+
return None
|
|
829
|
+
|
|
830
|
+
def _success_marker_dir(self, directory: Path) -> Path | None:
|
|
831
|
+
base_marker = StateManager.success_marker_exists(directory)
|
|
832
|
+
record = self._alias_record(directory)
|
|
833
|
+
if record is None:
|
|
834
|
+
return directory if base_marker else None
|
|
835
|
+
target_dir = self._alias_target_dir(directory, record, base_marker=base_marker)
|
|
836
|
+
if target_dir is not None:
|
|
837
|
+
return target_dir
|
|
838
|
+
return directory if base_marker else None
|
|
839
|
+
|
|
840
|
+
def _alias_is_active(self, directory: Path, record: MigrationRecord) -> bool:
|
|
841
|
+
return self._alias_target_dir(directory, record) is not None
|
|
701
842
|
|
|
702
843
|
def _maybe_detach_alias(
|
|
703
844
|
self: Self,
|
|
@@ -738,6 +879,7 @@ class Furu[T](ABC):
|
|
|
738
879
|
) -> SubmititJob | None:
|
|
739
880
|
"""Submit job once without waiting (fire-and-forget mode)."""
|
|
740
881
|
logger = get_logger()
|
|
882
|
+
StateManager.ensure_internal_dir(directory)
|
|
741
883
|
self._reconcile(directory, adapter=adapter)
|
|
742
884
|
state = StateManager.read_state(directory)
|
|
743
885
|
attempt = state.attempt
|
|
@@ -758,7 +900,7 @@ class Furu[T](ABC):
|
|
|
758
900
|
logger.debug(
|
|
759
901
|
"submit: waiting for submit lock %s %s %s",
|
|
760
902
|
self.__class__.__name__,
|
|
761
|
-
self.
|
|
903
|
+
self.furu_hash,
|
|
762
904
|
directory,
|
|
763
905
|
)
|
|
764
906
|
time.sleep(0.5)
|
|
@@ -767,9 +909,7 @@ class Furu[T](ABC):
|
|
|
767
909
|
attempt_id: str | None = None
|
|
768
910
|
try:
|
|
769
911
|
# Create metadata
|
|
770
|
-
metadata = MetadataManager.create_metadata(
|
|
771
|
-
self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
|
|
772
|
-
)
|
|
912
|
+
metadata = MetadataManager.create_metadata(self, directory)
|
|
773
913
|
MetadataManager.write_metadata(metadata, directory)
|
|
774
914
|
|
|
775
915
|
env_info = MetadataManager.collect_environment_info()
|
|
@@ -843,127 +983,197 @@ class Furu[T](ABC):
|
|
|
843
983
|
"""Entry point for worker process (called by submitit or locally)."""
|
|
844
984
|
with enter_holder(self):
|
|
845
985
|
logger = get_logger()
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
986
|
+
# Ensure executor semantics apply to *all* work in the worker, not
|
|
987
|
+
# just `_create()`. This prevents accidental dependency computation
|
|
988
|
+
# (e.g., from within `_validate()` or metadata hooks).
|
|
989
|
+
from furu.execution.context import EXEC_CONTEXT, ExecContext
|
|
990
|
+
|
|
991
|
+
exec_token = EXEC_CONTEXT.set(
|
|
992
|
+
ExecContext(
|
|
993
|
+
mode="executor",
|
|
994
|
+
spec_key=self._executor_spec_key(),
|
|
995
|
+
backend="submitit",
|
|
996
|
+
current_node_hash=self.furu_hash,
|
|
997
|
+
)
|
|
852
998
|
)
|
|
853
|
-
|
|
854
999
|
try:
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
reconcile_fn=lambda d: self._reconcile(d),
|
|
874
|
-
allow_failed=allow_failed_effective,
|
|
875
|
-
) as ctx:
|
|
876
|
-
stage = "metadata"
|
|
877
|
-
try:
|
|
878
|
-
# Refresh metadata (now safe - attempt is already recorded)
|
|
879
|
-
metadata = MetadataManager.create_metadata(
|
|
880
|
-
self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
|
|
881
|
-
)
|
|
882
|
-
MetadataManager.write_metadata(metadata, directory)
|
|
1000
|
+
directory = self._base_furu_dir()
|
|
1001
|
+
StateManager.ensure_internal_dir(directory)
|
|
1002
|
+
always_rerun = self._always_rerun()
|
|
1003
|
+
needs_success_invalidation = False
|
|
1004
|
+
if not always_rerun:
|
|
1005
|
+
exists_ok = self._exists_quiet()
|
|
1006
|
+
if not exists_ok:
|
|
1007
|
+
state = self.get_state(directory)
|
|
1008
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
1009
|
+
needs_success_invalidation = True
|
|
1010
|
+
|
|
1011
|
+
env_info = self._collect_submitit_env()
|
|
1012
|
+
allow_failed_effective = (
|
|
1013
|
+
allow_failed
|
|
1014
|
+
if allow_failed is not None
|
|
1015
|
+
else FURU_CONFIG.retry_failed
|
|
1016
|
+
)
|
|
1017
|
+
allow_success = always_rerun or needs_success_invalidation
|
|
883
1018
|
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
1019
|
+
try:
|
|
1020
|
+
with compute_lock(
|
|
1021
|
+
directory,
|
|
1022
|
+
backend="submitit",
|
|
1023
|
+
lease_duration_sec=FURU_CONFIG.lease_duration_sec,
|
|
1024
|
+
heartbeat_interval_sec=FURU_CONFIG.heartbeat_interval_sec,
|
|
1025
|
+
owner={
|
|
1026
|
+
"pid": os.getpid(),
|
|
1027
|
+
"host": socket.gethostname(),
|
|
1028
|
+
"user": getpass.getuser(),
|
|
1029
|
+
"command": " ".join(sys.argv) if sys.argv else "<unknown>",
|
|
1030
|
+
},
|
|
1031
|
+
scheduler={
|
|
1032
|
+
"backend": env_info.get("backend"),
|
|
1033
|
+
"job_id": env_info.get("slurm_job_id"),
|
|
1034
|
+
},
|
|
1035
|
+
max_wait_time_sec=None, # Workers wait indefinitely
|
|
1036
|
+
poll_interval_sec=FURU_CONFIG.poll_interval,
|
|
1037
|
+
wait_log_every_sec=FURU_CONFIG.wait_log_every_sec,
|
|
1038
|
+
reconcile_fn=lambda d: self._reconcile(d),
|
|
1039
|
+
allow_failed=allow_failed_effective,
|
|
1040
|
+
allow_success=allow_success,
|
|
1041
|
+
) as ctx:
|
|
1042
|
+
self._prepare_executor_rerun(directory)
|
|
1043
|
+
if not always_rerun:
|
|
1044
|
+
exists_ok = self._exists_quiet()
|
|
1045
|
+
if not exists_ok:
|
|
1046
|
+
state = self.get_state(directory)
|
|
1047
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
1048
|
+
self._invalidate_cached_success(
|
|
1049
|
+
directory,
|
|
1050
|
+
reason="_validate returned false (worker)",
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
stage = "metadata"
|
|
1054
|
+
try:
|
|
1055
|
+
# Refresh metadata (now safe - attempt is already recorded)
|
|
1056
|
+
metadata = MetadataManager.create_metadata(self, directory)
|
|
1057
|
+
MetadataManager.write_metadata(metadata, directory)
|
|
889
1058
|
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
self._furu_hash,
|
|
903
|
-
directory,
|
|
904
|
-
)
|
|
905
|
-
StateManager.write_success_marker(
|
|
906
|
-
directory, attempt_id=ctx.attempt_id
|
|
907
|
-
)
|
|
908
|
-
StateManager.finish_attempt_success(
|
|
909
|
-
directory, attempt_id=ctx.attempt_id
|
|
910
|
-
)
|
|
911
|
-
logger.info(
|
|
912
|
-
"_create ok %s %s",
|
|
913
|
-
self.__class__.__name__,
|
|
914
|
-
self._furu_hash,
|
|
915
|
-
extra={"furu_console_only": True},
|
|
916
|
-
)
|
|
917
|
-
except Exception as e:
|
|
918
|
-
if stage == "_create":
|
|
919
|
-
logger.error(
|
|
920
|
-
"_create failed %s %s %s",
|
|
1059
|
+
# Set up signal handlers
|
|
1060
|
+
stage = "signal handler setup"
|
|
1061
|
+
self._setup_signal_handlers(
|
|
1062
|
+
directory,
|
|
1063
|
+
ctx.stop_heartbeat,
|
|
1064
|
+
attempt_id=ctx.attempt_id,
|
|
1065
|
+
)
|
|
1066
|
+
|
|
1067
|
+
stage = "_create"
|
|
1068
|
+
# Run computation
|
|
1069
|
+
logger.debug(
|
|
1070
|
+
"_create: begin %s %s %s",
|
|
921
1071
|
self.__class__.__name__,
|
|
922
|
-
self.
|
|
1072
|
+
self.furu_hash,
|
|
923
1073
|
directory,
|
|
924
|
-
extra={"furu_file_only": True},
|
|
925
1074
|
)
|
|
926
|
-
|
|
927
|
-
logger.
|
|
928
|
-
"
|
|
929
|
-
stage,
|
|
1075
|
+
self._create()
|
|
1076
|
+
logger.debug(
|
|
1077
|
+
"_create: ok %s %s %s",
|
|
930
1078
|
self.__class__.__name__,
|
|
931
|
-
self.
|
|
1079
|
+
self.furu_hash,
|
|
932
1080
|
directory,
|
|
1081
|
+
)
|
|
1082
|
+
StateManager.write_success_marker(
|
|
1083
|
+
directory, attempt_id=ctx.attempt_id
|
|
1084
|
+
)
|
|
1085
|
+
StateManager.finish_attempt_success(
|
|
1086
|
+
directory, attempt_id=ctx.attempt_id
|
|
1087
|
+
)
|
|
1088
|
+
logger.info(
|
|
1089
|
+
"_create ok %s %s",
|
|
1090
|
+
self.__class__.__name__,
|
|
1091
|
+
self.furu_hash,
|
|
1092
|
+
extra={"furu_console_only": True},
|
|
1093
|
+
)
|
|
1094
|
+
except Exception as e:
|
|
1095
|
+
if stage == "_create":
|
|
1096
|
+
logger.error(
|
|
1097
|
+
"_create failed %s %s %s",
|
|
1098
|
+
self.__class__.__name__,
|
|
1099
|
+
self.furu_hash,
|
|
1100
|
+
directory,
|
|
1101
|
+
extra={"furu_file_only": True},
|
|
1102
|
+
)
|
|
1103
|
+
else:
|
|
1104
|
+
logger.error(
|
|
1105
|
+
"attempt failed (%s) %s %s %s",
|
|
1106
|
+
stage,
|
|
1107
|
+
self.__class__.__name__,
|
|
1108
|
+
self.furu_hash,
|
|
1109
|
+
directory,
|
|
1110
|
+
extra={"furu_file_only": True},
|
|
1111
|
+
)
|
|
1112
|
+
logger.error(
|
|
1113
|
+
"%s",
|
|
1114
|
+
format_traceback(e),
|
|
933
1115
|
extra={"furu_file_only": True},
|
|
934
1116
|
)
|
|
935
|
-
logger.error(
|
|
936
|
-
"%s", format_traceback(e), extra={"furu_file_only": True}
|
|
937
|
-
)
|
|
938
1117
|
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
)
|
|
942
|
-
StateManager.finish_attempt_failed(
|
|
943
|
-
directory,
|
|
944
|
-
attempt_id=ctx.attempt_id,
|
|
945
|
-
error={
|
|
946
|
-
"type": type(e).__name__,
|
|
947
|
-
"message": str(e),
|
|
948
|
-
"traceback": tb,
|
|
949
|
-
},
|
|
950
|
-
)
|
|
951
|
-
self._add_exception_breadcrumbs(e, directory)
|
|
952
|
-
if stage != "_create":
|
|
953
|
-
message = (
|
|
954
|
-
"Failed to create metadata"
|
|
955
|
-
if stage == "metadata"
|
|
956
|
-
else "Failed to set up signal handlers"
|
|
1118
|
+
tb = "".join(
|
|
1119
|
+
traceback.format_exception(type(e), e, e.__traceback__)
|
|
957
1120
|
)
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
1121
|
+
StateManager.finish_attempt_failed(
|
|
1122
|
+
directory,
|
|
1123
|
+
attempt_id=ctx.attempt_id,
|
|
1124
|
+
error={
|
|
1125
|
+
"type": type(e).__name__,
|
|
1126
|
+
"message": str(e),
|
|
1127
|
+
"traceback": tb,
|
|
1128
|
+
},
|
|
1129
|
+
)
|
|
1130
|
+
self._add_exception_breadcrumbs(e, directory)
|
|
1131
|
+
if stage != "_create":
|
|
1132
|
+
message = (
|
|
1133
|
+
"Failed to create metadata"
|
|
1134
|
+
if stage == "metadata"
|
|
1135
|
+
else "Failed to set up signal handlers"
|
|
1136
|
+
)
|
|
1137
|
+
raise FuruComputeError(
|
|
1138
|
+
message,
|
|
1139
|
+
StateManager.get_state_path(directory),
|
|
1140
|
+
e,
|
|
1141
|
+
) from e
|
|
1142
|
+
raise
|
|
1143
|
+
except FuruLockNotAcquired as exc:
|
|
1144
|
+
# Experiment already completed; succeed if success, fail if failed.
|
|
1145
|
+
state = StateManager.read_state(directory)
|
|
1146
|
+
state_path = StateManager.get_state_path(directory)
|
|
1147
|
+
attempt = state.attempt
|
|
1148
|
+
attempt_info = "no active attempt"
|
|
1149
|
+
if attempt is not None:
|
|
1150
|
+
attempt_info = (
|
|
1151
|
+
f"attempt {attempt.id} status {attempt.status} "
|
|
1152
|
+
f"backend {attempt.backend}"
|
|
1153
|
+
)
|
|
1154
|
+
hints = [
|
|
1155
|
+
f"Furu hash: {self.furu_hash}",
|
|
1156
|
+
f"Directory: {directory}",
|
|
1157
|
+
f"State file: {state_path}",
|
|
1158
|
+
f"Attempt: {attempt_info}",
|
|
1159
|
+
]
|
|
1160
|
+
if isinstance(state.result, _StateResultSuccess):
|
|
1161
|
+
return
|
|
1162
|
+
if isinstance(state.result, _StateResultFailed):
|
|
1163
|
+
if allow_failed_effective:
|
|
1164
|
+
return
|
|
1165
|
+
raise FuruComputeError(
|
|
1166
|
+
"Worker refused to run: experiment already failed",
|
|
1167
|
+
state_path,
|
|
1168
|
+
exc,
|
|
1169
|
+
hints=hints,
|
|
1170
|
+
) from exc
|
|
1171
|
+
raise FuruLockNotAcquired(
|
|
1172
|
+
"Worker refused to run: experiment already running elsewhere",
|
|
1173
|
+
hints=hints,
|
|
1174
|
+
) from exc
|
|
1175
|
+
finally:
|
|
1176
|
+
EXEC_CONTEXT.reset(exec_token)
|
|
967
1177
|
|
|
968
1178
|
def _collect_submitit_env(self: Self) -> _SubmititEnvInfo:
|
|
969
1179
|
"""Collect submitit/slurm environment information."""
|
|
@@ -994,6 +1204,7 @@ class Furu[T](ABC):
|
|
|
994
1204
|
start_time: float,
|
|
995
1205
|
*,
|
|
996
1206
|
allow_failed: bool,
|
|
1207
|
+
executor_mode: bool = False,
|
|
997
1208
|
) -> tuple[str, bool, T | None]:
|
|
998
1209
|
"""Run computation locally, returning (status, created_here, result)."""
|
|
999
1210
|
logger = get_logger()
|
|
@@ -1028,9 +1239,7 @@ class Furu[T](ABC):
|
|
|
1028
1239
|
stage = "metadata"
|
|
1029
1240
|
try:
|
|
1030
1241
|
# Create metadata (now safe - attempt is already recorded)
|
|
1031
|
-
metadata = MetadataManager.create_metadata(
|
|
1032
|
-
self, directory, ignore_diff=FURU_CONFIG.ignore_git_diff
|
|
1033
|
-
)
|
|
1242
|
+
metadata = MetadataManager.create_metadata(self, directory)
|
|
1034
1243
|
MetadataManager.write_metadata(metadata, directory)
|
|
1035
1244
|
|
|
1036
1245
|
# Set up preemption handler
|
|
@@ -1044,14 +1253,30 @@ class Furu[T](ABC):
|
|
|
1044
1253
|
logger.debug(
|
|
1045
1254
|
"_create: begin %s %s %s",
|
|
1046
1255
|
self.__class__.__name__,
|
|
1047
|
-
self.
|
|
1256
|
+
self.furu_hash,
|
|
1048
1257
|
directory,
|
|
1049
1258
|
)
|
|
1050
|
-
|
|
1259
|
+
token = None
|
|
1260
|
+
if executor_mode:
|
|
1261
|
+
from furu.execution.context import EXEC_CONTEXT, ExecContext
|
|
1262
|
+
|
|
1263
|
+
token = EXEC_CONTEXT.set(
|
|
1264
|
+
ExecContext(
|
|
1265
|
+
mode="executor",
|
|
1266
|
+
spec_key=self._executor_spec_key(),
|
|
1267
|
+
backend="local",
|
|
1268
|
+
current_node_hash=self.furu_hash,
|
|
1269
|
+
)
|
|
1270
|
+
)
|
|
1271
|
+
try:
|
|
1272
|
+
result = self._create()
|
|
1273
|
+
finally:
|
|
1274
|
+
if token is not None:
|
|
1275
|
+
EXEC_CONTEXT.reset(token)
|
|
1051
1276
|
logger.debug(
|
|
1052
1277
|
"_create: ok %s %s %s",
|
|
1053
1278
|
self.__class__.__name__,
|
|
1054
|
-
self.
|
|
1279
|
+
self.furu_hash,
|
|
1055
1280
|
directory,
|
|
1056
1281
|
)
|
|
1057
1282
|
StateManager.write_success_marker(
|
|
@@ -1063,7 +1288,7 @@ class Furu[T](ABC):
|
|
|
1063
1288
|
logger.info(
|
|
1064
1289
|
"_create ok %s %s",
|
|
1065
1290
|
self.__class__.__name__,
|
|
1066
|
-
self.
|
|
1291
|
+
self.furu_hash,
|
|
1067
1292
|
extra={"furu_console_only": True},
|
|
1068
1293
|
)
|
|
1069
1294
|
return "success", True, result
|
|
@@ -1072,7 +1297,7 @@ class Furu[T](ABC):
|
|
|
1072
1297
|
logger.error(
|
|
1073
1298
|
"_create failed %s %s %s",
|
|
1074
1299
|
self.__class__.__name__,
|
|
1075
|
-
self.
|
|
1300
|
+
self.furu_hash,
|
|
1076
1301
|
directory,
|
|
1077
1302
|
extra={"furu_file_only": True},
|
|
1078
1303
|
)
|
|
@@ -1081,7 +1306,7 @@ class Furu[T](ABC):
|
|
|
1081
1306
|
"attempt failed (%s) %s %s %s",
|
|
1082
1307
|
stage,
|
|
1083
1308
|
self.__class__.__name__,
|
|
1084
|
-
self.
|
|
1309
|
+
self.furu_hash,
|
|
1085
1310
|
directory,
|
|
1086
1311
|
extra={"furu_file_only": True},
|
|
1087
1312
|
)
|
|
@@ -1145,6 +1370,8 @@ class Furu[T](ABC):
|
|
|
1145
1370
|
attempt_id: str,
|
|
1146
1371
|
) -> None:
|
|
1147
1372
|
"""Set up signal handlers for graceful preemption."""
|
|
1373
|
+
if threading.current_thread() is not threading.main_thread():
|
|
1374
|
+
return
|
|
1148
1375
|
|
|
1149
1376
|
def handle_signal(signum: int, frame: FrameType | None) -> None:
|
|
1150
1377
|
try:
|
|
@@ -1191,7 +1418,7 @@ def _collect_dependencies(
|
|
|
1191
1418
|
recursive: bool,
|
|
1192
1419
|
) -> None:
|
|
1193
1420
|
for dependency in _direct_dependencies(obj):
|
|
1194
|
-
digest = dependency.
|
|
1421
|
+
digest = dependency.furu_hash
|
|
1195
1422
|
if digest in seen:
|
|
1196
1423
|
continue
|
|
1197
1424
|
seen.add(digest)
|
|
@@ -1345,7 +1572,7 @@ def _sorted_dependency_set(
|
|
|
1345
1572
|
|
|
1346
1573
|
def _dependency_sort_key(value: DependencyScanValue) -> tuple[int, str]:
|
|
1347
1574
|
if isinstance(value, Furu):
|
|
1348
|
-
return (0, value.
|
|
1575
|
+
return (0, cast(str, value.furu_hash))
|
|
1349
1576
|
return (1, f"{type(value).__name__}:{value!r}")
|
|
1350
1577
|
|
|
1351
1578
|
|