experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +130 -5
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +107 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +489 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +225 -30
- experimaestro/scheduler/interfaces.py +474 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/services.py +186 -12
- experimaestro/scheduler/state_db.py +388 -0
- experimaestro/scheduler/state_provider.py +2345 -0
- experimaestro/scheduler/state_sync.py +834 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2303 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b4.dist-info/RECORD +181 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b4.dist-info}/licenses/LICENSE +0 -0
experimaestro/core/types.py
CHANGED
|
@@ -1,7 +1,18 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
2
3
|
import inspect
|
|
3
4
|
import sys
|
|
4
|
-
from typing import
|
|
5
|
+
from typing import (
|
|
6
|
+
Set,
|
|
7
|
+
TypeVar,
|
|
8
|
+
Union,
|
|
9
|
+
Dict,
|
|
10
|
+
Iterator,
|
|
11
|
+
List,
|
|
12
|
+
Optional,
|
|
13
|
+
get_args,
|
|
14
|
+
get_origin,
|
|
15
|
+
)
|
|
5
16
|
from collections import ChainMap
|
|
6
17
|
from pathlib import Path
|
|
7
18
|
import typing
|
|
@@ -13,15 +24,27 @@ from enum import Enum
|
|
|
13
24
|
import ast
|
|
14
25
|
import textwrap
|
|
15
26
|
|
|
16
|
-
|
|
17
|
-
from typing_extensions import _AnnotatedAlias, get_type_hints
|
|
18
|
-
else:
|
|
19
|
-
from typing import _AnnotatedAlias, get_type_hints
|
|
27
|
+
from typing import _AnnotatedAlias, get_type_hints
|
|
20
28
|
|
|
21
29
|
if typing.TYPE_CHECKING:
|
|
22
30
|
from experimaestro.scheduler.base import Job
|
|
23
31
|
from experimaestro.launchers import Launcher
|
|
24
32
|
from experimaestro.core.objects import Config
|
|
33
|
+
from experimaestro.core.subparameters import Subparameters
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class DeprecationInfo:
|
|
38
|
+
"""Information about a deprecated configuration type."""
|
|
39
|
+
|
|
40
|
+
#: The original identifier before deprecation
|
|
41
|
+
original_identifier: "Identifier"
|
|
42
|
+
|
|
43
|
+
#: The target configuration class to convert to
|
|
44
|
+
target: type
|
|
45
|
+
|
|
46
|
+
#: If True, creating an instance immediately converts to the target type
|
|
47
|
+
replace: bool = False
|
|
25
48
|
|
|
26
49
|
|
|
27
50
|
class Identifier:
|
|
@@ -221,6 +244,9 @@ class ObjectType(Type):
|
|
|
221
244
|
self._title = None
|
|
222
245
|
self.submit_hooks = set()
|
|
223
246
|
|
|
247
|
+
# Warning flag for non-resumable task directory cleanup
|
|
248
|
+
self.warned_clean_not_resumable = False
|
|
249
|
+
|
|
224
250
|
# --- Get the identifier
|
|
225
251
|
if identifier is None and hasattr(tp, "__xpmid__"):
|
|
226
252
|
__xpmid__ = getattr(tp, "__xpmid__")
|
|
@@ -279,7 +305,20 @@ class ObjectType(Type):
|
|
|
279
305
|
self.__initialized__ = False
|
|
280
306
|
self._runtype = None
|
|
281
307
|
self.annotations = []
|
|
282
|
-
self.
|
|
308
|
+
self._deprecation: Optional[DeprecationInfo] = None
|
|
309
|
+
|
|
310
|
+
# --- Value class (for external value types, e.g., nn.Module subclasses)
|
|
311
|
+
self._original_type: type = tp # Keep reference to original config class
|
|
312
|
+
|
|
313
|
+
# --- Subparameters for partial identifier computation
|
|
314
|
+
self._subparameters: Dict[str, "Subparameters"] = {}
|
|
315
|
+
|
|
316
|
+
def set_value_type(self, value_class: type) -> None:
|
|
317
|
+
"""Register an explicit value class for this configuration.
|
|
318
|
+
|
|
319
|
+
The value class will be used when creating instances via .instance().
|
|
320
|
+
"""
|
|
321
|
+
self.value_type = value_class
|
|
283
322
|
|
|
284
323
|
def addAnnotation(self, annotation):
|
|
285
324
|
assert not self.__initialized__
|
|
@@ -335,15 +374,18 @@ class ObjectType(Type):
|
|
|
335
374
|
# Add task
|
|
336
375
|
if self.taskcommandfactory is not None:
|
|
337
376
|
self.task = self.taskcommandfactory(self)
|
|
338
|
-
elif issubclass(self.
|
|
377
|
+
elif issubclass(self._original_type, Task):
|
|
339
378
|
self.task = self.getpythontaskcommand()
|
|
340
379
|
|
|
341
380
|
# Add arguments from type hints
|
|
381
|
+
# Use _original_type since value_type may have been overridden by set_value_type
|
|
342
382
|
from .arguments import TypeAnnotation
|
|
343
383
|
|
|
344
|
-
if hasattr(self.
|
|
345
|
-
typekeys = set(
|
|
346
|
-
|
|
384
|
+
if hasattr(self._original_type, "__annotations__"):
|
|
385
|
+
typekeys = set(
|
|
386
|
+
self._original_type.__dict__.get("__annotations__", {}).keys()
|
|
387
|
+
)
|
|
388
|
+
hints = get_type_hints(self._original_type, include_extras=True)
|
|
347
389
|
for key, typehint in hints.items():
|
|
348
390
|
# Filter out hints from parent classes
|
|
349
391
|
if key in typekeys:
|
|
@@ -356,17 +398,27 @@ class ObjectType(Type):
|
|
|
356
398
|
try:
|
|
357
399
|
self.addArgument(
|
|
358
400
|
options.create(
|
|
359
|
-
key, self.
|
|
401
|
+
key, self._original_type, typehint.__args__[0]
|
|
360
402
|
)
|
|
361
403
|
)
|
|
362
404
|
except Exception:
|
|
363
405
|
logger.error(
|
|
364
406
|
"while adding argument %s of %s",
|
|
365
407
|
key,
|
|
366
|
-
self.
|
|
408
|
+
self._original_type,
|
|
367
409
|
)
|
|
368
410
|
raise
|
|
369
411
|
|
|
412
|
+
# Collect subparameters from class attributes
|
|
413
|
+
from .subparameters import Subparameters as SubparametersClass
|
|
414
|
+
|
|
415
|
+
for name, value in self._original_type.__dict__.items():
|
|
416
|
+
if isinstance(value, SubparametersClass):
|
|
417
|
+
# Auto-set name from attribute name if not already set
|
|
418
|
+
if value.name is None:
|
|
419
|
+
value.name = name
|
|
420
|
+
self._subparameters[name] = value
|
|
421
|
+
|
|
370
422
|
def name(self):
|
|
371
423
|
return f"{self.value_type.__module__}.{self.value_type.__qualname__}"
|
|
372
424
|
|
|
@@ -378,7 +430,8 @@ class ObjectType(Type):
|
|
|
378
430
|
self.__initialize__()
|
|
379
431
|
|
|
380
432
|
# Get description from documentation
|
|
381
|
-
|
|
433
|
+
# Use _original_type since value_type may have been overridden
|
|
434
|
+
__doc__ = self._original_type.__dict__.get("__doc__", None)
|
|
382
435
|
if __doc__:
|
|
383
436
|
parseddoc = parse(__doc__)
|
|
384
437
|
self._title = parseddoc.short_description
|
|
@@ -407,24 +460,56 @@ class ObjectType(Type):
|
|
|
407
460
|
|
|
408
461
|
argname = None
|
|
409
462
|
|
|
410
|
-
def deprecate(self):
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
463
|
+
def deprecate(self, target=None, replace: bool = False):
|
|
464
|
+
"""Mark this configuration type as deprecated.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
target: Optional target configuration class. If provided, uses
|
|
468
|
+
target's identifier. If None, uses parent class's identifier
|
|
469
|
+
(legacy behavior requiring single inheritance).
|
|
470
|
+
replace: If True, creating an instance of this class immediately
|
|
471
|
+
returns a converted instance of the target class.
|
|
472
|
+
|
|
473
|
+
When a target is specified, the deprecated class should define a
|
|
474
|
+
__convert__ method that returns an equivalent target configuration.
|
|
475
|
+
The identifier is computed from the converted configuration.
|
|
476
|
+
"""
|
|
477
|
+
assert self._deprecation is None, "Already deprecated"
|
|
478
|
+
|
|
479
|
+
# Save the deprecated identifier for migration tools (fix_deprecated)
|
|
480
|
+
original_identifier = self.identifier
|
|
481
|
+
|
|
482
|
+
if target is not None:
|
|
483
|
+
# New mechanism: explicit target class
|
|
484
|
+
target_xpmtype = target.__getxpmtype__()
|
|
485
|
+
self.identifier = target_xpmtype.identifier
|
|
486
|
+
deprecation_target = target
|
|
487
|
+
else:
|
|
488
|
+
# Legacy mechanism: parent class is the target
|
|
489
|
+
if len(self.value_type.__bases__) != 1:
|
|
490
|
+
raise RuntimeError(
|
|
491
|
+
"Deprecated configurations must have "
|
|
492
|
+
"only one parent (the new configuration)"
|
|
493
|
+
)
|
|
494
|
+
parent = self.value_type.__bases__[0].__getxpmtype__()
|
|
495
|
+
self.identifier = parent.identifier
|
|
496
|
+
deprecation_target = self.value_type.__bases__[0]
|
|
497
|
+
|
|
498
|
+
self._deprecation = DeprecationInfo(
|
|
499
|
+
original_identifier=original_identifier,
|
|
500
|
+
target=deprecation_target,
|
|
501
|
+
replace=replace,
|
|
502
|
+
)
|
|
423
503
|
|
|
424
504
|
@property
|
|
425
505
|
def deprecated(self) -> bool:
|
|
426
506
|
"""Returns true if this type is deprecated"""
|
|
427
|
-
return self.
|
|
507
|
+
return self._deprecation is not None
|
|
508
|
+
|
|
509
|
+
@property
|
|
510
|
+
def _deprecated_identifier(self) -> Optional["Identifier"]:
|
|
511
|
+
"""Returns the original identifier before deprecation (for backwards compatibility)"""
|
|
512
|
+
return self._deprecation.original_identifier if self._deprecation else None
|
|
428
513
|
|
|
429
514
|
@property
|
|
430
515
|
def description(self) -> str:
|
|
@@ -442,23 +527,72 @@ class ObjectType(Type):
|
|
|
442
527
|
return self._arguments
|
|
443
528
|
|
|
444
529
|
def addArgument(self, argument: Argument):
|
|
530
|
+
# Check if this argument overrides a parent argument
|
|
531
|
+
# _arguments is a ChainMap where maps[0] is current class, maps[1:] are parents
|
|
532
|
+
parent_argument = None
|
|
533
|
+
for parent_map in self._arguments.maps[1:]:
|
|
534
|
+
if argument.name in parent_map:
|
|
535
|
+
parent_argument = parent_map[argument.name]
|
|
536
|
+
break
|
|
537
|
+
|
|
538
|
+
if parent_argument is not None:
|
|
539
|
+
# Check type compatibility (child type should be subtype of parent type)
|
|
540
|
+
self._check_override_type_compatibility(argument, parent_argument)
|
|
541
|
+
|
|
542
|
+
# Warn if overrides flag is not set
|
|
543
|
+
if not argument.overrides:
|
|
544
|
+
logger.warning(
|
|
545
|
+
"Parameter '%s' in %s overrides parent parameter from %s. "
|
|
546
|
+
"Use field(overrides=True) to suppress this warning.",
|
|
547
|
+
argument.name,
|
|
548
|
+
self._original_type.__qualname__,
|
|
549
|
+
(
|
|
550
|
+
parent_argument.objecttype._original_type.__qualname__
|
|
551
|
+
if parent_argument.objecttype
|
|
552
|
+
else "unknown"
|
|
553
|
+
),
|
|
554
|
+
)
|
|
555
|
+
|
|
445
556
|
self._arguments[argument.name] = argument
|
|
446
557
|
argument.objecttype = self
|
|
447
558
|
|
|
448
|
-
# The the attribute for the config type
|
|
449
|
-
setattr(
|
|
450
|
-
self.config_type,
|
|
451
|
-
argument.name,
|
|
452
|
-
property(
|
|
453
|
-
lambda _self: _self.__xpm__.get(argument.name),
|
|
454
|
-
lambda _self, value: _self.__xpm__.set(argument.name, value),
|
|
455
|
-
),
|
|
456
|
-
)
|
|
457
|
-
|
|
458
559
|
# Check default value
|
|
459
560
|
if argument.default is not None:
|
|
460
561
|
argument.type.validate(argument.default)
|
|
461
562
|
|
|
563
|
+
def _check_override_type_compatibility(
|
|
564
|
+
self, child_arg: Argument, parent_arg: Argument
|
|
565
|
+
):
|
|
566
|
+
"""Check that the child argument type is compatible with the parent type.
|
|
567
|
+
|
|
568
|
+
For Config types, the child type should be a subtype of the parent type
|
|
569
|
+
(covariant). For other types, we check for exact match.
|
|
570
|
+
"""
|
|
571
|
+
child_type = child_arg.type
|
|
572
|
+
parent_type = parent_arg.type
|
|
573
|
+
|
|
574
|
+
# Check if both are ObjectType (Config types)
|
|
575
|
+
if isinstance(child_type, ObjectType) and isinstance(parent_type, ObjectType):
|
|
576
|
+
child_pytype = child_type.value_type
|
|
577
|
+
parent_pytype = parent_type.value_type
|
|
578
|
+
|
|
579
|
+
# Check if child is a subtype of parent
|
|
580
|
+
if not issubclass(child_pytype, parent_pytype):
|
|
581
|
+
raise TypeError(
|
|
582
|
+
f"Parameter '{child_arg.name}' type {child_pytype.__qualname__} "
|
|
583
|
+
f"is not a subtype of parent type {parent_pytype.__qualname__}. "
|
|
584
|
+
f"Override types must be subtypes of the parent type."
|
|
585
|
+
)
|
|
586
|
+
elif type(child_type) is not type(parent_type):
|
|
587
|
+
# For non-Config types, check for exact type match
|
|
588
|
+
# Different type classes (e.g., IntType vs StrType) are incompatible
|
|
589
|
+
raise TypeError(
|
|
590
|
+
f"Parameter '{child_arg.name}' type {type(child_type).__name__} "
|
|
591
|
+
f"is not compatible with parent type {type(parent_type).__name__}. "
|
|
592
|
+
f"Override types must be the same type or a subtype."
|
|
593
|
+
)
|
|
594
|
+
# Same type class is allowed (e.g., both are IntType)
|
|
595
|
+
|
|
462
596
|
def getArgument(self, key: str) -> Argument:
|
|
463
597
|
self.__initialize__()
|
|
464
598
|
return self._arguments[key]
|
|
@@ -466,7 +600,10 @@ class ObjectType(Type):
|
|
|
466
600
|
def parents(self) -> Iterator["ObjectType"]:
|
|
467
601
|
from .objects import Config, Task
|
|
468
602
|
|
|
469
|
-
|
|
603
|
+
# Use _original_type to avoid issues when value_type has been
|
|
604
|
+
# overridden by set_value_type (the value class would create
|
|
605
|
+
# circular references since it inherits from the config class)
|
|
606
|
+
for tp in self._original_type.__bases__:
|
|
470
607
|
if issubclass(tp, Config) and tp not in [Config, Task]:
|
|
471
608
|
yield tp.__xpmtype__
|
|
472
609
|
|
experimaestro/exceptions.py
CHANGED
|
@@ -1,2 +1,28 @@
|
|
|
1
1
|
class HandledException(Exception):
|
|
2
2
|
pass
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GracefulTimeout(Exception):
|
|
6
|
+
"""Exception raised to signal a graceful timeout in resumable tasks.
|
|
7
|
+
|
|
8
|
+
Raise this exception when a task needs to checkpoint and exit before
|
|
9
|
+
a time limit (e.g., SLURM walltime). The task will be marked for retry
|
|
10
|
+
rather than as failed.
|
|
11
|
+
|
|
12
|
+
Example::
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
class LongTraining(ResumableTask):
|
|
16
|
+
def execute(self):
|
|
17
|
+
for epoch in range(self.epochs):
|
|
18
|
+
remaining = self.remaining_time()
|
|
19
|
+
if remaining is not None and remaining < 300:
|
|
20
|
+
save_checkpoint(self.checkpoint, epoch)
|
|
21
|
+
raise GracefulTimeout("Not enough time for another epoch")
|
|
22
|
+
train_one_epoch()
|
|
23
|
+
```
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, message: str = "Task stopped gracefully before timeout"):
|
|
27
|
+
self.message = message
|
|
28
|
+
super().__init__(message)
|
experimaestro/experiments/cli.py
CHANGED
|
@@ -52,8 +52,7 @@ class ExperimentHelper:
|
|
|
52
52
|
class ExperimentCallable(Protocol):
|
|
53
53
|
"""Protocol for the run function"""
|
|
54
54
|
|
|
55
|
-
def __call__(self, helper: ExperimentHelper, configuration: Any):
|
|
56
|
-
...
|
|
55
|
+
def __call__(self, helper: ExperimentHelper, configuration: Any): ... # noqa: E704
|
|
57
56
|
|
|
58
57
|
|
|
59
58
|
class ConfigurationLoader:
|
|
@@ -126,6 +125,11 @@ class ConfigurationLoader:
|
|
|
126
125
|
default=None,
|
|
127
126
|
help="Port for monitoring (can be defined in the settings.yaml file)",
|
|
128
127
|
)
|
|
128
|
+
@click.option(
|
|
129
|
+
"--console",
|
|
130
|
+
is_flag=True,
|
|
131
|
+
help="Launch Textual console UI for monitoring with logs",
|
|
132
|
+
)
|
|
129
133
|
@click.option(
|
|
130
134
|
"--file",
|
|
131
135
|
"xp_file",
|
|
@@ -162,6 +166,7 @@ def experiments_cli( # noqa: C901
|
|
|
162
166
|
xp_file: str,
|
|
163
167
|
host: str,
|
|
164
168
|
port: int,
|
|
169
|
+
console: bool,
|
|
165
170
|
xpm_config_dir: Path,
|
|
166
171
|
workdir: Optional[Path],
|
|
167
172
|
workspace: Optional[str],
|
|
@@ -298,43 +303,120 @@ def experiments_cli( # noqa: C901
|
|
|
298
303
|
configuration, structured_config_mode=SCMode.INSTANTIATE
|
|
299
304
|
)
|
|
300
305
|
|
|
301
|
-
# Define the workspace
|
|
302
|
-
ws_env = find_workspace(workdir=workdir, workspace=workspace)
|
|
303
|
-
|
|
304
|
-
workdir = ws_env.path
|
|
305
|
-
|
|
306
306
|
# --- Sets up the experiment ID
|
|
307
|
-
|
|
308
|
-
# --- Runs the experiment
|
|
309
307
|
if xp_configuration.add_timestamp:
|
|
310
308
|
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M")
|
|
311
309
|
experiment_id = f"""{xp_configuration.id}-{timestamp}"""
|
|
312
310
|
else:
|
|
313
311
|
experiment_id = xp_configuration.id
|
|
314
312
|
|
|
313
|
+
# Define the workspace (may auto-select based on experiment_id triggers)
|
|
314
|
+
ws_env = find_workspace(
|
|
315
|
+
workdir=workdir, workspace=workspace, experiment_id=experiment_id
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
workdir = ws_env.path
|
|
319
|
+
|
|
315
320
|
logging.info(
|
|
316
321
|
"Running experiment %s working directory %s",
|
|
317
322
|
experiment_id,
|
|
318
323
|
str(workdir.resolve()),
|
|
319
324
|
)
|
|
320
|
-
with experiment(
|
|
321
|
-
ws_env, experiment_id, host=host, port=port, run_mode=run_mode
|
|
322
|
-
) as xp:
|
|
323
|
-
# Set up the environment
|
|
324
|
-
# (1) global settings (2) workspace settings and (3) command line settings
|
|
325
|
-
for key, value in env:
|
|
326
|
-
xp.setenv(key, value)
|
|
327
|
-
|
|
328
|
-
# Sets the python path
|
|
329
|
-
xp.workspace.python_path.extend(python_path)
|
|
330
325
|
|
|
326
|
+
# Define the experiment execution function
|
|
327
|
+
def run_experiment_code(xp_holder=None, xp_ready_event=None, register_signals=True):
|
|
328
|
+
"""Run the experiment code - optionally storing xp in xp_holder"""
|
|
331
329
|
try:
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
330
|
+
with experiment(
|
|
331
|
+
ws_env,
|
|
332
|
+
experiment_id,
|
|
333
|
+
host=host,
|
|
334
|
+
port=port,
|
|
335
|
+
run_mode=run_mode,
|
|
336
|
+
register_signals=register_signals,
|
|
337
|
+
) as xp:
|
|
338
|
+
if xp_holder is not None:
|
|
339
|
+
xp_holder["xp"] = xp
|
|
340
|
+
if xp_ready_event is not None:
|
|
341
|
+
xp_ready_event.set() # Signal that xp is ready
|
|
342
|
+
|
|
343
|
+
# Test logging from experiment thread
|
|
344
|
+
logging.info("Experiment started in background thread")
|
|
345
|
+
|
|
346
|
+
# Set up the environment
|
|
347
|
+
for key, value in env:
|
|
348
|
+
xp.setenv(key, value)
|
|
349
|
+
|
|
350
|
+
# Sets the python path
|
|
351
|
+
xp.workspace.python_path.extend(python_path)
|
|
352
|
+
|
|
353
|
+
# Run the experiment
|
|
354
|
+
helper.xp = xp
|
|
355
|
+
helper.run(list(args), xp_configuration)
|
|
356
|
+
|
|
357
|
+
# ... and wait
|
|
358
|
+
xp.wait()
|
|
338
359
|
|
|
339
360
|
except HandledException:
|
|
340
361
|
sys.exit(1)
|
|
362
|
+
|
|
363
|
+
if console:
|
|
364
|
+
# Run experiment in background thread, console UI in main thread
|
|
365
|
+
import threading
|
|
366
|
+
from experimaestro.tui import ExperimentTUI
|
|
367
|
+
|
|
368
|
+
xp_holder = {"xp": None}
|
|
369
|
+
exception_holder = {"exception": None}
|
|
370
|
+
xp_ready = threading.Event()
|
|
371
|
+
|
|
372
|
+
def run_in_thread():
|
|
373
|
+
try:
|
|
374
|
+
# Don't register signals in background thread
|
|
375
|
+
run_experiment_code(xp_holder, xp_ready, register_signals=False)
|
|
376
|
+
# Add a test message after experiment completes
|
|
377
|
+
logging.info("Experiment thread completed")
|
|
378
|
+
print("Experiment thread print test")
|
|
379
|
+
except Exception as e:
|
|
380
|
+
exception_holder["exception"] = e
|
|
381
|
+
xp_ready.set() # Signal even on error
|
|
382
|
+
|
|
383
|
+
# Start experiment in background thread
|
|
384
|
+
exp_thread = threading.Thread(target=run_in_thread, daemon=True)
|
|
385
|
+
exp_thread.start()
|
|
386
|
+
|
|
387
|
+
# Wait for experiment to start (up to 30 seconds)
|
|
388
|
+
if not xp_ready.wait(timeout=30.0):
|
|
389
|
+
cprint("Timeout waiting for experiment to start", "red", file=sys.stderr)
|
|
390
|
+
sys.exit(1)
|
|
391
|
+
|
|
392
|
+
if xp_holder["xp"] is None:
|
|
393
|
+
cprint("Failed to start experiment", "red", file=sys.stderr)
|
|
394
|
+
if exception_holder["exception"]:
|
|
395
|
+
raise exception_holder["exception"]
|
|
396
|
+
sys.exit(1)
|
|
397
|
+
|
|
398
|
+
# Run TUI in main thread (handles signals via Textual)
|
|
399
|
+
tui_app = ExperimentTUI(
|
|
400
|
+
workdir=workdir,
|
|
401
|
+
state_provider=xp_holder["xp"].state_provider,
|
|
402
|
+
show_logs=True,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
try:
|
|
406
|
+
# Textual automatically captures stdout/stderr via Print events
|
|
407
|
+
tui_app.run()
|
|
408
|
+
finally:
|
|
409
|
+
# TUI exited (user pressed q or Ctrl+C) - stop the experiment
|
|
410
|
+
if xp_holder["xp"]:
|
|
411
|
+
xp_holder["xp"].stop()
|
|
412
|
+
|
|
413
|
+
# Wait for experiment thread to finish
|
|
414
|
+
exp_thread.join(timeout=5.0)
|
|
415
|
+
|
|
416
|
+
# Handle exceptions
|
|
417
|
+
if exception_holder["exception"]:
|
|
418
|
+
raise exception_holder["exception"]
|
|
419
|
+
|
|
420
|
+
else:
|
|
421
|
+
# Normal mode without TUI - run directly
|
|
422
|
+
run_experiment_code()
|
experimaestro/generators.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import Callable, Union
|
|
4
|
+
from typing import Callable, Union, TYPE_CHECKING
|
|
5
5
|
from experimaestro.core.arguments import ArgumentOptions, TypeAnnotation
|
|
6
6
|
from experimaestro.core.objects import ConfigWalkContext, Config
|
|
7
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from experimaestro.core.subparameters import Subparameters
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
class Generator(ABC):
|
|
10
13
|
"""Base class for all generators"""
|
|
@@ -15,25 +18,63 @@ class Generator(ABC):
|
|
|
15
18
|
return False
|
|
16
19
|
|
|
17
20
|
@abstractmethod
|
|
18
|
-
def __call__(self, context: ConfigWalkContext, config: Config):
|
|
19
|
-
...
|
|
21
|
+
def __call__(self, context: ConfigWalkContext, config: Config): ...
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class PathGenerator(Generator):
|
|
23
|
-
"""
|
|
25
|
+
"""Generate paths within the task directory.
|
|
26
|
+
|
|
27
|
+
Use ``PathGenerator`` with ``field(default_factory=...)`` to create
|
|
28
|
+
paths relative to the task's working directory.
|
|
29
|
+
|
|
30
|
+
Example::
|
|
31
|
+
|
|
32
|
+
class MyTask(Task):
|
|
33
|
+
output: Meta[Path] = field(default_factory=PathGenerator("results.json"))
|
|
34
|
+
model: Meta[Path] = field(default_factory=PathGenerator("model.pt"))
|
|
35
|
+
|
|
36
|
+
For shared directories across related tasks, use with subparameters::
|
|
37
|
+
|
|
38
|
+
training_group = param_group("training")
|
|
39
|
+
|
|
40
|
+
class Train(Task):
|
|
41
|
+
epochs: Param[int] = field(groups=[training_group])
|
|
42
|
+
checkpoint: Meta[Path] = field(
|
|
43
|
+
default_factory=PathGenerator(
|
|
44
|
+
"model.pt",
|
|
45
|
+
subparameters=subparameters(exclude=[training_group])
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
:param path: Relative path within the task directory. Can be a string,
|
|
50
|
+
Path, or callable that takes (context, config) and returns a Path.
|
|
51
|
+
:param subparameters: Optional subparameters for partial directory sharing.
|
|
52
|
+
When provided, the path is generated in a shared partial directory.
|
|
53
|
+
"""
|
|
24
54
|
|
|
25
55
|
def __init__(
|
|
26
|
-
self,
|
|
56
|
+
self,
|
|
57
|
+
path: Union[str, Path, Callable[[ConfigWalkContext, Config], Path]] = "",
|
|
58
|
+
*,
|
|
59
|
+
partial: "Subparameters" = None,
|
|
27
60
|
):
|
|
28
61
|
self.path = path
|
|
62
|
+
self.partial = partial
|
|
29
63
|
|
|
30
64
|
def __call__(self, context: ConfigWalkContext, config: Config):
|
|
31
|
-
|
|
32
|
-
|
|
65
|
+
# Determine base path: partial directory or job directory
|
|
66
|
+
if self.partial is not None:
|
|
67
|
+
base_path = context.partial_path(self.partial, config)
|
|
33
68
|
else:
|
|
34
|
-
|
|
69
|
+
base_path = context.currentpath()
|
|
35
70
|
|
|
36
|
-
|
|
71
|
+
# Generate the final path
|
|
72
|
+
if inspect.isfunction(self.path):
|
|
73
|
+
return base_path / self.path(context, config)
|
|
74
|
+
elif self.path:
|
|
75
|
+
return base_path / Path(self.path)
|
|
76
|
+
else:
|
|
77
|
+
return base_path
|
|
37
78
|
|
|
38
79
|
def isoutput(self):
|
|
39
80
|
return True
|
experimaestro/huggingface.py
CHANGED
|
@@ -105,6 +105,35 @@ class Visitor(PTNodeVisitor):
|
|
|
105
105
|
|
|
106
106
|
|
|
107
107
|
def parse(expr: str):
|
|
108
|
+
"""Parse a requirement specification string into a HostRequirement object.
|
|
109
|
+
|
|
110
|
+
The specification string describes hardware requirements for running a task.
|
|
111
|
+
Multiple alternatives can be specified using ``|`` (OR), and requirements
|
|
112
|
+
within an alternative are combined using ``&`` (AND).
|
|
113
|
+
|
|
114
|
+
**Syntax elements:**
|
|
115
|
+
|
|
116
|
+
- ``duration=<N><unit>``: Job duration (units: h/hours, d/days, m/mins)
|
|
117
|
+
- ``cpu(mem=<size>, cores=<N>)``: CPU requirements
|
|
118
|
+
- ``cuda(mem=<size>) * <N>``: GPU requirements (memory and count)
|
|
119
|
+
- Memory sizes: ``<N>G``, ``<N>GiB``, ``<N>M``, ``<N>MiB``
|
|
120
|
+
|
|
121
|
+
:param expr: The requirement specification string
|
|
122
|
+
:return: A :class:`~experimaestro.launcherfinder.specs.HostRequirement` object
|
|
123
|
+
|
|
124
|
+
**Example:**
|
|
125
|
+
|
|
126
|
+
.. code-block:: python
|
|
127
|
+
|
|
128
|
+
from experimaestro.launcherfinder.parser import parse
|
|
129
|
+
|
|
130
|
+
# Request 2 GPUs with 32GB each, 700GB RAM, for 40 hours
|
|
131
|
+
# OR 4 GPUs with 32GB each for 50 hours
|
|
132
|
+
req = parse(
|
|
133
|
+
"duration=40h & cpu(mem=700GiB) & cuda(mem=32GiB) * 2"
|
|
134
|
+
" | duration=50h & cpu(mem=700GiB) & cuda(mem=32GiB) * 4"
|
|
135
|
+
)
|
|
136
|
+
"""
|
|
108
137
|
parser = ParserPython(grammar, syntax_classes={"StrMatch": SuppressStrMatch})
|
|
109
138
|
parse_tree = parser.parse(expr)
|
|
110
139
|
return visit_parse_tree(parse_tree, Visitor(debug=False))
|
|
@@ -31,7 +31,19 @@ SubmitListener = Callable[[Job], None]
|
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
class Launcher(ABC):
|
|
34
|
-
"""
|
|
34
|
+
"""Base class for task launchers.
|
|
35
|
+
|
|
36
|
+
Launchers are responsible for executing tasks on a compute resource.
|
|
37
|
+
They work with a :class:`~experimaestro.connectors.Connector` to
|
|
38
|
+
access the target system and manage process execution.
|
|
39
|
+
|
|
40
|
+
Subclasses include:
|
|
41
|
+
|
|
42
|
+
- :class:`~experimaestro.launchers.direct.DirectLauncher`: Local execution
|
|
43
|
+
- :class:`~experimaestro.launchers.slurm.SlurmLauncher`: SLURM cluster
|
|
44
|
+
|
|
45
|
+
:param connector: The connector to use for accessing the compute resource
|
|
46
|
+
"""
|
|
35
47
|
|
|
36
48
|
submit_listeners: List[SubmitListener]
|
|
37
49
|
|
|
@@ -69,6 +81,19 @@ class Launcher(ABC):
|
|
|
69
81
|
By default, returns the associated connector builder"""
|
|
70
82
|
return self.connector.processbuilder()
|
|
71
83
|
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def launcher_info_code(self) -> str:
|
|
86
|
+
"""Returns Python code to set up launcher info during task execution.
|
|
87
|
+
|
|
88
|
+
This code is inserted into the generated task script to set up
|
|
89
|
+
launcher-specific information (like LauncherInformation for
|
|
90
|
+
querying remaining time).
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Python code as a string, or empty string if no setup needed.
|
|
94
|
+
"""
|
|
95
|
+
...
|
|
96
|
+
|
|
72
97
|
@staticmethod
|
|
73
98
|
def get(path: Path):
|
|
74
99
|
"""Get a default launcher for a given path"""
|