experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +278 -7
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +111 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +510 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +256 -31
- experimaestro/scheduler/interfaces.py +501 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/client.py +874 -0
- experimaestro/scheduler/remote/protocol.py +467 -0
- experimaestro/scheduler/remote/server.py +423 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +323 -23
- experimaestro/scheduler/state_db.py +437 -0
- experimaestro/scheduler/state_provider.py +2766 -0
- experimaestro/scheduler/state_sync.py +891 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_remote_state.py +671 -0
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2395 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b8.dist-info/RECORD +187 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
experimaestro/settings.py
CHANGED
|
@@ -5,6 +5,7 @@ from functools import lru_cache
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Dict, Optional, List
|
|
7
7
|
import logging
|
|
8
|
+
import fnmatch
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
@dataclass
|
|
@@ -38,6 +39,12 @@ class WorkspaceSettings:
|
|
|
38
39
|
alt_workspaces: List[str] = field(default_factory=list)
|
|
39
40
|
"""Alternative workspaces to find jobs or experiments"""
|
|
40
41
|
|
|
42
|
+
max_retries: int = 3
|
|
43
|
+
"""Maximum number of retries for resumable tasks that timeout (default: 3)"""
|
|
44
|
+
|
|
45
|
+
triggers: List[str] = field(default_factory=list)
|
|
46
|
+
"""Glob patterns to automatically select this workspace based on experiment ID"""
|
|
47
|
+
|
|
41
48
|
def __post_init__(self):
|
|
42
49
|
self.path = self.path.expanduser().resolve()
|
|
43
50
|
|
|
@@ -84,9 +91,21 @@ def get_workspace(id: Optional[str] = None) -> Optional[WorkspaceSettings]:
|
|
|
84
91
|
|
|
85
92
|
|
|
86
93
|
def find_workspace(
|
|
87
|
-
*,
|
|
94
|
+
*,
|
|
95
|
+
workspace: Optional[str] = None,
|
|
96
|
+
workdir: Optional[Path] = None,
|
|
97
|
+
experiment_id: Optional[str] = None,
|
|
88
98
|
) -> WorkspaceSettings:
|
|
89
|
-
"""Find workspace
|
|
99
|
+
"""Find workspace
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
workspace: Explicit workspace ID to use
|
|
103
|
+
workdir: Explicit working directory path
|
|
104
|
+
experiment_id: Experiment ID to match against workspace triggers
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
WorkspaceSettings object
|
|
108
|
+
"""
|
|
90
109
|
workdir = Path(workdir) if workdir else None
|
|
91
110
|
|
|
92
111
|
if workspace:
|
|
@@ -103,8 +122,28 @@ def find_workspace(
|
|
|
103
122
|
logging.info("Using workdir %s", workdir)
|
|
104
123
|
ws_env = WorkspaceSettings("", workdir)
|
|
105
124
|
else:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
125
|
+
# Try to match experiment_id against workspace triggers
|
|
126
|
+
matched_workspace = None
|
|
127
|
+
if experiment_id:
|
|
128
|
+
workspaces = get_settings().workspaces
|
|
129
|
+
for ws in workspaces:
|
|
130
|
+
for trigger in ws.triggers:
|
|
131
|
+
if fnmatch.fnmatch(experiment_id, trigger):
|
|
132
|
+
matched_workspace = ws
|
|
133
|
+
logging.info(
|
|
134
|
+
"Auto-selected workspace %s (matched trigger '%s')",
|
|
135
|
+
ws.id,
|
|
136
|
+
trigger,
|
|
137
|
+
)
|
|
138
|
+
break
|
|
139
|
+
if matched_workspace:
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
if matched_workspace:
|
|
143
|
+
ws_env = matched_workspace
|
|
144
|
+
else:
|
|
145
|
+
ws_env = get_workspace()
|
|
146
|
+
assert ws_env is not None, "No workdir or workspace defined, and no default"
|
|
147
|
+
logging.info("Using default workspace %s", ws_env.id)
|
|
109
148
|
|
|
110
149
|
return ws_env
|
experimaestro/sphinx/__init__.py
CHANGED
|
@@ -113,12 +113,12 @@ class ConfigDocumenter(ClassDocumenter):
|
|
|
113
113
|
@staticmethod
|
|
114
114
|
def formatDefault(value) -> str:
|
|
115
115
|
if isinstance(value, Config):
|
|
116
|
-
|
|
116
|
+
value_type = value.__xpmtype__.value_type
|
|
117
117
|
params = ", ".join(
|
|
118
118
|
[f"{key}={value}" for key, value in value.__xpm__.values.items()]
|
|
119
119
|
)
|
|
120
120
|
# It would be possible to do better... if not
|
|
121
|
-
return f"{
|
|
121
|
+
return f"{value_type.__module__}.{value_type.__qualname__}({params})"
|
|
122
122
|
|
|
123
123
|
return str(value)
|
|
124
124
|
|
|
@@ -176,7 +176,7 @@ class ConfigDocumenter(ClassDocumenter):
|
|
|
176
176
|
self.add_line(" " + _("Bases: %s") % ", ".join(base_classes), sourcename)
|
|
177
177
|
|
|
178
178
|
# Adds return type if different
|
|
179
|
-
if xpminfo.returntype != xpminfo.
|
|
179
|
+
if xpminfo.returntype != xpminfo.value_type:
|
|
180
180
|
self.add_line("", sourcename)
|
|
181
181
|
self.add_line(
|
|
182
182
|
" " + _("Submit type: %s") % restify(xpminfo.returntype), sourcename
|
experimaestro/taskglobals.py
CHANGED
|
@@ -3,6 +3,23 @@ from pathlib import Path
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
class LauncherInformation:
|
|
7
|
+
"""Minimal launcher information available during task execution.
|
|
8
|
+
|
|
9
|
+
This is a lightweight class used to query launcher-specific information
|
|
10
|
+
(like remaining time) during task execution. It's set by the generated
|
|
11
|
+
Python script that runs the task.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def remaining_time(self) -> Optional[float]:
|
|
15
|
+
"""Returns the remaining time in seconds before the job times out.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
The remaining time in seconds, or None if no time limit.
|
|
19
|
+
"""
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
|
|
6
23
|
class Env:
|
|
7
24
|
_instance = None
|
|
8
25
|
|
|
@@ -12,6 +29,9 @@ class Env:
|
|
|
12
29
|
# The current task path
|
|
13
30
|
taskpath: Optional[Path] = None
|
|
14
31
|
|
|
32
|
+
# Launcher information (only set when running a task)
|
|
33
|
+
launcher_info: Optional[LauncherInformation] = None
|
|
34
|
+
|
|
15
35
|
# Set to True when multi-processing when
|
|
16
36
|
# in slave mode:
|
|
17
37
|
# - no progress report
|
experimaestro/tests/conftest.py
CHANGED
|
@@ -19,6 +19,86 @@ def xpmdirectory(tmp_path_factory):
|
|
|
19
19
|
shutil.rmtree(workdir)
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
@pytest.fixture(scope="function", autouse=True)
|
|
23
|
+
def reset_scheduler():
|
|
24
|
+
"""Reset scheduler state between tests to avoid state leakage with singleton pattern"""
|
|
25
|
+
from experimaestro.scheduler.base import Scheduler
|
|
26
|
+
from experimaestro.server import Server
|
|
27
|
+
|
|
28
|
+
# Get the singleton instance if it exists
|
|
29
|
+
if Scheduler._instance is not None:
|
|
30
|
+
scheduler = Scheduler._instance
|
|
31
|
+
# Clear job registrations but keep scheduler running
|
|
32
|
+
logging.debug(
|
|
33
|
+
f"FIXTURE: Clearing scheduler before test - jobs count: {len(scheduler.jobs)}"
|
|
34
|
+
)
|
|
35
|
+
# Clear experiment references from all jobs
|
|
36
|
+
for job in scheduler.jobs.values():
|
|
37
|
+
job.experiments.clear()
|
|
38
|
+
scheduler.jobs.clear()
|
|
39
|
+
scheduler.waitingjobs.clear()
|
|
40
|
+
scheduler.experiments.clear()
|
|
41
|
+
# Clear state provider experiment providers to avoid stale references
|
|
42
|
+
if (
|
|
43
|
+
hasattr(scheduler, "state_provider")
|
|
44
|
+
and scheduler.state_provider is not None
|
|
45
|
+
):
|
|
46
|
+
# Close all experiment providers
|
|
47
|
+
for provider in scheduler.state_provider.experiment_providers.values():
|
|
48
|
+
provider.close()
|
|
49
|
+
scheduler.state_provider.experiment_providers.clear()
|
|
50
|
+
logging.debug("FIXTURE: Cleared state provider experiment providers")
|
|
51
|
+
|
|
52
|
+
# Also clear listeners to prevent stale listeners
|
|
53
|
+
scheduler.clear_listeners()
|
|
54
|
+
|
|
55
|
+
# Re-add state_provider as listener if it exists
|
|
56
|
+
if (
|
|
57
|
+
hasattr(scheduler, "state_provider")
|
|
58
|
+
and scheduler.state_provider is not None
|
|
59
|
+
):
|
|
60
|
+
scheduler.addlistener(scheduler.state_provider)
|
|
61
|
+
|
|
62
|
+
# Reset server instance too
|
|
63
|
+
if Server._instance is not None:
|
|
64
|
+
logging.debug("FIXTURE: Clearing server instance")
|
|
65
|
+
Server._instance = None
|
|
66
|
+
|
|
67
|
+
yield
|
|
68
|
+
|
|
69
|
+
# Cleanup after test - clear again
|
|
70
|
+
if Scheduler._instance is not None:
|
|
71
|
+
scheduler = Scheduler._instance
|
|
72
|
+
logging.debug(
|
|
73
|
+
f"FIXTURE: Clearing scheduler after test - jobs count: {len(scheduler.jobs)}"
|
|
74
|
+
)
|
|
75
|
+
# Clear experiment references from all jobs
|
|
76
|
+
for job in scheduler.jobs.values():
|
|
77
|
+
job.experiments.clear()
|
|
78
|
+
scheduler.jobs.clear()
|
|
79
|
+
scheduler.waitingjobs.clear()
|
|
80
|
+
scheduler.experiments.clear()
|
|
81
|
+
# Clear state provider experiment providers
|
|
82
|
+
if (
|
|
83
|
+
hasattr(scheduler, "state_provider")
|
|
84
|
+
and scheduler.state_provider is not None
|
|
85
|
+
):
|
|
86
|
+
for provider in scheduler.state_provider.experiment_providers.values():
|
|
87
|
+
provider.close()
|
|
88
|
+
scheduler.state_provider.experiment_providers.clear()
|
|
89
|
+
scheduler.clear_listeners()
|
|
90
|
+
# Re-add state_provider as listener if it exists
|
|
91
|
+
if (
|
|
92
|
+
hasattr(scheduler, "state_provider")
|
|
93
|
+
and scheduler.state_provider is not None
|
|
94
|
+
):
|
|
95
|
+
scheduler.addlistener(scheduler.state_provider)
|
|
96
|
+
|
|
97
|
+
# Reset server after test
|
|
98
|
+
if Server._instance is not None:
|
|
99
|
+
Server._instance = None
|
|
100
|
+
|
|
101
|
+
|
|
22
102
|
# Sets a flag
|
|
23
103
|
def pytest_configure(config):
|
|
24
104
|
import sys
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from typing import Generic, Optional, TypeVar
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
|
-
from experimaestro import Config, Param
|
|
6
|
+
from experimaestro import field, Config, Param
|
|
7
7
|
from experimaestro.core.arguments import Argument
|
|
8
8
|
from experimaestro.core.types import TypeVarType
|
|
9
9
|
|
|
@@ -162,7 +162,7 @@ class TreeGenericConfig(Config, Generic[T]):
|
|
|
162
162
|
class TagTreeGenericConfig(TreeGenericConfig[T], Generic[T]):
|
|
163
163
|
"""A tagged version of TreeGenericConfig to test recursive generics"""
|
|
164
164
|
|
|
165
|
-
tag: Param[str] = "default"
|
|
165
|
+
tag: Param[str] = field(ignore_default="default")
|
|
166
166
|
|
|
167
167
|
|
|
168
168
|
def test_core_generics_recursive():
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"bool_false": "bb61efa2769d20e6665fd63911d8a1e2fcdd2af22ff1e6c860d2b26ab7b04ab2",
|
|
3
|
+
"bool_true": "e718f2e3a3cc5b6b816a9645f587d3009efb08642bd22db45c8c288b78ff11f4",
|
|
4
|
+
"cycle_simple": "a73ef01b1c3e4e0187aee95eda96d1c069fd4757ad0137ac66adbf3a9502673f",
|
|
5
|
+
"default_override": "90951821af9c0d84b3f300fadfab63387bbfad6d1982dfbaa5b4d7ebbbfcf800",
|
|
6
|
+
"default_with_default": "0203eb7eb6a13e3c4592c9366f76a5f53dd2c5211c576547873184af86558bc3",
|
|
7
|
+
"dict_empty": "d2c32c9305431266e4ab1f5a70face4cee13b02a01af4ba0a6046fb254971b5f",
|
|
8
|
+
"dict_multiple": "01994d7bc212a73ea9d80332bf460922ca786a9d4ab8d8f444b3673901c75c99",
|
|
9
|
+
"dict_nested_empty": "77ebb66bcfe1c24c166dd80ceaae5840c1729f5c435a9d4ae040e8285b9beca7",
|
|
10
|
+
"dict_nested_multiple": "4476cd6934c5cc4a63cce1594cbbed622d0e70f6291b6d8cb4092d7b2612bb15",
|
|
11
|
+
"dict_nested_single": "373a3e409042029439fdbcb679b4e6388242901772d273ddefafd5663f9e57e4",
|
|
12
|
+
"dict_single": "dd879ad5038694c95134926ab3524696437f6ec96e52341a4e8c8fd44a1c2ae2",
|
|
13
|
+
"enum_value_a": "96c98d4683658b0a8e62d67abcb32c918506f8e455685680ee50f7b174e91366",
|
|
14
|
+
"enum_value_b": "433e23f7e2ee01a850bd97da097cf158873cb7465d3faa86218d86c0f7c38834",
|
|
15
|
+
"enum_value_c": "5881adaa535d2c3c842e4f163fbabbe5f537bc18f48a5bc2e9887ad9b3deb00b",
|
|
16
|
+
"float_negative": "67a86cea76bc90be4ec052e8c2f08829fb989e64a1d419541e68485dff85dba1",
|
|
17
|
+
"float_simple": "f445ab15c80965e89436c9e58a369fb969f216f392e4cbb19846830731f9a1e4",
|
|
18
|
+
"float_zero": "35879908ca1652ea07a4b49f0c44e9aa2ca264bedf14974dd9a765b1fafd1921",
|
|
19
|
+
"instance_different_values": "e175afb36163f56078beb516cf7489238b491571519e25c2b5ff68afbeccc643",
|
|
20
|
+
"instance_separate": "6d6274a5b541f60833e5d15a556a1b8adfaaa6dd0970a09a57849edd7a0c6fdd",
|
|
21
|
+
"instance_shared": "d9a76235da634b81b7559d561322263698852fa2012005781569154d7ad3cfc5",
|
|
22
|
+
"int_negative": "4e2ad6ee44e1c9429b56900aea7ba81a6b746a38678d2e29a89805bfb32b9153",
|
|
23
|
+
"int_positive": "2c57a590b8bf1bb5283a54d971814e196f395269f2973096dc277dbc24218962",
|
|
24
|
+
"int_zero": "2696ea881e0f601d4ad75679560e0e3305fa2f15552952d88ac87df4cc6f9f49",
|
|
25
|
+
"list_empty": "457140939f4e4ea43c5cfa4e05f4db0ed82891f0b550e23bfedf1510aea94d0c",
|
|
26
|
+
"list_multiple": "14575fd83be49b8f23d43d54fab90768ea0c296a829eeaa1b5a312f8322fb2ef",
|
|
27
|
+
"list_nested_empty": "fa30a32619931b4048a9f9854d82975e955c48017cd72b474344fa6b5a9c9bbe",
|
|
28
|
+
"list_nested_multiple": "4cc6e0d3d4ac32209334b8667d6b18f37cc5fd1677309eaeec89e7862d98ec5f",
|
|
29
|
+
"list_nested_single": "160a7e361e3482536479beaf8250f3107436c59e60ca5da573548da60e4b9bcf",
|
|
30
|
+
"list_single": "d33a881039f9a79cb7d688e547acdc79092b86b9e05fcb65faebbebfb38b3067",
|
|
31
|
+
"nested_multi": "8a1a37250d6f90caa549b02f8899dcba51ce01e5f6f511896b6c00b9c4a714a0",
|
|
32
|
+
"nested_simple": "a52569bc853a4cacbc044ba3082bbc10670f9174279e283168115828de979be1",
|
|
33
|
+
"option_override": "e98d969a3a309b2a43bab46c0fce6a6ea3c8c337a63ecb78c3705b8c281927b5",
|
|
34
|
+
"option_with_default": "e98d969a3a309b2a43bab46c0fce6a6ea3c8c337a63ecb78c3705b8c281927b5",
|
|
35
|
+
"str_empty": "bfdf01b69cbed525f27d0a9c1ba1043086ae5705fbc4027f5cf899a394e38bca",
|
|
36
|
+
"str_simple": "ad42020604bcc3c36bbcef8843de7b7c3af80b4198a109b7768d65bc2f788b1a",
|
|
37
|
+
"str_unicode": "26972840e4f5f71b2303902e0247aaf1e27d8a14ab6495c433d1f95c32dd40e8",
|
|
38
|
+
"task_simple": "1ff8ca42cdc94959e1b0c3c019ef4ab1f45b30a4309cc9fef3e42f4ea7da3e86",
|
|
39
|
+
"task_submitted": "834fae0fffb762b20064e8c648221dab99e81ba6f00622219fccce1bd0a18a17",
|
|
40
|
+
"task_using_output": "6c51d8124133038482472973a439d785b7ce53e46bac096e047e0e6cf1fc104e",
|
|
41
|
+
"task_with_config": "b26f8a8f7b1b9f6bda7e9c7e334071097b377ac48caec9d7da7fe98bc8c97c84",
|
|
42
|
+
"task_with_init": "285697abd5eaef36264f640ef790880f076daea4aff1814f1a518aa014ba4b0d",
|
|
43
|
+
"task_with_multiple_init": "d0e8610e1312d9a3398b839c691e16c741d4520823763067465a3ddab63acb30",
|
|
44
|
+
"task_with_output": "9cbaadb16fc6168286703afe35805108c4600abd05380fe56160f50e20b3cbb6"
|
|
45
|
+
}
|
|
@@ -13,9 +13,13 @@ fi
|
|
|
13
13
|
find "$XPM_SLURM_DIR/jobs" -name "*.start" | while read name; do
|
|
14
14
|
jobid=${name%.start}
|
|
15
15
|
sf="$jobid.status"
|
|
16
|
+
timeout_marker="$jobid.timeout"
|
|
16
17
|
if test -f "$sf"; then
|
|
17
18
|
exitcode="$(cat $sf)"
|
|
18
|
-
|
|
19
|
+
# Check for timeout marker file
|
|
20
|
+
if test -f "$timeout_marker"; then
|
|
21
|
+
status=TIMEOUT
|
|
22
|
+
elif test "$exitcode" == 0; then
|
|
19
23
|
status=COMPLETED
|
|
20
24
|
else
|
|
21
25
|
status=FAILED
|
|
@@ -25,4 +29,4 @@ find "$XPM_SLURM_DIR/jobs" -name "*.start" | while read name; do
|
|
|
25
29
|
fi
|
|
26
30
|
|
|
27
31
|
echo "$(basename $jobid)|${status}|$(cat ${jobid}.start)|$(cat ${jobid}.start)|"
|
|
28
|
-
done
|
|
32
|
+
done
|
|
@@ -65,12 +65,14 @@ done < "$1"
|
|
|
65
65
|
|
|
66
66
|
cd "$chdir"
|
|
67
67
|
echo "Starting $@ ${args[@]} > $stdout 2> $stderr" >&2
|
|
68
|
+
# Get job ID before forking
|
|
69
|
+
JOBID="$$"
|
|
68
70
|
(
|
|
69
71
|
export PATH="${CURDIR}/bin:$PATH"
|
|
72
|
+
export SLURM_JOB_ID="$JOBID"
|
|
70
73
|
eval "$@" "${args[@]}"
|
|
71
|
-
echo $? > "$XPM_SLURM_DIR/jobs
|
|
74
|
+
echo $? > "$XPM_SLURM_DIR/jobs/$JOBID.status"
|
|
72
75
|
) > $stdout 2> $stderr &
|
|
73
|
-
JOBID="$$"
|
|
74
76
|
date > "$XPM_SLURM_DIR/jobs/$JOBID.start"
|
|
75
77
|
disown
|
|
76
78
|
|
|
@@ -6,6 +6,8 @@ from experimaestro.connectors.local import LocalConnector
|
|
|
6
6
|
from experimaestro.launchers.slurm import (
|
|
7
7
|
SlurmLauncher,
|
|
8
8
|
)
|
|
9
|
+
from experimaestro import field, ResumableTask, Param
|
|
10
|
+
from experimaestro.scheduler import JobState
|
|
9
11
|
import shutil
|
|
10
12
|
import pytest
|
|
11
13
|
from .common import waitFromSpec, takeback
|
|
@@ -84,3 +86,81 @@ def test_slurm_takeback(slurmlauncher, tmp_path):
|
|
|
84
86
|
datapath = tmp_path / "data"
|
|
85
87
|
|
|
86
88
|
takeback(slurmlauncher, datapath, txp1, txp2)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class SlurmResumableTask(ResumableTask):
|
|
92
|
+
"""ResumableTask that simulates timeout on first N attempts for SLURM testing"""
|
|
93
|
+
|
|
94
|
+
checkpoint: Param[Path]
|
|
95
|
+
timeout_count: Param[int] = field(ignore_default=2)
|
|
96
|
+
slurm_jobs_dir: Param[Path] # Path to mock SLURM jobs directory
|
|
97
|
+
output_file: Param[Path] = field(ignore_default=None)
|
|
98
|
+
|
|
99
|
+
def execute(self):
|
|
100
|
+
import os
|
|
101
|
+
|
|
102
|
+
# Read current attempt count from checkpoint
|
|
103
|
+
attempt = 1
|
|
104
|
+
if self.checkpoint.exists():
|
|
105
|
+
attempt = int(self.checkpoint.read_text()) + 1
|
|
106
|
+
|
|
107
|
+
print(f"SlurmResumableTask attempt #{attempt}")
|
|
108
|
+
|
|
109
|
+
# Write updated attempt count
|
|
110
|
+
self.checkpoint.write_text(str(attempt))
|
|
111
|
+
|
|
112
|
+
# Simulate timeout for first timeout_count attempts
|
|
113
|
+
if attempt <= self.timeout_count:
|
|
114
|
+
print(f"Simulating SLURM TIMEOUT on attempt {attempt}")
|
|
115
|
+
# Create timeout marker file for mock SLURM
|
|
116
|
+
# The marker needs to be named <jobid>.timeout in the SLURM jobs directory
|
|
117
|
+
# Use SLURM_JOB_ID environment variable (set by mock sbatch, like real SLURM)
|
|
118
|
+
job_id = os.environ.get("SLURM_JOB_ID")
|
|
119
|
+
if job_id:
|
|
120
|
+
timeout_marker = self.slurm_jobs_dir / f"{job_id}.timeout"
|
|
121
|
+
timeout_marker.write_text(f"timeout on attempt {attempt}")
|
|
122
|
+
# Exit with error to trigger SLURM timeout handling
|
|
123
|
+
raise RuntimeError(f"Simulated timeout on attempt {attempt}")
|
|
124
|
+
|
|
125
|
+
# Success - task completed
|
|
126
|
+
print(f"Task completed successfully on attempt {attempt}")
|
|
127
|
+
if self.output_file:
|
|
128
|
+
self.output_file.write_text(f"Completed after {attempt} attempts")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@pytest.mark.timeout(30)
|
|
132
|
+
def test_slurm_resumable_task(tmp_path: Path, slurmlauncher: SlurmLauncher):
|
|
133
|
+
"""Test that ResumableTask retries and resumes after SLURM timeouts"""
|
|
134
|
+
with TemporaryExperiment("slurm-resumable", workdir=tmp_path / "xp", maxwait=25):
|
|
135
|
+
checkpoint = tmp_path / "checkpoint.txt"
|
|
136
|
+
output_file = tmp_path / "output.txt"
|
|
137
|
+
|
|
138
|
+
# Get the SLURM jobs directory from the launcher's binpath
|
|
139
|
+
slurm_jobs_dir = slurmlauncher.binpath.parent / "slurm" / "jobs"
|
|
140
|
+
|
|
141
|
+
# Submit task with max_retries to allow multiple timeout retries
|
|
142
|
+
task = SlurmResumableTask.C(
|
|
143
|
+
checkpoint=checkpoint,
|
|
144
|
+
timeout_count=2, # Timeout on first 2 attempts
|
|
145
|
+
slurm_jobs_dir=slurm_jobs_dir,
|
|
146
|
+
output_file=output_file,
|
|
147
|
+
).submit(launcher=slurmlauncher, max_retries=5)
|
|
148
|
+
|
|
149
|
+
# Wait for the task to complete
|
|
150
|
+
state = task.__xpm__.job.wait()
|
|
151
|
+
|
|
152
|
+
# Verify task completed successfully after retries
|
|
153
|
+
assert state == JobState.DONE, f"Task did not complete successfully: {state}"
|
|
154
|
+
assert (
|
|
155
|
+
task.__xpm__.job.retry_count == 2
|
|
156
|
+
), f"Expected 2 retries, got {task.__xpm__.job.retry_count}"
|
|
157
|
+
|
|
158
|
+
# Verify checkpoint shows 3 attempts (2 timeouts + 1 success)
|
|
159
|
+
assert checkpoint.exists(), "Checkpoint file was not created"
|
|
160
|
+
assert (
|
|
161
|
+
int(checkpoint.read_text()) == 3
|
|
162
|
+
), f"Expected 3 attempts, got {checkpoint.read_text()}"
|
|
163
|
+
|
|
164
|
+
# Verify output file was created on success
|
|
165
|
+
assert output_file.exists(), "Output file was not created"
|
|
166
|
+
assert "Completed after 3 attempts" in output_file.read_text()
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# Test for future task outputs handling
|
|
2
|
+
# https://github.com/experimaestro/experimaestro-python/issues/90
|
|
3
|
+
|
|
4
|
+
from functools import partial
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
from experimaestro import (
|
|
11
|
+
Config,
|
|
12
|
+
Param,
|
|
13
|
+
Task,
|
|
14
|
+
ResumableTask,
|
|
15
|
+
DependentMarker,
|
|
16
|
+
LightweightTask,
|
|
17
|
+
field,
|
|
18
|
+
PathGenerator,
|
|
19
|
+
)
|
|
20
|
+
from experimaestro.core.arguments import Meta
|
|
21
|
+
from experimaestro.tests.utils import TemporaryDirectory, TemporaryExperiment
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Model(Config):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Checkpoint(Config):
|
|
29
|
+
step: Param[int]
|
|
30
|
+
model: Param[Model]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CheckpointLoader(LightweightTask):
|
|
34
|
+
checkpoint: Param[Checkpoint]
|
|
35
|
+
|
|
36
|
+
def execute(self):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Evaluate(Task):
|
|
41
|
+
model: Param[Model]
|
|
42
|
+
|
|
43
|
+
def execute(self):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Validation(Config):
|
|
48
|
+
model: Param[Model]
|
|
49
|
+
|
|
50
|
+
def checkpoint(self, dep: DependentMarker, *, step: int) -> Checkpoint:
|
|
51
|
+
return dep(Checkpoint.C(model=self.model, step=step))
|
|
52
|
+
|
|
53
|
+
def compute(self, step: int):
|
|
54
|
+
self.register_task_output(self.checkpoint, step=step)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Learn(ResumableTask):
|
|
58
|
+
model: Param[Model]
|
|
59
|
+
validation: Param[Validation]
|
|
60
|
+
|
|
61
|
+
# Control files for synchronization with tests
|
|
62
|
+
max_step_file: Meta[Path] = field(default_factory=PathGenerator("max_step"))
|
|
63
|
+
state_file: Meta[Path] = field(default_factory=PathGenerator("state.json"))
|
|
64
|
+
|
|
65
|
+
def execute(self):
|
|
66
|
+
start_step = 0
|
|
67
|
+
|
|
68
|
+
if self.state_file.exists():
|
|
69
|
+
with self.state_file.open("r") as f:
|
|
70
|
+
state = json.load(f)
|
|
71
|
+
start_step = state.get("last_step", 0)
|
|
72
|
+
logging.info("Resuming from step %d", start_step)
|
|
73
|
+
|
|
74
|
+
# Wait for max_step_file to know how far to go
|
|
75
|
+
while not self.max_step_file.is_file():
|
|
76
|
+
time.sleep(0.1)
|
|
77
|
+
|
|
78
|
+
with self.max_step_file.open("r") as f:
|
|
79
|
+
max_step = int(f.read().strip())
|
|
80
|
+
self.max_step_file.unlink()
|
|
81
|
+
|
|
82
|
+
# Use absolute value for step comparison
|
|
83
|
+
# Negative max_step means: produce up to |max_step| then crash (simulate interruption)
|
|
84
|
+
# Positive max_step means: produce up to max_step then complete normally
|
|
85
|
+
abs_max = abs(max_step)
|
|
86
|
+
|
|
87
|
+
for step in [15, 30, 45]:
|
|
88
|
+
if step <= start_step:
|
|
89
|
+
logging.info("Skipping already processed step %d", step)
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
if step > abs_max:
|
|
93
|
+
# We're past the limit, stop here
|
|
94
|
+
break
|
|
95
|
+
|
|
96
|
+
self.validation.compute(step)
|
|
97
|
+
|
|
98
|
+
# Save state after each checkpoint
|
|
99
|
+
with self.state_file.open("w") as f:
|
|
100
|
+
json.dump({"last_step": step}, f)
|
|
101
|
+
|
|
102
|
+
# If max_step is negative (e.g. -15), simulate exit after producing |max_step|
|
|
103
|
+
if max_step < 0 and step >= abs_max:
|
|
104
|
+
logging.warning("Simulating interruption after step %d", step)
|
|
105
|
+
sys.exit(1)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def evaluate(evaluations, checkpoint: Checkpoint):
|
|
109
|
+
logging.warning("Evaluating checkpoint %s", checkpoint)
|
|
110
|
+
task = Evaluate.C(model=checkpoint.model)
|
|
111
|
+
checkpoint_loader = CheckpointLoader.C(checkpoint=checkpoint)
|
|
112
|
+
evaluations.append(task.submit(init_tasks=[checkpoint_loader]))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def test_task_dynamic_simple():
|
|
116
|
+
"""Test that dynamic task outputs trigger callbacks
|
|
117
|
+
|
|
118
|
+
This test verifies that callbacks are guaranteed to complete before
|
|
119
|
+
the experiment context exits. The callback waits for jobs to complete
|
|
120
|
+
before submitting evaluations, which validates that the synchronization
|
|
121
|
+
logic correctly waits for all callbacks to finish.
|
|
122
|
+
"""
|
|
123
|
+
import asyncio
|
|
124
|
+
|
|
125
|
+
evaluations = []
|
|
126
|
+
xp_ref = [None] # To access xp from callback
|
|
127
|
+
|
|
128
|
+
def collect_checkpoint(checkpoint: Checkpoint):
|
|
129
|
+
"""Callback that waits for jobs to complete before evaluating
|
|
130
|
+
|
|
131
|
+
This simulates a real-world scenario where the callback needs to wait
|
|
132
|
+
for the triggering task to complete before it can proceed (e.g., to
|
|
133
|
+
read outputs from the task's directory).
|
|
134
|
+
"""
|
|
135
|
+
logging.info("Received checkpoint %s, waiting for jobs to complete", checkpoint)
|
|
136
|
+
xp = xp_ref[0]
|
|
137
|
+
|
|
138
|
+
# Wait for unfinished jobs to become 0 (all tasks completed)
|
|
139
|
+
async def wait_for_jobs_done():
|
|
140
|
+
async with xp.scheduler.exitCondition:
|
|
141
|
+
while xp.unfinishedJobs > 0:
|
|
142
|
+
await xp.scheduler.exitCondition.wait()
|
|
143
|
+
|
|
144
|
+
asyncio.run_coroutine_threadsafe(
|
|
145
|
+
wait_for_jobs_done(), xp.scheduler.loop
|
|
146
|
+
).result()
|
|
147
|
+
|
|
148
|
+
# Now submit evaluation
|
|
149
|
+
logging.info("Jobs done, submitting evaluation for checkpoint %s", checkpoint)
|
|
150
|
+
evaluate(evaluations, checkpoint)
|
|
151
|
+
|
|
152
|
+
with TemporaryDirectory() as workdir:
|
|
153
|
+
with TemporaryExperiment("dynamic", maxwait=10, workdir=workdir) as xp:
|
|
154
|
+
xp_ref[0] = xp
|
|
155
|
+
model = Model.C()
|
|
156
|
+
validation = Validation.C(model=model)
|
|
157
|
+
learn = Learn.C(model=model, validation=validation)
|
|
158
|
+
learn.watch_output(validation.checkpoint, collect_checkpoint)
|
|
159
|
+
|
|
160
|
+
learn.submit()
|
|
161
|
+
|
|
162
|
+
# Allow the task to run up to step 30
|
|
163
|
+
learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
|
|
164
|
+
with learn.max_step_file.open("w") as f:
|
|
165
|
+
f.write("30")
|
|
166
|
+
|
|
167
|
+
logging.info("Experiment will wait for completion...")
|
|
168
|
+
|
|
169
|
+
assert len(evaluations) == 2, f"Expected 2 evaluations, got {len(evaluations)}"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_task_dynamic_replay():
|
|
173
|
+
"""Test that dynamic outputs are replayed when a task is restarted
|
|
174
|
+
|
|
175
|
+
Scenario:
|
|
176
|
+
1. First run: task produces checkpoint for step 15, then exits (simulated timeout)
|
|
177
|
+
2. Second run: task should replay the step 15 checkpoint and produce new ones
|
|
178
|
+
"""
|
|
179
|
+
with TemporaryDirectory() as workdir:
|
|
180
|
+
# First run: produce one checkpoint then exit
|
|
181
|
+
evaluations_run1 = []
|
|
182
|
+
try:
|
|
183
|
+
with TemporaryExperiment("dynamic_replay", maxwait=5, workdir=workdir):
|
|
184
|
+
model = Model.C()
|
|
185
|
+
validation = Validation.C(model=model)
|
|
186
|
+
learn = Learn.C(model=model, validation=validation)
|
|
187
|
+
learn.watch_output(
|
|
188
|
+
validation.checkpoint, partial(evaluate, evaluations_run1)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
learn.submit()
|
|
192
|
+
|
|
193
|
+
# Allow task to produce step 15 checkpoint, then simulate crash
|
|
194
|
+
# Negative value means: produce up to |value| then exit with error
|
|
195
|
+
learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
with learn.max_step_file.open("w") as f:
|
|
197
|
+
f.write("-15")
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
# Expected: the task will fail when trying to go past max_step
|
|
201
|
+
logging.info("First run ended (expected): %s", e)
|
|
202
|
+
|
|
203
|
+
# First run should have produced at least one evaluation (for step 15)
|
|
204
|
+
assert (
|
|
205
|
+
len(evaluations_run1) == 1
|
|
206
|
+
), f"Run 1: Expected 1 evaluation, got {len(evaluations_run1)}"
|
|
207
|
+
|
|
208
|
+
# Second run: restart and continue
|
|
209
|
+
evaluations_run2 = []
|
|
210
|
+
with TemporaryExperiment("dynamic_replay", maxwait=30, workdir=workdir):
|
|
211
|
+
model = Model.C()
|
|
212
|
+
validation = Validation.C(model=model)
|
|
213
|
+
learn = Learn.C(model=model, validation=validation)
|
|
214
|
+
learn.watch_output(
|
|
215
|
+
validation.checkpoint, partial(evaluate, evaluations_run2)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
learn.submit()
|
|
219
|
+
|
|
220
|
+
# Allow task to run to completion (step 45)
|
|
221
|
+
learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
with learn.max_step_file.open("w") as f:
|
|
223
|
+
f.write("45")
|
|
224
|
+
|
|
225
|
+
# Second run should have:
|
|
226
|
+
# - Replayed the step 15 checkpoint (from first run)
|
|
227
|
+
# - Produced step 30 and 45 checkpoints
|
|
228
|
+
# Total: 3 evaluations (but step 15 was replayed, not re-produced)
|
|
229
|
+
assert (
|
|
230
|
+
len(evaluations_run2) == 3
|
|
231
|
+
), f"Run 2: Expected 3 evaluations, got {len(evaluations_run2)}"
|