experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +140 -16
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/progress.py +269 -0
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +22 -3
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +192 -37
- experimaestro/core/identifier.py +127 -12
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +702 -285
- experimaestro/core/objects/config_walk.py +24 -6
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +198 -83
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +107 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launcherfinder/registry.py +3 -3
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +75 -16
- experimaestro/progress.py +404 -0
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +504 -959
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +582 -0
- experimaestro/scheduler/interfaces.py +474 -0
- experimaestro/scheduler/jobs.py +485 -0
- experimaestro/scheduler/services.py +186 -12
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +1 -1
- experimaestro/scheduler/state_db.py +388 -0
- experimaestro/scheduler/state_provider.py +2345 -0
- experimaestro/scheduler/state_sync.py +834 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +153 -32
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +47 -6
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/common.py +2 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/restart.py +1 -1
- experimaestro/tests/tasks/all.py +7 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_experiment.py +3 -3
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +520 -169
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +16 -21
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +314 -30
- experimaestro/tests/test_outputs.py +8 -8
- experimaestro/tests/test_param.py +22 -26
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +2 -50
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -60
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +151 -15
- experimaestro/tests/test_tasks.py +137 -160
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +25 -19
- experimaestro/tests/test_types.py +133 -11
- experimaestro/tests/test_validation.py +19 -19
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +5 -3
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +8 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2303 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/typingutils.py +11 -2
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
- experimaestro-2.0.0b4.dist-info/RECORD +181 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -225
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-1.11.1.dist-info/RECORD +0 -158
- experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# Test for future task outputs handling
|
|
2
|
+
# https://github.com/experimaestro/experimaestro-python/issues/90
|
|
3
|
+
|
|
4
|
+
from functools import partial
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
from experimaestro import (
|
|
11
|
+
Config,
|
|
12
|
+
Param,
|
|
13
|
+
Task,
|
|
14
|
+
ResumableTask,
|
|
15
|
+
DependentMarker,
|
|
16
|
+
LightweightTask,
|
|
17
|
+
field,
|
|
18
|
+
PathGenerator,
|
|
19
|
+
)
|
|
20
|
+
from experimaestro.core.arguments import Meta
|
|
21
|
+
from experimaestro.tests.utils import TemporaryDirectory, TemporaryExperiment
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Model(Config):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Checkpoint(Config):
|
|
29
|
+
step: Param[int]
|
|
30
|
+
model: Param[Model]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CheckpointLoader(LightweightTask):
|
|
34
|
+
checkpoint: Param[Checkpoint]
|
|
35
|
+
|
|
36
|
+
def execute(self):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Evaluate(Task):
|
|
41
|
+
model: Param[Model]
|
|
42
|
+
|
|
43
|
+
def execute(self):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Validation(Config):
|
|
48
|
+
model: Param[Model]
|
|
49
|
+
|
|
50
|
+
def checkpoint(self, dep: DependentMarker, *, step: int) -> Checkpoint:
|
|
51
|
+
return dep(Checkpoint.C(model=self.model, step=step))
|
|
52
|
+
|
|
53
|
+
def compute(self, step: int):
|
|
54
|
+
self.register_task_output(self.checkpoint, step=step)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Learn(ResumableTask):
|
|
58
|
+
model: Param[Model]
|
|
59
|
+
validation: Param[Validation]
|
|
60
|
+
|
|
61
|
+
# Control files for synchronization with tests
|
|
62
|
+
max_step_file: Meta[Path] = field(default_factory=PathGenerator("max_step"))
|
|
63
|
+
state_file: Meta[Path] = field(default_factory=PathGenerator("state.json"))
|
|
64
|
+
|
|
65
|
+
def execute(self):
|
|
66
|
+
start_step = 0
|
|
67
|
+
|
|
68
|
+
if self.state_file.exists():
|
|
69
|
+
with self.state_file.open("r") as f:
|
|
70
|
+
state = json.load(f)
|
|
71
|
+
start_step = state.get("last_step", 0)
|
|
72
|
+
logging.info("Resuming from step %d", start_step)
|
|
73
|
+
|
|
74
|
+
# Wait for max_step_file to know how far to go
|
|
75
|
+
while not self.max_step_file.is_file():
|
|
76
|
+
time.sleep(0.1)
|
|
77
|
+
|
|
78
|
+
with self.max_step_file.open("r") as f:
|
|
79
|
+
max_step = int(f.read().strip())
|
|
80
|
+
self.max_step_file.unlink()
|
|
81
|
+
|
|
82
|
+
# Use absolute value for step comparison
|
|
83
|
+
# Negative max_step means: produce up to |max_step| then crash (simulate interruption)
|
|
84
|
+
# Positive max_step means: produce up to max_step then complete normally
|
|
85
|
+
abs_max = abs(max_step)
|
|
86
|
+
|
|
87
|
+
for step in [15, 30, 45]:
|
|
88
|
+
if step <= start_step:
|
|
89
|
+
logging.info("Skipping already processed step %d", step)
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
if step > abs_max:
|
|
93
|
+
# We're past the limit, stop here
|
|
94
|
+
break
|
|
95
|
+
|
|
96
|
+
self.validation.compute(step)
|
|
97
|
+
|
|
98
|
+
# Save state after each checkpoint
|
|
99
|
+
with self.state_file.open("w") as f:
|
|
100
|
+
json.dump({"last_step": step}, f)
|
|
101
|
+
|
|
102
|
+
# If max_step is negative (e.g. -15), simulate exit after producing |max_step|
|
|
103
|
+
if max_step < 0 and step >= abs_max:
|
|
104
|
+
logging.warning("Simulating interruption after step %d", step)
|
|
105
|
+
sys.exit(1)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def evaluate(evaluations, checkpoint: Checkpoint):
|
|
109
|
+
logging.warning("Evaluating checkpoint %s", checkpoint)
|
|
110
|
+
task = Evaluate.C(model=checkpoint.model)
|
|
111
|
+
checkpoint_loader = CheckpointLoader.C(checkpoint=checkpoint)
|
|
112
|
+
evaluations.append(task.submit(init_tasks=[checkpoint_loader]))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def test_task_dynamic_simple():
|
|
116
|
+
"""Test that dynamic task outputs trigger callbacks
|
|
117
|
+
|
|
118
|
+
This test verifies that callbacks are guaranteed to complete before
|
|
119
|
+
the experiment context exits. The callback waits for jobs to complete
|
|
120
|
+
before submitting evaluations, which validates that the synchronization
|
|
121
|
+
logic correctly waits for all callbacks to finish.
|
|
122
|
+
"""
|
|
123
|
+
import asyncio
|
|
124
|
+
|
|
125
|
+
evaluations = []
|
|
126
|
+
xp_ref = [None] # To access xp from callback
|
|
127
|
+
|
|
128
|
+
def collect_checkpoint(checkpoint: Checkpoint):
|
|
129
|
+
"""Callback that waits for jobs to complete before evaluating
|
|
130
|
+
|
|
131
|
+
This simulates a real-world scenario where the callback needs to wait
|
|
132
|
+
for the triggering task to complete before it can proceed (e.g., to
|
|
133
|
+
read outputs from the task's directory).
|
|
134
|
+
"""
|
|
135
|
+
logging.info("Received checkpoint %s, waiting for jobs to complete", checkpoint)
|
|
136
|
+
xp = xp_ref[0]
|
|
137
|
+
|
|
138
|
+
# Wait for unfinished jobs to become 0 (all tasks completed)
|
|
139
|
+
async def wait_for_jobs_done():
|
|
140
|
+
async with xp.scheduler.exitCondition:
|
|
141
|
+
while xp.unfinishedJobs > 0:
|
|
142
|
+
await xp.scheduler.exitCondition.wait()
|
|
143
|
+
|
|
144
|
+
asyncio.run_coroutine_threadsafe(
|
|
145
|
+
wait_for_jobs_done(), xp.scheduler.loop
|
|
146
|
+
).result()
|
|
147
|
+
|
|
148
|
+
# Now submit evaluation
|
|
149
|
+
logging.info("Jobs done, submitting evaluation for checkpoint %s", checkpoint)
|
|
150
|
+
evaluate(evaluations, checkpoint)
|
|
151
|
+
|
|
152
|
+
with TemporaryDirectory() as workdir:
|
|
153
|
+
with TemporaryExperiment("dynamic", maxwait=10, workdir=workdir) as xp:
|
|
154
|
+
xp_ref[0] = xp
|
|
155
|
+
model = Model.C()
|
|
156
|
+
validation = Validation.C(model=model)
|
|
157
|
+
learn = Learn.C(model=model, validation=validation)
|
|
158
|
+
learn.watch_output(validation.checkpoint, collect_checkpoint)
|
|
159
|
+
|
|
160
|
+
learn.submit()
|
|
161
|
+
|
|
162
|
+
# Allow the task to run up to step 30
|
|
163
|
+
learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
|
|
164
|
+
with learn.max_step_file.open("w") as f:
|
|
165
|
+
f.write("30")
|
|
166
|
+
|
|
167
|
+
logging.info("Experiment will wait for completion...")
|
|
168
|
+
|
|
169
|
+
assert len(evaluations) == 2, f"Expected 2 evaluations, got {len(evaluations)}"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_task_dynamic_replay():
|
|
173
|
+
"""Test that dynamic outputs are replayed when a task is restarted
|
|
174
|
+
|
|
175
|
+
Scenario:
|
|
176
|
+
1. First run: task produces checkpoint for step 15, then exits (simulated timeout)
|
|
177
|
+
2. Second run: task should replay the step 15 checkpoint and produce new ones
|
|
178
|
+
"""
|
|
179
|
+
with TemporaryDirectory() as workdir:
|
|
180
|
+
# First run: produce one checkpoint then exit
|
|
181
|
+
evaluations_run1 = []
|
|
182
|
+
try:
|
|
183
|
+
with TemporaryExperiment("dynamic_replay", maxwait=5, workdir=workdir):
|
|
184
|
+
model = Model.C()
|
|
185
|
+
validation = Validation.C(model=model)
|
|
186
|
+
learn = Learn.C(model=model, validation=validation)
|
|
187
|
+
learn.watch_output(
|
|
188
|
+
validation.checkpoint, partial(evaluate, evaluations_run1)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
learn.submit()
|
|
192
|
+
|
|
193
|
+
# Allow task to produce step 15 checkpoint, then simulate crash
|
|
194
|
+
# Negative value means: produce up to |value| then exit with error
|
|
195
|
+
learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
with learn.max_step_file.open("w") as f:
|
|
197
|
+
f.write("-15")
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
# Expected: the task will fail when trying to go past max_step
|
|
201
|
+
logging.info("First run ended (expected): %s", e)
|
|
202
|
+
|
|
203
|
+
# First run should have produced at least one evaluation (for step 15)
|
|
204
|
+
assert (
|
|
205
|
+
len(evaluations_run1) == 1
|
|
206
|
+
), f"Run 1: Expected 1 evaluation, got {len(evaluations_run1)}"
|
|
207
|
+
|
|
208
|
+
# Second run: restart and continue
|
|
209
|
+
evaluations_run2 = []
|
|
210
|
+
with TemporaryExperiment("dynamic_replay", maxwait=30, workdir=workdir):
|
|
211
|
+
model = Model.C()
|
|
212
|
+
validation = Validation.C(model=model)
|
|
213
|
+
learn = Learn.C(model=model, validation=validation)
|
|
214
|
+
learn.watch_output(
|
|
215
|
+
validation.checkpoint, partial(evaluate, evaluations_run2)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
learn.submit()
|
|
219
|
+
|
|
220
|
+
# Allow task to run to completion (step 45)
|
|
221
|
+
learn.max_step_file.parent.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
with learn.max_step_file.open("w") as f:
|
|
223
|
+
f.write("45")
|
|
224
|
+
|
|
225
|
+
# Second run should have:
|
|
226
|
+
# - Replayed the step 15 checkpoint (from first run)
|
|
227
|
+
# - Produced step 30 and 45 checkpoints
|
|
228
|
+
# Total: 3 evaluations (but step 15 was replayed, not re-produced)
|
|
229
|
+
assert (
|
|
230
|
+
len(evaluations_run2) == 3
|
|
231
|
+
), f"Run 2: Expected 3 evaluations, got {len(evaluations_run2)}"
|
|
@@ -10,7 +10,7 @@ def test_choices():
|
|
|
10
10
|
class TestChoices(Config):
|
|
11
11
|
a: Annotated[str, Choices(["a", "b"])]
|
|
12
12
|
|
|
13
|
-
TestChoices(a="a").__xpm__.validate()
|
|
13
|
+
TestChoices.C(a="a").__xpm__.validate()
|
|
14
14
|
|
|
15
15
|
with pytest.raises((ValueError, KeyError)):
|
|
16
|
-
TestChoices(a="c").__xpm__.validate()
|
|
16
|
+
TestChoices.C(a="c").__xpm__.validate()
|