experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +239 -126
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +217 -50
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +629 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +732 -167
- experimaestro/scheduler/interfaces.py +316 -101
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +171 -117
- experimaestro/scheduler/remote/protocol.py +8 -193
- experimaestro/scheduler/remote/server.py +95 -71
- experimaestro/scheduler/services.py +53 -28
- experimaestro/scheduler/state_provider.py +663 -2430
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +560 -99
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +438 -1966
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -437
- experimaestro/scheduler/state_sync.py +0 -891
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b8.dist-info/RECORD +0 -187
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/experiments/cli.py
CHANGED
|
@@ -5,7 +5,7 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import sys
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any, List, Optional, Protocol, Tuple
|
|
8
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Protocol, Tuple
|
|
9
9
|
|
|
10
10
|
import click
|
|
11
11
|
import omegaconf
|
|
@@ -13,16 +13,21 @@ import yaml
|
|
|
13
13
|
from omegaconf import OmegaConf, SCMode
|
|
14
14
|
from termcolor import cprint
|
|
15
15
|
|
|
16
|
-
from experimaestro import LauncherRegistry, RunMode, experiment
|
|
17
16
|
from experimaestro.exceptions import HandledException
|
|
18
17
|
from experimaestro.experiments.configuration import ConfigurationBase
|
|
18
|
+
from experimaestro.launcherfinder.registry import LauncherRegistry
|
|
19
|
+
from experimaestro.scheduler.workspace import RunMode
|
|
19
20
|
from experimaestro.settings import find_workspace
|
|
21
|
+
from experimaestro.utils.logging import setup_logging
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from experimaestro.scheduler.experiment import experiment
|
|
20
25
|
|
|
21
26
|
|
|
22
27
|
class ExperimentHelper:
|
|
23
28
|
"""Helper for experiments"""
|
|
24
29
|
|
|
25
|
-
xp: experiment
|
|
30
|
+
xp: "experiment"
|
|
26
31
|
"""The experiment object"""
|
|
27
32
|
|
|
28
33
|
#: Run function
|
|
@@ -52,7 +57,7 @@ class ExperimentHelper:
|
|
|
52
57
|
class ExperimentCallable(Protocol):
|
|
53
58
|
"""Protocol for the run function"""
|
|
54
59
|
|
|
55
|
-
def __call__(self, helper: ExperimentHelper, configuration: Any): ...
|
|
60
|
+
def __call__(self, helper: ExperimentHelper, configuration: Any): ...
|
|
56
61
|
|
|
57
62
|
|
|
58
63
|
class ConfigurationLoader:
|
|
@@ -74,6 +79,11 @@ class ConfigurationLoader:
|
|
|
74
79
|
if not path.is_absolute():
|
|
75
80
|
_data["file"] = str((yaml_file.parent / path).resolve())
|
|
76
81
|
|
|
82
|
+
if "pre_experiment" in _data:
|
|
83
|
+
path = Path(_data["pre_experiment"])
|
|
84
|
+
if not path.is_absolute():
|
|
85
|
+
_data["pre_experiment"] = str((yaml_file.parent / path).resolve())
|
|
86
|
+
|
|
77
87
|
if "module" in _data:
|
|
78
88
|
# Keeps track of the YAML file where the module was defined
|
|
79
89
|
self.yaml_module_file = yaml_file
|
|
@@ -93,6 +103,18 @@ class ConfigurationLoader:
|
|
|
93
103
|
|
|
94
104
|
@click.option("--debug", is_flag=True, help="Print debug information")
|
|
95
105
|
@click.option("--show", is_flag=True, help="Print configuration and exits")
|
|
106
|
+
@click.option(
|
|
107
|
+
"--watcher",
|
|
108
|
+
type=click.Choice(["auto", "polling", "inotify", "fsevents", "kqueue", "windows"]),
|
|
109
|
+
default="auto",
|
|
110
|
+
help="Filesystem watcher type (auto=platform default, polling=network mounts)",
|
|
111
|
+
)
|
|
112
|
+
@click.option(
|
|
113
|
+
"--polling-interval",
|
|
114
|
+
type=float,
|
|
115
|
+
default=1.0,
|
|
116
|
+
help="Polling interval in seconds (only for --watcher=polling)",
|
|
117
|
+
)
|
|
96
118
|
@click.option(
|
|
97
119
|
"--env",
|
|
98
120
|
help="Define one environment variable",
|
|
@@ -103,8 +125,8 @@ class ConfigurationLoader:
|
|
|
103
125
|
"--host",
|
|
104
126
|
type=str,
|
|
105
127
|
default=None,
|
|
106
|
-
help="Server hostname (
|
|
107
|
-
|
|
128
|
+
help="[DEPRECATED] Server hostname (use --web instead)",
|
|
129
|
+
hidden=True,
|
|
108
130
|
)
|
|
109
131
|
@click.option(
|
|
110
132
|
"--run-mode",
|
|
@@ -116,20 +138,31 @@ class ConfigurationLoader:
|
|
|
116
138
|
"--xpm-config-dir",
|
|
117
139
|
type=Path,
|
|
118
140
|
default=None,
|
|
119
|
-
help="Path for the experimaestro config directory "
|
|
120
|
-
"(if not specified, use $HOME/.config/experimaestro)",
|
|
141
|
+
help="Path for the experimaestro config directory (if not specified, use $HOME/.config/experimaestro)",
|
|
121
142
|
)
|
|
122
143
|
@click.option(
|
|
123
144
|
"--port",
|
|
124
145
|
type=int,
|
|
125
146
|
default=None,
|
|
126
|
-
help="Port for monitoring (
|
|
147
|
+
help="[DEPRECATED] Port for monitoring (use --web instead)",
|
|
148
|
+
hidden=True,
|
|
149
|
+
)
|
|
150
|
+
@click.option(
|
|
151
|
+
"--web",
|
|
152
|
+
is_flag=True,
|
|
153
|
+
help="Start web server for monitoring (use settings.yaml for host/port config)",
|
|
127
154
|
)
|
|
128
155
|
@click.option(
|
|
129
156
|
"--console",
|
|
130
157
|
is_flag=True,
|
|
131
158
|
help="Launch Textual console UI for monitoring with logs",
|
|
132
159
|
)
|
|
160
|
+
@click.option(
|
|
161
|
+
"--no-db",
|
|
162
|
+
"no_db",
|
|
163
|
+
is_flag=True,
|
|
164
|
+
help="Disable database state tracking for this experiment",
|
|
165
|
+
)
|
|
133
166
|
@click.option(
|
|
134
167
|
"--file",
|
|
135
168
|
"xp_file",
|
|
@@ -166,7 +199,9 @@ def experiments_cli( # noqa: C901
|
|
|
166
199
|
xp_file: str,
|
|
167
200
|
host: str,
|
|
168
201
|
port: int,
|
|
202
|
+
web: bool,
|
|
169
203
|
console: bool,
|
|
204
|
+
no_db: bool,
|
|
170
205
|
xpm_config_dir: Path,
|
|
171
206
|
workdir: Optional[Path],
|
|
172
207
|
workspace: Optional[str],
|
|
@@ -178,13 +213,40 @@ def experiments_cli( # noqa: C901
|
|
|
178
213
|
module_name: Optional[str],
|
|
179
214
|
args: List[str],
|
|
180
215
|
show: bool,
|
|
216
|
+
watcher: str,
|
|
217
|
+
polling_interval: float,
|
|
181
218
|
debug: bool,
|
|
182
219
|
):
|
|
183
220
|
"""Run an experiment"""
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
221
|
+
import warnings
|
|
222
|
+
|
|
223
|
+
# --- Set the logger with colors if outputting to terminal
|
|
224
|
+
setup_logging(debug=debug)
|
|
225
|
+
|
|
226
|
+
# --- Configure filesystem watcher type
|
|
227
|
+
from experimaestro.ipc import IPCom, WatcherType
|
|
228
|
+
|
|
229
|
+
if watcher != "auto":
|
|
230
|
+
IPCom.set_watcher_type(WatcherType(watcher), polling_interval)
|
|
231
|
+
elif polling_interval != 1.0:
|
|
232
|
+
# If polling interval is specified but watcher is auto, use polling
|
|
233
|
+
IPCom.set_watcher_type(WatcherType.POLLING, polling_interval)
|
|
234
|
+
|
|
235
|
+
# --- Warn about deprecated options
|
|
236
|
+
if host is not None:
|
|
237
|
+
warnings.warn(
|
|
238
|
+
"The '--host' option is deprecated. Use '--web' flag instead. "
|
|
239
|
+
"Configure host in settings.yaml.",
|
|
240
|
+
DeprecationWarning,
|
|
241
|
+
stacklevel=2,
|
|
242
|
+
)
|
|
243
|
+
if port is not None:
|
|
244
|
+
warnings.warn(
|
|
245
|
+
"The '--port' option is deprecated. Use '--web' flag instead. "
|
|
246
|
+
"Configure port in settings.yaml.",
|
|
247
|
+
DeprecationWarning,
|
|
248
|
+
stacklevel=2,
|
|
249
|
+
)
|
|
188
250
|
|
|
189
251
|
# --- Loads the YAML
|
|
190
252
|
conf_loader = ConfigurationLoader()
|
|
@@ -207,9 +269,9 @@ def experiments_cli( # noqa: C901
|
|
|
207
269
|
if xp_file is None:
|
|
208
270
|
xp_file = configuration.get("file", None)
|
|
209
271
|
if xp_file:
|
|
210
|
-
assert (
|
|
211
|
-
|
|
212
|
-
)
|
|
272
|
+
assert not module_name, (
|
|
273
|
+
"Module name and experiment file are mutually exclusive options"
|
|
274
|
+
)
|
|
213
275
|
xp_file = Path(xp_file)
|
|
214
276
|
if not python_path:
|
|
215
277
|
python_path.append(xp_file.parent.absolute())
|
|
@@ -217,9 +279,9 @@ def experiments_cli( # noqa: C901
|
|
|
217
279
|
"Using python path: %s", ", ".join(str(s) for s in python_path)
|
|
218
280
|
)
|
|
219
281
|
|
|
220
|
-
assert (
|
|
221
|
-
|
|
222
|
-
)
|
|
282
|
+
assert module_name or xp_file, (
|
|
283
|
+
"Either the module name or experiment file should be given"
|
|
284
|
+
)
|
|
223
285
|
|
|
224
286
|
# --- Set some options
|
|
225
287
|
|
|
@@ -233,6 +295,27 @@ def experiments_cli( # noqa: C901
|
|
|
233
295
|
for path in python_path:
|
|
234
296
|
sys.path.append(str(path))
|
|
235
297
|
|
|
298
|
+
# --- Execute pre-experiment script if specified
|
|
299
|
+
pre_experiment = configuration.get("pre_experiment", None)
|
|
300
|
+
if pre_experiment:
|
|
301
|
+
pre_exp_path = Path(pre_experiment)
|
|
302
|
+
if pre_exp_path.exists():
|
|
303
|
+
logging.info("Executing pre-experiment script: %s", pre_exp_path)
|
|
304
|
+
try:
|
|
305
|
+
spec = importlib.util.spec_from_file_location(
|
|
306
|
+
"pre_experiment", str(pre_exp_path.absolute())
|
|
307
|
+
)
|
|
308
|
+
pre_mod = importlib.util.module_from_spec(spec)
|
|
309
|
+
spec.loader.exec_module(pre_mod)
|
|
310
|
+
except Exception as e:
|
|
311
|
+
raise click.ClickException(
|
|
312
|
+
f"Failed to execute pre-experiment script '{pre_exp_path}': {e}"
|
|
313
|
+
)
|
|
314
|
+
else:
|
|
315
|
+
raise click.ClickException(
|
|
316
|
+
f"Pre-experiment script not found: {pre_exp_path}"
|
|
317
|
+
)
|
|
318
|
+
|
|
236
319
|
# --- Adds automatically the experiment module if not found
|
|
237
320
|
if module_name and conf_loader.yaml_module_file:
|
|
238
321
|
try:
|
|
@@ -285,6 +368,14 @@ def experiments_cli( # noqa: C901
|
|
|
285
368
|
)
|
|
286
369
|
|
|
287
370
|
schema = list_parameters[1].annotation
|
|
371
|
+
if isinstance(schema, str):
|
|
372
|
+
# Get the schema from the module
|
|
373
|
+
schema = getattr(mod.__dict__, schema, None)
|
|
374
|
+
assert schema is not None, (
|
|
375
|
+
f"Could not find schema {list_parameters[1].annotation} "
|
|
376
|
+
f"in module {module_name}"
|
|
377
|
+
)
|
|
378
|
+
|
|
288
379
|
omegaconf_schema = OmegaConf.structured(schema())
|
|
289
380
|
|
|
290
381
|
if omegaconf_schema is not None:
|
|
@@ -323,20 +414,48 @@ def experiments_cli( # noqa: C901
|
|
|
323
414
|
str(workdir.resolve()),
|
|
324
415
|
)
|
|
325
416
|
|
|
417
|
+
# Determine project path for git info
|
|
418
|
+
project_paths = []
|
|
419
|
+
if xp_file:
|
|
420
|
+
project_paths.append(xp_file.resolve().parent)
|
|
421
|
+
elif hasattr(mod, "__file__") and mod.__file__:
|
|
422
|
+
project_paths.append(Path(mod.__file__).resolve().parent)
|
|
423
|
+
|
|
326
424
|
# Define the experiment execution function
|
|
327
|
-
def run_experiment_code(
|
|
425
|
+
def run_experiment_code(
|
|
426
|
+
xp_holder=None, xp_ready_event=None, register_signals=True, in_thread=False
|
|
427
|
+
):
|
|
328
428
|
"""Run the experiment code - optionally storing xp in xp_holder"""
|
|
329
429
|
try:
|
|
430
|
+
from experimaestro.scheduler.experiment import experiment
|
|
431
|
+
from experimaestro.settings import get_settings
|
|
432
|
+
|
|
330
433
|
with experiment(
|
|
331
434
|
ws_env,
|
|
332
435
|
experiment_id,
|
|
333
|
-
host=host,
|
|
334
|
-
port=port,
|
|
335
436
|
run_mode=run_mode,
|
|
336
437
|
register_signals=register_signals,
|
|
438
|
+
project_paths=project_paths,
|
|
439
|
+
dirty_git=xp_configuration.dirty_git,
|
|
440
|
+
no_db=no_db,
|
|
337
441
|
) as xp:
|
|
338
442
|
if xp_holder is not None:
|
|
339
443
|
xp_holder["xp"] = xp
|
|
444
|
+
|
|
445
|
+
# Start web server if requested
|
|
446
|
+
if web and run_mode == RunMode.NORMAL:
|
|
447
|
+
settings = get_settings()
|
|
448
|
+
xp.scheduler.start_server(
|
|
449
|
+
settings.server,
|
|
450
|
+
workspace=xp.workspace,
|
|
451
|
+
wait_for_quit=False,
|
|
452
|
+
)
|
|
453
|
+
logging.info(
|
|
454
|
+
"Web server started at http://%s:%d",
|
|
455
|
+
settings.server.host or "localhost",
|
|
456
|
+
settings.server.port or 12345,
|
|
457
|
+
)
|
|
458
|
+
|
|
340
459
|
if xp_ready_event is not None:
|
|
341
460
|
xp_ready_event.set() # Signal that xp is ready
|
|
342
461
|
|
|
@@ -357,7 +476,11 @@ def experiments_cli( # noqa: C901
|
|
|
357
476
|
# ... and wait
|
|
358
477
|
xp.wait()
|
|
359
478
|
|
|
360
|
-
except HandledException:
|
|
479
|
+
except HandledException as e:
|
|
480
|
+
if in_thread:
|
|
481
|
+
# Re-raise to preserve exception info for the main thread
|
|
482
|
+
raise
|
|
483
|
+
cprint(f"Experiment failed: {e}", "red", file=sys.stderr)
|
|
361
484
|
sys.exit(1)
|
|
362
485
|
|
|
363
486
|
# Console mode is only available in NORMAL run mode
|
|
@@ -366,47 +489,91 @@ def experiments_cli( # noqa: C901
|
|
|
366
489
|
logging.warning("--console is ignored when run_mode is not NORMAL")
|
|
367
490
|
|
|
368
491
|
if use_console:
|
|
369
|
-
#
|
|
492
|
+
# Start TUI first, then run experiment in background thread
|
|
493
|
+
# This ensures all logs (including startup) are captured
|
|
370
494
|
import threading
|
|
371
495
|
from experimaestro.tui import ExperimentTUI
|
|
372
496
|
|
|
497
|
+
# Initialize multiprocessing resource tracker before Textual takes over
|
|
498
|
+
# terminal file descriptors. This prevents "bad value(s) in fds_to_keep"
|
|
499
|
+
# errors when code in the background thread (e.g., tqdm in torchvision)
|
|
500
|
+
# tries to use multiprocessing.
|
|
501
|
+
try:
|
|
502
|
+
from multiprocessing import resource_tracker
|
|
503
|
+
|
|
504
|
+
resource_tracker.ensure_running()
|
|
505
|
+
except Exception:
|
|
506
|
+
pass # Best effort - may not be needed on all systems
|
|
507
|
+
|
|
373
508
|
xp_holder = {"xp": None}
|
|
374
509
|
exception_holder = {"exception": None}
|
|
375
|
-
xp_ready = threading.Event()
|
|
376
510
|
|
|
377
|
-
|
|
511
|
+
# Create TUI first in deferred mode (no state_provider yet)
|
|
512
|
+
# This allows capturing all logs from experiment startup
|
|
513
|
+
tui_app = ExperimentTUI(show_logs=True)
|
|
514
|
+
|
|
515
|
+
def run_experiment_in_thread():
|
|
516
|
+
"""Run experiment and connect state provider to TUI when ready"""
|
|
378
517
|
try:
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
518
|
+
from experimaestro.scheduler.experiment import experiment as exp_context
|
|
519
|
+
from experimaestro.settings import get_settings
|
|
520
|
+
|
|
521
|
+
with exp_context(
|
|
522
|
+
ws_env,
|
|
523
|
+
experiment_id,
|
|
524
|
+
run_mode=run_mode,
|
|
525
|
+
register_signals=False, # TUI handles signals
|
|
526
|
+
project_paths=project_paths,
|
|
527
|
+
dirty_git=xp_configuration.dirty_git,
|
|
528
|
+
no_db=no_db,
|
|
529
|
+
) as xp:
|
|
530
|
+
xp_holder["xp"] = xp
|
|
531
|
+
|
|
532
|
+
# Connect TUI to the experiment's scheduler
|
|
533
|
+
tui_app.call_from_thread(tui_app.set_state_provider, xp.scheduler)
|
|
534
|
+
|
|
535
|
+
# Start web server if requested
|
|
536
|
+
if web:
|
|
537
|
+
settings = get_settings()
|
|
538
|
+
xp.scheduler.start_server(
|
|
539
|
+
settings.server,
|
|
540
|
+
workspace=xp.workspace,
|
|
541
|
+
wait_for_quit=False,
|
|
542
|
+
)
|
|
543
|
+
logging.info(
|
|
544
|
+
"Web server started at http://%s:%d",
|
|
545
|
+
settings.server.host or "localhost",
|
|
546
|
+
settings.server.port or 12345,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
logging.info("Experiment started")
|
|
550
|
+
|
|
551
|
+
# Set up the environment
|
|
552
|
+
for key, value in env:
|
|
553
|
+
xp.setenv(key, value)
|
|
554
|
+
|
|
555
|
+
# Sets the python path
|
|
556
|
+
xp.workspace.python_path.extend(python_path)
|
|
557
|
+
|
|
558
|
+
# Run the experiment
|
|
559
|
+
helper.xp = xp
|
|
560
|
+
helper.run(list(args), xp_configuration)
|
|
561
|
+
|
|
562
|
+
# ... and wait
|
|
563
|
+
xp.wait()
|
|
564
|
+
|
|
382
565
|
logging.info("Experiment thread completed")
|
|
383
|
-
|
|
566
|
+
|
|
567
|
+
except BaseException as e:
|
|
568
|
+
# Use BaseException to also catch SystemExit from sys.exit()
|
|
384
569
|
exception_holder["exception"] = e
|
|
385
|
-
xp_ready.set() # Signal even on error
|
|
386
570
|
|
|
387
571
|
# Start experiment in background thread
|
|
388
|
-
exp_thread = threading.Thread(target=
|
|
572
|
+
exp_thread = threading.Thread(target=run_experiment_in_thread, daemon=True)
|
|
389
573
|
exp_thread.start()
|
|
390
574
|
|
|
391
|
-
# Wait for experiment to start (up to 30 seconds)
|
|
392
|
-
if not xp_ready.wait(timeout=30.0):
|
|
393
|
-
cprint("Timeout waiting for experiment to start", "red", file=sys.stderr)
|
|
394
|
-
sys.exit(1)
|
|
395
|
-
|
|
396
|
-
if xp_holder["xp"] is None:
|
|
397
|
-
cprint("Failed to start experiment", "red", file=sys.stderr)
|
|
398
|
-
if exception_holder["exception"]:
|
|
399
|
-
raise exception_holder["exception"]
|
|
400
|
-
sys.exit(1)
|
|
401
|
-
|
|
402
|
-
# Run TUI in main thread (handles signals via Textual)
|
|
403
|
-
tui_app = ExperimentTUI(
|
|
404
|
-
workdir=workdir,
|
|
405
|
-
state_provider=xp_holder["xp"].state_provider,
|
|
406
|
-
show_logs=True,
|
|
407
|
-
)
|
|
408
|
-
|
|
409
575
|
try:
|
|
576
|
+
# Run TUI in main thread (handles signals via Textual)
|
|
410
577
|
# Textual automatically captures stdout/stderr via Print events
|
|
411
578
|
tui_app.run()
|
|
412
579
|
finally:
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from enum import Enum
|
|
1
2
|
from omegaconf import MISSING
|
|
2
3
|
from typing import Optional, List
|
|
3
4
|
import attr
|
|
@@ -8,6 +9,19 @@ except ImportError:
|
|
|
8
9
|
from typing_extensions import dataclass_transform
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
class DirtyGitAction(str, Enum):
|
|
13
|
+
"""Action to take when the git repository has uncommitted changes"""
|
|
14
|
+
|
|
15
|
+
IGNORE = "ignore"
|
|
16
|
+
"""Don't check or warn about dirty git state"""
|
|
17
|
+
|
|
18
|
+
WARN = "warn"
|
|
19
|
+
"""Warn about dirty git state (default)"""
|
|
20
|
+
|
|
21
|
+
ERROR = "error"
|
|
22
|
+
"""Raise an error if git is dirty"""
|
|
23
|
+
|
|
24
|
+
|
|
11
25
|
@dataclass_transform(kw_only_default=True)
|
|
12
26
|
def configuration(*args, **kwargs):
|
|
13
27
|
"""Method to define keyword only dataclasses
|
|
@@ -40,6 +54,13 @@ class ConfigurationBase:
|
|
|
40
54
|
parent: Optional[str] = None
|
|
41
55
|
"""Relative path of a YAML file that should be merged"""
|
|
42
56
|
|
|
57
|
+
pre_experiment: Optional[str] = None
|
|
58
|
+
"""Relative path to a Python file to execute before importing the experiment.
|
|
59
|
+
|
|
60
|
+
This is useful for setting environment variables or mocking modules to speed up
|
|
61
|
+
the experiment setup phase (e.g., mocking torch.compile or torch.nn).
|
|
62
|
+
The actual job execution will use real modules."""
|
|
63
|
+
|
|
43
64
|
title: str = ""
|
|
44
65
|
"""Short description of the experiment"""
|
|
45
66
|
|
|
@@ -54,3 +75,6 @@ class ConfigurationBase:
|
|
|
54
75
|
|
|
55
76
|
add_timestamp: bool = False
|
|
56
77
|
"""Adds a timestamp YYYY_MM_DD-HH_MM to the experiment ID"""
|
|
78
|
+
|
|
79
|
+
dirty_git: DirtyGitAction = DirtyGitAction.WARN
|
|
80
|
+
"""Action when git repository has uncommitted changes: ignore, warn (default), error"""
|
experimaestro/generators.py
CHANGED
|
@@ -6,7 +6,7 @@ from experimaestro.core.arguments import ArgumentOptions, TypeAnnotation
|
|
|
6
6
|
from experimaestro.core.objects import ConfigWalkContext, Config
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
|
-
from experimaestro.core.
|
|
9
|
+
from experimaestro.core.partial import Partial
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class Generator(ABC):
|
|
@@ -33,7 +33,7 @@ class PathGenerator(Generator):
|
|
|
33
33
|
output: Meta[Path] = field(default_factory=PathGenerator("results.json"))
|
|
34
34
|
model: Meta[Path] = field(default_factory=PathGenerator("model.pt"))
|
|
35
35
|
|
|
36
|
-
For shared directories across related tasks, use with
|
|
36
|
+
For shared directories across related tasks, use with partial::
|
|
37
37
|
|
|
38
38
|
training_group = param_group("training")
|
|
39
39
|
|
|
@@ -42,13 +42,13 @@ class PathGenerator(Generator):
|
|
|
42
42
|
checkpoint: Meta[Path] = field(
|
|
43
43
|
default_factory=PathGenerator(
|
|
44
44
|
"model.pt",
|
|
45
|
-
|
|
45
|
+
partial=partial(exclude=[training_group])
|
|
46
46
|
)
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
:param path: Relative path within the task directory. Can be a string,
|
|
50
50
|
Path, or callable that takes (context, config) and returns a Path.
|
|
51
|
-
:param
|
|
51
|
+
:param partial: Optional partial for partial directory sharing.
|
|
52
52
|
When provided, the path is generated in a shared partial directory.
|
|
53
53
|
"""
|
|
54
54
|
|
|
@@ -56,7 +56,7 @@ class PathGenerator(Generator):
|
|
|
56
56
|
self,
|
|
57
57
|
path: Union[str, Path, Callable[[ConfigWalkContext, Config], Path]] = "",
|
|
58
58
|
*,
|
|
59
|
-
partial: "
|
|
59
|
+
partial: "Partial" = None,
|
|
60
60
|
):
|
|
61
61
|
self.path = path
|
|
62
62
|
self.partial = partial
|
experimaestro/ipc.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""IPC utilities"""
|
|
2
2
|
|
|
3
|
+
from enum import Enum
|
|
3
4
|
from typing import Optional
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
import os
|
|
@@ -11,16 +12,132 @@ from watchdog.observers.api import ObservedWatch
|
|
|
11
12
|
from watchdog.events import FileSystemEventHandler
|
|
12
13
|
|
|
13
14
|
|
|
15
|
+
class WatcherType(str, Enum):
|
|
16
|
+
"""Available filesystem watcher types"""
|
|
17
|
+
|
|
18
|
+
AUTO = "auto"
|
|
19
|
+
"""Use the best available watcher for the platform (default)"""
|
|
20
|
+
|
|
21
|
+
POLLING = "polling"
|
|
22
|
+
"""Platform-independent polling (works on network mounts)"""
|
|
23
|
+
|
|
24
|
+
INOTIFY = "inotify"
|
|
25
|
+
"""Linux inotify (Linux 2.6.13+ only)"""
|
|
26
|
+
|
|
27
|
+
FSEVENTS = "fsevents"
|
|
28
|
+
"""macOS FSEvents (macOS only)"""
|
|
29
|
+
|
|
30
|
+
KQUEUE = "kqueue"
|
|
31
|
+
"""BSD/macOS kqueue (less scalable for deep directories)"""
|
|
32
|
+
|
|
33
|
+
WINDOWS = "windows"
|
|
34
|
+
"""Windows API (Windows only)"""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _create_observer(watcher_type: WatcherType, polling_interval: float = 1.0):
|
|
38
|
+
"""Create an observer of the specified type
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
watcher_type: The type of filesystem watcher to use
|
|
42
|
+
polling_interval: Polling interval in seconds (for polling watcher)
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
An observer instance
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
ImportError: If the requested watcher type is not available on this platform
|
|
49
|
+
ValueError: If watcher_type is invalid
|
|
50
|
+
"""
|
|
51
|
+
match watcher_type:
|
|
52
|
+
case WatcherType.AUTO:
|
|
53
|
+
return Observer()
|
|
54
|
+
|
|
55
|
+
case WatcherType.POLLING:
|
|
56
|
+
from watchdog.observers.polling import PollingObserver
|
|
57
|
+
|
|
58
|
+
return PollingObserver(timeout=polling_interval)
|
|
59
|
+
|
|
60
|
+
case WatcherType.INOTIFY:
|
|
61
|
+
from watchdog.observers.inotify import InotifyObserver
|
|
62
|
+
|
|
63
|
+
return InotifyObserver()
|
|
64
|
+
|
|
65
|
+
case WatcherType.FSEVENTS:
|
|
66
|
+
from watchdog.observers.fsevents import FSEventsObserver
|
|
67
|
+
|
|
68
|
+
return FSEventsObserver()
|
|
69
|
+
|
|
70
|
+
case WatcherType.KQUEUE:
|
|
71
|
+
from watchdog.observers.kqueue import KqueueObserver
|
|
72
|
+
|
|
73
|
+
return KqueueObserver()
|
|
74
|
+
|
|
75
|
+
case WatcherType.WINDOWS:
|
|
76
|
+
from watchdog.observers.read_directory_changes import WindowsApiObserver
|
|
77
|
+
|
|
78
|
+
return WindowsApiObserver()
|
|
79
|
+
|
|
80
|
+
case _:
|
|
81
|
+
raise ValueError(f"Unknown watcher type: {watcher_type}")
|
|
82
|
+
|
|
83
|
+
|
|
14
84
|
class IPCom:
|
|
15
85
|
"""IPC async thread"""
|
|
16
86
|
|
|
17
87
|
INSTANCE: Optional["IPCom"] = None
|
|
88
|
+
# Testing mode: use polling observer with small interval
|
|
89
|
+
TESTING_MODE: bool = False
|
|
90
|
+
POLLING_INTERVAL: float = 0.01
|
|
91
|
+
# Watcher type configuration
|
|
92
|
+
WATCHER_TYPE: WatcherType = WatcherType.AUTO
|
|
18
93
|
|
|
19
94
|
def __init__(self):
|
|
20
|
-
|
|
95
|
+
if IPCom.TESTING_MODE:
|
|
96
|
+
from watchdog.observers.polling import PollingObserver
|
|
97
|
+
|
|
98
|
+
self.observer = PollingObserver(timeout=IPCom.POLLING_INTERVAL)
|
|
99
|
+
else:
|
|
100
|
+
self.observer = _create_observer(IPCom.WATCHER_TYPE, IPCom.POLLING_INTERVAL)
|
|
21
101
|
self.observer.start()
|
|
22
102
|
self.pid = os.getpid()
|
|
23
103
|
|
|
104
|
+
@classmethod
|
|
105
|
+
def set_watcher_type(cls, watcher_type: WatcherType, polling_interval: float = 1.0):
|
|
106
|
+
"""Set the filesystem watcher type
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
watcher_type: The type of watcher to use
|
|
110
|
+
polling_interval: Polling interval in seconds (for polling watcher)
|
|
111
|
+
|
|
112
|
+
Note:
|
|
113
|
+
This must be called before the first IPCom instance is created.
|
|
114
|
+
If an instance already exists, it will be reset.
|
|
115
|
+
"""
|
|
116
|
+
cls.WATCHER_TYPE = watcher_type
|
|
117
|
+
cls.POLLING_INTERVAL = polling_interval
|
|
118
|
+
# Reset instance to apply new settings
|
|
119
|
+
if cls.INSTANCE is not None:
|
|
120
|
+
cls.INSTANCE.observer.stop()
|
|
121
|
+
cls.INSTANCE.observer.join(timeout=5)
|
|
122
|
+
cls.INSTANCE = None
|
|
123
|
+
logger.info("Set watcher type to %s", watcher_type.value)
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def set_testing_mode(cls, enabled: bool = True, polling_interval: float = 0.01):
|
|
127
|
+
"""Enable testing mode with polling observer
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
enabled: Whether to enable testing mode
|
|
131
|
+
polling_interval: Polling interval in seconds (default 0.01)
|
|
132
|
+
"""
|
|
133
|
+
cls.TESTING_MODE = enabled
|
|
134
|
+
cls.POLLING_INTERVAL = polling_interval
|
|
135
|
+
# Reset instance to apply new settings
|
|
136
|
+
if cls.INSTANCE is not None:
|
|
137
|
+
cls.INSTANCE.observer.stop()
|
|
138
|
+
cls.INSTANCE.observer.join(timeout=5)
|
|
139
|
+
cls.INSTANCE = None
|
|
140
|
+
|
|
24
141
|
def fswatch(
|
|
25
142
|
self, watcher: FileSystemEventHandler, path: Path, recursive=False
|
|
26
143
|
) -> ObservedWatch:
|
|
@@ -59,9 +59,9 @@ class LauncherRegistry:
|
|
|
59
59
|
).expanduser()
|
|
60
60
|
|
|
61
61
|
if LauncherRegistry.CURRENT_CONFIG_DIR not in LauncherRegistry.INSTANCES:
|
|
62
|
-
LauncherRegistry.INSTANCES[
|
|
63
|
-
LauncherRegistry.CURRENT_CONFIG_DIR
|
|
64
|
-
|
|
62
|
+
LauncherRegistry.INSTANCES[LauncherRegistry.CURRENT_CONFIG_DIR] = (
|
|
63
|
+
LauncherRegistry(LauncherRegistry.CURRENT_CONFIG_DIR)
|
|
64
|
+
)
|
|
65
65
|
|
|
66
66
|
return LauncherRegistry.INSTANCES[LauncherRegistry.CURRENT_CONFIG_DIR]
|
|
67
67
|
|
|
@@ -83,7 +83,6 @@ class LauncherRegistry:
|
|
|
83
83
|
|
|
84
84
|
# Register the find launcher function if it exists
|
|
85
85
|
launchers_py = basepath / "launchers.py"
|
|
86
|
-
print(f"basepath {launchers_py}")
|
|
87
86
|
if launchers_py.is_file():
|
|
88
87
|
logger.info("Loading %s", launchers_py)
|
|
89
88
|
|
|
@@ -163,9 +162,9 @@ class LauncherRegistry:
|
|
|
163
162
|
if self.find_launcher_fn is not None:
|
|
164
163
|
for spec in specs.requirements:
|
|
165
164
|
if launcher := self.find_launcher_fn(spec, tags):
|
|
166
|
-
assert isinstance(
|
|
167
|
-
|
|
168
|
-
)
|
|
165
|
+
assert isinstance(launcher, Launcher), (
|
|
166
|
+
"f{self.find_launcher_fn} did not return a Launcher but {type(launcher)}"
|
|
167
|
+
)
|
|
169
168
|
return launcher
|
|
170
169
|
|
|
171
170
|
return None
|