experimaestro 1.5.6__py3-none-any.whl → 1.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__main__.py +3 -423
- experimaestro/cli/__init__.py +312 -0
- experimaestro/{filter.py → cli/filter.py} +4 -4
- experimaestro/cli/jobs.py +251 -0
- experimaestro/connectors/ssh.py +2 -2
- experimaestro/core/objects.py +6 -3
- experimaestro/core/types.py +8 -3
- experimaestro/experiments/cli.py +20 -42
- experimaestro/launcherfinder/__init__.py +1 -1
- experimaestro/launcherfinder/base.py +2 -18
- experimaestro/launcherfinder/registry.py +22 -129
- experimaestro/launchers/direct.py +0 -47
- experimaestro/launchers/slurm/base.py +3 -1
- experimaestro/notifications.py +24 -8
- experimaestro/run.py +0 -1
- experimaestro/scheduler/base.py +0 -5
- experimaestro/settings.py +29 -1
- experimaestro/tests/launchers/config_slurm/launchers.py +25 -0
- experimaestro/tests/test_findlauncher.py +1 -1
- experimaestro/tests/test_tags.py +35 -0
- experimaestro/tokens.py +8 -8
- experimaestro/utils/resources.py +5 -1
- {experimaestro-1.5.6.dist-info → experimaestro-1.5.8.dist-info}/METADATA +1 -1
- {experimaestro-1.5.6.dist-info → experimaestro-1.5.8.dist-info}/RECORD +27 -28
- {experimaestro-1.5.6.dist-info → experimaestro-1.5.8.dist-info}/entry_points.txt +0 -4
- experimaestro/launchers/slurm/cli.py +0 -29
- experimaestro/launchers/slurm/configuration.py +0 -597
- experimaestro/tests/launchers/config_slurm/launchers.yaml +0 -134
- experimaestro/utils/yaml.py +0 -202
- {experimaestro-1.5.6.dist-info → experimaestro-1.5.8.dist-info}/LICENSE +0 -0
- {experimaestro-1.5.6.dist-info → experimaestro-1.5.8.dist-info}/WHEEL +0 -0
experimaestro/experiments/cli.py
CHANGED
|
@@ -1,26 +1,28 @@
|
|
|
1
|
+
import imp
|
|
1
2
|
import inspect
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
import sys
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Any, List, Optional, Protocol, Tuple
|
|
7
|
+
from typing import Any, List, Optional, Protocol, Tuple
|
|
7
8
|
|
|
8
9
|
import click
|
|
9
10
|
import omegaconf
|
|
10
11
|
import yaml
|
|
11
|
-
from experimaestro import LauncherRegistry, RunMode, experiment
|
|
12
|
-
from experimaestro.experiments.configuration import ConfigurationBase
|
|
13
|
-
from experimaestro.exceptions import HandledException
|
|
14
|
-
from experimaestro.settings import get_workspace
|
|
15
12
|
from omegaconf import OmegaConf, SCMode
|
|
16
13
|
from termcolor import cprint
|
|
17
14
|
|
|
15
|
+
from experimaestro import LauncherRegistry, RunMode, experiment
|
|
16
|
+
from experimaestro.exceptions import HandledException
|
|
17
|
+
from experimaestro.experiments.configuration import ConfigurationBase
|
|
18
|
+
from experimaestro.settings import find_workspace
|
|
19
|
+
|
|
18
20
|
|
|
19
21
|
class ExperimentHelper:
|
|
20
22
|
"""Helper for experiments"""
|
|
21
23
|
|
|
22
|
-
# The experiment
|
|
23
24
|
xp: experiment
|
|
25
|
+
"""The experiment object"""
|
|
24
26
|
|
|
25
27
|
#: Run function
|
|
26
28
|
callable: "ExperimentCallable"
|
|
@@ -175,28 +177,23 @@ def experiments_cli( # noqa: C901
|
|
|
175
177
|
xp_file = Path(xp_file)
|
|
176
178
|
if not xp_file.exists() and xp_file.suffix != ".py":
|
|
177
179
|
xp_file = xp_file.with_suffix(".py")
|
|
178
|
-
xp_file = Path(yaml_file).parent / xp_file
|
|
179
|
-
|
|
180
|
-
with open(xp_file, "r") as f:
|
|
181
|
-
source = f.read()
|
|
182
|
-
if sys.version_info < (3, 9):
|
|
183
|
-
the__file__ = str(xp_file)
|
|
184
|
-
else:
|
|
185
|
-
the__file__ = str(xp_file.absolute())
|
|
186
|
-
|
|
187
|
-
code = compile(source, filename=the__file__, mode="exec")
|
|
188
|
-
_locals: Dict[str, Any] = {}
|
|
180
|
+
xp_file: Path = Path(yaml_file).parent / xp_file
|
|
189
181
|
|
|
190
|
-
|
|
182
|
+
# --- Finds the "run" function
|
|
191
183
|
try:
|
|
192
|
-
|
|
184
|
+
sys.path.append(str(xp_file.parent.absolute()))
|
|
185
|
+
with open(xp_file) as src:
|
|
186
|
+
module_name = xp_file.with_suffix("").name
|
|
187
|
+
mod = imp.load_module(
|
|
188
|
+
module_name, src, str(xp_file.absolute()), (".py", "r", imp.PY_SOURCE)
|
|
189
|
+
)
|
|
190
|
+
helper = getattr(mod, "run", None)
|
|
193
191
|
finally:
|
|
194
192
|
sys.path.pop()
|
|
195
193
|
|
|
196
194
|
# --- ... and runs it
|
|
197
|
-
helper = _locals.get("run", None)
|
|
198
195
|
if helper is None:
|
|
199
|
-
raise ValueError(f"Could not find run function in {
|
|
196
|
+
raise ValueError(f"Could not find run function in {xp_file}")
|
|
200
197
|
|
|
201
198
|
if not isinstance(helper, ExperimentHelper):
|
|
202
199
|
helper = ExperimentHelper(helper)
|
|
@@ -231,27 +228,8 @@ def experiments_cli( # noqa: C901
|
|
|
231
228
|
)
|
|
232
229
|
|
|
233
230
|
# Define the workspace
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
if workspace:
|
|
237
|
-
ws_env = get_workspace(workspace)
|
|
238
|
-
if ws_env is None:
|
|
239
|
-
raise RuntimeError("No workspace named %s", workspace)
|
|
240
|
-
|
|
241
|
-
logging.info("Using workspace %s", ws_env.id)
|
|
242
|
-
if workdir:
|
|
243
|
-
# Overrides working directory
|
|
244
|
-
logging.info(" override working directory: %s", workdir)
|
|
245
|
-
ws_env.path = workdir
|
|
246
|
-
else:
|
|
247
|
-
workdir = ws_env.path
|
|
248
|
-
elif workdir:
|
|
249
|
-
logging.info("Using workdir %s", workdir)
|
|
250
|
-
ws_env = workdir
|
|
251
|
-
else:
|
|
252
|
-
ws_env = get_workspace()
|
|
253
|
-
assert ws_env is not None, "No workdir or workspace defined, and no default"
|
|
254
|
-
logging.info("Using default workspace %s", ws_env.id)
|
|
231
|
+
ws_env = find_workspace(workdir=workdir, workspace=workspace)
|
|
232
|
+
workdir = ws_env.path
|
|
255
233
|
|
|
256
234
|
logging.info("Using working directory %s", str(workdir.resolve()))
|
|
257
235
|
|
|
@@ -1,33 +1,17 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import TYPE_CHECKING, List, Optional
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
3
2
|
|
|
4
|
-
from experimaestro.utils.yaml import YAMLDataClass
|
|
5
|
-
from .specs import HostRequirement
|
|
6
3
|
|
|
7
4
|
if TYPE_CHECKING:
|
|
8
|
-
from experimaestro.launchers import Launcher
|
|
9
5
|
from experimaestro.connectors import Connector
|
|
10
6
|
from experimaestro.tokens import Token
|
|
11
7
|
from .registry import LauncherRegistry
|
|
12
8
|
|
|
13
9
|
|
|
14
|
-
class LauncherConfiguration:
|
|
15
|
-
tags: List[str]
|
|
16
|
-
weight: int
|
|
17
|
-
|
|
18
|
-
"""Generic class for a launcher configuration"""
|
|
19
|
-
|
|
20
|
-
def get(
|
|
21
|
-
self, registry: "LauncherRegistry", requirement: HostRequirement
|
|
22
|
-
) -> Optional["Launcher"]:
|
|
23
|
-
raise NotImplementedError(f"For {self.__class__}")
|
|
24
|
-
|
|
25
|
-
|
|
26
10
|
class ConnectorConfiguration:
|
|
27
11
|
def create(self, registry: "LauncherRegistry") -> "Connector":
|
|
28
12
|
raise NotImplementedError(f"For {self.__class__}")
|
|
29
13
|
|
|
30
14
|
|
|
31
|
-
class TokenConfiguration
|
|
15
|
+
class TokenConfiguration:
|
|
32
16
|
def create(self, registry: "LauncherRegistry", identifier: str) -> "Token":
|
|
33
17
|
raise NotImplementedError(f"For {self.__class__}")
|
|
@@ -1,27 +1,15 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Configuration registers
|
|
2
|
+
|
|
3
|
+
from typing import ClassVar, Dict, Optional, Set, Type, Union
|
|
2
4
|
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
import itertools
|
|
5
|
-
from types import new_class
|
|
6
|
-
from typing import ClassVar, Dict, List, Optional, Set, Type, Union
|
|
7
|
-
from experimaestro import Annotated
|
|
8
5
|
from pathlib import Path
|
|
9
6
|
import typing
|
|
7
|
+
from omegaconf import DictConfig, OmegaConf, SCMode
|
|
10
8
|
import pkg_resources
|
|
11
|
-
import humanfriendly
|
|
12
|
-
import yaml
|
|
13
|
-
from yaml import Loader, Dumper
|
|
14
9
|
from experimaestro.utils import logger
|
|
15
|
-
from experimaestro.utils.yaml import (
|
|
16
|
-
Initialize,
|
|
17
|
-
YAMLDataClass,
|
|
18
|
-
YAMLException,
|
|
19
|
-
YAMLList,
|
|
20
|
-
add_path_resolvers,
|
|
21
|
-
)
|
|
22
10
|
|
|
23
|
-
from .base import
|
|
24
|
-
from .specs import
|
|
11
|
+
from .base import ConnectorConfiguration, TokenConfiguration
|
|
12
|
+
from .specs import HostRequirement
|
|
25
13
|
|
|
26
14
|
if typing.TYPE_CHECKING:
|
|
27
15
|
from experimaestro.launchers import Launcher
|
|
@@ -32,80 +20,20 @@ class LauncherNotFoundError(Exception):
|
|
|
32
20
|
pass
|
|
33
21
|
|
|
34
22
|
|
|
35
|
-
@dataclass
|
|
36
|
-
class GPU(YAMLDataClass):
|
|
37
|
-
"""Represents a GPU"""
|
|
38
|
-
|
|
39
|
-
model: str
|
|
40
|
-
count: int
|
|
41
|
-
memory: Annotated[int, Initialize(humanfriendly.parse_size)]
|
|
42
|
-
|
|
43
|
-
def to_spec(self):
|
|
44
|
-
return [CudaSpecification(self.memory, self.model) for _ in range(self.count)]
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class GPUList(YAMLList[GPU]):
|
|
48
|
-
"""Represents a list of GPUs"""
|
|
49
|
-
|
|
50
|
-
def __repr__(self):
|
|
51
|
-
return f"GPUs({super().__repr__()})"
|
|
52
|
-
|
|
53
|
-
def to_spec(self) -> List[CudaSpecification]:
|
|
54
|
-
return list(itertools.chain(*[gpu.to_spec() for gpu in self]))
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
@dataclass
|
|
58
|
-
class CPU(YAMLDataClass):
|
|
59
|
-
"""Represents a CPU"""
|
|
60
|
-
|
|
61
|
-
memory: Annotated[int, Initialize(humanfriendly.parse_size)] = 0
|
|
62
|
-
cores: int = 1
|
|
63
|
-
|
|
64
|
-
def to_spec(self):
|
|
65
|
-
return CPUSpecification(self.memory, self.cores)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@dataclass
|
|
69
|
-
class Host(YAMLDataClass):
|
|
70
|
-
name: str
|
|
71
|
-
gpus: List[GPU]
|
|
72
|
-
launchers: List[str]
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
Launchers = Dict[str, List[LauncherConfiguration]]
|
|
76
23
|
Connectors = Dict[str, Dict[str, ConnectorConfiguration]]
|
|
77
24
|
Tokens = Dict[str, Dict[str, TokenConfiguration]]
|
|
78
25
|
|
|
79
26
|
|
|
80
|
-
def
|
|
81
|
-
return new_class("LauncherLoader", (yaml.FullLoader,)) # type: ignore
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def load_yaml(loader_cls: Type[Loader], path: Path):
|
|
27
|
+
def load_yaml(schema, path: Path):
|
|
85
28
|
if not path.is_file():
|
|
86
|
-
return
|
|
29
|
+
return {}
|
|
87
30
|
|
|
88
|
-
logger.warning(
|
|
89
|
-
"Using YAML file to configure launchers is deprecated. Please remove %s using launchers.py",
|
|
90
|
-
path,
|
|
91
|
-
)
|
|
92
31
|
logger.debug("Loading %s", path)
|
|
93
32
|
with path.open("rt") as fp:
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
loader.dispose()
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def unknown_error(loader: Loader, node):
|
|
102
|
-
raise YAMLException(
|
|
103
|
-
"",
|
|
104
|
-
node.start_mark.name,
|
|
105
|
-
node.start_mark.line,
|
|
106
|
-
node.start_mark.column,
|
|
107
|
-
f"No handler defined for key {node}",
|
|
108
|
-
)
|
|
33
|
+
cfg = OmegaConf.load(fp)
|
|
34
|
+
return OmegaConf.to_container(
|
|
35
|
+
OmegaConf.merge(cfg, schema), structured_config_mode=SCMode.INSTANTIATE
|
|
36
|
+
)
|
|
109
37
|
|
|
110
38
|
|
|
111
39
|
class LauncherRegistry:
|
|
@@ -132,27 +60,14 @@ class LauncherRegistry:
|
|
|
132
60
|
LauncherRegistry.CURRENT_CONFIG_DIR = config_dir
|
|
133
61
|
|
|
134
62
|
def __init__(self, basepath: Path):
|
|
135
|
-
self.
|
|
136
|
-
self.
|
|
137
|
-
self.TokenLoader: Type[Loader] = new_loader("TokenLoader")
|
|
138
|
-
self.Dumper: Type[Dumper] = new_class("CustomDumper", (Dumper,), {})
|
|
63
|
+
self.connectors_schema = DictConfig({})
|
|
64
|
+
self.tokens_schema = DictConfig({})
|
|
139
65
|
self.find_launcher_fn = None
|
|
140
66
|
|
|
141
|
-
# Add safeguards
|
|
142
|
-
add_path_resolvers(
|
|
143
|
-
self.LauncherLoader,
|
|
144
|
-
[],
|
|
145
|
-
Dict[str, LauncherConfiguration],
|
|
146
|
-
dumper=self.Dumper,
|
|
147
|
-
)
|
|
148
|
-
|
|
149
67
|
# Use entry points for connectors and launchers
|
|
150
68
|
for entry_point in pkg_resources.iter_entry_points("experimaestro.connectors"):
|
|
151
69
|
entry_point.load().init_registry(self)
|
|
152
70
|
|
|
153
|
-
for entry_point in pkg_resources.iter_entry_points("experimaestro.launchers"):
|
|
154
|
-
entry_point.load().init_registry(self)
|
|
155
|
-
|
|
156
71
|
for entry_point in pkg_resources.iter_entry_points("experimaestro.tokens"):
|
|
157
72
|
entry_point.load().init_registry(self)
|
|
158
73
|
|
|
@@ -172,32 +87,16 @@ class LauncherRegistry:
|
|
|
172
87
|
logger.warn("No find_launcher() function was found in %s", launchers_py)
|
|
173
88
|
|
|
174
89
|
# Read the configuration file
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
)
|
|
178
|
-
self.launchers = sorted(
|
|
179
|
-
itertools.chain(*launchers.values()), key=lambda launcher: -launcher.weight
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
self.connectors: Connectors = (
|
|
183
|
-
load_yaml(self.ConnectorLoader, basepath / "connectors.yaml") or {}
|
|
184
|
-
)
|
|
185
|
-
self.tokens: Tokens = (
|
|
186
|
-
load_yaml(self.TokenLoader, basepath / "tokens.yaml") or {}
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
def register_launcher(self, identifier: str, cls: Type[YAMLDataClass]):
|
|
190
|
-
add_path_resolvers(
|
|
191
|
-
self.LauncherLoader, [identifier, None], cls, dumper=self.Dumper
|
|
90
|
+
self.connectors = load_yaml(
|
|
91
|
+
self.connectors_schema, basepath / "connectors.yaml"
|
|
192
92
|
)
|
|
93
|
+
self.tokens = load_yaml(self.tokens_schema, basepath / "tokens.yaml")
|
|
193
94
|
|
|
194
|
-
def register_connector(self, identifier: str, cls: Type
|
|
195
|
-
|
|
196
|
-
self.ConnectorLoader, [identifier, None], cls, dumper=self.Dumper
|
|
197
|
-
)
|
|
95
|
+
def register_connector(self, identifier: str, cls: Type):
|
|
96
|
+
self.connectors_schema.merge_with({identifier: cls})
|
|
198
97
|
|
|
199
|
-
def register_token(self, identifier: str, cls: Type
|
|
200
|
-
|
|
98
|
+
def register_token(self, identifier: str, cls: Type):
|
|
99
|
+
self.tokens_schema.merge_with({identifier: cls})
|
|
201
100
|
|
|
202
101
|
def getToken(self, identifier: str) -> "Token":
|
|
203
102
|
for tokens in self.tokens.values():
|
|
@@ -227,7 +126,7 @@ class LauncherRegistry:
|
|
|
227
126
|
tags: Restrict the launchers to those containing one of the specified tags
|
|
228
127
|
"""
|
|
229
128
|
|
|
230
|
-
if
|
|
129
|
+
if self.find_launcher_fn is None:
|
|
231
130
|
logger.info("No launchers.yaml file: using local host ")
|
|
232
131
|
from experimaestro.launchers.direct import DirectLauncher
|
|
233
132
|
from experimaestro.connectors.local import LocalConnector
|
|
@@ -250,12 +149,6 @@ class LauncherRegistry:
|
|
|
250
149
|
if launcher := self.find_launcher_fn(spec, tags):
|
|
251
150
|
return launcher
|
|
252
151
|
|
|
253
|
-
# We have registered launchers
|
|
254
|
-
for spec in specs:
|
|
255
|
-
for handler in self.launchers:
|
|
256
|
-
if (not tags) or any((tag in tags) for tag in handler.tags):
|
|
257
|
-
if launcher := handler.get(self, spec):
|
|
258
|
-
return launcher
|
|
259
152
|
return None
|
|
260
153
|
|
|
261
154
|
|
|
@@ -1,15 +1,3 @@
|
|
|
1
|
-
from dataclasses import dataclass, field
|
|
2
|
-
from functools import cached_property
|
|
3
|
-
from typing import Dict, List, Optional
|
|
4
|
-
from experimaestro.launcherfinder import (
|
|
5
|
-
LauncherConfiguration,
|
|
6
|
-
LauncherRegistry,
|
|
7
|
-
HostRequirement,
|
|
8
|
-
)
|
|
9
|
-
from experimaestro.launcherfinder.registry import CPU, GPUList, YAMLDataClass
|
|
10
|
-
from experimaestro.launcherfinder.specs import (
|
|
11
|
-
HostSpecification,
|
|
12
|
-
)
|
|
13
1
|
from experimaestro.scriptbuilder import PythonScriptBuilder
|
|
14
2
|
from . import Launcher
|
|
15
3
|
|
|
@@ -18,40 +6,5 @@ class DirectLauncher(Launcher):
|
|
|
18
6
|
def scriptbuilder(self):
|
|
19
7
|
return PythonScriptBuilder()
|
|
20
8
|
|
|
21
|
-
@staticmethod
|
|
22
|
-
def init_registry(registry: LauncherRegistry):
|
|
23
|
-
registry.register_launcher("local", DirectLauncherConfiguration)
|
|
24
|
-
|
|
25
9
|
def __str__(self):
|
|
26
10
|
return f"DirectLauncher({self.connector})"
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
@dataclass
|
|
30
|
-
class DirectLauncherConfiguration(YAMLDataClass, LauncherConfiguration):
|
|
31
|
-
connector: str = "connector"
|
|
32
|
-
cpu: CPU = field(default_factory=CPU)
|
|
33
|
-
gpus: GPUList = field(default_factory=GPUList)
|
|
34
|
-
tokens: Optional[Dict[str, int]] = None
|
|
35
|
-
tags: List[str] = field(default_factory=lambda: [])
|
|
36
|
-
weight: int = 0
|
|
37
|
-
disable: bool = False
|
|
38
|
-
|
|
39
|
-
@cached_property
|
|
40
|
-
def spec(self) -> HostSpecification:
|
|
41
|
-
return HostSpecification(cpu=self.cpu.to_spec(), cuda=self.gpus.to_spec())
|
|
42
|
-
|
|
43
|
-
def get(
|
|
44
|
-
self, registry: LauncherRegistry, requirement: "HostRequirement"
|
|
45
|
-
) -> Optional[Launcher]:
|
|
46
|
-
if requirement.match(self.spec):
|
|
47
|
-
launcher = DirectLauncher(connector=registry.getConnector(self.connector))
|
|
48
|
-
if self.tokens:
|
|
49
|
-
for token_identifier, count in self.tokens.items():
|
|
50
|
-
token = registry.getToken(token_identifier)
|
|
51
|
-
# TODO: handle the case where this is not a CounterToken
|
|
52
|
-
launcher.addListener(
|
|
53
|
-
lambda job: job.dependencies.add(token.dependency(count))
|
|
54
|
-
)
|
|
55
|
-
return launcher
|
|
56
|
-
|
|
57
|
-
return None
|
|
@@ -262,7 +262,9 @@ class SlurmProcessBuilder(ProcessBuilder):
|
|
|
262
262
|
addstream(builder.command, "-i", self.stdin)
|
|
263
263
|
|
|
264
264
|
builder.command.extend(self.command)
|
|
265
|
-
logger.info(
|
|
265
|
+
logger.info(
|
|
266
|
+
"slurm sbatch command: %s", " ".join(f'"{s}"' for s in builder.command)
|
|
267
|
+
)
|
|
266
268
|
handler = OutputCaptureHandler()
|
|
267
269
|
builder.stdout = Redirect.pipe(handler)
|
|
268
270
|
builder.stderr = Redirect.inherit()
|
experimaestro/notifications.py
CHANGED
|
@@ -44,6 +44,12 @@ class LevelInformation:
|
|
|
44
44
|
return f"[{self.level}] {self.desc} {int(self.progress*1000)/10}%"
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
class ListenerInformation:
|
|
48
|
+
def __init__(self, url: str):
|
|
49
|
+
self.url = url
|
|
50
|
+
self.error_count = 0
|
|
51
|
+
|
|
52
|
+
|
|
47
53
|
class Reporter(threading.Thread):
|
|
48
54
|
NOTIFICATION_FOLDER = ".notifications"
|
|
49
55
|
|
|
@@ -59,7 +65,7 @@ class Reporter(threading.Thread):
|
|
|
59
65
|
super().__init__(daemon=True)
|
|
60
66
|
self.path = path / Reporter.NOTIFICATION_FOLDER
|
|
61
67
|
self.path.mkdir(exist_ok=True)
|
|
62
|
-
self.urls: Dict[str,
|
|
68
|
+
self.urls: Dict[str, ListenerInformation] = {}
|
|
63
69
|
|
|
64
70
|
# Last check of notification URLs
|
|
65
71
|
self.lastcheck = 0
|
|
@@ -80,7 +86,7 @@ class Reporter(threading.Thread):
|
|
|
80
86
|
self.cv.notifyAll()
|
|
81
87
|
|
|
82
88
|
@staticmethod
|
|
83
|
-
def isfatal_httperror(e: Exception) -> bool:
|
|
89
|
+
def isfatal_httperror(e: Exception, info: ListenerInformation) -> bool:
|
|
84
90
|
"""Returns True if this HTTP error indicates that the server won't recover"""
|
|
85
91
|
if isinstance(e, HTTPError):
|
|
86
92
|
if e.code >= 400 and e.code < 500:
|
|
@@ -90,6 +96,13 @@ class Reporter(threading.Thread):
|
|
|
90
96
|
return True
|
|
91
97
|
if isinstance(e.reason, socket.gaierror) and e.reason.errno == -2:
|
|
92
98
|
return True
|
|
99
|
+
if isinstance(e.reason, TimeoutError):
|
|
100
|
+
info.error_count += 1
|
|
101
|
+
|
|
102
|
+
# Too many errors
|
|
103
|
+
if info.error_count > 3:
|
|
104
|
+
logger.info("Too many errors with %s", info.error_count)
|
|
105
|
+
return True
|
|
93
106
|
|
|
94
107
|
return False
|
|
95
108
|
|
|
@@ -100,8 +113,8 @@ class Reporter(threading.Thread):
|
|
|
100
113
|
mtime = os.path.getmtime(self.path)
|
|
101
114
|
if mtime > self.lastcheck:
|
|
102
115
|
for f in self.path.iterdir():
|
|
103
|
-
self.urls[f.name] = f.read_text().strip()
|
|
104
|
-
logger.info("Added new notification URL: %s", self.urls[f.name])
|
|
116
|
+
self.urls[f.name] = ListenerInformation(f.read_text().strip())
|
|
117
|
+
logger.info("Added new notification URL: %s", self.urls[f.name].url)
|
|
105
118
|
f.unlink()
|
|
106
119
|
|
|
107
120
|
self.lastcheck = os.path.getmtime(self.path)
|
|
@@ -128,7 +141,9 @@ class Reporter(threading.Thread):
|
|
|
128
141
|
params = level.report()
|
|
129
142
|
|
|
130
143
|
# Go over all URLs
|
|
131
|
-
for key,
|
|
144
|
+
for key, info in self.urls.items():
|
|
145
|
+
baseurl = info.url
|
|
146
|
+
|
|
132
147
|
url = "{}/progress?{}".format(
|
|
133
148
|
baseurl, urllib.parse.urlencode(params)
|
|
134
149
|
)
|
|
@@ -147,7 +162,7 @@ class Reporter(threading.Thread):
|
|
|
147
162
|
url,
|
|
148
163
|
e,
|
|
149
164
|
)
|
|
150
|
-
if Reporter.isfatal_httperror(e):
|
|
165
|
+
if Reporter.isfatal_httperror(e, info):
|
|
151
166
|
toremove.append(key)
|
|
152
167
|
|
|
153
168
|
# Removes unvalid URLs
|
|
@@ -165,7 +180,8 @@ class Reporter(threading.Thread):
|
|
|
165
180
|
self.check_urls()
|
|
166
181
|
if self.urls:
|
|
167
182
|
# Go over all URLs
|
|
168
|
-
for key,
|
|
183
|
+
for key, info in self.urls.items():
|
|
184
|
+
baseurl = info.url
|
|
169
185
|
url = "{}?status=eoj".format(baseurl)
|
|
170
186
|
try:
|
|
171
187
|
with urlopen(url) as _:
|
|
@@ -243,7 +259,7 @@ class xpm_tqdm(std_tqdm):
|
|
|
243
259
|
|
|
244
260
|
def update(self, n=1):
|
|
245
261
|
result = super().update(n)
|
|
246
|
-
if self.total is not None:
|
|
262
|
+
if self.total is not None and self.total > 0:
|
|
247
263
|
progress(self.n / self.total, level=self.pos, console=False)
|
|
248
264
|
return result
|
|
249
265
|
|
experimaestro/run.py
CHANGED
experimaestro/scheduler/base.py
CHANGED
|
@@ -803,11 +803,6 @@ class experiment:
|
|
|
803
803
|
else None
|
|
804
804
|
)
|
|
805
805
|
|
|
806
|
-
# Copy environment variable from main (but do not
|
|
807
|
-
# override)
|
|
808
|
-
for key, value in settings.env.items():
|
|
809
|
-
self.setenv(key, value, override=False)
|
|
810
|
-
|
|
811
806
|
if os.environ.get("XPM_ENABLEFAULTHANDLER", "0") == "1":
|
|
812
807
|
import faulthandler
|
|
813
808
|
|
experimaestro/settings.py
CHANGED
|
@@ -4,6 +4,7 @@ from dataclasses import field, dataclass
|
|
|
4
4
|
from functools import lru_cache
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Dict, Optional, List
|
|
7
|
+
import logging
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
@dataclass
|
|
@@ -59,7 +60,9 @@ def get_settings(path: Optional[Path] = None) -> Settings:
|
|
|
59
60
|
|
|
60
61
|
path = path or Path("~/.config/experimaestro/settings.yaml").expanduser()
|
|
61
62
|
if not path.is_file():
|
|
62
|
-
return
|
|
63
|
+
return OmegaConf.to_container(
|
|
64
|
+
schema, structured_config_mode=SCMode.INSTANTIATE
|
|
65
|
+
)
|
|
63
66
|
|
|
64
67
|
conf = OmegaConf.load(path)
|
|
65
68
|
return OmegaConf.to_container(
|
|
@@ -78,3 +81,28 @@ def get_workspace(id: Optional[str] = None) -> Optional[WorkspaceSettings]:
|
|
|
78
81
|
return workspace
|
|
79
82
|
|
|
80
83
|
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def find_workspace(*, workspace: Optional[str] = None, workdir: Optional[Path] = None):
|
|
87
|
+
"""Find workspace"""
|
|
88
|
+
workdir = Path(workdir) if workdir else None
|
|
89
|
+
|
|
90
|
+
if workspace:
|
|
91
|
+
ws_env = get_workspace(workspace)
|
|
92
|
+
if ws_env is None:
|
|
93
|
+
raise RuntimeError("No workspace named %s", workspace)
|
|
94
|
+
|
|
95
|
+
logging.info("Using workspace %s", ws_env.id)
|
|
96
|
+
if workdir:
|
|
97
|
+
# Overrides working directory
|
|
98
|
+
logging.info(" override working directory: %s", workdir)
|
|
99
|
+
ws_env.path = workdir
|
|
100
|
+
elif workdir:
|
|
101
|
+
logging.info("Using workdir %s", workdir)
|
|
102
|
+
ws_env = WorkspaceSettings("", workdir)
|
|
103
|
+
else:
|
|
104
|
+
ws_env = get_workspace()
|
|
105
|
+
assert ws_env is not None, "No workdir or workspace defined, and no default"
|
|
106
|
+
logging.info("Using default workspace %s", ws_env.id)
|
|
107
|
+
|
|
108
|
+
return ws_env
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from experimaestro.launcherfinder.specs import (
|
|
2
|
+
CPUSpecification,
|
|
3
|
+
CudaSpecification,
|
|
4
|
+
HostRequirement,
|
|
5
|
+
HostSpecification,
|
|
6
|
+
)
|
|
7
|
+
from experimaestro.launchers.slurm.base import SlurmLauncher, SlurmOptions
|
|
8
|
+
|
|
9
|
+
GIGA = 1024**3
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_launcher(requirements: HostRequirement, tags: set[str] = set()):
|
|
13
|
+
host = HostSpecification(
|
|
14
|
+
cpu=CPUSpecification(cores=16, memory=32 * GIGA),
|
|
15
|
+
max_duration=3600 * 24 * 10,
|
|
16
|
+
cuda=[CudaSpecification(memory=32 * GIGA) for _ in range(4)],
|
|
17
|
+
)
|
|
18
|
+
if match := requirements.match(host):
|
|
19
|
+
return SlurmLauncher(
|
|
20
|
+
options=SlurmOptions(
|
|
21
|
+
gpus_per_node=len(match.requirement.cuda_gpus),
|
|
22
|
+
partition="hard,electronic",
|
|
23
|
+
constraint="(A6000&GPU2&GPUM48G)|(A6000&GPU3&GPUM48G)|(RTX&GPU4&GPUM48G)",
|
|
24
|
+
)
|
|
25
|
+
)
|
|
@@ -81,7 +81,7 @@ def slurm_constraint_split(constraint: str):
|
|
|
81
81
|
def test_findlauncher_slurm():
|
|
82
82
|
path = ResourcePathWrapper.create(f"{__package__ }.launchers", "config_slurm")
|
|
83
83
|
|
|
84
|
-
assert (path / "launchers.
|
|
84
|
+
assert (path / "launchers.py").is_file()
|
|
85
85
|
|
|
86
86
|
registry = LauncherRegistry(path)
|
|
87
87
|
launcher = registry.find("""duration=4 days & cuda(mem=24G) * 2""")
|
experimaestro/tests/test_tags.py
CHANGED
|
@@ -2,6 +2,7 @@ from typing import Dict
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from experimaestro import (
|
|
4
4
|
tag,
|
|
5
|
+
LightweightTask,
|
|
5
6
|
config,
|
|
6
7
|
argument,
|
|
7
8
|
Config,
|
|
@@ -69,6 +70,40 @@ def test_inneroutput():
|
|
|
69
70
|
assert evaluate.__xpm__.tags() == {"hello": "world"}
|
|
70
71
|
|
|
71
72
|
|
|
73
|
+
def test_tags_init_tasks():
|
|
74
|
+
"""Test tags within init tasks"""
|
|
75
|
+
|
|
76
|
+
class MyTask(Task):
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
class InitTask(LightweightTask):
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
class MyConfig(Config):
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
class TaskWithOutput(Task):
|
|
86
|
+
x: Param[MyConfig]
|
|
87
|
+
|
|
88
|
+
def task_outputs(self, dep) -> MyConfig:
|
|
89
|
+
return dep(MyConfig())
|
|
90
|
+
|
|
91
|
+
init_task = InitTask().tag("hello", "world")
|
|
92
|
+
task = MyTask()
|
|
93
|
+
result = task.submit(run_mode=RunMode.DRY_RUN, init_tasks=[init_task])
|
|
94
|
+
assert result.tags() == {"hello": "world"}
|
|
95
|
+
|
|
96
|
+
other_task = TaskWithOutput(x=MyConfig().tag("hello", "world"))
|
|
97
|
+
assert other_task.tags() == {"hello": "world"}
|
|
98
|
+
|
|
99
|
+
result = other_task.submit(run_mode=RunMode.DRY_RUN)
|
|
100
|
+
assert isinstance(result, MyConfig)
|
|
101
|
+
assert result.tags() == {"hello": "world"}
|
|
102
|
+
|
|
103
|
+
result = MyTask().submit(run_mode=RunMode.DRY_RUN, init_tasks=[result])
|
|
104
|
+
assert result.tags() == {"hello": "world"}
|
|
105
|
+
|
|
106
|
+
|
|
72
107
|
class TaskDirectoryContext(DirectoryContext):
|
|
73
108
|
def __init__(self, task, path):
|
|
74
109
|
super().__init__(path)
|