experimaestro 1.5.1__py3-none-any.whl → 2.0.0a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +14 -4
- experimaestro/__main__.py +3 -423
- experimaestro/annotations.py +14 -4
- experimaestro/cli/__init__.py +311 -0
- experimaestro/{filter.py → cli/filter.py} +23 -9
- experimaestro/cli/jobs.py +268 -0
- experimaestro/cli/progress.py +269 -0
- experimaestro/click.py +0 -35
- experimaestro/commandline.py +3 -7
- experimaestro/connectors/__init__.py +29 -14
- experimaestro/connectors/local.py +19 -10
- experimaestro/connectors/ssh.py +27 -8
- experimaestro/core/arguments.py +45 -3
- experimaestro/core/callbacks.py +52 -0
- experimaestro/core/context.py +8 -9
- experimaestro/core/identifier.py +310 -0
- experimaestro/core/objects/__init__.py +44 -0
- experimaestro/core/{objects.py → objects/config.py} +399 -772
- experimaestro/core/objects/config_utils.py +58 -0
- experimaestro/core/objects/config_walk.py +151 -0
- experimaestro/core/objects.pyi +15 -45
- experimaestro/core/serialization.py +63 -9
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/types.py +104 -66
- experimaestro/experiments/cli.py +154 -72
- experimaestro/experiments/configuration.py +10 -1
- experimaestro/generators.py +6 -1
- experimaestro/ipc.py +4 -1
- experimaestro/launcherfinder/__init__.py +1 -1
- experimaestro/launcherfinder/base.py +2 -18
- experimaestro/launcherfinder/parser.py +8 -3
- experimaestro/launcherfinder/registry.py +52 -140
- experimaestro/launcherfinder/specs.py +49 -10
- experimaestro/launchers/direct.py +0 -47
- experimaestro/launchers/slurm/base.py +54 -14
- experimaestro/mkdocs/__init__.py +1 -1
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/notifications.py +38 -12
- experimaestro/progress.py +406 -0
- experimaestro/run.py +24 -3
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +108 -808
- experimaestro/scheduler/dynamic_outputs.py +184 -0
- experimaestro/scheduler/experiment.py +387 -0
- experimaestro/scheduler/jobs.py +475 -0
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +75 -0
- experimaestro/scheduler/workspace.py +27 -8
- experimaestro/scriptbuilder.py +18 -3
- experimaestro/server/__init__.py +36 -5
- experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
- experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
- experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
- experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
- experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
- experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
- experimaestro/server/data/index.css +5187 -5068
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +68887 -68064
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +45 -5
- experimaestro/sphinx/__init__.py +7 -17
- experimaestro/taskglobals.py +7 -2
- experimaestro/tests/core/__init__.py +0 -0
- experimaestro/tests/core/test_generics.py +206 -0
- experimaestro/tests/definitions_types.py +5 -3
- experimaestro/tests/launchers/bin/sbatch +34 -7
- experimaestro/tests/launchers/bin/srun +5 -0
- experimaestro/tests/launchers/common.py +17 -5
- experimaestro/tests/launchers/config_slurm/launchers.py +25 -0
- experimaestro/tests/restart.py +10 -5
- experimaestro/tests/tasks/all.py +23 -10
- experimaestro/tests/tasks/foreign.py +2 -4
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_experiment.py +73 -0
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_findlauncher.py +12 -5
- experimaestro/tests/test_forward.py +5 -5
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +182 -158
- experimaestro/tests/test_instance.py +19 -27
- experimaestro/tests/test_objects.py +13 -20
- experimaestro/tests/test_outputs.py +6 -6
- experimaestro/tests/test_param.py +68 -30
- experimaestro/tests/test_progress.py +4 -4
- experimaestro/tests/test_serializers.py +24 -64
- experimaestro/tests/test_ssh.py +7 -0
- experimaestro/tests/test_tags.py +50 -21
- experimaestro/tests/test_tasks.py +42 -51
- experimaestro/tests/test_tokens.py +11 -8
- experimaestro/tests/test_types.py +24 -21
- experimaestro/tests/test_validation.py +67 -110
- experimaestro/tests/token_reschedule.py +1 -1
- experimaestro/tokens.py +24 -13
- experimaestro/tools/diff.py +8 -1
- experimaestro/typingutils.py +20 -11
- experimaestro/utils/asyncio.py +6 -2
- experimaestro/utils/multiprocessing.py +44 -0
- experimaestro/utils/resources.py +11 -3
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/METADATA +28 -36
- experimaestro-2.0.0a8.dist-info/RECORD +166 -0
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/WHEEL +1 -1
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/entry_points.txt +0 -4
- experimaestro/launchers/slurm/cli.py +0 -29
- experimaestro/launchers/slurm/configuration.py +0 -597
- experimaestro/scheduler/environment.py +0 -94
- experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
- experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
- experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
- experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
- experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
- experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
- experimaestro/tests/launchers/config_slurm/launchers.yaml +0 -134
- experimaestro/utils/yaml.py +0 -202
- experimaestro-1.5.1.dist-info/RECORD +0 -148
- {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info/licenses}/LICENSE +0 -0
experimaestro/notifications.py
CHANGED
|
@@ -12,6 +12,7 @@ from tqdm.auto import tqdm as std_tqdm
|
|
|
12
12
|
|
|
13
13
|
from .utils import logger
|
|
14
14
|
from experimaestro.taskglobals import Env as TaskEnv
|
|
15
|
+
from .progress import FileBasedProgressReporter
|
|
15
16
|
|
|
16
17
|
# --- Progress and other notifications
|
|
17
18
|
|
|
@@ -41,7 +42,13 @@ class LevelInformation:
|
|
|
41
42
|
return result
|
|
42
43
|
|
|
43
44
|
def __repr__(self) -> str:
|
|
44
|
-
return f"[{self.level}] {self.desc} {int(self.progress*1000)/10}%"
|
|
45
|
+
return f"[{self.level}] {self.desc} {int(self.progress * 1000) / 10}%"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ListenerInformation:
|
|
49
|
+
def __init__(self, url: str):
|
|
50
|
+
self.url = url
|
|
51
|
+
self.error_count = 0
|
|
45
52
|
|
|
46
53
|
|
|
47
54
|
class Reporter(threading.Thread):
|
|
@@ -59,7 +66,7 @@ class Reporter(threading.Thread):
|
|
|
59
66
|
super().__init__(daemon=True)
|
|
60
67
|
self.path = path / Reporter.NOTIFICATION_FOLDER
|
|
61
68
|
self.path.mkdir(exist_ok=True)
|
|
62
|
-
self.urls: Dict[str,
|
|
69
|
+
self.urls: Dict[str, ListenerInformation] = {}
|
|
63
70
|
|
|
64
71
|
# Last check of notification URLs
|
|
65
72
|
self.lastcheck = 0
|
|
@@ -72,15 +79,18 @@ class Reporter(threading.Thread):
|
|
|
72
79
|
|
|
73
80
|
self.progress_threshold = 0.01
|
|
74
81
|
self.cv = threading.Condition()
|
|
75
|
-
|
|
82
|
+
|
|
83
|
+
# File-based progress reporter
|
|
84
|
+
self.file_reporter = FileBasedProgressReporter(task_path=path)
|
|
76
85
|
|
|
77
86
|
def stop(self):
|
|
78
87
|
self.stopping = True
|
|
79
88
|
with self.cv:
|
|
80
|
-
self.cv.notifyAll()
|
|
89
|
+
# self.cv.notifyAll()
|
|
90
|
+
self.cv.notify_all()
|
|
81
91
|
|
|
82
92
|
@staticmethod
|
|
83
|
-
def isfatal_httperror(e: Exception) -> bool:
|
|
93
|
+
def isfatal_httperror(e: Exception, info: ListenerInformation) -> bool:
|
|
84
94
|
"""Returns True if this HTTP error indicates that the server won't recover"""
|
|
85
95
|
if isinstance(e, HTTPError):
|
|
86
96
|
if e.code >= 400 and e.code < 500:
|
|
@@ -90,6 +100,13 @@ class Reporter(threading.Thread):
|
|
|
90
100
|
return True
|
|
91
101
|
if isinstance(e.reason, socket.gaierror) and e.reason.errno == -2:
|
|
92
102
|
return True
|
|
103
|
+
if isinstance(e.reason, TimeoutError):
|
|
104
|
+
info.error_count += 1
|
|
105
|
+
|
|
106
|
+
# Too many errors
|
|
107
|
+
if info.error_count > 3:
|
|
108
|
+
logger.info("Too many errors with %s", info.error_count)
|
|
109
|
+
return True
|
|
93
110
|
|
|
94
111
|
return False
|
|
95
112
|
|
|
@@ -97,11 +114,12 @@ class Reporter(threading.Thread):
|
|
|
97
114
|
return any(level.modified(self) for level in self.levels)
|
|
98
115
|
|
|
99
116
|
def check_urls(self):
|
|
117
|
+
"""Check whether we have new schedulers to notify"""
|
|
100
118
|
mtime = os.path.getmtime(self.path)
|
|
101
119
|
if mtime > self.lastcheck:
|
|
102
120
|
for f in self.path.iterdir():
|
|
103
|
-
self.urls[f.name] = f.read_text().strip()
|
|
104
|
-
logger.info("Added new notification URL: %s", self.urls[f.name])
|
|
121
|
+
self.urls[f.name] = ListenerInformation(f.read_text().strip())
|
|
122
|
+
logger.info("Added new notification URL: %s", self.urls[f.name].url)
|
|
105
123
|
f.unlink()
|
|
106
124
|
|
|
107
125
|
self.lastcheck = os.path.getmtime(self.path)
|
|
@@ -128,7 +146,9 @@ class Reporter(threading.Thread):
|
|
|
128
146
|
params = level.report()
|
|
129
147
|
|
|
130
148
|
# Go over all URLs
|
|
131
|
-
for key,
|
|
149
|
+
for key, info in self.urls.items():
|
|
150
|
+
baseurl = info.url
|
|
151
|
+
|
|
132
152
|
url = "{}/progress?{}".format(
|
|
133
153
|
baseurl, urllib.parse.urlencode(params)
|
|
134
154
|
)
|
|
@@ -147,7 +167,7 @@ class Reporter(threading.Thread):
|
|
|
147
167
|
url,
|
|
148
168
|
e,
|
|
149
169
|
)
|
|
150
|
-
if Reporter.isfatal_httperror(e):
|
|
170
|
+
if Reporter.isfatal_httperror(e, info):
|
|
151
171
|
toremove.append(key)
|
|
152
172
|
|
|
153
173
|
# Removes unvalid URLs
|
|
@@ -165,12 +185,13 @@ class Reporter(threading.Thread):
|
|
|
165
185
|
self.check_urls()
|
|
166
186
|
if self.urls:
|
|
167
187
|
# Go over all URLs
|
|
168
|
-
for key,
|
|
188
|
+
for key, info in self.urls.items():
|
|
189
|
+
baseurl = info.url
|
|
169
190
|
url = "{}?status=eoj".format(baseurl)
|
|
170
191
|
try:
|
|
171
192
|
with urlopen(url) as _:
|
|
172
193
|
logger.debug(
|
|
173
|
-
"EOJ
|
|
194
|
+
"EOJ notification sent for %s",
|
|
174
195
|
baseurl,
|
|
175
196
|
)
|
|
176
197
|
except Exception:
|
|
@@ -178,6 +199,8 @@ class Reporter(threading.Thread):
|
|
|
178
199
|
"Could not report EOJ",
|
|
179
200
|
)
|
|
180
201
|
|
|
202
|
+
self.file_reporter.eoj()
|
|
203
|
+
|
|
181
204
|
def set_progress(
|
|
182
205
|
self, progress: float, level: int, desc: Optional[str], console=False
|
|
183
206
|
):
|
|
@@ -196,6 +219,8 @@ class Reporter(threading.Thread):
|
|
|
196
219
|
self.levels[level].desc = desc
|
|
197
220
|
self.levels[level].progress = progress
|
|
198
221
|
|
|
222
|
+
self.file_reporter.set_progress(progress, level, desc)
|
|
223
|
+
|
|
199
224
|
self.cv.notify_all()
|
|
200
225
|
|
|
201
226
|
INSTANCE: ClassVar[Optional["Reporter"]] = None
|
|
@@ -206,6 +231,7 @@ class Reporter(threading.Thread):
|
|
|
206
231
|
taskpath = TaskEnv.instance().taskpath
|
|
207
232
|
assert taskpath is not None, "Task path is not defined"
|
|
208
233
|
Reporter.INSTANCE = Reporter(taskpath)
|
|
234
|
+
Reporter.INSTANCE.start()
|
|
209
235
|
return Reporter.INSTANCE
|
|
210
236
|
|
|
211
237
|
|
|
@@ -243,7 +269,7 @@ class xpm_tqdm(std_tqdm):
|
|
|
243
269
|
|
|
244
270
|
def update(self, n=1):
|
|
245
271
|
result = super().update(n)
|
|
246
|
-
if self.total is not None:
|
|
272
|
+
if self.total is not None and self.total > 0:
|
|
247
273
|
progress(self.n / self.total, level=self.pos, console=False)
|
|
248
274
|
return result
|
|
249
275
|
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
"""File-based progress tracking system for experimaestro tasks."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass, asdict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional, List, Iterator, Dict, Any
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
import fcntl
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
from .utils import logger
|
|
14
|
+
|
|
15
|
+
DEFAULT_MAX_ENTRIES_PER_FILE = 10_000
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ProgressEntry:
|
|
20
|
+
"""A single progress entry in the JSONL file"""
|
|
21
|
+
|
|
22
|
+
timestamp: float
|
|
23
|
+
level: int
|
|
24
|
+
progress: float
|
|
25
|
+
desc: Optional[str] = None
|
|
26
|
+
|
|
27
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
28
|
+
"""Convert to dictionary for JSON serialization"""
|
|
29
|
+
return asdict(self)
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ProgressEntry":
|
|
33
|
+
"""Create from dictionary"""
|
|
34
|
+
return cls(**data)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class StateFile:
|
|
38
|
+
"""Represents the state file for progress tracking.
|
|
39
|
+
Checks if the state must be written based on time and progress changes.
|
|
40
|
+
By default, it writes every second or when progress changes significantly (>1%)"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, filename: Path):
|
|
43
|
+
self.filename = filename
|
|
44
|
+
self.state: Dict[int, ProgressEntry] = {}
|
|
45
|
+
|
|
46
|
+
# Write threshold to avoid too frequent writes
|
|
47
|
+
self._time_threshold = timedelta(seconds=1.0)
|
|
48
|
+
self._last_write_time: datetime = datetime.now()
|
|
49
|
+
# Minimum progress change to trigger write
|
|
50
|
+
self._progress_threshold = 0.01
|
|
51
|
+
self._last_write_progress: Optional[Dict[int, float]] = None
|
|
52
|
+
|
|
53
|
+
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
self.load()
|
|
55
|
+
|
|
56
|
+
def _allow_write(self) -> bool:
|
|
57
|
+
"""Check if the state should be written based on time and progress changes.
|
|
58
|
+
Allows writing if:
|
|
59
|
+
- BOTH: More than 1 second has passed since last write
|
|
60
|
+
- AND: Progress has changed significantly (>1%)
|
|
61
|
+
- OR: All entries are done (progress >= 1.0)"""
|
|
62
|
+
time_check = datetime.now() - self._last_write_time > self._time_threshold
|
|
63
|
+
progress_check = self._last_write_progress is None or any(
|
|
64
|
+
abs(entry.progress - self._last_write_progress.get(entry.level, 0.0))
|
|
65
|
+
> self._progress_threshold
|
|
66
|
+
for entry in self.state.values()
|
|
67
|
+
)
|
|
68
|
+
all_entries_done = all(entry.progress >= 1.0 for entry in self.state.values())
|
|
69
|
+
return all_entries_done or (time_check and progress_check)
|
|
70
|
+
|
|
71
|
+
def write(self, force: bool = False):
|
|
72
|
+
"""Write the current state to the file."""
|
|
73
|
+
if self._allow_write() or force:
|
|
74
|
+
with open(self.filename, "w") as f:
|
|
75
|
+
json.dump({k: v.to_dict() for k, v in self.state.items()}, f)
|
|
76
|
+
self._last_write_time = datetime.now()
|
|
77
|
+
self._last_write_progress = {k: v.progress for k, v in self.state.items()}
|
|
78
|
+
|
|
79
|
+
def update(self, entry: ProgressEntry):
|
|
80
|
+
self.state[entry.level] = entry
|
|
81
|
+
|
|
82
|
+
def load(self):
|
|
83
|
+
"""Load the state from the file"""
|
|
84
|
+
if self.filename.exists():
|
|
85
|
+
with self.filename.open("r") as f:
|
|
86
|
+
try:
|
|
87
|
+
data = json.load(f)
|
|
88
|
+
self.state = {
|
|
89
|
+
int(k): ProgressEntry.from_dict(v) for k, v in data.items()
|
|
90
|
+
}
|
|
91
|
+
except (json.JSONDecodeError, IOError):
|
|
92
|
+
logger.warning(f"Failed to load state from {self.filename}")
|
|
93
|
+
|
|
94
|
+
def read(self) -> Dict[int, ProgressEntry]:
|
|
95
|
+
"""Read the state from the file"""
|
|
96
|
+
self.load()
|
|
97
|
+
return self.state
|
|
98
|
+
|
|
99
|
+
# flush on exit
|
|
100
|
+
def __del__(self):
|
|
101
|
+
"""Ensure state is written on exit"""
|
|
102
|
+
try:
|
|
103
|
+
self.write(force=True)
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.error(f"Failed to write state on exit: {e}")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class ProgressFileWriter:
|
|
109
|
+
# TODO: Implement buffering and flushing
|
|
110
|
+
|
|
111
|
+
def __init__(
|
|
112
|
+
self, task_path: Path, max_entries_per_file: int = DEFAULT_MAX_ENTRIES_PER_FILE
|
|
113
|
+
):
|
|
114
|
+
self.task_path = task_path
|
|
115
|
+
self.progress_dir = task_path / ".experimaestro"
|
|
116
|
+
self.max_entries_per_file = max_entries_per_file
|
|
117
|
+
self.current_file_index = 0
|
|
118
|
+
self.current_file_entries = 0
|
|
119
|
+
self.lock = threading.Lock()
|
|
120
|
+
|
|
121
|
+
# Ensure directory exists
|
|
122
|
+
self.progress_dir.mkdir(exist_ok=True)
|
|
123
|
+
|
|
124
|
+
# State is the latest entry per level
|
|
125
|
+
self.state = StateFile(self.progress_dir / "progress_state.json")
|
|
126
|
+
|
|
127
|
+
# Find the latest file index
|
|
128
|
+
self._find_latest_file()
|
|
129
|
+
|
|
130
|
+
def _find_latest_file(self):
|
|
131
|
+
"""Find the latest progress file and entry count"""
|
|
132
|
+
progress_files = list(self.progress_dir.glob("progress-*.jsonl"))
|
|
133
|
+
if not progress_files:
|
|
134
|
+
self.current_file_index = 0
|
|
135
|
+
self.current_file_entries = 0
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
# Sort by file index
|
|
139
|
+
max_index = None
|
|
140
|
+
for f in progress_files:
|
|
141
|
+
try:
|
|
142
|
+
index = int(f.stem.split("-")[1])
|
|
143
|
+
if max_index is None or index > max_index:
|
|
144
|
+
max_index = index
|
|
145
|
+
except (ValueError, IndexError):
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
if max_index is not None:
|
|
149
|
+
self.current_file_index = max_index
|
|
150
|
+
# Count entries in current file
|
|
151
|
+
current_file = self._get_current_file_path()
|
|
152
|
+
if current_file.exists():
|
|
153
|
+
with current_file.open("r") as f:
|
|
154
|
+
self.current_file_entries = sum(1 for _ in f.readlines())
|
|
155
|
+
else:
|
|
156
|
+
self.current_file_entries = 0
|
|
157
|
+
else:
|
|
158
|
+
self.current_file_index = 0
|
|
159
|
+
self.current_file_entries = 0
|
|
160
|
+
|
|
161
|
+
def _get_current_file_path(self) -> Path:
|
|
162
|
+
"""Get path to current progress file"""
|
|
163
|
+
return self.progress_dir / f"progress-{self.current_file_index:04d}.jsonl"
|
|
164
|
+
|
|
165
|
+
def _get_latest_symlink_path(self) -> Path:
|
|
166
|
+
"""Get path to latest progress symlink"""
|
|
167
|
+
return self.progress_dir / "progress-latest.jsonl"
|
|
168
|
+
|
|
169
|
+
def _rotate_file_if_needed(self):
|
|
170
|
+
"""Create new file if current one is full"""
|
|
171
|
+
if self.current_file_entries >= self.max_entries_per_file:
|
|
172
|
+
self.current_file_index += 1
|
|
173
|
+
self.current_file_entries = 0
|
|
174
|
+
logger.debug(f"Rotating to new progress file: {self.current_file_index}")
|
|
175
|
+
|
|
176
|
+
def _update_latest_symlink(self):
|
|
177
|
+
"""Update symlink to point to latest file"""
|
|
178
|
+
current_file = self._get_current_file_path()
|
|
179
|
+
latest_symlink = self._get_latest_symlink_path()
|
|
180
|
+
|
|
181
|
+
# Remove existing symlink
|
|
182
|
+
if latest_symlink.exists() or latest_symlink.is_symlink():
|
|
183
|
+
latest_symlink.unlink()
|
|
184
|
+
|
|
185
|
+
# Create new symlink
|
|
186
|
+
latest_symlink.symlink_to(current_file.name)
|
|
187
|
+
|
|
188
|
+
def write_progress(self, level: int, progress: float, desc: Optional[str] = None):
|
|
189
|
+
"""Write a progress entry to the file
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
level: Progress level (0 is top level)
|
|
193
|
+
progress: Progress value between 0.0 and 1.0
|
|
194
|
+
desc: Optional description
|
|
195
|
+
"""
|
|
196
|
+
with self.lock:
|
|
197
|
+
# Eventually rotate internal state if needed
|
|
198
|
+
self._rotate_file_if_needed()
|
|
199
|
+
|
|
200
|
+
entry = ProgressEntry(
|
|
201
|
+
timestamp=time.time(), level=level, progress=progress, desc=desc
|
|
202
|
+
)
|
|
203
|
+
self.state.update(entry)
|
|
204
|
+
self.state.write(force=level == -1) # Force write on EOJ
|
|
205
|
+
|
|
206
|
+
current_file = self._get_current_file_path()
|
|
207
|
+
|
|
208
|
+
# Write with file locking for concurrent access
|
|
209
|
+
with current_file.open("a") as f:
|
|
210
|
+
try:
|
|
211
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
|
212
|
+
f.write(json.dumps(entry.to_dict()) + "\n")
|
|
213
|
+
f.flush() # Flush the file buffer
|
|
214
|
+
os.fsync(f.fileno()) # Ensure data is written to disk
|
|
215
|
+
finally:
|
|
216
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
|
217
|
+
|
|
218
|
+
self.current_file_entries += 1
|
|
219
|
+
self._update_latest_symlink()
|
|
220
|
+
|
|
221
|
+
logger.debug(
|
|
222
|
+
f"Progress written: level={level}, progress={progress}, desc={desc}"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def __del__(self):
|
|
226
|
+
"""Ensure state is written on exit"""
|
|
227
|
+
try:
|
|
228
|
+
self.state.write(force=True)
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.error(f"Failed to write state on exit: {e}")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class ProgressFileReader:
|
|
234
|
+
"""Reads progress entries from JSONL files"""
|
|
235
|
+
|
|
236
|
+
def __init__(self, task_path: Path):
|
|
237
|
+
"""Initialize progress file reader
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
task_path: Path to the task directory
|
|
241
|
+
"""
|
|
242
|
+
self.task_path = task_path
|
|
243
|
+
self.progress_dir = task_path / ".experimaestro"
|
|
244
|
+
self.max_entries_per_file: Optional[int] = None
|
|
245
|
+
self.state = StateFile(self.progress_dir / "progress_state.json")
|
|
246
|
+
|
|
247
|
+
def get_progress_files(self) -> List[Path]:
|
|
248
|
+
"""Get all progress files sorted by index"""
|
|
249
|
+
if not self.progress_dir.exists():
|
|
250
|
+
return []
|
|
251
|
+
|
|
252
|
+
progress_files = list(self.progress_dir.glob("progress-*.jsonl"))
|
|
253
|
+
|
|
254
|
+
# Filter out symlinks to avoid duplicates
|
|
255
|
+
progress_files = [f for f in progress_files if not f.is_symlink()]
|
|
256
|
+
|
|
257
|
+
# Sort by file index
|
|
258
|
+
# Alternatively, we could simply sort by filename
|
|
259
|
+
def get_index(path: Path) -> int:
|
|
260
|
+
try:
|
|
261
|
+
return int(path.stem.split("-")[1])
|
|
262
|
+
except (ValueError, IndexError):
|
|
263
|
+
return 0
|
|
264
|
+
|
|
265
|
+
return sorted(progress_files, key=get_index)
|
|
266
|
+
|
|
267
|
+
def get_latest_file(self) -> Optional[Path]:
|
|
268
|
+
"""Get the latest progress file via symlink"""
|
|
269
|
+
latest_symlink = self.progress_dir / "progress-latest.jsonl"
|
|
270
|
+
if latest_symlink.exists() and latest_symlink.is_symlink():
|
|
271
|
+
return latest_symlink.resolve()
|
|
272
|
+
|
|
273
|
+
# Fallback to finding latest manually
|
|
274
|
+
files = self.get_progress_files()
|
|
275
|
+
return files[-1] if files else None
|
|
276
|
+
|
|
277
|
+
def read_entries(self, file_path: Path) -> Iterator[ProgressEntry]:
|
|
278
|
+
"""Read progress entries from a file
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
file_path: Path to progress file
|
|
282
|
+
|
|
283
|
+
Yields:
|
|
284
|
+
ProgressEntry objects
|
|
285
|
+
"""
|
|
286
|
+
if not file_path.exists():
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
with file_path.open("r") as f:
|
|
291
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
|
292
|
+
try:
|
|
293
|
+
for line in f:
|
|
294
|
+
line = line.strip()
|
|
295
|
+
if line:
|
|
296
|
+
try:
|
|
297
|
+
data = json.loads(line)
|
|
298
|
+
yield ProgressEntry.from_dict(data)
|
|
299
|
+
except json.JSONDecodeError as e:
|
|
300
|
+
logger.warning(
|
|
301
|
+
f"Invalid JSON in progress file {file_path}: {e}"
|
|
302
|
+
)
|
|
303
|
+
finally:
|
|
304
|
+
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
|
305
|
+
except IOError as e:
|
|
306
|
+
logger.warning(f"Could not read progress file {file_path}: {e}")
|
|
307
|
+
|
|
308
|
+
def read_all_entries(self) -> Iterator[ProgressEntry]:
|
|
309
|
+
"""Read all progress entries from all files in order
|
|
310
|
+
|
|
311
|
+
Yields:
|
|
312
|
+
ProgressEntry objects in chronological order
|
|
313
|
+
"""
|
|
314
|
+
logger.warning("Reading all progress entries, this may be slow for large jobs.")
|
|
315
|
+
for file_path in self.get_progress_files():
|
|
316
|
+
yield from self.read_entries(file_path)
|
|
317
|
+
|
|
318
|
+
def read_latest_entries(self, count: Optional[int] = None) -> List[ProgressEntry]:
|
|
319
|
+
"""Read the latest N progress entries"""
|
|
320
|
+
entries = []
|
|
321
|
+
|
|
322
|
+
# Read files in reverse order to get latest entries first
|
|
323
|
+
files = self.get_progress_files()
|
|
324
|
+
# Fetch the max length of files, in lines
|
|
325
|
+
if files and count is None:
|
|
326
|
+
# Fetch the number of entries in the first file
|
|
327
|
+
# This is the most likely to be the longest file
|
|
328
|
+
count = sum(1 for _ in self.read_entries(files[0]))
|
|
329
|
+
if count is None:
|
|
330
|
+
count = DEFAULT_MAX_ENTRIES_PER_FILE
|
|
331
|
+
|
|
332
|
+
for file_path in reversed(files):
|
|
333
|
+
file_entries = list(self.read_entries(file_path))
|
|
334
|
+
entries.extend(reversed(file_entries))
|
|
335
|
+
|
|
336
|
+
if len(entries) >= count:
|
|
337
|
+
break
|
|
338
|
+
|
|
339
|
+
# Return latest entries in chronological order
|
|
340
|
+
return list(reversed(entries[:count]))
|
|
341
|
+
|
|
342
|
+
def get_current_progress(
|
|
343
|
+
self, count: Optional[int] = None
|
|
344
|
+
) -> Dict[int, ProgressEntry]:
|
|
345
|
+
"""Get the current progress for each level"""
|
|
346
|
+
logger.warning(
|
|
347
|
+
"Reading current progress from progress logs, this may be slow for large jobs."
|
|
348
|
+
)
|
|
349
|
+
return {entry.level: entry for entry in self.read_latest_entries(count)}
|
|
350
|
+
|
|
351
|
+
def get_current_state(self) -> Optional[Dict[int, ProgressEntry]]:
|
|
352
|
+
"""Fetch the latest progress entry from the state file"""
|
|
353
|
+
current_state = self.state.read()
|
|
354
|
+
return current_state or self.get_current_progress()
|
|
355
|
+
|
|
356
|
+
def is_done(self) -> bool:
|
|
357
|
+
"""Check if the task is done by looking for a special 'done' file.
|
|
358
|
+
Fallback to checking for end-of-job (EOJ) entries."""
|
|
359
|
+
|
|
360
|
+
task_name = self.task_path.parent.stem.split(".")[-1]
|
|
361
|
+
job_done_file = self.task_path / f"{task_name}.done"
|
|
362
|
+
if job_done_file.exists() and job_done_file.is_file():
|
|
363
|
+
return True
|
|
364
|
+
|
|
365
|
+
# Check if any progress file has a level -1 entry indicating EOJ
|
|
366
|
+
return any(entry.level == -1 for entry in self.read_all_entries())
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class FileBasedProgressReporter:
|
|
370
|
+
"""File-based progress reporter that replaces the socket-based Reporter"""
|
|
371
|
+
|
|
372
|
+
def __init__(self, task_path: Path):
|
|
373
|
+
"""Initialize file-based progress reporter
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
task_path: Path to the task directory
|
|
377
|
+
"""
|
|
378
|
+
self.task_path = task_path
|
|
379
|
+
self.writer = ProgressFileWriter(task_path)
|
|
380
|
+
self.current_progress = {} # level -> (progress, desc)
|
|
381
|
+
self.lock = threading.Lock()
|
|
382
|
+
|
|
383
|
+
def set_progress(self, progress: float, level: int = 0, desc: Optional[str] = None):
|
|
384
|
+
"""Set progress for a specific level
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
progress: Progress value between 0.0 and 1.0
|
|
388
|
+
level: Progress level (0 is top level)
|
|
389
|
+
desc: Optional description
|
|
390
|
+
"""
|
|
391
|
+
with self.lock:
|
|
392
|
+
# Check if progress has changed significantly
|
|
393
|
+
current = self.current_progress.get(level, (None, None))
|
|
394
|
+
if (
|
|
395
|
+
current[0] is None
|
|
396
|
+
or abs(progress - current[0]) > 0.01
|
|
397
|
+
or desc != current[1]
|
|
398
|
+
):
|
|
399
|
+
self.current_progress[level] = (progress, desc)
|
|
400
|
+
self.writer.write_progress(level, progress, desc)
|
|
401
|
+
|
|
402
|
+
def eoj(self):
|
|
403
|
+
"""End of job notification"""
|
|
404
|
+
with self.lock:
|
|
405
|
+
# Write a special end-of-job marker
|
|
406
|
+
self.writer.write_progress(-1, 1.0, "EOJ")
|
experimaestro/run.py
CHANGED
|
@@ -8,6 +8,7 @@ import json
|
|
|
8
8
|
from typing import List
|
|
9
9
|
import fasteners
|
|
10
10
|
from experimaestro.notifications import progress, report_eoj
|
|
11
|
+
from experimaestro.utils.multiprocessing import delayed_shutdown
|
|
11
12
|
from .core.types import ObjectType
|
|
12
13
|
from experimaestro.utils import logger
|
|
13
14
|
from experimaestro.core.objects import ConfigInformation
|
|
@@ -92,15 +93,27 @@ class TaskRunner:
|
|
|
92
93
|
logger.info("Finished cleanup")
|
|
93
94
|
|
|
94
95
|
def handle_error(self, code, frame_type):
|
|
95
|
-
logger.info("
|
|
96
|
+
logger.info("Error handler: finished with code %d", code)
|
|
96
97
|
self.failedpath.write_text(str(code))
|
|
97
98
|
self.cleanup()
|
|
99
|
+
logger.info("Exiting")
|
|
100
|
+
delayed_shutdown(60, exit_code=code)
|
|
98
101
|
sys.exit(1)
|
|
99
102
|
|
|
100
103
|
def run(self):
|
|
101
104
|
atexit.register(self.cleanup)
|
|
102
|
-
signal.signal(signal.SIGTERM, self.handle_error)
|
|
103
|
-
signal.signal(signal.SIGINT, self.handle_error)
|
|
105
|
+
sigterm_handler = signal.signal(signal.SIGTERM, self.handle_error)
|
|
106
|
+
sigint_handler = signal.signal(signal.SIGINT, self.handle_error)
|
|
107
|
+
|
|
108
|
+
def remove_signal_handlers(remove_cleanup=True):
|
|
109
|
+
"""Removes cleanup in forked processes"""
|
|
110
|
+
signal.signal(signal.SIGTERM, sigterm_handler)
|
|
111
|
+
signal.signal(signal.SIGINT, sigint_handler)
|
|
112
|
+
atexit.unregister(self.cleanup)
|
|
113
|
+
|
|
114
|
+
if sys.platform != "win32":
|
|
115
|
+
os.register_at_fork(after_in_child=remove_signal_handlers)
|
|
116
|
+
|
|
104
117
|
try:
|
|
105
118
|
workdir = self.scriptpath.parent
|
|
106
119
|
os.chdir(workdir)
|
|
@@ -128,7 +141,11 @@ class TaskRunner:
|
|
|
128
141
|
self.started = True
|
|
129
142
|
run(workdir / "params.json")
|
|
130
143
|
|
|
144
|
+
# ... remove the handlers
|
|
145
|
+
remove_signal_handlers(remove_cleanup=False)
|
|
146
|
+
|
|
131
147
|
# Everything went OK
|
|
148
|
+
logger.info("Task ended successfully")
|
|
132
149
|
sys.exit(0)
|
|
133
150
|
except Exception:
|
|
134
151
|
logger.exception("Got exception while running")
|
|
@@ -136,6 +153,10 @@ class TaskRunner:
|
|
|
136
153
|
|
|
137
154
|
except SystemExit as e:
|
|
138
155
|
if e.code == 0:
|
|
156
|
+
# Normal exit, just create the ".done" file
|
|
139
157
|
self.donepath.touch()
|
|
158
|
+
|
|
159
|
+
# ... and finish the exit process
|
|
160
|
+
raise
|
|
140
161
|
else:
|
|
141
162
|
self.handle_error(e.code, None)
|
|
@@ -1 +1,18 @@
|
|
|
1
|
-
from .base import
|
|
1
|
+
from .base import Scheduler, Listener
|
|
2
|
+
from .workspace import Workspace, RunMode
|
|
3
|
+
from .experiment import experiment, FailedExperiment
|
|
4
|
+
from .jobs import Job, JobState, JobFailureStatus, JobDependency, JobContext
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"Scheduler",
|
|
8
|
+
"Listener",
|
|
9
|
+
"Workspace",
|
|
10
|
+
"RunMode",
|
|
11
|
+
"experiment",
|
|
12
|
+
"FailedExperiment",
|
|
13
|
+
"Job",
|
|
14
|
+
"JobState",
|
|
15
|
+
"JobFailureStatus",
|
|
16
|
+
"JobDependency",
|
|
17
|
+
"JobContext",
|
|
18
|
+
]
|