experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +278 -7
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +20 -1
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +182 -46
- experimaestro/core/identifier.py +107 -6
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +542 -25
- experimaestro/core/objects/config_walk.py +20 -0
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +175 -38
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +111 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +63 -13
- experimaestro/progress.py +0 -2
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/base.py +510 -125
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +256 -31
- experimaestro/scheduler/interfaces.py +501 -0
- experimaestro/scheduler/jobs.py +216 -206
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/client.py +874 -0
- experimaestro/scheduler/remote/protocol.py +467 -0
- experimaestro/scheduler/remote/server.py +423 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +323 -23
- experimaestro/scheduler/state_db.py +437 -0
- experimaestro/scheduler/state_provider.py +2766 -0
- experimaestro/scheduler/state_sync.py +891 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +147 -57
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +44 -5
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_file_progress_integration.py +1 -1
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_identifier.py +372 -41
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +3 -3
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +312 -5
- experimaestro/tests/test_outputs.py +2 -2
- experimaestro/tests/test_param.py +8 -12
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +0 -48
- experimaestro/tests/test_remote_state.py +671 -0
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -1
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +136 -0
- experimaestro/tests/test_tasks.py +107 -121
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +17 -13
- experimaestro/tests/test_types.py +123 -1
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +4 -2
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +1 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2395 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
- experimaestro-2.0.0b8.dist-info/RECORD +187 -0
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -221
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-2.0.0a8.dist-info/RECORD +0 -166
- experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
- {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
"""Base interfaces for job and experiment data
|
|
2
|
+
|
|
3
|
+
This module defines abstract interfaces that represent job and experiment information.
|
|
4
|
+
These interfaces provide a common API between live jobs/experiments and those
|
|
5
|
+
loaded from the database.
|
|
6
|
+
|
|
7
|
+
- JobState: Base class for job states with singleton instances
|
|
8
|
+
- JobFailureStatus: Enum for failure reasons
|
|
9
|
+
- BaseJob: Interface defining job attributes and metadata operations
|
|
10
|
+
- BaseExperiment: Interface defining experiment attributes
|
|
11
|
+
|
|
12
|
+
The existing Job and experiment classes should provide these same attributes
|
|
13
|
+
to enable unified access in the TUI and other monitoring tools.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import enum
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Dict, List, Optional
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger("xpm.interfaces")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# =============================================================================
|
|
27
|
+
# Job State Classes
|
|
28
|
+
# =============================================================================
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class JobState:
|
|
32
|
+
"""Base class for job states
|
|
33
|
+
|
|
34
|
+
Job states are represented as instances of JobState subclasses.
|
|
35
|
+
Singleton instances are available as class attributes (e.g., JobState.DONE)
|
|
36
|
+
for backward compatibility.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
name: str # Readable name
|
|
40
|
+
value: int # Numeric value for ordering comparisons
|
|
41
|
+
|
|
42
|
+
def notstarted(self):
|
|
43
|
+
"""Returns True if the job hasn't started yet"""
|
|
44
|
+
return self.value <= 2 # READY
|
|
45
|
+
|
|
46
|
+
def running(self):
|
|
47
|
+
"""Returns True if the job is currently running or scheduled"""
|
|
48
|
+
return self.value == 4 or self.value == 3 # RUNNING or SCHEDULED
|
|
49
|
+
|
|
50
|
+
def finished(self):
|
|
51
|
+
"""Returns True if the job has finished (success or error)"""
|
|
52
|
+
return self.value >= 5 # DONE or ERROR
|
|
53
|
+
|
|
54
|
+
def is_error(self):
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
def __eq__(self, other):
|
|
58
|
+
"""Compare job states by their numeric value"""
|
|
59
|
+
if isinstance(other, JobState):
|
|
60
|
+
return self.value == other.value
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
def __hash__(self):
|
|
64
|
+
"""Allow JobState instances to be used as dict keys"""
|
|
65
|
+
return hash(self.value)
|
|
66
|
+
|
|
67
|
+
def __repr__(self):
|
|
68
|
+
"""String representation of the job state"""
|
|
69
|
+
return f"{self.__class__.__name__}()"
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def from_path(basepath: Path, scriptname: str) -> "JobState":
|
|
73
|
+
"""Read job state from .done or .failed files
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
basepath: The job directory path
|
|
77
|
+
scriptname: The script name (used for file naming)
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
JobState.DONE if .done exists, JobStateError with details if .failed exists,
|
|
81
|
+
or None if neither exists.
|
|
82
|
+
"""
|
|
83
|
+
donepath = basepath / f"{scriptname}.done"
|
|
84
|
+
failedpath = basepath / f"{scriptname}.failed"
|
|
85
|
+
|
|
86
|
+
if donepath.is_file():
|
|
87
|
+
return JobState.DONE
|
|
88
|
+
|
|
89
|
+
if failedpath.is_file():
|
|
90
|
+
content = failedpath.read_text().strip()
|
|
91
|
+
|
|
92
|
+
# Try JSON first
|
|
93
|
+
try:
|
|
94
|
+
data = json.loads(content)
|
|
95
|
+
if isinstance(data, dict):
|
|
96
|
+
# New format: failure_status field
|
|
97
|
+
failure_status_str = data.get("failure_status")
|
|
98
|
+
if failure_status_str:
|
|
99
|
+
try:
|
|
100
|
+
failure_status = JobFailureStatus[
|
|
101
|
+
failure_status_str.upper()
|
|
102
|
+
]
|
|
103
|
+
return JobStateError(failure_status)
|
|
104
|
+
except KeyError:
|
|
105
|
+
pass
|
|
106
|
+
# Legacy format: reason field
|
|
107
|
+
reason = data.get("reason")
|
|
108
|
+
if reason:
|
|
109
|
+
try:
|
|
110
|
+
failure_status = JobFailureStatus[reason.upper()]
|
|
111
|
+
return JobStateError(failure_status)
|
|
112
|
+
except KeyError:
|
|
113
|
+
pass
|
|
114
|
+
return JobStateError(JobFailureStatus.FAILED)
|
|
115
|
+
except json.JSONDecodeError:
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
# Fall back to legacy integer format
|
|
119
|
+
try:
|
|
120
|
+
code = int(content)
|
|
121
|
+
if code == 0:
|
|
122
|
+
return JobState.DONE
|
|
123
|
+
return JobStateError(JobFailureStatus.FAILED)
|
|
124
|
+
except ValueError:
|
|
125
|
+
logger.warning(
|
|
126
|
+
"Could not parse failed file %s: %s", failedpath, content
|
|
127
|
+
)
|
|
128
|
+
return JobStateError(JobFailureStatus.FAILED)
|
|
129
|
+
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class JobStateUnscheduled(JobState):
|
|
134
|
+
"""Job is not yet scheduled"""
|
|
135
|
+
|
|
136
|
+
name = "unscheduled"
|
|
137
|
+
value = 0
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class JobStateWaiting(JobState):
|
|
141
|
+
"""Job is waiting for dependencies to be done"""
|
|
142
|
+
|
|
143
|
+
name = "waiting"
|
|
144
|
+
value = 1
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class JobStateReady(JobState):
|
|
148
|
+
"""Job is ready to run"""
|
|
149
|
+
|
|
150
|
+
name = "ready"
|
|
151
|
+
value = 2
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class JobStateScheduled(JobState):
|
|
155
|
+
"""Job is scheduled (e.g., in SLURM queue)"""
|
|
156
|
+
|
|
157
|
+
name = "scheduled"
|
|
158
|
+
value = 3
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class JobStateRunning(JobState):
|
|
162
|
+
"""Job is currently running"""
|
|
163
|
+
|
|
164
|
+
name = "running"
|
|
165
|
+
value = 4
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class JobStateDone(JobState):
|
|
169
|
+
"""Job has completed successfully"""
|
|
170
|
+
|
|
171
|
+
name = "done"
|
|
172
|
+
value = 5
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class JobFailureStatus(enum.Enum):
|
|
176
|
+
"""Reasons for job failure"""
|
|
177
|
+
|
|
178
|
+
#: Job dependency failed
|
|
179
|
+
DEPENDENCY = 0
|
|
180
|
+
|
|
181
|
+
#: Job failed
|
|
182
|
+
FAILED = 1
|
|
183
|
+
|
|
184
|
+
#: Memory
|
|
185
|
+
MEMORY = 2
|
|
186
|
+
|
|
187
|
+
#: Timeout (can retry for resumable tasks)
|
|
188
|
+
TIMEOUT = 3
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class JobStateError(JobState):
|
|
192
|
+
"""Job has failed
|
|
193
|
+
|
|
194
|
+
This state carries information about the failure reason via JobFailureStatus enum.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
name = "error"
|
|
198
|
+
value = 6
|
|
199
|
+
|
|
200
|
+
def __init__(self, failure_reason: Optional[JobFailureStatus] = None):
|
|
201
|
+
"""Create an error state, optionally with failure details
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
failure_reason: Optional reason for the failure (JobFailureStatus enum value)
|
|
205
|
+
"""
|
|
206
|
+
self.failure_reason = failure_reason
|
|
207
|
+
|
|
208
|
+
def __repr__(self):
|
|
209
|
+
if self.failure_reason:
|
|
210
|
+
return f"JobStateError(failure_reason={self.failure_reason})"
|
|
211
|
+
return "JobStateError()"
|
|
212
|
+
|
|
213
|
+
def __eq__(self, other):
|
|
214
|
+
"""Error states are equal if they have the same value
|
|
215
|
+
|
|
216
|
+
Note: We intentionally ignore failure_reason in equality comparison
|
|
217
|
+
to maintain backward compatibility with code that does:
|
|
218
|
+
if job.state == JobState.ERROR: ...
|
|
219
|
+
"""
|
|
220
|
+
if isinstance(other, JobState):
|
|
221
|
+
return self.value == other.value
|
|
222
|
+
return False
|
|
223
|
+
|
|
224
|
+
def is_error(self):
|
|
225
|
+
return True
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# FIXME: Get rid of those
|
|
229
|
+
# Create singleton instances for backward compatibility
|
|
230
|
+
# These can be used in comparisons: if state == JobState.DONE: ...
|
|
231
|
+
JobState.UNSCHEDULED = JobStateUnscheduled()
|
|
232
|
+
JobState.WAITING = JobStateWaiting()
|
|
233
|
+
JobState.READY = JobStateReady()
|
|
234
|
+
JobState.SCHEDULED = JobStateScheduled()
|
|
235
|
+
JobState.RUNNING = JobStateRunning()
|
|
236
|
+
JobState.DONE = JobStateDone()
|
|
237
|
+
JobState.ERROR = JobStateError() # default error without failure details
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# Mapping from state name string to JobState singleton
|
|
241
|
+
STATE_NAME_TO_JOBSTATE = {
|
|
242
|
+
"unscheduled": JobState.UNSCHEDULED,
|
|
243
|
+
"waiting": JobState.WAITING,
|
|
244
|
+
"ready": JobState.READY,
|
|
245
|
+
"scheduled": JobState.SCHEDULED,
|
|
246
|
+
"running": JobState.RUNNING,
|
|
247
|
+
"done": JobState.DONE,
|
|
248
|
+
"error": JobState.ERROR,
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# =============================================================================
|
|
253
|
+
# Base Job Interface
|
|
254
|
+
# =============================================================================
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class BaseJob:
|
|
258
|
+
"""Base interface for job information and metadata operations
|
|
259
|
+
|
|
260
|
+
This class defines the interface for job data and provides methods for
|
|
261
|
+
reading/writing job metadata files. Both live Job instances and
|
|
262
|
+
database-loaded MockJob instances should provide these attributes.
|
|
263
|
+
|
|
264
|
+
Attributes:
|
|
265
|
+
identifier: Unique identifier for the job (hash)
|
|
266
|
+
task_id: Task class identifier (string)
|
|
267
|
+
locator: Full task locator (identifier)
|
|
268
|
+
path: Path to job directory
|
|
269
|
+
state: Current job state (JobState object or compatible)
|
|
270
|
+
submittime: When job was submitted (Unix timestamp or None)
|
|
271
|
+
starttime: When job started running (Unix timestamp or None)
|
|
272
|
+
endtime: When job finished (Unix timestamp or None)
|
|
273
|
+
progress: List of progress updates
|
|
274
|
+
tags: Dictionary of tag key-value pairs
|
|
275
|
+
exit_code: Process exit code (optional)
|
|
276
|
+
retry_count: Number of retries
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
identifier: str
|
|
280
|
+
task_id: str
|
|
281
|
+
locator: str
|
|
282
|
+
path: Path
|
|
283
|
+
state: JobState
|
|
284
|
+
submittime: Optional[float]
|
|
285
|
+
starttime: Optional[float]
|
|
286
|
+
endtime: Optional[float]
|
|
287
|
+
progress: List[Dict]
|
|
288
|
+
tags: Dict[str, str]
|
|
289
|
+
exit_code: Optional[int]
|
|
290
|
+
retry_count: int
|
|
291
|
+
|
|
292
|
+
# -------------------------------------------------------------------------
|
|
293
|
+
# Static path computation (for use without a job instance)
|
|
294
|
+
# -------------------------------------------------------------------------
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def get_scriptname(task_id: str) -> str:
|
|
298
|
+
"""Extract script name from task_id (last component after '.')"""
|
|
299
|
+
return task_id.rsplit(".", 1)[-1]
|
|
300
|
+
|
|
301
|
+
@staticmethod
|
|
302
|
+
def get_xpm_dir(job_path: Path) -> Path:
|
|
303
|
+
"""Get .experimaestro directory path for a job path"""
|
|
304
|
+
return job_path / ".experimaestro"
|
|
305
|
+
|
|
306
|
+
@staticmethod
|
|
307
|
+
def get_metadata_path(job_path: Path) -> Path:
|
|
308
|
+
"""Get metadata file path for a job path"""
|
|
309
|
+
return job_path / ".experimaestro" / "information.json"
|
|
310
|
+
|
|
311
|
+
@staticmethod
|
|
312
|
+
def get_pidfile(job_path: Path, scriptname: str) -> Path:
|
|
313
|
+
"""Get PID file path"""
|
|
314
|
+
return job_path / f"{scriptname}.pid"
|
|
315
|
+
|
|
316
|
+
@staticmethod
|
|
317
|
+
def get_donefile(job_path: Path, scriptname: str) -> Path:
|
|
318
|
+
"""Get done marker file path"""
|
|
319
|
+
return job_path / f"{scriptname}.done"
|
|
320
|
+
|
|
321
|
+
@staticmethod
|
|
322
|
+
def get_failedfile(job_path: Path, scriptname: str) -> Path:
|
|
323
|
+
"""Get failed marker file path"""
|
|
324
|
+
return job_path / f"{scriptname}.failed"
|
|
325
|
+
|
|
326
|
+
# -------------------------------------------------------------------------
|
|
327
|
+
# Instance properties (using static methods for consistency)
|
|
328
|
+
# -------------------------------------------------------------------------
|
|
329
|
+
|
|
330
|
+
@property
|
|
331
|
+
def scriptname(self) -> str:
|
|
332
|
+
"""The script name derived from task_id"""
|
|
333
|
+
return BaseJob.get_scriptname(self.task_id)
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def xpm_dir(self) -> Path:
|
|
337
|
+
"""Path to the .experimaestro directory within job path"""
|
|
338
|
+
return BaseJob.get_xpm_dir(self.path)
|
|
339
|
+
|
|
340
|
+
@property
|
|
341
|
+
def metadata_path(self) -> Path:
|
|
342
|
+
"""Path to the job metadata file"""
|
|
343
|
+
return BaseJob.get_metadata_path(self.path)
|
|
344
|
+
|
|
345
|
+
@property
|
|
346
|
+
def pidfile(self) -> Path:
|
|
347
|
+
"""Path to the .pid file"""
|
|
348
|
+
return BaseJob.get_pidfile(self.path, self.scriptname)
|
|
349
|
+
|
|
350
|
+
@property
|
|
351
|
+
def donefile(self) -> Path:
|
|
352
|
+
"""Path to the .done file"""
|
|
353
|
+
return BaseJob.get_donefile(self.path, self.scriptname)
|
|
354
|
+
|
|
355
|
+
@property
|
|
356
|
+
def failedfile(self) -> Path:
|
|
357
|
+
"""Path to the .failed file"""
|
|
358
|
+
return BaseJob.get_failedfile(self.path, self.scriptname)
|
|
359
|
+
|
|
360
|
+
# -------------------------------------------------------------------------
|
|
361
|
+
# Metadata I/O
|
|
362
|
+
# -------------------------------------------------------------------------
|
|
363
|
+
|
|
364
|
+
def write_metadata(self, **extra_fields) -> None:
|
|
365
|
+
"""Write or update job metadata in .experimaestro/information.json file
|
|
366
|
+
|
|
367
|
+
Automatically extracts metadata from job attributes (identifier, state,
|
|
368
|
+
submittime, starttime, endtime, retry_count) and writes to the metadata file.
|
|
369
|
+
|
|
370
|
+
Performs atomic write using temp file + rename. If metadata exists,
|
|
371
|
+
new fields are merged with existing ones. Updates last_updated timestamp.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
**extra_fields: Optional extra fields (e.g., launcher, launcher_job_id, exit_code)
|
|
375
|
+
"""
|
|
376
|
+
# Ensure .experimaestro directory exists
|
|
377
|
+
self.xpm_dir.mkdir(parents=True, exist_ok=True)
|
|
378
|
+
metadata_path = self.metadata_path
|
|
379
|
+
|
|
380
|
+
# Read existing metadata
|
|
381
|
+
existing = {}
|
|
382
|
+
if metadata_path.exists():
|
|
383
|
+
try:
|
|
384
|
+
with metadata_path.open("r") as f:
|
|
385
|
+
existing = json.load(f)
|
|
386
|
+
except Exception as e:
|
|
387
|
+
logger.warning(
|
|
388
|
+
"Failed to read existing metadata from %s: %s", metadata_path, e
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
# Build metadata from job attributes
|
|
392
|
+
fields = {
|
|
393
|
+
"job_id": self.identifier,
|
|
394
|
+
"task_id": self.task_id,
|
|
395
|
+
"state": self.state.name if self.state else None,
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
# Add timing information if available
|
|
399
|
+
if self.submittime is not None:
|
|
400
|
+
fields["submitted_time"] = self.submittime
|
|
401
|
+
if self.starttime is not None:
|
|
402
|
+
fields["started_time"] = self.starttime
|
|
403
|
+
if self.endtime is not None:
|
|
404
|
+
fields["ended_time"] = self.endtime
|
|
405
|
+
|
|
406
|
+
# Add exit code if available
|
|
407
|
+
if self.exit_code is not None:
|
|
408
|
+
fields["exit_code"] = self.exit_code
|
|
409
|
+
|
|
410
|
+
# Add retry count
|
|
411
|
+
if hasattr(self, "retry_count"):
|
|
412
|
+
fields["retry_count"] = self.retry_count
|
|
413
|
+
|
|
414
|
+
# Merge with extra fields (for launcher info, exit_code, etc.)
|
|
415
|
+
fields.update(extra_fields)
|
|
416
|
+
|
|
417
|
+
# Merge with existing and update timestamp
|
|
418
|
+
existing.update(fields)
|
|
419
|
+
existing["last_updated"] = datetime.now().timestamp()
|
|
420
|
+
|
|
421
|
+
# Atomic write
|
|
422
|
+
temp_path = metadata_path.with_suffix(".json.tmp")
|
|
423
|
+
try:
|
|
424
|
+
with temp_path.open("w") as f:
|
|
425
|
+
json.dump(existing, f, indent=2)
|
|
426
|
+
temp_path.replace(metadata_path)
|
|
427
|
+
logger.debug("Wrote metadata to %s: %s", metadata_path, list(fields.keys()))
|
|
428
|
+
except Exception as e:
|
|
429
|
+
logger.error("Failed to write metadata to %s: %s", metadata_path, e)
|
|
430
|
+
if temp_path.exists():
|
|
431
|
+
temp_path.unlink()
|
|
432
|
+
raise
|
|
433
|
+
|
|
434
|
+
def read_metadata(self) -> Optional[dict]:
|
|
435
|
+
"""Read job metadata from .experimaestro/information.json file
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
Dictionary of metadata fields, or None if file doesn't exist
|
|
439
|
+
"""
|
|
440
|
+
metadata_path = self.metadata_path
|
|
441
|
+
if not metadata_path.exists():
|
|
442
|
+
return None
|
|
443
|
+
|
|
444
|
+
try:
|
|
445
|
+
with metadata_path.open("r") as f:
|
|
446
|
+
return json.load(f)
|
|
447
|
+
except Exception as e:
|
|
448
|
+
logger.warning("Failed to read metadata from %s: %s", metadata_path, e)
|
|
449
|
+
return None
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
# =============================================================================
|
|
453
|
+
# Base Experiment Interface
|
|
454
|
+
# =============================================================================
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
class BaseExperiment:
|
|
458
|
+
"""Base interface for experiment information
|
|
459
|
+
|
|
460
|
+
This class defines the interface for experiment data. Both live experiment
|
|
461
|
+
instances and database-loaded MockExperiment instances should provide these attributes.
|
|
462
|
+
|
|
463
|
+
Attributes:
|
|
464
|
+
workdir: Path to experiment directory
|
|
465
|
+
current_run_id: Current/latest run ID (or None)
|
|
466
|
+
"""
|
|
467
|
+
|
|
468
|
+
workdir: Path
|
|
469
|
+
current_run_id: Optional[str]
|
|
470
|
+
|
|
471
|
+
@property
|
|
472
|
+
def experiment_id(self) -> str:
|
|
473
|
+
"""Experiment identifier derived from workdir name"""
|
|
474
|
+
return self.workdir.name
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
class BaseService:
|
|
478
|
+
"""Base interface for service information
|
|
479
|
+
|
|
480
|
+
This class defines the interface for service data. Both live Service instances
|
|
481
|
+
and MockService instances should provide these attributes and methods.
|
|
482
|
+
|
|
483
|
+
Attributes:
|
|
484
|
+
id: Unique identifier for the service
|
|
485
|
+
state: Current service state (ServiceState enum or compatible)
|
|
486
|
+
"""
|
|
487
|
+
|
|
488
|
+
id: str
|
|
489
|
+
|
|
490
|
+
@property
|
|
491
|
+
def state(self):
|
|
492
|
+
"""Current service state"""
|
|
493
|
+
raise NotImplementedError
|
|
494
|
+
|
|
495
|
+
def description(self) -> str:
|
|
496
|
+
"""Human-readable description of the service"""
|
|
497
|
+
raise NotImplementedError
|
|
498
|
+
|
|
499
|
+
def state_dict(self) -> dict:
|
|
500
|
+
"""Return dictionary representation for serialization"""
|
|
501
|
+
raise NotImplementedError
|