experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +393 -134
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +223 -52
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +650 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +764 -169
- experimaestro/scheduler/interfaces.py +338 -96
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +928 -0
- experimaestro/scheduler/remote/protocol.py +282 -0
- experimaestro/scheduler/remote/server.py +447 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +186 -35
- experimaestro/scheduler/state_provider.py +811 -2157
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +1132 -0
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +459 -1895
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -388
- experimaestro/scheduler/state_sync.py +0 -834
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b4.dist-info/RECORD +0 -181
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
|
@@ -16,13 +16,91 @@ to enable unified access in the TUI and other monitoring tools.
|
|
|
16
16
|
import enum
|
|
17
17
|
import json
|
|
18
18
|
import logging
|
|
19
|
+
from dataclasses import dataclass, field
|
|
19
20
|
from datetime import datetime
|
|
20
21
|
from pathlib import Path
|
|
21
|
-
from typing import Dict, List, Optional
|
|
22
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from experimaestro.scheduler.transient import TransientMode
|
|
22
26
|
|
|
23
27
|
logger = logging.getLogger("xpm.interfaces")
|
|
24
28
|
|
|
25
29
|
|
|
30
|
+
@dataclass
|
|
31
|
+
class ExperimentJobInformation:
|
|
32
|
+
"""Lightweight job information for experiment state serialization
|
|
33
|
+
|
|
34
|
+
This class contains the minimal job metadata stored in status.json and jobs.jsonl.
|
|
35
|
+
Full job state (progress, state changes, etc.) comes from events.jsonl replay
|
|
36
|
+
or from the state provider.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
job_id: str
|
|
40
|
+
task_id: str
|
|
41
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
|
42
|
+
timestamp: Optional[float] = None
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
45
|
+
"""Serialize to dictionary for JSON"""
|
|
46
|
+
return {
|
|
47
|
+
"job_id": self.job_id,
|
|
48
|
+
"task_id": self.task_id,
|
|
49
|
+
"tags": self.tags,
|
|
50
|
+
"timestamp": self.timestamp,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def from_dict(cls, d: Dict[str, Any]) -> "ExperimentJobInformation":
|
|
55
|
+
"""Create from dictionary"""
|
|
56
|
+
return cls(
|
|
57
|
+
job_id=d["job_id"],
|
|
58
|
+
task_id=d["task_id"],
|
|
59
|
+
tags=d.get("tags", {}),
|
|
60
|
+
timestamp=d.get("timestamp"),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def serialize_timestamp(ts: Optional[Union[float, datetime, str]]) -> Optional[str]:
|
|
65
|
+
"""Serialize timestamp to ISO format string for DB/network storage
|
|
66
|
+
|
|
67
|
+
Handles:
|
|
68
|
+
- None: returns None
|
|
69
|
+
- float/int: Unix timestamp, converts to ISO format
|
|
70
|
+
- datetime: converts to ISO format
|
|
71
|
+
- str: returns as-is (already serialized)
|
|
72
|
+
"""
|
|
73
|
+
if ts is None:
|
|
74
|
+
return None
|
|
75
|
+
if isinstance(ts, str):
|
|
76
|
+
return ts # Already serialized
|
|
77
|
+
if isinstance(ts, (int, float)):
|
|
78
|
+
return datetime.fromtimestamp(ts).isoformat()
|
|
79
|
+
if isinstance(ts, datetime):
|
|
80
|
+
return ts.isoformat()
|
|
81
|
+
return str(ts)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def deserialize_timestamp(ts: Optional[Union[float, str]]) -> Optional[float]:
|
|
85
|
+
"""Deserialize timestamp from ISO format string to Unix timestamp
|
|
86
|
+
|
|
87
|
+
Handles:
|
|
88
|
+
- None: returns None
|
|
89
|
+
- float/int: returns as-is (already a Unix timestamp)
|
|
90
|
+
- str: parses ISO format and converts to Unix timestamp
|
|
91
|
+
"""
|
|
92
|
+
if ts is None:
|
|
93
|
+
return None
|
|
94
|
+
if isinstance(ts, (int, float)):
|
|
95
|
+
return float(ts)
|
|
96
|
+
if isinstance(ts, str):
|
|
97
|
+
try:
|
|
98
|
+
return datetime.fromisoformat(ts).timestamp()
|
|
99
|
+
except ValueError:
|
|
100
|
+
return None
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
|
|
26
104
|
# =============================================================================
|
|
27
105
|
# Job State Classes
|
|
28
106
|
# =============================================================================
|
|
@@ -188,6 +266,19 @@ class JobFailureStatus(enum.Enum):
|
|
|
188
266
|
TIMEOUT = 3
|
|
189
267
|
|
|
190
268
|
|
|
269
|
+
class ExperimentStatus(enum.Enum):
|
|
270
|
+
"""Status of an experiment run"""
|
|
271
|
+
|
|
272
|
+
#: Experiment is currently running
|
|
273
|
+
RUNNING = "running"
|
|
274
|
+
|
|
275
|
+
#: Experiment completed successfully
|
|
276
|
+
DONE = "done"
|
|
277
|
+
|
|
278
|
+
#: Experiment failed
|
|
279
|
+
FAILED = "failed"
|
|
280
|
+
|
|
281
|
+
|
|
191
282
|
class JobStateError(JobState):
|
|
192
283
|
"""Job has failed
|
|
193
284
|
|
|
@@ -225,7 +316,7 @@ class JobStateError(JobState):
|
|
|
225
316
|
return True
|
|
226
317
|
|
|
227
318
|
|
|
228
|
-
#
|
|
319
|
+
# NOTE: Consider removing these singleton instances in a future refactor
|
|
229
320
|
# Create singleton instances for backward compatibility
|
|
230
321
|
# These can be used in comparisons: if state == JobState.DONE: ...
|
|
231
322
|
JobState.UNSCHEDULED = JobStateUnscheduled()
|
|
@@ -264,30 +355,33 @@ class BaseJob:
|
|
|
264
355
|
Attributes:
|
|
265
356
|
identifier: Unique identifier for the job (hash)
|
|
266
357
|
task_id: Task class identifier (string)
|
|
267
|
-
locator: Full task locator (identifier)
|
|
268
358
|
path: Path to job directory
|
|
269
359
|
state: Current job state (JobState object or compatible)
|
|
270
360
|
submittime: When job was submitted (Unix timestamp or None)
|
|
271
361
|
starttime: When job started running (Unix timestamp or None)
|
|
272
362
|
endtime: When job finished (Unix timestamp or None)
|
|
273
363
|
progress: List of progress updates
|
|
274
|
-
tags: Dictionary of tag key-value pairs
|
|
275
364
|
exit_code: Process exit code (optional)
|
|
276
365
|
retry_count: Number of retries
|
|
366
|
+
transient: Transient mode (NONE, TRANSIENT, or REMOVE)
|
|
277
367
|
"""
|
|
278
368
|
|
|
279
369
|
identifier: str
|
|
280
370
|
task_id: str
|
|
281
|
-
locator: str
|
|
282
371
|
path: Path
|
|
283
372
|
state: JobState
|
|
284
373
|
submittime: Optional[float]
|
|
285
374
|
starttime: Optional[float]
|
|
286
375
|
endtime: Optional[float]
|
|
287
376
|
progress: List[Dict]
|
|
288
|
-
tags: Dict[str, str]
|
|
289
377
|
exit_code: Optional[int]
|
|
290
378
|
retry_count: int
|
|
379
|
+
transient: "TransientMode"
|
|
380
|
+
|
|
381
|
+
@property
|
|
382
|
+
def locator(self) -> str:
|
|
383
|
+
"""Full task locator (identifier): {task_id}/{identifier}"""
|
|
384
|
+
return f"{self.task_id}/{self.identifier}"
|
|
291
385
|
|
|
292
386
|
# -------------------------------------------------------------------------
|
|
293
387
|
# Static path computation (for use without a job instance)
|
|
@@ -304,9 +398,9 @@ class BaseJob:
|
|
|
304
398
|
return job_path / ".experimaestro"
|
|
305
399
|
|
|
306
400
|
@staticmethod
|
|
307
|
-
def
|
|
308
|
-
"""Get
|
|
309
|
-
return job_path / ".experimaestro" / "
|
|
401
|
+
def get_status_path(job_path: Path) -> Path:
|
|
402
|
+
"""Get status file path for a job path"""
|
|
403
|
+
return job_path / ".experimaestro" / "status.json"
|
|
310
404
|
|
|
311
405
|
@staticmethod
|
|
312
406
|
def get_pidfile(job_path: Path, scriptname: str) -> Path:
|
|
@@ -338,9 +432,9 @@ class BaseJob:
|
|
|
338
432
|
return BaseJob.get_xpm_dir(self.path)
|
|
339
433
|
|
|
340
434
|
@property
|
|
341
|
-
def
|
|
342
|
-
"""Path to the job
|
|
343
|
-
return BaseJob.
|
|
435
|
+
def status_path(self) -> Path:
|
|
436
|
+
"""Path to the job status file"""
|
|
437
|
+
return BaseJob.get_status_path(self.path)
|
|
344
438
|
|
|
345
439
|
@property
|
|
346
440
|
def pidfile(self) -> Path:
|
|
@@ -358,117 +452,265 @@ class BaseJob:
|
|
|
358
452
|
return BaseJob.get_failedfile(self.path, self.scriptname)
|
|
359
453
|
|
|
360
454
|
# -------------------------------------------------------------------------
|
|
361
|
-
#
|
|
455
|
+
# State I/O (unified state_dict pattern)
|
|
362
456
|
# -------------------------------------------------------------------------
|
|
363
457
|
|
|
364
|
-
def
|
|
365
|
-
"""
|
|
366
|
-
|
|
367
|
-
Automatically extracts metadata from job attributes (identifier, state,
|
|
368
|
-
submittime, starttime, endtime, retry_count) and writes to the metadata file.
|
|
458
|
+
def state_dict(self) -> Dict[str, Any]:
|
|
459
|
+
"""Get job state as dictionary (single source of truth)
|
|
369
460
|
|
|
370
|
-
|
|
371
|
-
|
|
461
|
+
This is the canonical representation of job state used for both
|
|
462
|
+
serialization to status files and network communication.
|
|
372
463
|
|
|
373
|
-
|
|
374
|
-
|
|
464
|
+
Returns:
|
|
465
|
+
Dictionary with all job state fields
|
|
375
466
|
"""
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
logger.warning(
|
|
388
|
-
"Failed to read existing metadata from %s: %s", metadata_path, e
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
# Build metadata from job attributes
|
|
392
|
-
fields = {
|
|
467
|
+
failure_reason = None
|
|
468
|
+
if (
|
|
469
|
+
self.state
|
|
470
|
+
and self.state.is_error()
|
|
471
|
+
and hasattr(self.state, "failure_reason")
|
|
472
|
+
):
|
|
473
|
+
fr = self.state.failure_reason
|
|
474
|
+
if fr is not None:
|
|
475
|
+
failure_reason = fr.name
|
|
476
|
+
|
|
477
|
+
return {
|
|
393
478
|
"job_id": self.identifier,
|
|
394
479
|
"task_id": self.task_id,
|
|
480
|
+
"path": str(self.path) if self.path else None,
|
|
395
481
|
"state": self.state.name if self.state else None,
|
|
482
|
+
"failure_reason": failure_reason,
|
|
483
|
+
"submitted_time": self.submittime,
|
|
484
|
+
"started_time": self.starttime,
|
|
485
|
+
"ended_time": self.endtime,
|
|
486
|
+
"exit_code": self.exit_code,
|
|
487
|
+
"retry_count": self.retry_count,
|
|
488
|
+
"progress": [
|
|
489
|
+
p.to_dict() if hasattr(p, "to_dict") else p
|
|
490
|
+
for p in (self.progress or [])
|
|
491
|
+
],
|
|
492
|
+
"process": self.process_state_dict(),
|
|
396
493
|
}
|
|
397
494
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
495
|
+
def process_state_dict(self) -> dict | None:
|
|
496
|
+
"""Get process state as dictionary. Override in subclasses."""
|
|
497
|
+
return None
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
# =============================================================================
|
|
501
|
+
# Base Experiment Interface
|
|
502
|
+
# =============================================================================
|
|
405
503
|
|
|
406
|
-
# Add exit code if available
|
|
407
|
-
if self.exit_code is not None:
|
|
408
|
-
fields["exit_code"] = self.exit_code
|
|
409
504
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
fields["retry_count"] = self.retry_count
|
|
505
|
+
class BaseExperiment:
|
|
506
|
+
"""Base interface for experiment information
|
|
413
507
|
|
|
414
|
-
|
|
415
|
-
|
|
508
|
+
This class defines the interface for experiment data. Both live experiment
|
|
509
|
+
instances and MockExperiment instances should provide these attributes.
|
|
510
|
+
|
|
511
|
+
Core attributes:
|
|
512
|
+
workdir: Path to run directory (experiments/{exp-id}/{run-id}/)
|
|
513
|
+
run_id: Run identifier
|
|
514
|
+
|
|
515
|
+
State tracking (replaces StatusData):
|
|
516
|
+
jobs: Dict mapping job_id to BaseJob
|
|
517
|
+
services: Dict mapping service_id to BaseService
|
|
518
|
+
tags: Dict mapping job_id to tag dict
|
|
519
|
+
dependencies: Dict mapping job_id to list of dependency job_ids
|
|
520
|
+
events_count: Number of events processed
|
|
521
|
+
hostname: Hostname where experiment runs
|
|
522
|
+
started_at: Start timestamp
|
|
523
|
+
ended_at: End timestamp (None if running)
|
|
524
|
+
"""
|
|
416
525
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
existing["last_updated"] = datetime.now().timestamp()
|
|
526
|
+
# Status file version
|
|
527
|
+
STATUS_VERSION = 1
|
|
420
528
|
|
|
421
|
-
|
|
422
|
-
|
|
529
|
+
workdir: Path
|
|
530
|
+
run_id: str
|
|
531
|
+
|
|
532
|
+
@property
|
|
533
|
+
def experiment_id(self) -> str:
|
|
534
|
+
"""Experiment identifier derived from workdir structure"""
|
|
535
|
+
# workdir is experiments/{exp-id}/{run-id}, so parent.name is exp-id
|
|
536
|
+
return self.workdir.parent.name
|
|
537
|
+
|
|
538
|
+
@property
|
|
539
|
+
def run_dir(self) -> Path:
|
|
540
|
+
"""Path to run directory (same as workdir)"""
|
|
541
|
+
return self.workdir
|
|
542
|
+
|
|
543
|
+
@property
|
|
544
|
+
def status(self) -> "ExperimentStatus":
|
|
545
|
+
"""Experiment status - override in subclasses"""
|
|
546
|
+
raise NotImplementedError
|
|
547
|
+
|
|
548
|
+
# State tracking properties (abstract - must be implemented by subclasses)
|
|
549
|
+
|
|
550
|
+
@property
|
|
551
|
+
def jobs(self) -> Dict[str, "BaseJob"]:
|
|
552
|
+
"""Jobs in this experiment"""
|
|
553
|
+
raise NotImplementedError
|
|
554
|
+
|
|
555
|
+
@property
|
|
556
|
+
def services(self) -> Dict[str, "BaseService"]:
|
|
557
|
+
"""Services in this experiment"""
|
|
558
|
+
raise NotImplementedError
|
|
559
|
+
|
|
560
|
+
@property
|
|
561
|
+
def tags(self) -> Dict[str, Dict[str, str]]:
|
|
562
|
+
"""Tags for jobs"""
|
|
563
|
+
raise NotImplementedError
|
|
564
|
+
|
|
565
|
+
@property
|
|
566
|
+
def dependencies(self) -> Dict[str, List[str]]:
|
|
567
|
+
"""Job dependencies"""
|
|
568
|
+
raise NotImplementedError
|
|
569
|
+
|
|
570
|
+
@property
|
|
571
|
+
def events_count(self) -> int:
|
|
572
|
+
"""Number of events processed"""
|
|
573
|
+
raise NotImplementedError
|
|
574
|
+
|
|
575
|
+
@property
|
|
576
|
+
def hostname(self) -> Optional[str]:
|
|
577
|
+
"""Hostname where experiment runs"""
|
|
578
|
+
raise NotImplementedError
|
|
579
|
+
|
|
580
|
+
@property
|
|
581
|
+
def started_at(self) -> Optional[float]:
|
|
582
|
+
"""Start timestamp"""
|
|
583
|
+
raise NotImplementedError
|
|
584
|
+
|
|
585
|
+
@property
|
|
586
|
+
def ended_at(self) -> Optional[float]:
|
|
587
|
+
"""End timestamp (None if running)"""
|
|
588
|
+
raise NotImplementedError
|
|
589
|
+
|
|
590
|
+
# Computed properties
|
|
591
|
+
|
|
592
|
+
@property
|
|
593
|
+
def total_jobs(self) -> int:
|
|
594
|
+
"""Total number of jobs"""
|
|
595
|
+
return len(self.jobs)
|
|
596
|
+
|
|
597
|
+
@property
|
|
598
|
+
def finished_jobs(self) -> int:
|
|
599
|
+
"""Number of finished jobs"""
|
|
600
|
+
return sum(1 for j in self.jobs.values() if j.state == JobState.DONE)
|
|
601
|
+
|
|
602
|
+
@property
|
|
603
|
+
def failed_jobs(self) -> int:
|
|
604
|
+
"""Number of failed jobs"""
|
|
605
|
+
return sum(1 for j in self.jobs.values() if j.state.is_error())
|
|
606
|
+
|
|
607
|
+
def get_services(self) -> List["BaseService"]:
|
|
608
|
+
"""Get services for this experiment as a list"""
|
|
609
|
+
return list(self.services.values())
|
|
610
|
+
|
|
611
|
+
@staticmethod
|
|
612
|
+
def get_status_path(run_dir: Path) -> Path:
|
|
613
|
+
"""Get status file path for a run directory"""
|
|
614
|
+
return run_dir / "status.json"
|
|
615
|
+
|
|
616
|
+
def state_dict(self) -> Dict[str, Any]:
|
|
617
|
+
"""Get experiment state as dictionary (single source of truth)
|
|
618
|
+
|
|
619
|
+
This is the canonical representation of experiment state used for both
|
|
620
|
+
serialization to status files and network communication.
|
|
621
|
+
|
|
622
|
+
Note: Jobs are not included here - they are stored in jobs.jsonl.
|
|
623
|
+
"""
|
|
423
624
|
try:
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
625
|
+
status_value = self.status.value
|
|
626
|
+
except NotImplementedError:
|
|
627
|
+
status_value = None
|
|
628
|
+
|
|
629
|
+
return {
|
|
630
|
+
"version": self.STATUS_VERSION,
|
|
631
|
+
"experiment_id": self.experiment_id,
|
|
632
|
+
"run_id": self.run_id,
|
|
633
|
+
"status": status_value,
|
|
634
|
+
"events_count": self.events_count,
|
|
635
|
+
"hostname": self.hostname,
|
|
636
|
+
"started_at": self.started_at,
|
|
637
|
+
"ended_at": self.ended_at,
|
|
638
|
+
"finished_jobs": self.finished_jobs,
|
|
639
|
+
"failed_jobs": self.failed_jobs,
|
|
640
|
+
"services": {k: v.full_state_dict() for k, v in self.services.items()},
|
|
641
|
+
}
|
|
436
642
|
|
|
437
|
-
|
|
438
|
-
|
|
643
|
+
def write_status(self) -> None:
|
|
644
|
+
"""Write status.json to disk (calls state_dict internally)
|
|
645
|
+
|
|
646
|
+
Uses file locking to ensure atomic writes across processes.
|
|
439
647
|
"""
|
|
440
|
-
|
|
441
|
-
if not metadata_path.exists():
|
|
442
|
-
return None
|
|
648
|
+
import fasteners
|
|
443
649
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
except Exception as e:
|
|
448
|
-
logger.warning("Failed to read metadata from %s: %s", metadata_path, e)
|
|
449
|
-
return None
|
|
650
|
+
run_dir = self.run_dir
|
|
651
|
+
if run_dir is None:
|
|
652
|
+
return
|
|
450
653
|
|
|
654
|
+
status_path = run_dir / "status.json"
|
|
655
|
+
status_path.parent.mkdir(parents=True, exist_ok=True)
|
|
656
|
+
lock_path = status_path.parent / f".{status_path.name}.lock"
|
|
657
|
+
lock = fasteners.InterProcessLock(str(lock_path))
|
|
451
658
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
# =============================================================================
|
|
659
|
+
data = self.state_dict()
|
|
660
|
+
data["last_updated"] = datetime.now().isoformat()
|
|
455
661
|
|
|
662
|
+
with lock:
|
|
663
|
+
temp_path = status_path.with_suffix(".json.tmp")
|
|
664
|
+
with temp_path.open("w") as f:
|
|
665
|
+
json.dump(data, f, indent=2)
|
|
666
|
+
temp_path.replace(status_path)
|
|
456
667
|
|
|
457
|
-
class BaseExperiment:
|
|
458
|
-
"""Base interface for experiment information
|
|
459
668
|
|
|
460
|
-
|
|
461
|
-
|
|
669
|
+
class BaseService:
|
|
670
|
+
"""Base interface for service information
|
|
671
|
+
|
|
672
|
+
This class defines the interface for service data. Both live Service instances
|
|
673
|
+
and MockService instances should provide these attributes and methods.
|
|
462
674
|
|
|
463
675
|
Attributes:
|
|
464
|
-
|
|
465
|
-
|
|
676
|
+
id: Unique identifier for the service
|
|
677
|
+
state: Current service state (ServiceState enum or compatible)
|
|
466
678
|
"""
|
|
467
679
|
|
|
468
|
-
|
|
469
|
-
current_run_id: Optional[str]
|
|
680
|
+
id: str
|
|
470
681
|
|
|
471
682
|
@property
|
|
472
|
-
def
|
|
473
|
-
"""
|
|
474
|
-
|
|
683
|
+
def state(self):
|
|
684
|
+
"""Current service state"""
|
|
685
|
+
raise NotImplementedError
|
|
686
|
+
|
|
687
|
+
def description(self) -> str:
|
|
688
|
+
"""Human-readable description of the service"""
|
|
689
|
+
raise NotImplementedError
|
|
690
|
+
|
|
691
|
+
def state_dict(self) -> dict:
|
|
692
|
+
"""Return service state for serialization/recreation"""
|
|
693
|
+
return {}
|
|
694
|
+
|
|
695
|
+
def full_state_dict(self) -> Dict[str, Any]:
|
|
696
|
+
"""Get service state as dictionary for JSON serialization.
|
|
697
|
+
|
|
698
|
+
This method properly serializes Path objects and other non-JSON types.
|
|
699
|
+
"""
|
|
700
|
+
return {
|
|
701
|
+
"service_id": self.id,
|
|
702
|
+
"description": self.description(),
|
|
703
|
+
"class": f"{self.__class__.__module__}.{self.__class__.__name__}",
|
|
704
|
+
"state_dict": self.state_dict(),
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
def to_service(self) -> "BaseService":
|
|
708
|
+
"""Convert to a live Service instance.
|
|
709
|
+
|
|
710
|
+
For live Service instances, returns self.
|
|
711
|
+
For MockService instances, tries to recreate the service from config.
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
A live Service instance, or self if conversion is not possible
|
|
715
|
+
"""
|
|
716
|
+
return self # Default: return self (for live services)
|