experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (154) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +393 -134
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +223 -52
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +650 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +764 -169
  36. experimaestro/scheduler/interfaces.py +338 -96
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/__init__.py +31 -0
  39. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  40. experimaestro/scheduler/remote/client.py +928 -0
  41. experimaestro/scheduler/remote/protocol.py +282 -0
  42. experimaestro/scheduler/remote/server.py +447 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +186 -35
  45. experimaestro/scheduler/state_provider.py +811 -2157
  46. experimaestro/scheduler/state_status.py +1247 -0
  47. experimaestro/scheduler/transient.py +31 -0
  48. experimaestro/scheduler/workspace.py +1 -1
  49. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  50. experimaestro/scriptbuilder.py +4 -4
  51. experimaestro/settings.py +36 -0
  52. experimaestro/tests/conftest.py +33 -5
  53. experimaestro/tests/connectors/bin/executable.py +1 -1
  54. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  55. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  56. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  58. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  59. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  60. experimaestro/tests/launchers/bin/test.py +1 -0
  61. experimaestro/tests/launchers/test_slurm.py +9 -9
  62. experimaestro/tests/partial_reschedule.py +46 -0
  63. experimaestro/tests/restart.py +3 -3
  64. experimaestro/tests/restart_main.py +1 -0
  65. experimaestro/tests/scripts/notifyandwait.py +1 -0
  66. experimaestro/tests/task_partial.py +38 -0
  67. experimaestro/tests/task_tokens.py +2 -2
  68. experimaestro/tests/tasks/test_dynamic.py +6 -6
  69. experimaestro/tests/test_dependencies.py +3 -3
  70. experimaestro/tests/test_deprecated.py +15 -15
  71. experimaestro/tests/test_dynamic_locking.py +317 -0
  72. experimaestro/tests/test_environment.py +24 -14
  73. experimaestro/tests/test_experiment.py +171 -36
  74. experimaestro/tests/test_identifier.py +25 -25
  75. experimaestro/tests/test_identifier_stability.py +3 -5
  76. experimaestro/tests/test_multitoken.py +2 -4
  77. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  78. experimaestro/tests/test_partial_paths.py +81 -138
  79. experimaestro/tests/test_pre_experiment.py +219 -0
  80. experimaestro/tests/test_progress.py +2 -8
  81. experimaestro/tests/test_remote_state.py +1132 -0
  82. experimaestro/tests/test_stray_jobs.py +261 -0
  83. experimaestro/tests/test_tasks.py +1 -2
  84. experimaestro/tests/test_token_locking.py +52 -67
  85. experimaestro/tests/test_tokens.py +5 -6
  86. experimaestro/tests/test_transient.py +225 -0
  87. experimaestro/tests/test_workspace_state_provider.py +768 -0
  88. experimaestro/tests/token_reschedule.py +1 -3
  89. experimaestro/tests/utils.py +2 -7
  90. experimaestro/tokens.py +227 -372
  91. experimaestro/tools/diff.py +1 -0
  92. experimaestro/tools/documentation.py +4 -5
  93. experimaestro/tools/jobs.py +1 -2
  94. experimaestro/tui/app.py +459 -1895
  95. experimaestro/tui/app.tcss +162 -0
  96. experimaestro/tui/dialogs.py +172 -0
  97. experimaestro/tui/log_viewer.py +253 -3
  98. experimaestro/tui/messages.py +137 -0
  99. experimaestro/tui/utils.py +54 -0
  100. experimaestro/tui/widgets/__init__.py +23 -0
  101. experimaestro/tui/widgets/experiments.py +468 -0
  102. experimaestro/tui/widgets/global_services.py +238 -0
  103. experimaestro/tui/widgets/jobs.py +972 -0
  104. experimaestro/tui/widgets/log.py +156 -0
  105. experimaestro/tui/widgets/orphans.py +363 -0
  106. experimaestro/tui/widgets/runs.py +185 -0
  107. experimaestro/tui/widgets/services.py +314 -0
  108. experimaestro/tui/widgets/stray_jobs.py +528 -0
  109. experimaestro/utils/__init__.py +1 -1
  110. experimaestro/utils/environment.py +105 -22
  111. experimaestro/utils/fswatcher.py +124 -0
  112. experimaestro/utils/jobs.py +1 -2
  113. experimaestro/utils/jupyter.py +1 -2
  114. experimaestro/utils/logging.py +72 -0
  115. experimaestro/version.py +2 -2
  116. experimaestro/webui/__init__.py +9 -0
  117. experimaestro/webui/app.py +117 -0
  118. experimaestro/{server → webui}/data/index.css +66 -11
  119. experimaestro/webui/data/index.css.map +1 -0
  120. experimaestro/{server → webui}/data/index.js +82763 -87217
  121. experimaestro/webui/data/index.js.map +1 -0
  122. experimaestro/webui/routes/__init__.py +5 -0
  123. experimaestro/webui/routes/auth.py +53 -0
  124. experimaestro/webui/routes/proxy.py +117 -0
  125. experimaestro/webui/server.py +200 -0
  126. experimaestro/webui/state_bridge.py +152 -0
  127. experimaestro/webui/websocket.py +413 -0
  128. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
  129. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  130. experimaestro/cli/progress.py +0 -269
  131. experimaestro/scheduler/state.py +0 -75
  132. experimaestro/scheduler/state_db.py +0 -388
  133. experimaestro/scheduler/state_sync.py +0 -834
  134. experimaestro/server/__init__.py +0 -467
  135. experimaestro/server/data/index.css.map +0 -1
  136. experimaestro/server/data/index.js.map +0 -1
  137. experimaestro/tests/test_cli_jobs.py +0 -615
  138. experimaestro/tests/test_file_progress.py +0 -425
  139. experimaestro/tests/test_file_progress_integration.py +0 -477
  140. experimaestro/tests/test_state_db.py +0 -434
  141. experimaestro-2.0.0b4.dist-info/RECORD +0 -181
  142. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  143. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  145. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  147. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  148. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  149. /experimaestro/{server → webui}/data/index.html +0 -0
  150. /experimaestro/{server → webui}/data/login.html +0 -0
  151. /experimaestro/{server → webui}/data/manifest.json +0 -0
  152. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  153. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  154. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -16,13 +16,91 @@ to enable unified access in the TUI and other monitoring tools.
16
16
  import enum
17
17
  import json
18
18
  import logging
19
+ from dataclasses import dataclass, field
19
20
  from datetime import datetime
20
21
  from pathlib import Path
21
- from typing import Dict, List, Optional
22
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
23
+
24
+ if TYPE_CHECKING:
25
+ from experimaestro.scheduler.transient import TransientMode
22
26
 
23
27
  logger = logging.getLogger("xpm.interfaces")
24
28
 
25
29
 
30
+ @dataclass
31
+ class ExperimentJobInformation:
32
+ """Lightweight job information for experiment state serialization
33
+
34
+ This class contains the minimal job metadata stored in status.json and jobs.jsonl.
35
+ Full job state (progress, state changes, etc.) comes from events.jsonl replay
36
+ or from the state provider.
37
+ """
38
+
39
+ job_id: str
40
+ task_id: str
41
+ tags: Dict[str, str] = field(default_factory=dict)
42
+ timestamp: Optional[float] = None
43
+
44
+ def to_dict(self) -> Dict[str, Any]:
45
+ """Serialize to dictionary for JSON"""
46
+ return {
47
+ "job_id": self.job_id,
48
+ "task_id": self.task_id,
49
+ "tags": self.tags,
50
+ "timestamp": self.timestamp,
51
+ }
52
+
53
+ @classmethod
54
+ def from_dict(cls, d: Dict[str, Any]) -> "ExperimentJobInformation":
55
+ """Create from dictionary"""
56
+ return cls(
57
+ job_id=d["job_id"],
58
+ task_id=d["task_id"],
59
+ tags=d.get("tags", {}),
60
+ timestamp=d.get("timestamp"),
61
+ )
62
+
63
+
64
+ def serialize_timestamp(ts: Optional[Union[float, datetime, str]]) -> Optional[str]:
65
+ """Serialize timestamp to ISO format string for DB/network storage
66
+
67
+ Handles:
68
+ - None: returns None
69
+ - float/int: Unix timestamp, converts to ISO format
70
+ - datetime: converts to ISO format
71
+ - str: returns as-is (already serialized)
72
+ """
73
+ if ts is None:
74
+ return None
75
+ if isinstance(ts, str):
76
+ return ts # Already serialized
77
+ if isinstance(ts, (int, float)):
78
+ return datetime.fromtimestamp(ts).isoformat()
79
+ if isinstance(ts, datetime):
80
+ return ts.isoformat()
81
+ return str(ts)
82
+
83
+
84
+ def deserialize_timestamp(ts: Optional[Union[float, str]]) -> Optional[float]:
85
+ """Deserialize timestamp from ISO format string to Unix timestamp
86
+
87
+ Handles:
88
+ - None: returns None
89
+ - float/int: returns as-is (already a Unix timestamp)
90
+ - str: parses ISO format and converts to Unix timestamp
91
+ """
92
+ if ts is None:
93
+ return None
94
+ if isinstance(ts, (int, float)):
95
+ return float(ts)
96
+ if isinstance(ts, str):
97
+ try:
98
+ return datetime.fromisoformat(ts).timestamp()
99
+ except ValueError:
100
+ return None
101
+ return None
102
+
103
+
26
104
  # =============================================================================
27
105
  # Job State Classes
28
106
  # =============================================================================
@@ -188,6 +266,19 @@ class JobFailureStatus(enum.Enum):
188
266
  TIMEOUT = 3
189
267
 
190
268
 
269
+ class ExperimentStatus(enum.Enum):
270
+ """Status of an experiment run"""
271
+
272
+ #: Experiment is currently running
273
+ RUNNING = "running"
274
+
275
+ #: Experiment completed successfully
276
+ DONE = "done"
277
+
278
+ #: Experiment failed
279
+ FAILED = "failed"
280
+
281
+
191
282
  class JobStateError(JobState):
192
283
  """Job has failed
193
284
 
@@ -225,7 +316,7 @@ class JobStateError(JobState):
225
316
  return True
226
317
 
227
318
 
228
- # FIXME: Get rid of those
319
+ # NOTE: Consider removing these singleton instances in a future refactor
229
320
  # Create singleton instances for backward compatibility
230
321
  # These can be used in comparisons: if state == JobState.DONE: ...
231
322
  JobState.UNSCHEDULED = JobStateUnscheduled()
@@ -264,30 +355,33 @@ class BaseJob:
264
355
  Attributes:
265
356
  identifier: Unique identifier for the job (hash)
266
357
  task_id: Task class identifier (string)
267
- locator: Full task locator (identifier)
268
358
  path: Path to job directory
269
359
  state: Current job state (JobState object or compatible)
270
360
  submittime: When job was submitted (Unix timestamp or None)
271
361
  starttime: When job started running (Unix timestamp or None)
272
362
  endtime: When job finished (Unix timestamp or None)
273
363
  progress: List of progress updates
274
- tags: Dictionary of tag key-value pairs
275
364
  exit_code: Process exit code (optional)
276
365
  retry_count: Number of retries
366
+ transient: Transient mode (NONE, TRANSIENT, or REMOVE)
277
367
  """
278
368
 
279
369
  identifier: str
280
370
  task_id: str
281
- locator: str
282
371
  path: Path
283
372
  state: JobState
284
373
  submittime: Optional[float]
285
374
  starttime: Optional[float]
286
375
  endtime: Optional[float]
287
376
  progress: List[Dict]
288
- tags: Dict[str, str]
289
377
  exit_code: Optional[int]
290
378
  retry_count: int
379
+ transient: "TransientMode"
380
+
381
+ @property
382
+ def locator(self) -> str:
383
+ """Full task locator (identifier): {task_id}/{identifier}"""
384
+ return f"{self.task_id}/{self.identifier}"
291
385
 
292
386
  # -------------------------------------------------------------------------
293
387
  # Static path computation (for use without a job instance)
@@ -304,9 +398,9 @@ class BaseJob:
304
398
  return job_path / ".experimaestro"
305
399
 
306
400
  @staticmethod
307
- def get_metadata_path(job_path: Path) -> Path:
308
- """Get metadata file path for a job path"""
309
- return job_path / ".experimaestro" / "information.json"
401
+ def get_status_path(job_path: Path) -> Path:
402
+ """Get status file path for a job path"""
403
+ return job_path / ".experimaestro" / "status.json"
310
404
 
311
405
  @staticmethod
312
406
  def get_pidfile(job_path: Path, scriptname: str) -> Path:
@@ -338,9 +432,9 @@ class BaseJob:
338
432
  return BaseJob.get_xpm_dir(self.path)
339
433
 
340
434
  @property
341
- def metadata_path(self) -> Path:
342
- """Path to the job metadata file"""
343
- return BaseJob.get_metadata_path(self.path)
435
+ def status_path(self) -> Path:
436
+ """Path to the job status file"""
437
+ return BaseJob.get_status_path(self.path)
344
438
 
345
439
  @property
346
440
  def pidfile(self) -> Path:
@@ -358,117 +452,265 @@ class BaseJob:
358
452
  return BaseJob.get_failedfile(self.path, self.scriptname)
359
453
 
360
454
  # -------------------------------------------------------------------------
361
- # Metadata I/O
455
+ # State I/O (unified state_dict pattern)
362
456
  # -------------------------------------------------------------------------
363
457
 
364
- def write_metadata(self, **extra_fields) -> None:
365
- """Write or update job metadata in .experimaestro/information.json file
366
-
367
- Automatically extracts metadata from job attributes (identifier, state,
368
- submittime, starttime, endtime, retry_count) and writes to the metadata file.
458
+ def state_dict(self) -> Dict[str, Any]:
459
+ """Get job state as dictionary (single source of truth)
369
460
 
370
- Performs atomic write using temp file + rename. If metadata exists,
371
- new fields are merged with existing ones. Updates last_updated timestamp.
461
+ This is the canonical representation of job state used for both
462
+ serialization to status files and network communication.
372
463
 
373
- Args:
374
- **extra_fields: Optional extra fields (e.g., launcher, launcher_job_id, exit_code)
464
+ Returns:
465
+ Dictionary with all job state fields
375
466
  """
376
- # Ensure .experimaestro directory exists
377
- self.xpm_dir.mkdir(parents=True, exist_ok=True)
378
- metadata_path = self.metadata_path
379
-
380
- # Read existing metadata
381
- existing = {}
382
- if metadata_path.exists():
383
- try:
384
- with metadata_path.open("r") as f:
385
- existing = json.load(f)
386
- except Exception as e:
387
- logger.warning(
388
- "Failed to read existing metadata from %s: %s", metadata_path, e
389
- )
390
-
391
- # Build metadata from job attributes
392
- fields = {
467
+ failure_reason = None
468
+ if (
469
+ self.state
470
+ and self.state.is_error()
471
+ and hasattr(self.state, "failure_reason")
472
+ ):
473
+ fr = self.state.failure_reason
474
+ if fr is not None:
475
+ failure_reason = fr.name
476
+
477
+ return {
393
478
  "job_id": self.identifier,
394
479
  "task_id": self.task_id,
480
+ "path": str(self.path) if self.path else None,
395
481
  "state": self.state.name if self.state else None,
482
+ "failure_reason": failure_reason,
483
+ "submitted_time": self.submittime,
484
+ "started_time": self.starttime,
485
+ "ended_time": self.endtime,
486
+ "exit_code": self.exit_code,
487
+ "retry_count": self.retry_count,
488
+ "progress": [
489
+ p.to_dict() if hasattr(p, "to_dict") else p
490
+ for p in (self.progress or [])
491
+ ],
492
+ "process": self.process_state_dict(),
396
493
  }
397
494
 
398
- # Add timing information if available
399
- if self.submittime is not None:
400
- fields["submitted_time"] = self.submittime
401
- if self.starttime is not None:
402
- fields["started_time"] = self.starttime
403
- if self.endtime is not None:
404
- fields["ended_time"] = self.endtime
495
+ def process_state_dict(self) -> dict | None:
496
+ """Get process state as dictionary. Override in subclasses."""
497
+ return None
498
+
499
+
500
+ # =============================================================================
501
+ # Base Experiment Interface
502
+ # =============================================================================
405
503
 
406
- # Add exit code if available
407
- if self.exit_code is not None:
408
- fields["exit_code"] = self.exit_code
409
504
 
410
- # Add retry count
411
- if hasattr(self, "retry_count"):
412
- fields["retry_count"] = self.retry_count
505
+ class BaseExperiment:
506
+ """Base interface for experiment information
413
507
 
414
- # Merge with extra fields (for launcher info, exit_code, etc.)
415
- fields.update(extra_fields)
508
+ This class defines the interface for experiment data. Both live experiment
509
+ instances and MockExperiment instances should provide these attributes.
510
+
511
+ Core attributes:
512
+ workdir: Path to run directory (experiments/{exp-id}/{run-id}/)
513
+ run_id: Run identifier
514
+
515
+ State tracking (replaces StatusData):
516
+ jobs: Dict mapping job_id to BaseJob
517
+ services: Dict mapping service_id to BaseService
518
+ tags: Dict mapping job_id to tag dict
519
+ dependencies: Dict mapping job_id to list of dependency job_ids
520
+ events_count: Number of events processed
521
+ hostname: Hostname where experiment runs
522
+ started_at: Start timestamp
523
+ ended_at: End timestamp (None if running)
524
+ """
416
525
 
417
- # Merge with existing and update timestamp
418
- existing.update(fields)
419
- existing["last_updated"] = datetime.now().timestamp()
526
+ # Status file version
527
+ STATUS_VERSION = 1
420
528
 
421
- # Atomic write
422
- temp_path = metadata_path.with_suffix(".json.tmp")
529
+ workdir: Path
530
+ run_id: str
531
+
532
+ @property
533
+ def experiment_id(self) -> str:
534
+ """Experiment identifier derived from workdir structure"""
535
+ # workdir is experiments/{exp-id}/{run-id}, so parent.name is exp-id
536
+ return self.workdir.parent.name
537
+
538
+ @property
539
+ def run_dir(self) -> Path:
540
+ """Path to run directory (same as workdir)"""
541
+ return self.workdir
542
+
543
+ @property
544
+ def status(self) -> "ExperimentStatus":
545
+ """Experiment status - override in subclasses"""
546
+ raise NotImplementedError
547
+
548
+ # State tracking properties (abstract - must be implemented by subclasses)
549
+
550
+ @property
551
+ def jobs(self) -> Dict[str, "BaseJob"]:
552
+ """Jobs in this experiment"""
553
+ raise NotImplementedError
554
+
555
+ @property
556
+ def services(self) -> Dict[str, "BaseService"]:
557
+ """Services in this experiment"""
558
+ raise NotImplementedError
559
+
560
+ @property
561
+ def tags(self) -> Dict[str, Dict[str, str]]:
562
+ """Tags for jobs"""
563
+ raise NotImplementedError
564
+
565
+ @property
566
+ def dependencies(self) -> Dict[str, List[str]]:
567
+ """Job dependencies"""
568
+ raise NotImplementedError
569
+
570
+ @property
571
+ def events_count(self) -> int:
572
+ """Number of events processed"""
573
+ raise NotImplementedError
574
+
575
+ @property
576
+ def hostname(self) -> Optional[str]:
577
+ """Hostname where experiment runs"""
578
+ raise NotImplementedError
579
+
580
+ @property
581
+ def started_at(self) -> Optional[float]:
582
+ """Start timestamp"""
583
+ raise NotImplementedError
584
+
585
+ @property
586
+ def ended_at(self) -> Optional[float]:
587
+ """End timestamp (None if running)"""
588
+ raise NotImplementedError
589
+
590
+ # Computed properties
591
+
592
+ @property
593
+ def total_jobs(self) -> int:
594
+ """Total number of jobs"""
595
+ return len(self.jobs)
596
+
597
+ @property
598
+ def finished_jobs(self) -> int:
599
+ """Number of finished jobs"""
600
+ return sum(1 for j in self.jobs.values() if j.state == JobState.DONE)
601
+
602
+ @property
603
+ def failed_jobs(self) -> int:
604
+ """Number of failed jobs"""
605
+ return sum(1 for j in self.jobs.values() if j.state.is_error())
606
+
607
+ def get_services(self) -> List["BaseService"]:
608
+ """Get services for this experiment as a list"""
609
+ return list(self.services.values())
610
+
611
+ @staticmethod
612
+ def get_status_path(run_dir: Path) -> Path:
613
+ """Get status file path for a run directory"""
614
+ return run_dir / "status.json"
615
+
616
+ def state_dict(self) -> Dict[str, Any]:
617
+ """Get experiment state as dictionary (single source of truth)
618
+
619
+ This is the canonical representation of experiment state used for both
620
+ serialization to status files and network communication.
621
+
622
+ Note: Jobs are not included here - they are stored in jobs.jsonl.
623
+ """
423
624
  try:
424
- with temp_path.open("w") as f:
425
- json.dump(existing, f, indent=2)
426
- temp_path.replace(metadata_path)
427
- logger.debug("Wrote metadata to %s: %s", metadata_path, list(fields.keys()))
428
- except Exception as e:
429
- logger.error("Failed to write metadata to %s: %s", metadata_path, e)
430
- if temp_path.exists():
431
- temp_path.unlink()
432
- raise
433
-
434
- def read_metadata(self) -> Optional[dict]:
435
- """Read job metadata from .experimaestro/information.json file
625
+ status_value = self.status.value
626
+ except NotImplementedError:
627
+ status_value = None
628
+
629
+ return {
630
+ "version": self.STATUS_VERSION,
631
+ "experiment_id": self.experiment_id,
632
+ "run_id": self.run_id,
633
+ "status": status_value,
634
+ "events_count": self.events_count,
635
+ "hostname": self.hostname,
636
+ "started_at": self.started_at,
637
+ "ended_at": self.ended_at,
638
+ "finished_jobs": self.finished_jobs,
639
+ "failed_jobs": self.failed_jobs,
640
+ "services": {k: v.full_state_dict() for k, v in self.services.items()},
641
+ }
436
642
 
437
- Returns:
438
- Dictionary of metadata fields, or None if file doesn't exist
643
+ def write_status(self) -> None:
644
+ """Write status.json to disk (calls state_dict internally)
645
+
646
+ Uses file locking to ensure atomic writes across processes.
439
647
  """
440
- metadata_path = self.metadata_path
441
- if not metadata_path.exists():
442
- return None
648
+ import fasteners
443
649
 
444
- try:
445
- with metadata_path.open("r") as f:
446
- return json.load(f)
447
- except Exception as e:
448
- logger.warning("Failed to read metadata from %s: %s", metadata_path, e)
449
- return None
650
+ run_dir = self.run_dir
651
+ if run_dir is None:
652
+ return
450
653
 
654
+ status_path = run_dir / "status.json"
655
+ status_path.parent.mkdir(parents=True, exist_ok=True)
656
+ lock_path = status_path.parent / f".{status_path.name}.lock"
657
+ lock = fasteners.InterProcessLock(str(lock_path))
451
658
 
452
- # =============================================================================
453
- # Base Experiment Interface
454
- # =============================================================================
659
+ data = self.state_dict()
660
+ data["last_updated"] = datetime.now().isoformat()
455
661
 
662
+ with lock:
663
+ temp_path = status_path.with_suffix(".json.tmp")
664
+ with temp_path.open("w") as f:
665
+ json.dump(data, f, indent=2)
666
+ temp_path.replace(status_path)
456
667
 
457
- class BaseExperiment:
458
- """Base interface for experiment information
459
668
 
460
- This class defines the interface for experiment data. Both live experiment
461
- instances and database-loaded MockExperiment instances should provide these attributes.
669
+ class BaseService:
670
+ """Base interface for service information
671
+
672
+ This class defines the interface for service data. Both live Service instances
673
+ and MockService instances should provide these attributes and methods.
462
674
 
463
675
  Attributes:
464
- workdir: Path to experiment directory
465
- current_run_id: Current/latest run ID (or None)
676
+ id: Unique identifier for the service
677
+ state: Current service state (ServiceState enum or compatible)
466
678
  """
467
679
 
468
- workdir: Path
469
- current_run_id: Optional[str]
680
+ id: str
470
681
 
471
682
  @property
472
- def experiment_id(self) -> str:
473
- """Experiment identifier derived from workdir name"""
474
- return self.workdir.name
683
+ def state(self):
684
+ """Current service state"""
685
+ raise NotImplementedError
686
+
687
+ def description(self) -> str:
688
+ """Human-readable description of the service"""
689
+ raise NotImplementedError
690
+
691
+ def state_dict(self) -> dict:
692
+ """Return service state for serialization/recreation"""
693
+ return {}
694
+
695
+ def full_state_dict(self) -> Dict[str, Any]:
696
+ """Get service state as dictionary for JSON serialization.
697
+
698
+ This method properly serializes Path objects and other non-JSON types.
699
+ """
700
+ return {
701
+ "service_id": self.id,
702
+ "description": self.description(),
703
+ "class": f"{self.__class__.__module__}.{self.__class__.__name__}",
704
+ "state_dict": self.state_dict(),
705
+ }
706
+
707
+ def to_service(self) -> "BaseService":
708
+ """Convert to a live Service instance.
709
+
710
+ For live Service instances, returns self.
711
+ For MockService instances, tries to recreate the service from config.
712
+
713
+ Returns:
714
+ A live Service instance, or self if conversion is not possible
715
+ """
716
+ return self # Default: return self (for live services)