experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (154) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +393 -134
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +223 -52
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +650 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +764 -169
  36. experimaestro/scheduler/interfaces.py +338 -96
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/__init__.py +31 -0
  39. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  40. experimaestro/scheduler/remote/client.py +928 -0
  41. experimaestro/scheduler/remote/protocol.py +282 -0
  42. experimaestro/scheduler/remote/server.py +447 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +186 -35
  45. experimaestro/scheduler/state_provider.py +811 -2157
  46. experimaestro/scheduler/state_status.py +1247 -0
  47. experimaestro/scheduler/transient.py +31 -0
  48. experimaestro/scheduler/workspace.py +1 -1
  49. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  50. experimaestro/scriptbuilder.py +4 -4
  51. experimaestro/settings.py +36 -0
  52. experimaestro/tests/conftest.py +33 -5
  53. experimaestro/tests/connectors/bin/executable.py +1 -1
  54. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  55. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  56. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  58. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  59. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  60. experimaestro/tests/launchers/bin/test.py +1 -0
  61. experimaestro/tests/launchers/test_slurm.py +9 -9
  62. experimaestro/tests/partial_reschedule.py +46 -0
  63. experimaestro/tests/restart.py +3 -3
  64. experimaestro/tests/restart_main.py +1 -0
  65. experimaestro/tests/scripts/notifyandwait.py +1 -0
  66. experimaestro/tests/task_partial.py +38 -0
  67. experimaestro/tests/task_tokens.py +2 -2
  68. experimaestro/tests/tasks/test_dynamic.py +6 -6
  69. experimaestro/tests/test_dependencies.py +3 -3
  70. experimaestro/tests/test_deprecated.py +15 -15
  71. experimaestro/tests/test_dynamic_locking.py +317 -0
  72. experimaestro/tests/test_environment.py +24 -14
  73. experimaestro/tests/test_experiment.py +171 -36
  74. experimaestro/tests/test_identifier.py +25 -25
  75. experimaestro/tests/test_identifier_stability.py +3 -5
  76. experimaestro/tests/test_multitoken.py +2 -4
  77. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  78. experimaestro/tests/test_partial_paths.py +81 -138
  79. experimaestro/tests/test_pre_experiment.py +219 -0
  80. experimaestro/tests/test_progress.py +2 -8
  81. experimaestro/tests/test_remote_state.py +1132 -0
  82. experimaestro/tests/test_stray_jobs.py +261 -0
  83. experimaestro/tests/test_tasks.py +1 -2
  84. experimaestro/tests/test_token_locking.py +52 -67
  85. experimaestro/tests/test_tokens.py +5 -6
  86. experimaestro/tests/test_transient.py +225 -0
  87. experimaestro/tests/test_workspace_state_provider.py +768 -0
  88. experimaestro/tests/token_reschedule.py +1 -3
  89. experimaestro/tests/utils.py +2 -7
  90. experimaestro/tokens.py +227 -372
  91. experimaestro/tools/diff.py +1 -0
  92. experimaestro/tools/documentation.py +4 -5
  93. experimaestro/tools/jobs.py +1 -2
  94. experimaestro/tui/app.py +459 -1895
  95. experimaestro/tui/app.tcss +162 -0
  96. experimaestro/tui/dialogs.py +172 -0
  97. experimaestro/tui/log_viewer.py +253 -3
  98. experimaestro/tui/messages.py +137 -0
  99. experimaestro/tui/utils.py +54 -0
  100. experimaestro/tui/widgets/__init__.py +23 -0
  101. experimaestro/tui/widgets/experiments.py +468 -0
  102. experimaestro/tui/widgets/global_services.py +238 -0
  103. experimaestro/tui/widgets/jobs.py +972 -0
  104. experimaestro/tui/widgets/log.py +156 -0
  105. experimaestro/tui/widgets/orphans.py +363 -0
  106. experimaestro/tui/widgets/runs.py +185 -0
  107. experimaestro/tui/widgets/services.py +314 -0
  108. experimaestro/tui/widgets/stray_jobs.py +528 -0
  109. experimaestro/utils/__init__.py +1 -1
  110. experimaestro/utils/environment.py +105 -22
  111. experimaestro/utils/fswatcher.py +124 -0
  112. experimaestro/utils/jobs.py +1 -2
  113. experimaestro/utils/jupyter.py +1 -2
  114. experimaestro/utils/logging.py +72 -0
  115. experimaestro/version.py +2 -2
  116. experimaestro/webui/__init__.py +9 -0
  117. experimaestro/webui/app.py +117 -0
  118. experimaestro/{server → webui}/data/index.css +66 -11
  119. experimaestro/webui/data/index.css.map +1 -0
  120. experimaestro/{server → webui}/data/index.js +82763 -87217
  121. experimaestro/webui/data/index.js.map +1 -0
  122. experimaestro/webui/routes/__init__.py +5 -0
  123. experimaestro/webui/routes/auth.py +53 -0
  124. experimaestro/webui/routes/proxy.py +117 -0
  125. experimaestro/webui/server.py +200 -0
  126. experimaestro/webui/state_bridge.py +152 -0
  127. experimaestro/webui/websocket.py +413 -0
  128. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
  129. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  130. experimaestro/cli/progress.py +0 -269
  131. experimaestro/scheduler/state.py +0 -75
  132. experimaestro/scheduler/state_db.py +0 -388
  133. experimaestro/scheduler/state_sync.py +0 -834
  134. experimaestro/server/__init__.py +0 -467
  135. experimaestro/server/data/index.css.map +0 -1
  136. experimaestro/server/data/index.js.map +0 -1
  137. experimaestro/tests/test_cli_jobs.py +0 -615
  138. experimaestro/tests/test_file_progress.py +0 -425
  139. experimaestro/tests/test_file_progress_integration.py +0 -477
  140. experimaestro/tests/test_state_db.py +0 -434
  141. experimaestro-2.0.0b4.dist-info/RECORD +0 -181
  142. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  143. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  145. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  147. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  148. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  149. /experimaestro/{server → webui}/data/index.html +0 -0
  150. /experimaestro/{server → webui}/data/login.html +0 -0
  151. /experimaestro/{server → webui}/data/manifest.json +0 -0
  152. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  153. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  154. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -1,242 +1,452 @@
1
- """Unified workspace state provider for accessing experiment and job information
1
+ """State provider interfaces for accessing experiment and job information
2
2
 
3
- This module provides a single WorkspaceStateProvider class that accesses state
4
- from the workspace-level database (.experimaestro/workspace.db). This replaces
5
- the previous multi-provider architecture with a unified approach.
3
+ This module provides the abstract StateProvider interface and related data classes.
4
+ The concrete implementations are in db_state_provider.py (DbStateProvider) and
5
+ remote/client.py (SSHStateProviderClient).
6
6
 
7
7
  Key features:
8
- - Single .experimaestro/workspace.db database shared across all experiments
9
- - Support for multiple runs per experiment
10
- - Run-scoped tags (fixes GH #128)
11
- - Thread-safe database access via thread-local connections
12
- - Real-time updates via scheduler listener interface
13
- - Push notifications via listener callbacks (for reactive UI)
8
+ - StateProvider ABC: Abstract base class for all state providers
9
+ - Mock classes: Concrete implementations for database-loaded state objects
10
+ - StateListener: Type alias for listener callbacks
11
+
12
+ Note: Event classes are defined in state_status.py (EventBase and subclasses).
14
13
  """
15
14
 
16
15
  import json
17
16
  import logging
18
17
  import threading
19
- import time
20
18
  from dataclasses import dataclass
21
19
  from datetime import datetime
22
- from enum import Enum, auto
23
20
  from pathlib import Path
24
- from typing import Callable, Dict, List, Optional, Set, TYPE_CHECKING
25
-
26
- from watchdog.events import FileSystemEventHandler
27
- from watchdog.observers.api import ObservedWatch
28
-
29
- from experimaestro.scheduler.state_db import (
30
- ExperimentModel,
31
- ExperimentRunModel,
32
- JobModel,
33
- JobTagModel,
34
- ServiceModel,
35
- PartialModel,
36
- JobPartialModel,
37
- ALL_MODELS,
38
- )
21
+ from abc import ABC, abstractmethod
22
+ from typing import Callable, Dict, List, Optional, Set, Tuple
23
+
39
24
  from experimaestro.scheduler.interfaces import (
40
25
  BaseJob,
41
26
  BaseExperiment,
27
+ BaseService,
28
+ ExperimentJobInformation,
29
+ ExperimentStatus,
42
30
  JobState,
43
31
  JobFailureStatus,
44
32
  STATE_NAME_TO_JOBSTATE,
33
+ deserialize_timestamp,
45
34
  )
46
-
47
- if TYPE_CHECKING:
48
- from experimaestro.scheduler.jobs import Job
49
- from experimaestro.scheduler.services import Service
35
+ from experimaestro.scheduler.transient import TransientMode
36
+ from experimaestro.notifications import (
37
+ ProgressInformation,
38
+ get_progress_information_from_dict,
39
+ )
40
+ from experimaestro.scheduler.state_status import EventBase
50
41
 
51
42
  logger = logging.getLogger("xpm.state")
52
43
 
53
44
 
54
- # Event types for state provider notifications
55
- class StateEventType(Enum):
56
- """Types of state change events"""
57
-
58
- EXPERIMENT_UPDATED = auto()
59
- RUN_UPDATED = auto()
60
- JOB_UPDATED = auto()
61
- SERVICE_UPDATED = auto()
45
+ # =============================================================================
46
+ # Process Information
47
+ # =============================================================================
62
48
 
63
49
 
64
50
  @dataclass
65
- class StateEvent:
66
- """Base class for state change events
51
+ class ProcessInfo:
52
+ """Information about a running or completed process"""
67
53
 
68
- Attributes:
69
- event_type: Type of the event
70
- data: Event-specific data dictionary
71
- """
54
+ pid: int
55
+ """Process ID"""
72
56
 
73
- event_type: StateEventType
74
- data: Dict
57
+ type: str
58
+ """Process type (e.g., 'local', 'slurm', 'oar')"""
75
59
 
60
+ running: bool = False
61
+ """Whether the process is currently running"""
76
62
 
77
- # Type alias for listener callbacks
78
- StateListener = Callable[[StateEvent], None]
63
+ cpu_percent: Optional[float] = None
64
+ """CPU usage percentage (if available)"""
79
65
 
66
+ memory_mb: Optional[float] = None
67
+ """Memory usage in MB (if available)"""
80
68
 
81
- class _DatabaseChangeDetector:
82
- """Background thread that detects database changes and notifies listeners
69
+ num_threads: Optional[int] = None
70
+ """Number of threads (if available)"""
83
71
 
84
- Uses a semaphore pattern so that the watchdog event handler never blocks.
85
- The watchdog just signals the semaphore, and this thread does the actual
86
- database queries and listener notifications.
87
72
 
88
- Thread safety:
89
- - Uses a lock to protect start/stop transitions
90
- - Once stop() is called, the stop event cannot be cleared by start()
91
- - Uses a Condition for atomic wait-and-clear of change notifications
92
- """
73
+ # Type alias for listener callbacks (uses EventBase from state_status)
74
+ StateListener = Callable[[EventBase], None]
93
75
 
94
- def __init__(self, state_provider: "WorkspaceStateProvider"):
95
- self.state_provider = state_provider
96
- self._last_check_time: Optional[datetime] = None
97
- self._change_condition = threading.Condition()
98
- self._change_pending = False # Protected by _change_condition
99
- self._thread: Optional[threading.Thread] = None
100
- self._debounce_seconds = 0.5 # Wait before processing to batch rapid changes
101
- self._state_lock = threading.Lock() # Protects start/stop transitions
102
- self._stopped = False # Once True, cannot be restarted
103
-
104
- def start(self) -> None:
105
- """Start the change detection thread"""
106
- with self._state_lock:
107
- # Once stopped, cannot restart
108
- if self._stopped:
109
- logger.debug("Cannot start change detector - already stopped")
110
- return
111
-
112
- if self._thread is not None and self._thread.is_alive():
113
- return # Already running
114
-
115
- self._thread = threading.Thread(
116
- target=self._run,
117
- daemon=True,
118
- name="DBChangeDetector",
119
- )
120
- self._thread.start()
121
- logger.debug("Started database change detector thread")
122
-
123
- def stop(self) -> None:
124
- """Stop the change detection thread"""
125
- with self._state_lock:
126
- self._stopped = True # Mark as permanently stopped
127
-
128
- # Wake up the thread so it can exit
129
- with self._change_condition:
130
- self._change_condition.notify_all()
131
-
132
- # Join outside the lock to avoid deadlock
133
- if self._thread is not None:
134
- self._thread.join(timeout=2.0)
135
- self._thread = None
136
- logger.debug("Stopped database change detector thread")
137
-
138
- def signal_change(self) -> None:
139
- """Signal that a database change was detected (non-blocking)"""
140
- with self._change_condition:
141
- self._change_pending = True
142
- self._change_condition.notify()
143
-
144
- def _run(self) -> None:
145
- """Main loop: wait for changes and process them"""
146
- while not self._stopped:
147
- # Wait for a change signal and clear it atomically
148
- with self._change_condition:
149
- while not self._change_pending and not self._stopped:
150
- self._change_condition.wait()
151
-
152
- if self._stopped:
153
- break
154
-
155
- # Clear the pending flag atomically while holding the lock
156
- self._change_pending = False
157
-
158
- # Debounce - wait a bit for more changes to accumulate
159
- time.sleep(self._debounce_seconds)
160
-
161
- # Process all accumulated changes
162
- self._detect_and_notify_changes()
163
-
164
- def _detect_and_notify_changes(self) -> None:
165
- """Query the database to detect what changed and send events"""
166
- try:
167
- since = self._last_check_time
168
- self._last_check_time = datetime.now()
169
-
170
- # Query for changed experiments
171
- with self.state_provider.workspace_db.bind_ctx([ExperimentModel]):
172
- query = ExperimentModel.select()
173
- if since:
174
- query = query.where(ExperimentModel.updated_at > since)
175
-
176
- for exp in query:
177
- self.state_provider._notify_listeners(
178
- StateEvent(
179
- event_type=StateEventType.EXPERIMENT_UPDATED,
180
- data={
181
- "experiment_id": exp.experiment_id,
182
- },
183
- )
184
- )
185
-
186
- # Query for changed jobs
187
- with self.state_provider.workspace_db.bind_ctx([JobModel]):
188
- query = JobModel.select()
189
- if since:
190
- query = query.where(JobModel.updated_at > since)
191
-
192
- for job in query:
193
- self.state_provider._notify_listeners(
194
- StateEvent(
195
- event_type=StateEventType.JOB_UPDATED,
196
- data={
197
- "jobId": job.job_id,
198
- "experimentId": job.experiment_id,
199
- "runId": job.run_id,
200
- "status": job.state,
201
- },
202
- )
203
- )
204
76
 
205
- except Exception as e:
206
- logger.warning("Error detecting database changes: %s", e)
77
+ # =============================================================================
78
+ # State Provider ABC
79
+ # =============================================================================
80
+
207
81
 
82
+ class StateProvider(ABC):
83
+ """Abstract base class for state providers
208
84
 
209
- class _DatabaseFileHandler(FileSystemEventHandler):
210
- """Watchdog handler for SQLite database file changes
85
+ Defines the interface that all state providers must implement.
86
+ This enables both local (DbStateProvider), remote (SSHStateProviderClient),
87
+ and live (Scheduler) providers to be used interchangeably.
211
88
 
212
- Simply signals the change detector when database files are modified.
213
- Does not block - all processing happens in the detector thread.
89
+ Concrete implementations:
90
+ - Scheduler: Live in-memory state from running experiments
91
+ - OfflineStateProvider: Base for cached/persistent state (in db_state_provider.py)
92
+ - DbStateProvider: SQLite database-backed state
93
+ - SSHStateProviderClient: Remote SSH-based state
94
+
95
+ State listener management is provided by the base class with default implementations.
214
96
  """
215
97
 
216
- def __init__(self, change_detector: _DatabaseChangeDetector):
217
- super().__init__()
218
- self.change_detector = change_detector
98
+ #: Whether this provider is connected to a live scheduler
99
+ is_live: bool = False
100
+
101
+ def __init__(self) -> None:
102
+ """Initialize state listener management"""
103
+ self._state_listeners: Set[StateListener] = set()
104
+ self._state_listener_lock = threading.Lock()
105
+
106
+ def add_listener(self, listener: StateListener) -> None:
107
+ """Register a listener for state change events
108
+
109
+ Args:
110
+ listener: Callback function that receives StateEvent objects
111
+ """
112
+ with self._state_listener_lock:
113
+ self._state_listeners.add(listener)
114
+
115
+ def remove_listener(self, listener: StateListener) -> None:
116
+ """Unregister a listener
117
+
118
+ Args:
119
+ listener: Previously registered callback function
120
+ """
121
+ with self._state_listener_lock:
122
+ self._state_listeners.discard(listener)
123
+
124
+ def _notify_state_listeners(self, event: EventBase) -> None:
125
+ """Notify all state listeners of an event
126
+
127
+ Args:
128
+ event: State change event to broadcast
129
+ """
130
+ with self._state_listener_lock:
131
+ listeners = list(self._state_listeners)
132
+
133
+ logger.debug(
134
+ "Notifying %d listeners of %s", len(listeners), type(event).__name__
135
+ )
136
+ for listener in listeners:
137
+ try:
138
+ listener(event)
139
+ except Exception as e:
140
+ logger.exception("Error in state listener: %s", e)
219
141
 
220
- def on_any_event(self, event) -> None:
221
- """Handle all file system events"""
222
- # Only handle modification-like events
223
- if event.event_type not in ("modified", "created", "moved"):
224
- return
142
+ def service_state_changed(self, service) -> None:
143
+ """Called when a service's state changes - emit event to listeners
225
144
 
226
- if event.is_directory:
227
- return
145
+ StateProvider registers itself as a listener on services it returns,
146
+ so this method is called when those services' states change.
147
+ """
148
+ from experimaestro.scheduler.state_status import ServiceStateChangedEvent
228
149
 
229
- # Only react to database files
230
- path = Path(event.src_path)
231
- if path.name not in ("workspace.db", "workspace.db-wal"):
232
- return
150
+ experiment_id = getattr(service, "_experiment_id", "") or ""
151
+ run_id = getattr(service, "_run_id", "") or ""
152
+ state_name = service.state.name if hasattr(service.state, "name") else "UNKNOWN"
233
153
 
234
154
  logger.debug(
235
- "Database file changed: %s (event: %s)", path.name, event.event_type
155
+ "Service %s state changed to %s (experiment=%s)",
156
+ service.id,
157
+ state_name,
158
+ experiment_id,
236
159
  )
237
160
 
238
- # Signal the detector thread (non-blocking)
239
- self.change_detector.signal_change()
161
+ event = ServiceStateChangedEvent(
162
+ experiment_id=experiment_id,
163
+ run_id=run_id,
164
+ service_id=service.id,
165
+ state=state_name,
166
+ )
167
+ self._notify_state_listeners(event)
168
+
169
+ @abstractmethod
170
+ def get_experiments(self, since: Optional[datetime] = None) -> List[BaseExperiment]:
171
+ """Get list of all experiments"""
172
+ ...
173
+
174
+ @abstractmethod
175
+ def get_experiment(self, experiment_id: str) -> Optional[BaseExperiment]:
176
+ """Get a specific experiment by ID"""
177
+ ...
178
+
179
+ @abstractmethod
180
+ def get_experiment_runs(self, experiment_id: str) -> List[BaseExperiment]:
181
+ """Get all runs for an experiment
182
+
183
+ Returns:
184
+ List of BaseExperiment instances (MockExperiment for past runs,
185
+ or live experiment for the current run in Scheduler)
186
+ """
187
+ ...
188
+
189
+ @abstractmethod
190
+ def get_current_run(self, experiment_id: str) -> Optional[str]:
191
+ """Get the current run ID for an experiment"""
192
+ ...
193
+
194
+ @abstractmethod
195
+ def get_jobs(
196
+ self,
197
+ experiment_id: Optional[str] = None,
198
+ run_id: Optional[str] = None,
199
+ task_id: Optional[str] = None,
200
+ state: Optional[str] = None,
201
+ tags: Optional[Dict[str, str]] = None,
202
+ since: Optional[datetime] = None,
203
+ ) -> List[BaseJob]:
204
+ """Query jobs with optional filters"""
205
+ ...
206
+
207
+ @abstractmethod
208
+ def get_job(
209
+ self, job_id: str, experiment_id: str, run_id: Optional[str] = None
210
+ ) -> Optional[BaseJob]:
211
+ """Get a specific job"""
212
+ ...
213
+
214
+ @abstractmethod
215
+ def get_all_jobs(
216
+ self,
217
+ state: Optional[str] = None,
218
+ tags: Optional[Dict[str, str]] = None,
219
+ since: Optional[datetime] = None,
220
+ ) -> List[BaseJob]:
221
+ """Get all jobs across all experiments"""
222
+ ...
223
+
224
+ @abstractmethod
225
+ def get_tags_map(
226
+ self,
227
+ experiment_id: str,
228
+ run_id: Optional[str] = None,
229
+ ) -> Dict[str, Dict[str, str]]:
230
+ """Get tags map for jobs in an experiment/run
231
+
232
+ Tags are stored per (job_id, experiment_id, run_id) in JobTagModel.
233
+ This method returns a map from job_id to {tag_key: tag_value}.
234
+
235
+ Args:
236
+ experiment_id: Experiment identifier
237
+ run_id: Run identifier (None = current run)
238
+
239
+ Returns:
240
+ Dictionary mapping job identifiers to their tags dict
241
+ """
242
+ ...
243
+
244
+ @abstractmethod
245
+ def get_dependencies_map(
246
+ self,
247
+ experiment_id: str,
248
+ run_id: Optional[str] = None,
249
+ ) -> Dict[str, List[str]]:
250
+ """Get dependencies map for jobs in an experiment/run
251
+
252
+ Dependencies are stored per (job_id, experiment_id, run_id) in JobDependenciesModel.
253
+ This method returns a map from job_id to list of job_ids it depends on.
254
+
255
+ Args:
256
+ experiment_id: Experiment identifier
257
+ run_id: Run identifier (None = current run)
258
+
259
+ Returns:
260
+ Dictionary mapping job identifiers to list of job IDs they depend on
261
+ """
262
+ ...
263
+
264
+ @abstractmethod
265
+ def get_services(
266
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
267
+ ) -> List[BaseService]:
268
+ """Get services for an experiment"""
269
+ ...
270
+
271
+ # add_listener and remove_listener are implemented in base class
272
+
273
+ @abstractmethod
274
+ def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
275
+ """Kill a running job"""
276
+ ...
277
+
278
+ @abstractmethod
279
+ def clean_job(self, job: BaseJob, perform: bool = False) -> bool:
280
+ """Clean a finished job"""
281
+ ...
282
+
283
+ @abstractmethod
284
+ def close(self) -> None:
285
+ """Close the state provider and release resources"""
286
+ ...
287
+
288
+ # Optional methods with default implementations
289
+
290
+ def sync_path(self, path: str) -> Optional[Path]:
291
+ """Sync a specific path from remote (remote providers only)
292
+
293
+ Returns None for local providers or if sync fails.
294
+ """
295
+ return None
296
+
297
+ def get_orphan_jobs(self) -> List[BaseJob]:
298
+ """Get orphan jobs (jobs not associated with any experiment run)"""
299
+ return []
300
+
301
+ def get_stray_jobs(self) -> List[BaseJob]:
302
+ """Get stray jobs (running jobs not associated with any active experiment)
303
+
304
+ Stray jobs are a subset of orphan jobs - they are orphan jobs that are
305
+ currently running or scheduled. These represent jobs where the experimental
306
+ plan changed but the job process is still running.
307
+
308
+ Returns:
309
+ List of running/scheduled jobs not in any active experiment
310
+ """
311
+ # Default implementation: filter orphan jobs to running ones
312
+ return [j for j in self.get_orphan_jobs() if j.state and j.state.running()]
313
+
314
+ def delete_job_safely(self, job: BaseJob, perform: bool = True) -> Tuple[bool, str]:
315
+ """Safely delete a job and its data"""
316
+ return False, "Not implemented"
317
+
318
+ def delete_experiment(
319
+ self, experiment_id: str, perform: bool = True
320
+ ) -> Tuple[bool, str]:
321
+ """Delete an experiment and all its data"""
322
+ return False, "Not implemented"
323
+
324
+ def cleanup_orphan_partials(self, perform: bool = False) -> List[str]:
325
+ """Clean up orphan partial directories"""
326
+ return []
327
+
328
+ def get_process_info(self, job: BaseJob) -> Optional[ProcessInfo]:
329
+ """Get process information for a job
330
+
331
+ Returns a ProcessInfo dataclass or None if not available.
332
+ """
333
+ return None
334
+
335
+ def get_last_sync_time(self) -> Optional[datetime]:
336
+ """Get the last sync time (for incremental updates)"""
337
+ return None
338
+
339
+ @property
340
+ def read_only(self) -> bool:
341
+ """Whether this provider is read-only"""
342
+ return True
343
+
344
+ @property
345
+ def is_remote(self) -> bool:
346
+ """Whether this is a remote provider (e.g., SSH)
347
+
348
+ Remote providers use periodic refresh instead of push notifications
349
+ and support sync_path for on-demand file synchronization.
350
+ """
351
+ return False
352
+
353
+
354
+ # =============================================================================
355
+ # Offline State Provider (with service caching)
356
+ # =============================================================================
357
+
358
+
359
+ class OfflineStateProvider(StateProvider):
360
+ """State provider for offline/cached state access
361
+
362
+ Provides state listener management and service caching shared by
363
+ WorkspaceStateProvider and SSHStateProviderClient.
364
+
365
+ This is an intermediate class between StateProvider (the ABC) and concrete
366
+ implementations that need state listener support and service caching.
367
+ """
368
+
369
+ def __init__(self):
370
+ """Initialize offline state provider with service cache and listener management"""
371
+ super().__init__() # Initialize state listener management
372
+ self._init_service_cache()
373
+
374
+ # =========================================================================
375
+ # Service caching methods
376
+ # =========================================================================
377
+
378
+ def _init_service_cache(self) -> None:
379
+ """Initialize service cache - call from subclass __init__"""
380
+ self._service_cache: Dict[tuple[str, str], Dict[str, "BaseService"]] = {}
381
+ self._service_cache_lock = threading.Lock()
382
+
383
+ def _clear_service_cache(self) -> None:
384
+ """Clear the service cache"""
385
+ with self._service_cache_lock:
386
+ self._service_cache.clear()
387
+
388
+ def get_services(
389
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
390
+ ) -> List["BaseService"]:
391
+ """Get services for an experiment
392
+
393
+ Uses caching to preserve service instances (and their URLs) across calls.
394
+ Subclasses can override _get_live_services() for live service support
395
+ and must implement _fetch_services_from_storage() for persistent storage.
396
+ """
397
+ # Resolve run_id if needed
398
+ if experiment_id is not None and run_id is None:
399
+ run_id = self.get_current_run(experiment_id)
400
+ if run_id is None:
401
+ return []
402
+
403
+ cache_key = (experiment_id or "", run_id or "")
404
+
405
+ with self._service_cache_lock:
406
+ # Try to get live services (scheduler, etc.) - may return None
407
+ live_services = self._get_live_services(experiment_id, run_id)
408
+ if live_services is not None:
409
+ # Cache and return live services
410
+ self._service_cache[cache_key] = {s.id: s for s in live_services}
411
+ return live_services
412
+
413
+ # Check cache
414
+ cached = self._service_cache.get(cache_key)
415
+ if cached is not None:
416
+ return list(cached.values())
417
+
418
+ # Fetch from persistent storage (filesystem or remote)
419
+ services = self._fetch_services_from_storage(experiment_id, run_id)
420
+ self._service_cache[cache_key] = {s.id: s for s in services}
421
+ return services
422
+
423
+ def _get_live_services(
424
+ self, experiment_id: Optional[str], run_id: Optional[str]
425
+ ) -> Optional[List["BaseService"]]:
426
+ """Get live services if available (e.g., from scheduler).
427
+
428
+ Returns None if no live services are available (default).
429
+ Subclasses may override to check for live services.
430
+ """
431
+ return None
432
+
433
+ @abstractmethod
434
+ def _fetch_services_from_storage(
435
+ self, experiment_id: Optional[str], run_id: Optional[str]
436
+ ) -> List["BaseService"]:
437
+ """Fetch services from persistent storage (filesystem or remote).
438
+
439
+ Called when no live services and cache is empty.
440
+ """
441
+ ...
442
+
443
+ # State listener methods (add_listener, remove_listener, _notify_state_listeners)
444
+ # are inherited from StateProvider base class
445
+
446
+
447
+ # =============================================================================
448
+ # Mock Classes for Database-Loaded State
449
+ # =============================================================================
240
450
 
241
451
 
242
452
  class MockJob(BaseJob):
@@ -246,28 +456,69 @@ class MockJob(BaseJob):
246
456
  as opposed to live Job instances which are created during experiment runs.
247
457
  """
248
458
 
459
+ def apply_event(self, event: "EventBase") -> None:
460
+ """Apply a job event to update this job's state"""
461
+ from experimaestro.scheduler.state_status import (
462
+ JobStateChangedEvent,
463
+ JobProgressEvent,
464
+ )
465
+ from experimaestro.notifications import LevelInformation
466
+
467
+ if isinstance(event, JobStateChangedEvent):
468
+ self.state = STATE_NAME_TO_JOBSTATE.get(event.state, self.state)
469
+ if event.failure_reason:
470
+ try:
471
+ self.failure_reason = JobFailureStatus[event.failure_reason]
472
+ except KeyError:
473
+ pass
474
+ if event.submitted_time is not None:
475
+ self.submittime = event.submitted_time
476
+ if event.started_time is not None:
477
+ self.starttime = event.started_time
478
+ if event.ended_time is not None:
479
+ self.endtime = event.ended_time
480
+ if event.exit_code is not None:
481
+ self.exit_code = event.exit_code
482
+ if event.retry_count:
483
+ self.retry_count = event.retry_count
484
+ logger.debug(
485
+ "Applied state change to job %s: %s", self.identifier, self.state
486
+ )
487
+
488
+ elif isinstance(event, JobProgressEvent):
489
+ level = event.level
490
+ # Truncate to level + 1 entries
491
+ self.progress = self.progress[: (level + 1)]
492
+ # Extend if needed
493
+ while len(self.progress) <= level:
494
+ self.progress.append(LevelInformation(len(self.progress), None, 0.0))
495
+ # Update the level's progress and description
496
+ if event.desc:
497
+ self.progress[-1].desc = event.desc
498
+ self.progress[-1].progress = event.progress
499
+ logger.debug(
500
+ "Applied progress to job %s: %s", self.identifier, self.progress
501
+ )
502
+
249
503
  def __init__(
250
504
  self,
251
505
  identifier: str,
252
506
  task_id: str,
253
- locator: str,
254
507
  path: Path,
255
508
  state: str, # State name string from DB
256
509
  submittime: Optional[float],
257
510
  starttime: Optional[float],
258
511
  endtime: Optional[float],
259
- progress: List[Dict],
260
- tags: Dict[str, str],
261
- experiment_id: str,
262
- run_id: str,
512
+ progress: ProgressInformation,
263
513
  updated_at: str,
264
514
  exit_code: Optional[int] = None,
265
515
  retry_count: int = 0,
266
516
  failure_reason: Optional[JobFailureStatus] = None,
517
+ transient: TransientMode = TransientMode.NONE,
518
+ process: dict | None = None,
267
519
  ):
268
520
  self.identifier = identifier
269
521
  self.task_id = task_id
270
- self.locator = locator
271
522
  self.path = path
272
523
  # Convert state name to JobState instance
273
524
  self.state = STATE_NAME_TO_JOBSTATE.get(state, JobState.UNSCHEDULED)
@@ -275,56 +526,16 @@ class MockJob(BaseJob):
275
526
  self.starttime = starttime
276
527
  self.endtime = endtime
277
528
  self.progress = progress
278
- self.tags = tags
279
- self.experiment_id = experiment_id
280
- self.run_id = run_id
281
529
  self.updated_at = updated_at
282
530
  self.exit_code = exit_code
283
531
  self.retry_count = retry_count
284
532
  self.failure_reason = failure_reason
533
+ self.transient = transient
534
+ self._process_dict = process
285
535
 
286
- @classmethod
287
- def from_disk(cls, path: Path) -> Optional["MockJob"]:
288
- """Create a MockJob by reading metadata from disk
289
-
290
- Args:
291
- path: Path to the job directory
292
-
293
- Returns:
294
- MockJob instance if metadata exists, None otherwise
295
- """
296
- metadata_path = path / ".xpm_metadata.json"
297
- if not metadata_path.exists():
298
- return None
299
-
300
- try:
301
- import json
302
-
303
- with metadata_path.open("r") as f:
304
- metadata = json.load(f)
305
-
306
- return cls(
307
- identifier=metadata.get("job_id", path.name),
308
- task_id=metadata.get(
309
- "task_id", path.parent.name if path.parent else "unknown"
310
- ),
311
- locator=metadata.get("job_id", path.name),
312
- path=path,
313
- state=metadata.get("state", "unscheduled"),
314
- submittime=metadata.get("submitted_time"),
315
- starttime=metadata.get("started_time"),
316
- endtime=metadata.get("ended_time"),
317
- progress=[], # Progress not stored in metadata
318
- tags={}, # Tags come from jobs.jsonl, not metadata
319
- experiment_id="", # Not stored in job metadata
320
- run_id="", # Not stored in job metadata
321
- updated_at=str(metadata.get("last_updated", "")),
322
- exit_code=metadata.get("exit_code"),
323
- retry_count=metadata.get("retry_count", 0),
324
- )
325
- except Exception as e:
326
- logger.warning("Failed to read job metadata from %s: %s", path, e)
327
- return None
536
+ def process_state_dict(self) -> dict | None:
537
+ """Get process state as dictionary."""
538
+ return self._process_dict
328
539
 
329
540
  def getprocess(self):
330
541
  """Get process handle for running job
@@ -353,1993 +564,436 @@ class MockJob(BaseJob):
353
564
  logger.warning("Could not get process for job at %s: %s", self.path, e)
354
565
  return None
355
566
 
567
+ @classmethod
568
+ def from_state_dict(cls, d: Dict, workspace_path: Path) -> "MockJob":
569
+ """Create MockJob from state dictionary
570
+
571
+ Args:
572
+ d: Dictionary from state_dict()
573
+ workspace_path: Workspace path to compute job path if not provided
574
+
575
+ Returns:
576
+ MockJob instance
577
+ """
578
+ task_id = d["task_id"]
579
+ identifier = d["job_id"]
580
+
581
+ # Use path from dict if it's already a Path, otherwise compute it
582
+ path = d.get("path")
583
+ if path is None:
584
+ path = workspace_path / "jobs" / task_id / identifier
585
+ elif isinstance(path, str):
586
+ path = Path(path)
587
+
588
+ failure_reason = None
589
+ if d.get("failure_reason"):
590
+ failure_reason = JobFailureStatus[d["failure_reason"]]
591
+
592
+ # Convert progress dicts to LevelInformation objects
593
+ progress_list = get_progress_information_from_dict(d.get("progress", []))
594
+
595
+ return cls(
596
+ identifier=identifier,
597
+ task_id=task_id,
598
+ path=path,
599
+ state=d["state"],
600
+ submittime=deserialize_timestamp(d.get("submitted_time")),
601
+ starttime=deserialize_timestamp(d.get("started_time")),
602
+ endtime=deserialize_timestamp(d.get("ended_time")),
603
+ progress=progress_list,
604
+ updated_at=d.get("updated_at", ""),
605
+ exit_code=d.get("exit_code"),
606
+ retry_count=d.get("retry_count", 0),
607
+ failure_reason=failure_reason,
608
+ process=d.get("process"),
609
+ )
610
+
356
611
 
357
612
  class MockExperiment(BaseExperiment):
358
- """Concrete implementation of BaseExperiment for database-loaded experiments
613
+ """Concrete implementation of BaseExperiment for loaded experiments
359
614
 
360
- This class is used when loading experiment information from the database,
615
+ This class is used when loading experiment information from disk,
361
616
  as opposed to live experiment instances which are created during runs.
617
+
618
+ It stores all experiment state including jobs, services, tags,
619
+ dependencies, and event tracking (replaces StatusData).
362
620
  """
363
621
 
364
622
  def __init__(
365
623
  self,
366
624
  workdir: Path,
367
- current_run_id: Optional[str],
368
- total_jobs: int,
369
- finished_jobs: int,
370
- failed_jobs: int,
371
- updated_at: str,
625
+ run_id: str,
626
+ *,
627
+ status: ExperimentStatus = ExperimentStatus.RUNNING,
628
+ events_count: int = 0,
629
+ hostname: Optional[str] = None,
372
630
  started_at: Optional[float] = None,
373
631
  ended_at: Optional[float] = None,
632
+ job_infos: Optional[Dict[str, "ExperimentJobInformation"]] = None,
633
+ services: Optional[Dict[str, "MockService"]] = None,
634
+ dependencies: Optional[Dict[str, List[str]]] = None,
635
+ experiment_id_override: Optional[str] = None,
636
+ finished_jobs: int = 0,
637
+ failed_jobs: int = 0,
374
638
  ):
375
639
  self.workdir = workdir
376
- self.current_run_id = current_run_id
377
- self.total_jobs = total_jobs
378
- self.finished_jobs = finished_jobs
379
- self.failed_jobs = failed_jobs
380
- self.updated_at = updated_at
381
- self.started_at = started_at
382
- self.ended_at = ended_at
640
+ self.run_id = run_id
641
+ self._status = status
642
+ self._events_count = events_count
643
+ self._hostname = hostname
644
+ self._started_at = started_at
645
+ self._ended_at = ended_at
646
+ self._job_infos = job_infos or {}
647
+ self._services = services or {}
648
+ self._dependencies = dependencies or {}
649
+ self._experiment_id_override = experiment_id_override
650
+ self._finished_jobs = finished_jobs
651
+ self._failed_jobs = failed_jobs
383
652
 
384
653
  @property
385
654
  def experiment_id(self) -> str:
386
- """Experiment identifier derived from workdir name"""
387
- return self.workdir.name
655
+ """Return experiment_id (overriding base class if needed for v1 layout)"""
656
+ if self._experiment_id_override:
657
+ return self._experiment_id_override
658
+ return super().experiment_id
659
+
660
+ # Implement abstract properties from BaseExperiment
388
661
 
662
+ @property
663
+ def status(self) -> ExperimentStatus:
664
+ return self._status
389
665
 
390
- def _with_db_context(func):
391
- """Decorator to wrap method in database bind context
666
+ @property
667
+ def job_infos(self) -> Dict[str, "ExperimentJobInformation"]:
668
+ """Lightweight job info from jobs.jsonl (job_id, task_id, tags, timestamp)"""
669
+ return self._job_infos
392
670
 
393
- This ensures all database queries have the models bound to the database.
394
- """
395
- from functools import wraps
671
+ @property
672
+ def services(self) -> Dict[str, "BaseService"]:
673
+ return self._services
396
674
 
397
- @wraps(func)
398
- def wrapper(self, *args, **kwargs):
399
- try:
400
- with self.workspace_db.bind_ctx(ALL_MODELS):
401
- return func(self, *args, **kwargs)
402
- except Exception as e:
403
- logger.exception("Error in %s with database context: %s", func.__name__, e)
404
- raise
675
+ @property
676
+ def tags(self) -> Dict[str, Dict[str, str]]:
677
+ """Build tags dict from job_infos"""
678
+ return {
679
+ job_id: job_info.tags
680
+ for job_id, job_info in self._job_infos.items()
681
+ if job_info.tags
682
+ }
405
683
 
406
- return wrapper
684
+ @property
685
+ def dependencies(self) -> Dict[str, List[str]]:
686
+ return self._dependencies
407
687
 
688
+ @property
689
+ def events_count(self) -> int:
690
+ return self._events_count
408
691
 
409
- class WorkspaceStateProvider:
410
- """Unified state provider for workspace-level database (singleton per workspace path)
692
+ @property
693
+ def hostname(self) -> Optional[str]:
694
+ return self._hostname
411
695
 
412
- Provides access to experiment and job state from a single workspace database.
413
- Supports both read-only (monitoring) and read-write (scheduler) modes.
696
+ @property
697
+ def started_at(self) -> Optional[float]:
698
+ return self._started_at
414
699
 
415
- Only one WorkspaceStateProvider instance exists per workspace path. Subsequent
416
- requests for the same path return the existing instance.
700
+ @property
701
+ def ended_at(self) -> Optional[float]:
702
+ return self._ended_at
417
703
 
418
- Thread safety:
419
- - Database connections are thread-local (managed by state_db module)
420
- - Singleton registry is protected by a lock
421
- - Each thread gets its own database connection
704
+ @property
705
+ def total_jobs(self) -> int:
706
+ return len(self._job_infos)
422
707
 
423
- Run tracking:
424
- - Each experiment can have multiple runs
425
- - Jobs/services are scoped to (experiment_id, run_id)
426
- - Tags are scoped to (job_id, experiment_id, run_id) - fixes GH #128
427
- """
708
+ @property
709
+ def finished_jobs(self) -> int:
710
+ return self._finished_jobs
428
711
 
429
- # Registry of state provider instances by absolute path
430
- _instances: Dict[Path, "WorkspaceStateProvider"] = {}
431
- _lock = threading.Lock()
712
+ @property
713
+ def failed_jobs(self) -> int:
714
+ return self._failed_jobs
715
+
716
+ # state_dict() is inherited from BaseExperiment
432
717
 
433
718
  @classmethod
434
- def get_instance(
435
- cls,
436
- workspace_path: Path,
437
- read_only: bool = False,
438
- sync_on_start: bool = False,
439
- sync_interval_minutes: int = 5,
440
- ) -> "WorkspaceStateProvider":
441
- """Get or create WorkspaceStateProvider instance for a workspace path
719
+ def from_disk(
720
+ cls, run_dir: Path, workspace_path: Path
721
+ ) -> Optional["MockExperiment"]:
722
+ """Load MockExperiment from status.json and jobs.jsonl on disk
442
723
 
443
724
  Args:
444
- workspace_path: Root workspace directory
445
- read_only: If True, database is in read-only mode
446
- sync_on_start: If True, sync from disk on initialization
447
- sync_interval_minutes: Minimum interval between syncs (default: 5)
725
+ run_dir: Path to the run directory containing status.json
726
+ workspace_path: Workspace path for resolving relative paths
448
727
 
449
728
  Returns:
450
- WorkspaceStateProvider instance (singleton per path)
729
+ MockExperiment instance or None if status.json doesn't exist
451
730
  """
452
- # Normalize path
453
- if isinstance(workspace_path, Path):
454
- workspace_path = workspace_path.absolute()
455
- else:
456
- workspace_path = Path(workspace_path).absolute()
457
-
458
- # Check if instance already exists
459
- with cls._lock:
460
- if workspace_path in cls._instances:
461
- existing = cls._instances[workspace_path]
462
- # Fail if requesting different read_only mode than cached instance
463
- if existing.read_only != read_only:
464
- raise RuntimeError(
465
- f"WorkspaceStateProvider for {workspace_path} already exists "
466
- f"with read_only={existing.read_only}, cannot open with "
467
- f"read_only={read_only}. Close the existing instance first."
468
- )
469
- return existing
470
-
471
- # Create new instance - register BEFORE __init__ to handle
472
- # nested get_instance calls during sync_on_start
473
- instance = object.__new__(cls)
474
- cls._instances[workspace_path] = instance
475
-
476
- # Initialize outside the lock to avoid deadlock during sync
477
- try:
478
- instance.__init__(
479
- workspace_path, read_only, sync_on_start, sync_interval_minutes
480
- )
481
- except Exception:
482
- # Remove from registry if initialization fails
483
- with cls._lock:
484
- cls._instances.pop(workspace_path, None)
485
- raise
486
- return instance
731
+ import fasteners
487
732
 
488
- def __init__(
489
- self,
490
- workspace_path: Path,
491
- read_only: bool = False,
492
- sync_on_start: bool = False,
493
- sync_interval_minutes: int = 5,
494
- ):
495
- """Initialize workspace state provider (called by get_instance())
733
+ status_path = run_dir / "status.json"
734
+ if not status_path.exists():
735
+ return None
496
736
 
497
- Args:
498
- workspace_path: Root workspace directory
499
- read_only: If True, database is in read-only mode
500
- sync_on_start: If True, sync from disk on initialization
501
- sync_interval_minutes: Minimum interval between syncs (default: 5)
502
- """
503
- # Normalize path
504
- if isinstance(workspace_path, Path):
505
- workspace_path = workspace_path.absolute()
506
- else:
507
- workspace_path = Path(workspace_path).absolute()
737
+ lock_path = status_path.parent / f".{status_path.name}.lock"
738
+ lock = fasteners.InterProcessLock(str(lock_path))
739
+ with lock:
740
+ try:
741
+ with status_path.open("r") as f:
742
+ data = json.load(f)
743
+ except (json.JSONDecodeError, OSError) as e:
744
+ logger.warning("Failed to read %s: %s", status_path, e)
745
+ return None
508
746
 
509
- self.workspace_path = workspace_path
510
- self.read_only = read_only
511
- self.sync_interval_minutes = sync_interval_minutes
747
+ # Create experiment from status.json
748
+ exp = cls.from_state_dict(data, workspace_path)
512
749
 
513
- # Listeners for push notifications
514
- self._listeners: Set[StateListener] = set()
515
- self._listeners_lock = threading.Lock()
750
+ # Load jobs from jobs.jsonl
751
+ jobs_jsonl_path = run_dir / "jobs.jsonl"
752
+ if jobs_jsonl_path.exists():
753
+ try:
754
+ with jobs_jsonl_path.open("r") as f:
755
+ for line in f:
756
+ line = line.strip()
757
+ if not line:
758
+ continue
759
+ try:
760
+ record = json.loads(line)
761
+ job_info = ExperimentJobInformation.from_dict(record)
762
+ exp._job_infos[job_info.job_id] = job_info
763
+ except (json.JSONDecodeError, KeyError):
764
+ continue
765
+ except OSError as e:
766
+ logger.warning("Failed to read %s: %s", jobs_jsonl_path, e)
767
+
768
+ return exp
769
+
770
+ @classmethod
771
+ def from_state_dict(cls, d: Dict, workspace_path: Path) -> "MockExperiment":
772
+ """Create MockExperiment from state dictionary
516
773
 
517
- # File watcher for database changes (started when listeners are added)
518
- self._change_detector: Optional[_DatabaseChangeDetector] = None
519
- self._db_file_handler: Optional[_DatabaseFileHandler] = None
520
- self._db_file_watch: Optional[ObservedWatch] = None
774
+ Args:
775
+ d: Dictionary from state_dict()
776
+ workspace_path: Workspace path to compute experiment path if not provided
521
777
 
522
- # Check and update workspace version
523
- from .workspace import WORKSPACE_VERSION
778
+ Returns:
779
+ MockExperiment instance
780
+ """
781
+ experiment_id = d.get("experiment_id", "")
782
+ run_id = d.get("run_id", "")
524
783
 
525
- version_file = self.workspace_path / ".__experimaestro__"
784
+ # Use workdir from dict if provided, otherwise compute it
785
+ workdir = d.get("workdir")
786
+ if workdir is None:
787
+ # New layout: experiments/{experiment_id}/{run_id}/
788
+ workdir = workspace_path / "experiments" / experiment_id / run_id
789
+ elif isinstance(workdir, str):
790
+ workdir = Path(workdir)
526
791
 
527
- if version_file.exists():
528
- # Read existing version
529
- content = version_file.read_text().strip()
530
- if content == "":
531
- # Empty file = v0
532
- workspace_version = 0
792
+ # Parse status from string to enum
793
+ status_str = d.get("status", "running")
794
+ try:
795
+ status = ExperimentStatus(status_str)
796
+ except ValueError:
797
+ # Handle legacy status values
798
+ if status_str in ("active", "running"):
799
+ status = ExperimentStatus.RUNNING
800
+ elif status_str in ("completed", "done"):
801
+ status = ExperimentStatus.DONE
802
+ elif status_str == "failed":
803
+ status = ExperimentStatus.FAILED
533
804
  else:
534
- try:
535
- workspace_version = int(content)
536
- except ValueError:
537
- raise RuntimeError(
538
- f"Invalid workspace version file at {version_file}: "
539
- f"expected integer, got '{content}'"
540
- )
541
-
542
- # Check if workspace version is supported
543
- if workspace_version > WORKSPACE_VERSION:
544
- raise RuntimeError(
545
- f"Workspace version {workspace_version} is not supported by "
546
- f"this version of experimaestro (supports up to version "
547
- f"{WORKSPACE_VERSION}). Please upgrade experimaestro."
548
- )
549
- if workspace_version < WORKSPACE_VERSION:
550
- raise RuntimeError(
551
- f"Workspace version {workspace_version} is not supported by "
552
- "this version of experimaestro (please upgrade the experimaestro "
553
- "workspace)"
554
- )
805
+ status = ExperimentStatus.RUNNING
806
+
807
+ # Parse services from dict (can be list or dict)
808
+ services_data = d.get("services", {})
809
+ if isinstance(services_data, list):
810
+ services = {
811
+ s.get("service_id", ""): MockService.from_full_state_dict(s)
812
+ for s in services_data
813
+ }
555
814
  else:
556
- # New workspace - create the file
557
- workspace_version = WORKSPACE_VERSION
558
-
559
- # Write current version to file (update empty v0 workspaces)
560
- if not read_only and (
561
- not version_file.exists() or version_file.read_text().strip() == ""
562
- ):
563
- version_file.write_text(str(WORKSPACE_VERSION))
564
-
565
- # Initialize workspace database in hidden .experimaestro directory
566
- from .state_db import initialize_workspace_database
567
-
568
- experimaestro_dir = self.workspace_path / ".experimaestro"
569
- if not read_only:
570
- experimaestro_dir.mkdir(parents=True, exist_ok=True)
571
-
572
- db_path = experimaestro_dir / "workspace.db"
573
- self.workspace_db = initialize_workspace_database(db_path, read_only=read_only)
574
- self._db_dir = experimaestro_dir # Store for file watcher
575
-
576
- # Optionally sync from disk on start (only in write mode)
577
- # Syncing requires write access to update the database and sync timestamp
578
- if sync_on_start and not read_only:
579
- from .state_sync import sync_workspace_from_disk
580
-
581
- sync_workspace_from_disk(
582
- self.workspace_path,
583
- write_mode=True,
584
- force=False,
585
- sync_interval_minutes=sync_interval_minutes,
586
- )
815
+ services = {
816
+ k: MockService.from_full_state_dict(v) for k, v in services_data.items()
817
+ }
587
818
 
588
- logger.info(
589
- "WorkspaceStateProvider initialized (read_only=%s, workspace=%s)",
590
- read_only,
591
- workspace_path,
819
+ return cls(
820
+ workdir=workdir,
821
+ run_id=run_id,
822
+ status=status,
823
+ events_count=d.get("events_count", 0),
824
+ hostname=d.get("hostname"),
825
+ started_at=d.get("started_at"),
826
+ ended_at=d.get("ended_at"),
827
+ services=services,
828
+ dependencies=d.get("dependencies", {}),
829
+ finished_jobs=d.get("finished_jobs", 0),
830
+ failed_jobs=d.get("failed_jobs", 0),
592
831
  )
593
832
 
594
- # Experiment management methods
595
-
596
- @_with_db_context
597
- def ensure_experiment(self, experiment_id: str):
598
- """Create or update experiment record
833
+ def apply_event(self, event: "EventBase") -> None:
834
+ """Apply an event to update experiment state
599
835
 
600
836
  Args:
601
- experiment_id: Unique identifier for the experiment
837
+ event: Event to apply
602
838
  """
603
- if self.read_only:
604
- raise RuntimeError("Cannot modify experiments in read-only mode")
605
-
606
- now = datetime.now()
607
- ExperimentModel.insert(
608
- experiment_id=experiment_id,
609
- created_at=now,
610
- updated_at=now,
611
- ).on_conflict(
612
- conflict_target=[ExperimentModel.experiment_id],
613
- update={
614
- ExperimentModel.updated_at: now,
615
- },
616
- ).execute()
617
-
618
- logger.debug("Ensured experiment: %s", experiment_id)
619
-
620
- # Notify listeners
621
- exp_path = str(self.workspace_path / "xp" / experiment_id)
622
- self._notify_listeners(
623
- StateEvent(
624
- event_type=StateEventType.EXPERIMENT_UPDATED,
625
- data={
626
- "experiment_id": experiment_id,
627
- "workdir_path": exp_path,
628
- "updated_at": now.isoformat(),
629
- },
630
- )
839
+ from experimaestro.scheduler.state_status import (
840
+ JobSubmittedEvent,
841
+ JobStateChangedEvent,
842
+ ServiceAddedEvent,
843
+ RunCompletedEvent,
631
844
  )
632
845
 
633
- @_with_db_context
634
- def create_run(self, experiment_id: str, run_id: Optional[str] = None) -> str:
635
- """Create a new run for an experiment
846
+ if isinstance(event, JobSubmittedEvent):
847
+ # Add lightweight job info (tags are stored in ExperimentJobInformation)
848
+ self._job_infos[event.job_id] = ExperimentJobInformation(
849
+ job_id=event.job_id,
850
+ task_id=event.task_id,
851
+ tags=event.tags or {},
852
+ timestamp=event.timestamp,
853
+ )
854
+ if event.depends_on:
855
+ self._dependencies[event.job_id] = event.depends_on
856
+
857
+ elif isinstance(event, ServiceAddedEvent):
858
+ self._services[event.service_id] = MockService(
859
+ service_id=event.service_id,
860
+ description_text=event.description,
861
+ state_dict_data=event.state_dict,
862
+ service_class=event.service_class,
863
+ experiment_id=self.experiment_id,
864
+ run_id=self.run_id,
865
+ )
866
+
867
+ elif isinstance(event, JobStateChangedEvent):
868
+ # Update finished/failed counters when jobs complete
869
+ if event.state == "done":
870
+ self._finished_jobs += 1
871
+ elif event.state == "error":
872
+ self._failed_jobs += 1
873
+
874
+ elif isinstance(event, RunCompletedEvent):
875
+ # Map status string to ExperimentStatus
876
+ if event.status in ("completed", "done"):
877
+ self._status = ExperimentStatus.DONE
878
+ elif event.status == "failed":
879
+ self._status = ExperimentStatus.FAILED
880
+ else:
881
+ self._status = ExperimentStatus.RUNNING
882
+ self._ended_at = event.ended_at
636
883
 
637
- Args:
638
- experiment_id: Experiment identifier
639
- run_id: Optional run ID (auto-generated from timestamp if not provided)
640
-
641
- Returns:
642
- The run_id that was created
643
-
644
- Raises:
645
- RuntimeError: If in read-only mode
646
- """
647
- if self.read_only:
648
- raise RuntimeError("Cannot create runs in read-only mode")
649
-
650
- # Auto-generate run_id from timestamp if not provided
651
- if run_id is None:
652
- now = datetime.now()
653
- run_id = now.strftime("%Y%m%d_%H%M%S") + f"_{now.microsecond:06d}"
654
-
655
- # Create run record
656
- ExperimentRunModel.insert(
657
- experiment_id=experiment_id,
658
- run_id=run_id,
659
- started_at=datetime.now(),
660
- status="active",
661
- ).execute()
662
-
663
- # Update experiment's current_run_id and updated_at
664
- now = datetime.now()
665
- ExperimentModel.update(
666
- current_run_id=run_id,
667
- updated_at=now,
668
- ).where(ExperimentModel.experiment_id == experiment_id).execute()
669
-
670
- logger.info("Created run %s for experiment %s", run_id, experiment_id)
671
-
672
- # Notify listeners
673
- self._notify_listeners(
674
- StateEvent(
675
- event_type=StateEventType.RUN_UPDATED,
676
- data={
677
- "experiment_id": experiment_id,
678
- "run_id": run_id,
679
- "status": "active",
680
- "started_at": now.isoformat(),
681
- },
682
- )
683
- )
684
-
685
- return run_id
686
-
687
- @_with_db_context
688
- def get_current_run(self, experiment_id: str) -> Optional[str]:
689
- """Get the current/latest run_id for an experiment
690
-
691
- Args:
692
- experiment_id: Experiment identifier
693
-
694
- Returns:
695
- Current run_id or None if no runs exist
696
- """
697
- try:
698
- experiment = ExperimentModel.get(
699
- ExperimentModel.experiment_id == experiment_id
700
- )
701
- return experiment.current_run_id
702
- except ExperimentModel.DoesNotExist:
703
- return None
704
-
705
- @_with_db_context
706
- def get_experiments(self, since: Optional[datetime] = None) -> List[MockExperiment]:
707
- """Get list of all experiments
708
-
709
- Args:
710
- since: If provided, only return experiments updated after this timestamp
711
-
712
- Returns:
713
- List of MockExperiment objects with attributes:
714
- - workdir: Path to experiment directory
715
- - experiment_id: Unique identifier (property derived from workdir.name)
716
- - current_run_id: Current/latest run ID
717
- - total_jobs: Total number of jobs (for current run)
718
- - finished_jobs: Number of completed jobs (for current run)
719
- - failed_jobs: Number of failed jobs (for current run)
720
- - updated_at: When experiment was last modified
721
- """
722
- experiments = []
723
-
724
- query = ExperimentModel.select()
725
- if since is not None:
726
- query = query.where(ExperimentModel.updated_at > since)
727
-
728
- for exp_model in query:
729
- # Count jobs for current run
730
- total_jobs = 0
731
- finished_jobs = 0
732
- failed_jobs = 0
733
-
734
- started_at = None
735
- ended_at = None
736
-
737
- if exp_model.current_run_id:
738
- total_jobs = (
739
- JobModel.select()
740
- .where(
741
- (JobModel.experiment_id == exp_model.experiment_id)
742
- & (JobModel.run_id == exp_model.current_run_id)
743
- )
744
- .count()
745
- )
746
- finished_jobs = (
747
- JobModel.select()
748
- .where(
749
- (JobModel.experiment_id == exp_model.experiment_id)
750
- & (JobModel.run_id == exp_model.current_run_id)
751
- & (JobModel.state == "done")
752
- )
753
- .count()
754
- )
755
- failed_jobs = (
756
- JobModel.select()
757
- .where(
758
- (JobModel.experiment_id == exp_model.experiment_id)
759
- & (JobModel.run_id == exp_model.current_run_id)
760
- & (JobModel.state == "error")
761
- )
762
- .count()
763
- )
764
-
765
- # Get run timestamps
766
- try:
767
- run_model = ExperimentRunModel.get(
768
- (ExperimentRunModel.experiment_id == exp_model.experiment_id)
769
- & (ExperimentRunModel.run_id == exp_model.current_run_id)
770
- )
771
- if run_model.started_at:
772
- started_at = run_model.started_at.timestamp()
773
- if run_model.ended_at:
774
- ended_at = run_model.ended_at.timestamp()
775
- except ExperimentRunModel.DoesNotExist:
776
- pass
777
-
778
- # Compute experiment path from workspace_path and experiment_id
779
- exp_path = self.workspace_path / "xp" / exp_model.experiment_id
780
-
781
- experiments.append(
782
- MockExperiment(
783
- workdir=exp_path,
784
- current_run_id=exp_model.current_run_id,
785
- total_jobs=total_jobs,
786
- finished_jobs=finished_jobs,
787
- failed_jobs=failed_jobs,
788
- updated_at=exp_model.updated_at.isoformat(),
789
- started_at=started_at,
790
- ended_at=ended_at,
791
- )
792
- )
793
-
794
- return experiments
795
-
796
- @_with_db_context
797
- def get_experiment(self, experiment_id: str) -> Optional[MockExperiment]:
798
- """Get a specific experiment by ID
799
-
800
- Args:
801
- experiment_id: Experiment identifier
802
-
803
- Returns:
804
- MockExperiment object or None if not found
805
- """
806
- try:
807
- exp_model = ExperimentModel.get(
808
- ExperimentModel.experiment_id == experiment_id
809
- )
810
- except ExperimentModel.DoesNotExist:
811
- return None
812
-
813
- # Count jobs for current run
814
- total_jobs = 0
815
- finished_jobs = 0
816
- failed_jobs = 0
817
-
818
- if exp_model.current_run_id:
819
- total_jobs = (
820
- JobModel.select()
821
- .where(
822
- (JobModel.experiment_id == exp_model.experiment_id)
823
- & (JobModel.run_id == exp_model.current_run_id)
824
- )
825
- .count()
826
- )
827
- finished_jobs = (
828
- JobModel.select()
829
- .where(
830
- (JobModel.experiment_id == exp_model.experiment_id)
831
- & (JobModel.run_id == exp_model.current_run_id)
832
- & (JobModel.state == "done")
833
- )
834
- .count()
835
- )
836
- failed_jobs = (
837
- JobModel.select()
838
- .where(
839
- (JobModel.experiment_id == exp_model.experiment_id)
840
- & (JobModel.run_id == exp_model.current_run_id)
841
- & (JobModel.state == "error")
842
- )
843
- .count()
844
- )
845
-
846
- # Compute experiment path from workspace_path and experiment_id
847
- exp_path = self.workspace_path / "xp" / exp_model.experiment_id
848
-
849
- return MockExperiment(
850
- workdir=exp_path,
851
- current_run_id=exp_model.current_run_id,
852
- total_jobs=total_jobs,
853
- finished_jobs=finished_jobs,
854
- failed_jobs=failed_jobs,
855
- updated_at=exp_model.updated_at.isoformat(),
856
- )
857
-
858
- @_with_db_context
859
- def get_experiment_runs(self, experiment_id: str) -> List[Dict]:
860
- """Get all runs for an experiment
861
-
862
- Args:
863
- experiment_id: Experiment identifier
864
-
865
- Returns:
866
- List of run dictionaries with keys:
867
- - experiment_id: Experiment ID
868
- - run_id: Run ID
869
- - started_at: When run started
870
- - ended_at: When run completed (None if active)
871
- - status: Run status (active, completed, failed, abandoned)
872
- """
873
- runs = []
874
- for run_model in (
875
- ExperimentRunModel.select()
876
- .where(ExperimentRunModel.experiment_id == experiment_id)
877
- .order_by(ExperimentRunModel.started_at.desc())
878
- ):
879
- runs.append(
880
- {
881
- "experiment_id": run_model.experiment_id,
882
- "run_id": run_model.run_id,
883
- "started_at": run_model.started_at.isoformat(),
884
- "ended_at": (
885
- run_model.ended_at.isoformat() if run_model.ended_at else None
886
- ),
887
- "status": run_model.status,
888
- }
889
- )
890
- return runs
891
-
892
- @_with_db_context
893
- def complete_run(self, experiment_id: str, run_id: str, status: str = "completed"):
894
- """Mark a run as completed
895
-
896
- Args:
897
- experiment_id: Experiment identifier
898
- run_id: Run identifier
899
- status: Final status (completed, failed, abandoned)
900
-
901
- Raises:
902
- RuntimeError: If in read-only mode
903
- """
904
- if self.read_only:
905
- raise RuntimeError("Cannot modify runs in read-only mode")
906
884
 
907
- ExperimentRunModel.update(ended_at=datetime.now(), status=status).where(
908
- (ExperimentRunModel.experiment_id == experiment_id)
909
- & (ExperimentRunModel.run_id == run_id)
910
- ).execute()
885
+ class MockService(BaseService):
886
+ """Mock service object for remote monitoring
911
887
 
912
- logger.info("Marked run %s/%s as %s", experiment_id, run_id, status)
913
-
914
- # Job operations
888
+ This class provides a service-like interface for services loaded from
889
+ the remote server. It mimics the Service class interface sufficiently
890
+ for display in the TUI ServicesList widget.
891
+ """
915
892
 
916
- @_with_db_context
917
- def get_jobs(
893
+ def __init__(
918
894
  self,
895
+ service_id: str,
896
+ description_text: str,
897
+ state_dict_data: dict,
898
+ service_class: Optional[str] = None,
919
899
  experiment_id: Optional[str] = None,
920
900
  run_id: Optional[str] = None,
921
- task_id: Optional[str] = None,
922
- state: Optional[str] = None,
923
- tags: Optional[Dict[str, str]] = None,
924
- since: Optional[datetime] = None,
925
- ) -> List[MockJob]:
926
- """Query jobs with optional filters
927
-
928
- Args:
929
- experiment_id: Filter by experiment (None = all experiments)
930
- run_id: Filter by run (None = current run if experiment_id provided)
931
- task_id: Filter by task class identifier
932
- state: Filter by job state
933
- tags: Filter by tags (all tags must match)
934
- since: If provided, only return jobs updated after this timestamp
935
-
936
- Returns:
937
- List of MockJob objects
938
- """
939
- # Build base query
940
- query = JobModel.select()
941
-
942
- # Apply since filter for incremental updates
943
- if since is not None:
944
- query = query.where(JobModel.updated_at > since)
945
-
946
- # Apply experiment filter
947
- if experiment_id is not None:
948
- # If experiment_id provided but not run_id, use current run
949
- if run_id is None:
950
- current_run = self.get_current_run(experiment_id)
951
- if current_run is None:
952
- return [] # No runs exist for this experiment
953
- run_id = current_run
954
-
955
- query = query.where(
956
- (JobModel.experiment_id == experiment_id) & (JobModel.run_id == run_id)
957
- )
958
-
959
- # Apply task_id filter
960
- if task_id is not None:
961
- query = query.where(JobModel.task_id == task_id)
962
-
963
- # Apply state filter
964
- if state is not None:
965
- query = query.where(JobModel.state == state)
966
-
967
- # Apply tag filters
968
- if tags:
969
- for tag_key, tag_value in tags.items():
970
- # Join with JobTagModel for each tag filter
971
- query = query.join(
972
- JobTagModel,
973
- on=(
974
- (JobTagModel.job_id == JobModel.job_id)
975
- & (JobTagModel.experiment_id == JobModel.experiment_id)
976
- & (JobTagModel.run_id == JobModel.run_id)
977
- & (JobTagModel.tag_key == tag_key)
978
- & (JobTagModel.tag_value == tag_value)
979
- ),
980
- )
981
-
982
- # Execute query and convert to dictionaries
983
- jobs = []
984
- for job_model in query:
985
- # Get tags for this job
986
- job_tags = self._get_job_tags(
987
- job_model.job_id, job_model.experiment_id, job_model.run_id
988
- )
989
-
990
- jobs.append(self._job_model_to_dict(job_model, job_tags))
991
-
992
- return jobs
993
-
994
- @_with_db_context
995
- def get_job(
996
- self, job_id: str, experiment_id: str, run_id: Optional[str] = None
997
- ) -> Optional[MockJob]:
998
- """Get a specific job
999
-
1000
- Args:
1001
- job_id: Job identifier
1002
- experiment_id: Experiment identifier
1003
- run_id: Run identifier (None = current run)
1004
-
1005
- Returns:
1006
- MockJob object or None if not found
1007
- """
1008
- # Use current run if not specified
1009
- if run_id is None:
1010
- run_id = self.get_current_run(experiment_id)
1011
- if run_id is None:
1012
- return None
1013
-
1014
- try:
1015
- job_model = JobModel.get(
1016
- (JobModel.job_id == job_id)
1017
- & (JobModel.experiment_id == experiment_id)
1018
- & (JobModel.run_id == run_id)
1019
- )
1020
- except JobModel.DoesNotExist:
1021
- return None
1022
-
1023
- # Get tags for this job
1024
- job_tags = self._get_job_tags(job_id, experiment_id, run_id)
1025
-
1026
- return self._job_model_to_dict(job_model, job_tags)
1027
-
1028
- @_with_db_context
1029
- def update_job_submitted(self, job: "Job", experiment_id: str, run_id: str):
1030
- """Record that a job has been submitted
1031
-
1032
- Args:
1033
- job: Job instance
1034
- experiment_id: Experiment identifier
1035
- run_id: Run identifier
1036
-
1037
- Raises:
1038
- RuntimeError: If in read-only mode
1039
- """
1040
- if self.read_only:
1041
- raise RuntimeError("Cannot update jobs in read-only mode")
1042
-
1043
- task_id = str(job.type.identifier)
1044
-
1045
- # Create or update job record
1046
- now = datetime.now()
1047
- JobModel.insert(
1048
- job_id=job.identifier,
1049
- experiment_id=experiment_id,
1050
- run_id=run_id,
1051
- task_id=task_id,
1052
- locator=job.identifier,
1053
- state=job.state.name,
1054
- submitted_time=job.submittime,
1055
- updated_at=now,
1056
- ).on_conflict(
1057
- conflict_target=[JobModel.job_id, JobModel.experiment_id, JobModel.run_id],
1058
- update={
1059
- JobModel.state: job.state.name,
1060
- JobModel.submitted_time: job.submittime,
1061
- JobModel.updated_at: now,
1062
- JobModel.failure_reason: None, # Clear old failure reason on resubmit
1063
- },
1064
- ).execute()
1065
-
1066
- # Update tags (run-scoped)
1067
- self.update_job_tags(job.identifier, experiment_id, run_id, job.tags)
1068
-
1069
- # Register partials for all declared subparameters
1070
- subparameters = job.type._subparameters
1071
- for name, sp in subparameters.items():
1072
- partial_id = job.config.__xpm__.get_partial_identifier(sp)
1073
- partial_id_hex = partial_id.all.hex()
1074
-
1075
- # Register the partial directory
1076
- self.register_partial(partial_id_hex, task_id, name)
1077
-
1078
- # Link job to partial
1079
- self.register_job_partial(
1080
- job.identifier, experiment_id, run_id, partial_id_hex
1081
- )
1082
-
1083
- logger.debug(
1084
- "Recorded job submission: %s (experiment=%s, run=%s)",
1085
- job.identifier,
1086
- experiment_id,
1087
- run_id,
1088
- )
1089
-
1090
- # Notify listeners
1091
- job_path = str(
1092
- self.workspace_path / "jobs" / str(job.type.identifier) / job.identifier
1093
- )
1094
- self._notify_listeners(
1095
- StateEvent(
1096
- event_type=StateEventType.JOB_UPDATED,
1097
- data={
1098
- "jobId": job.identifier,
1099
- "taskId": str(job.type.identifier),
1100
- "experimentId": experiment_id,
1101
- "runId": run_id,
1102
- "status": job.state.name,
1103
- "path": job_path,
1104
- "updatedAt": now.isoformat(),
1105
- },
1106
- )
1107
- )
1108
-
1109
- @_with_db_context
1110
- def update_job_state(self, job: "Job", experiment_id: str, run_id: str):
1111
- """Update the state of a job
1112
-
1113
- Args:
1114
- job: Job instance
1115
- experiment_id: Experiment identifier
1116
- run_id: Run identifier
1117
-
1118
- Raises:
1119
- RuntimeError: If in read-only mode
1120
- """
1121
- if self.read_only:
1122
- raise RuntimeError("Cannot update jobs in read-only mode")
1123
-
1124
- # Build update dict with updated_at timestamp
1125
- now = datetime.now()
1126
- update_data = {
1127
- JobModel.state: job.state.name,
1128
- JobModel.updated_at: now,
1129
- }
1130
-
1131
- # Add or clear failure reason based on state
1132
- from experimaestro.scheduler.jobs import JobStateError
1133
-
1134
- if isinstance(job.state, JobStateError) and job.state.failure_reason:
1135
- update_data[JobModel.failure_reason] = job.state.failure_reason.name
1136
- else:
1137
- # Clear failure reason when job is not in error state
1138
- update_data[JobModel.failure_reason] = None
1139
-
1140
- # Add timing information
1141
- if job.starttime:
1142
- update_data[JobModel.started_time] = job.starttime
1143
- if job.endtime:
1144
- update_data[JobModel.ended_time] = job.endtime
1145
-
1146
- # Add progress information
1147
- if job._progress:
1148
- update_data[JobModel.progress] = json.dumps(
1149
- [
1150
- {"level": p.level, "progress": p.progress, "desc": p.desc}
1151
- for p in job._progress
1152
- ]
1153
- )
1154
-
1155
- # Update the job record
1156
- JobModel.update(update_data).where(
1157
- (JobModel.job_id == job.identifier)
1158
- & (JobModel.experiment_id == experiment_id)
1159
- & (JobModel.run_id == run_id)
1160
- ).execute()
1161
-
1162
- logger.debug(
1163
- "Updated job state: %s -> %s (experiment=%s, run=%s)",
1164
- job.identifier,
1165
- job.state.name,
1166
- experiment_id,
1167
- run_id,
1168
- )
1169
-
1170
- # Notify listeners
1171
- job_path = str(
1172
- self.workspace_path / "jobs" / str(job.type.identifier) / job.identifier
1173
- )
1174
- self._notify_listeners(
1175
- StateEvent(
1176
- event_type=StateEventType.JOB_UPDATED,
1177
- data={
1178
- "jobId": job.identifier,
1179
- "taskId": str(job.type.identifier),
1180
- "experimentId": experiment_id,
1181
- "runId": run_id,
1182
- "status": job.state.name,
1183
- "path": job_path,
1184
- "updatedAt": now.isoformat(),
1185
- },
1186
- )
1187
- )
1188
-
1189
- @_with_db_context
1190
- def update_job_tags(
1191
- self, job_id: str, experiment_id: str, run_id: str, tags_dict: Dict[str, str]
901
+ url: Optional[str] = None,
1192
902
  ):
1193
- """Update tags for a job (run-scoped - fixes GH #128)
1194
-
1195
- Deletes existing tags for this (job_id, experiment_id, run_id) combination
1196
- and inserts new tags. This ensures that the same job in different runs can
1197
- have different tags.
1198
-
1199
- Args:
1200
- job_id: Job identifier
1201
- experiment_id: Experiment identifier
1202
- run_id: Run identifier
1203
- tags_dict: Dictionary of tag key-value pairs
1204
-
1205
- Raises:
1206
- RuntimeError: If in read-only mode
1207
- """
1208
- if self.read_only:
1209
- raise RuntimeError("Cannot update tags in read-only mode")
1210
-
1211
- # Delete existing tags for this job/experiment/run
1212
- JobTagModel.delete().where(
1213
- (JobTagModel.job_id == job_id)
1214
- & (JobTagModel.experiment_id == experiment_id)
1215
- & (JobTagModel.run_id == run_id)
1216
- ).execute()
1217
-
1218
- # Insert new tags
1219
- if tags_dict:
1220
- tag_records = [
1221
- {
1222
- "job_id": job_id,
1223
- "experiment_id": experiment_id,
1224
- "run_id": run_id,
1225
- "tag_key": key,
1226
- "tag_value": value,
1227
- }
1228
- for key, value in tags_dict.items()
1229
- ]
1230
- JobTagModel.insert_many(tag_records).execute()
1231
-
1232
- logger.debug(
1233
- "Updated tags for job %s (experiment=%s, run=%s): %s",
1234
- job_id,
1235
- experiment_id,
1236
- run_id,
1237
- tags_dict,
1238
- )
1239
-
1240
- @_with_db_context
1241
- def delete_job(self, job_id: str, experiment_id: str, run_id: str):
1242
- """Remove a job, its tags, and partial references
1243
-
1244
- Args:
1245
- job_id: Job identifier
1246
- experiment_id: Experiment identifier
1247
- run_id: Run identifier
1248
-
1249
- Raises:
1250
- RuntimeError: If in read-only mode
1251
- """
1252
- if self.read_only:
1253
- raise RuntimeError("Cannot delete jobs in read-only mode")
1254
-
1255
- # Delete tags first (foreign key constraint)
1256
- JobTagModel.delete().where(
1257
- (JobTagModel.job_id == job_id)
1258
- & (JobTagModel.experiment_id == experiment_id)
1259
- & (JobTagModel.run_id == run_id)
1260
- ).execute()
1261
-
1262
- # Delete partial references
1263
- JobPartialModel.delete().where(
1264
- (JobPartialModel.job_id == job_id)
1265
- & (JobPartialModel.experiment_id == experiment_id)
1266
- & (JobPartialModel.run_id == run_id)
1267
- ).execute()
1268
-
1269
- # Delete job
1270
- JobModel.delete().where(
1271
- (JobModel.job_id == job_id)
1272
- & (JobModel.experiment_id == experiment_id)
1273
- & (JobModel.run_id == run_id)
1274
- ).execute()
1275
-
1276
- logger.debug(
1277
- "Deleted job %s (experiment=%s, run=%s)", job_id, experiment_id, run_id
1278
- )
1279
-
1280
- # CLI utility methods for job management
1281
-
1282
- @_with_db_context
1283
- def get_all_jobs(
1284
- self,
1285
- state: Optional[str] = None,
1286
- tags: Optional[Dict[str, str]] = None,
1287
- since: Optional[datetime] = None,
1288
- ) -> List[MockJob]:
1289
- """Query all jobs across all experiments/runs
1290
-
1291
- This method is designed for CLI tools that need to list or manage jobs
1292
- across the entire workspace, regardless of experiment or run.
1293
-
1294
- Args:
1295
- state: Filter by job state (e.g., "done", "error", "running")
1296
- tags: Filter by tags (all tags must match)
1297
- since: If provided, only return jobs updated after this timestamp
1298
-
1299
- Returns:
1300
- List of MockJob objects
1301
- """
1302
- # Build base query
1303
- query = JobModel.select()
1304
-
1305
- # Apply since filter for incremental updates
1306
- if since is not None:
1307
- query = query.where(JobModel.updated_at > since)
1308
-
1309
- # Apply state filter
1310
- if state is not None:
1311
- query = query.where(JobModel.state == state)
1312
-
1313
- # Apply tag filters
1314
- if tags:
1315
- for tag_key, tag_value in tags.items():
1316
- query = query.join(
1317
- JobTagModel,
1318
- on=(
1319
- (JobTagModel.job_id == JobModel.job_id)
1320
- & (JobTagModel.experiment_id == JobModel.experiment_id)
1321
- & (JobTagModel.run_id == JobModel.run_id)
1322
- & (JobTagModel.tag_key == tag_key)
1323
- & (JobTagModel.tag_value == tag_value)
1324
- ),
1325
- )
1326
-
1327
- # Execute query and convert to MockJob objects
1328
- jobs = []
1329
- for job_model in query:
1330
- # Get tags for this job
1331
- job_tags = self._get_job_tags(
1332
- job_model.job_id, job_model.experiment_id, job_model.run_id
1333
- )
1334
- jobs.append(self._job_model_to_dict(job_model, job_tags))
1335
-
1336
- return jobs
1337
-
1338
- def kill_job(self, job: MockJob, perform: bool = False) -> bool:
1339
- """Kill a running job process
1340
-
1341
- This method finds the process associated with a running job and kills it.
1342
- It also updates the job state in the database to ERROR.
1343
-
1344
- Args:
1345
- job: MockJob instance to kill
1346
- perform: If True, actually kill the process. If False, just check
1347
- if the job can be killed (dry run).
1348
-
1349
- Returns:
1350
- True if job was killed (or would be killed in dry run),
1351
- False if job is not running or process not found
1352
- """
1353
- # Check if job is in a running state
1354
- if not job.state.running():
1355
- logger.debug("Job %s is not running (state=%s)", job.identifier, job.state)
1356
- return False
1357
-
1358
- # Get process from job
1359
- process = job.getprocess()
1360
- if process is None:
1361
- logger.warning("Could not get process for job %s", job.identifier)
1362
- return False
1363
-
1364
- if perform:
1365
- try:
1366
- logger.info("Killing job %s (process: %s)", job.identifier, process)
1367
- process.kill()
1368
-
1369
- # Update job state in database
1370
- if not self.read_only:
1371
- self._update_job_state_to_error(job, "killed")
1372
- except Exception as e:
1373
- logger.error("Error killing job %s: %s", job.identifier, e)
1374
- return False
1375
-
1376
- return True
1377
-
1378
- def _update_job_state_to_error(self, job: MockJob, reason: str):
1379
- """Update job state to ERROR in database
1380
-
1381
- Args:
1382
- job: MockJob instance
1383
- reason: Failure reason
1384
- """
1385
- if self.read_only:
1386
- return
1387
-
1388
- now = datetime.now()
1389
- with self.workspace_db.bind_ctx([JobModel]):
1390
- JobModel.update(
1391
- state="error",
1392
- failure_reason=reason,
1393
- ended_time=now.timestamp(),
1394
- updated_at=now,
1395
- ).where(
1396
- (JobModel.job_id == job.identifier)
1397
- & (JobModel.experiment_id == job.experiment_id)
1398
- & (JobModel.run_id == job.run_id)
1399
- ).execute()
1400
-
1401
- logger.debug(
1402
- "Updated job %s state to error (reason=%s)", job.identifier, reason
1403
- )
1404
-
1405
- def clean_job(self, job: MockJob, perform: bool = False) -> bool:
1406
- """Clean a finished job (delete directory and DB entry)
1407
-
1408
- This method removes the job's working directory and its database entry.
1409
- Only finished jobs (DONE or ERROR state) can be cleaned.
1410
-
1411
- Args:
1412
- job: MockJob instance to clean
1413
- perform: If True, actually delete the job. If False, just check
1414
- if the job can be cleaned (dry run).
1415
-
1416
- Returns:
1417
- True if job was cleaned (or would be cleaned in dry run),
1418
- False if job is not finished or cannot be cleaned
1419
- """
1420
- from shutil import rmtree
1421
-
1422
- # Check if job is in a finished state
1423
- if not job.state.finished():
1424
- logger.debug(
1425
- "Job %s is not finished (state=%s), cannot clean",
1426
- job.identifier,
1427
- job.state,
1428
- )
1429
- return False
1430
-
1431
- if perform:
1432
- # Delete job directory
1433
- if job.path.exists():
1434
- logger.info("Cleaning job %s: removing %s", job.identifier, job.path)
1435
- rmtree(job.path)
1436
- else:
1437
- logger.warning("Job directory does not exist: %s", job.path)
1438
-
1439
- # Delete from database
1440
- if not self.read_only:
1441
- self.delete_job(job.identifier, job.experiment_id, job.run_id)
1442
-
1443
- return True
1444
-
1445
- def kill_jobs(self, jobs: List[MockJob], perform: bool = False) -> int:
1446
- """Kill multiple jobs
1447
-
1448
- Args:
1449
- jobs: List of MockJob instances to kill
1450
- perform: If True, actually kill the processes. If False, dry run.
1451
-
1452
- Returns:
1453
- Number of jobs that were killed (or would be killed in dry run)
1454
- """
1455
- count = 0
1456
- for job in jobs:
1457
- if self.kill_job(job, perform=perform):
1458
- count += 1
1459
- return count
1460
-
1461
- def clean_jobs(self, jobs: List[MockJob], perform: bool = False) -> int:
1462
- """Clean multiple finished jobs
1463
-
1464
- Args:
1465
- jobs: List of MockJob instances to clean
1466
- perform: If True, actually delete the jobs. If False, dry run.
1467
-
1468
- Returns:
1469
- Number of jobs that were cleaned (or would be cleaned in dry run)
1470
- """
1471
- count = 0
1472
- for job in jobs:
1473
- if self.clean_job(job, perform=perform):
1474
- count += 1
1475
- return count
1476
-
1477
- def delete_job_safely(
1478
- self, job: MockJob, cascade_orphans: bool = True
1479
- ) -> tuple[bool, str]:
1480
- """Delete a job with proper locking and orphan cleanup
1481
-
1482
- This method is designed for TUI/UI use. It acquires a lock on the job
1483
- to prevent race conditions, then deletes the job directory and DB entry.
1484
-
1485
- Args:
1486
- job: MockJob instance to delete
1487
- cascade_orphans: If True, clean up orphan partials after deletion
1488
-
1489
- Returns:
1490
- Tuple of (success: bool, message: str)
1491
- """
1492
- import fasteners
1493
- from shutil import rmtree
1494
-
1495
- # Check if job is running
1496
- if job.state.running():
1497
- return False, "Cannot delete a running job"
1498
-
1499
- # Check if path exists
1500
- if not job.path or not job.path.exists():
1501
- # Just delete from database if path doesn't exist
1502
- if not self.read_only:
1503
- self.delete_job(job.identifier, job.experiment_id, job.run_id)
1504
- if cascade_orphans:
1505
- self.cleanup_orphan_partials(perform=True)
1506
- return True, f"Job {job.identifier} deleted (directory already gone)"
1507
-
1508
- # Try to acquire job lock (non-blocking)
1509
- # Lock file is typically {script_name}.lock, but we use .lock for general locking
1510
- lock_path = job.path / ".lock"
1511
- lock = fasteners.InterProcessLock(str(lock_path))
903
+ self.id = service_id
904
+ self._description = description_text
905
+ self._state_name = "MOCK" # MockService always has MOCK state
906
+ self._state_dict_data = state_dict_data
907
+ self._service_class = service_class
908
+ self.experiment_id = experiment_id
909
+ self.run_id = run_id
910
+ self.url = url
1512
911
 
1513
- if not lock.acquire(blocking=False):
1514
- return False, "Job is currently locked (possibly running)"
912
+ @property
913
+ def state(self):
914
+ """Return state as a ServiceState-like object with a name attribute"""
915
+ from experimaestro.scheduler.services import ServiceState
1515
916
 
917
+ # Convert state name to ServiceState enum
1516
918
  try:
1517
- # Delete all files except the lock file
1518
- for item in job.path.iterdir():
1519
- if item.name != ".lock":
1520
- if item.is_dir():
1521
- rmtree(item)
1522
- else:
1523
- item.unlink()
1524
-
1525
- # Mark job as "phantom" in database (don't delete - keep as phantom)
1526
- if not self.read_only:
1527
- from datetime import datetime
1528
-
1529
- JobModel.update(
1530
- state="phantom",
1531
- updated_at=datetime.now(),
1532
- ).where(
1533
- (JobModel.job_id == job.identifier)
1534
- & (JobModel.experiment_id == job.experiment_id)
1535
- & (JobModel.run_id == job.run_id)
1536
- ).execute()
1537
-
1538
- finally:
1539
- lock.release()
1540
- # Now delete the lock file and directory
1541
- try:
1542
- lock_path.unlink(missing_ok=True)
1543
- if job.path.exists() and not any(job.path.iterdir()):
1544
- job.path.rmdir()
1545
- except Exception as e:
1546
- logger.warning("Could not clean up lock file: %s", e)
1547
-
1548
- # Clean up orphan partials if requested
1549
- if cascade_orphans:
1550
- self.cleanup_orphan_partials(perform=True)
1551
-
1552
- return True, f"Job {job.identifier} deleted successfully"
1553
-
1554
- @_with_db_context
1555
- def delete_experiment(
1556
- self, experiment_id: str, delete_jobs: bool = False
1557
- ) -> tuple[bool, str]:
1558
- """Delete an experiment from the database
1559
-
1560
- Args:
1561
- experiment_id: Experiment identifier
1562
- delete_jobs: If True, also delete associated jobs (default: False)
1563
-
1564
- Returns:
1565
- Tuple of (success: bool, message: str)
1566
- """
1567
- from shutil import rmtree
1568
-
1569
- if self.read_only:
1570
- return False, "Cannot delete in read-only mode"
1571
-
1572
- # Get all jobs for this experiment
1573
- jobs = self.get_jobs(experiment_id)
1574
- running_jobs = [j for j in jobs if j.state.running()]
1575
-
1576
- if running_jobs:
1577
- return (
1578
- False,
1579
- f"Cannot delete experiment with {len(running_jobs)} running job(s)",
1580
- )
1581
-
1582
- # Delete jobs if requested
1583
- if delete_jobs:
1584
- for job in jobs:
1585
- success, msg = self.delete_job_safely(job, cascade_orphans=False)
1586
- if not success:
1587
- logger.warning("Failed to delete job %s: %s", job.identifier, msg)
1588
-
1589
- # Delete experiment runs
1590
- ExperimentRunModel.delete().where(
1591
- ExperimentRunModel.experiment_id == experiment_id
1592
- ).execute()
1593
-
1594
- # Delete experiment
1595
- ExperimentModel.delete().where(
1596
- ExperimentModel.experiment_id == experiment_id
1597
- ).execute()
1598
-
1599
- # Optionally delete experiment directory
1600
- exp_path = self.workspace_path / "xp" / experiment_id
1601
- if exp_path.exists():
1602
- try:
1603
- rmtree(exp_path)
1604
- except Exception as e:
1605
- logger.warning("Could not delete experiment directory: %s", e)
1606
-
1607
- # Clean up orphan partials
1608
- self.cleanup_orphan_partials(perform=True)
1609
-
1610
- return True, f"Experiment {experiment_id} deleted successfully"
1611
-
1612
- @_with_db_context
1613
- def get_orphan_jobs(self) -> List[MockJob]:
1614
- """Find jobs that have no associated experiment in the database
1615
-
1616
- Returns:
1617
- List of MockJob instances for orphan jobs
1618
- """
1619
- # Get all jobs
1620
- all_jobs = self.get_all_jobs()
1621
-
1622
- # Get all experiment IDs
1623
- experiments = self.get_experiments()
1624
- experiment_ids = {exp.experiment_id for exp in experiments}
1625
-
1626
- # Find jobs with no matching experiment
1627
- orphan_jobs = [
1628
- job for job in all_jobs if job.experiment_id not in experiment_ids
1629
- ]
1630
-
1631
- return orphan_jobs
1632
-
1633
- # Service operations
1634
-
1635
- @_with_db_context
1636
- def update_service(
1637
- self,
1638
- service_id: str,
1639
- experiment_id: str,
1640
- run_id: str,
1641
- description: str,
1642
- state: str,
1643
- state_dict: Optional[str] = None,
1644
- ):
1645
- """Update service information
1646
-
1647
- Args:
1648
- service_id: Service identifier
1649
- experiment_id: Experiment identifier
1650
- run_id: Run identifier
1651
- description: Human-readable description
1652
- state: Service state
1653
- state_dict: JSON serialized state_dict for service recreation
1654
-
1655
- Raises:
1656
- RuntimeError: If in read-only mode
1657
- """
1658
- if self.read_only:
1659
- raise RuntimeError("Cannot update services in read-only mode")
1660
-
1661
- insert_data = {
1662
- "service_id": service_id,
1663
- "experiment_id": experiment_id,
1664
- "run_id": run_id,
1665
- "description": description,
1666
- "state": state,
1667
- "created_at": datetime.now(),
1668
- "updated_at": datetime.now(),
919
+ return ServiceState[self._state_name]
920
+ except KeyError:
921
+ # Return a mock object with name attribute for unknown states
922
+ class MockState:
923
+ def __init__(self, name):
924
+ self.name = name
925
+
926
+ return MockState(self._state_name)
927
+
928
+ def description(self) -> str:
929
+ """Return service description"""
930
+ return self._description
931
+
932
+ def state_dict(self) -> dict:
933
+ """Return service state for recreation"""
934
+ return self._state_dict_data
935
+
936
+ def full_state_dict(self) -> dict:
937
+ """Get full state as dictionary for JSON serialization.
938
+
939
+ Overrides BaseService.full_state_dict() to preserve the original
940
+ service class name instead of using MockService's class name.
941
+ """
942
+ return {
943
+ "service_id": self.id,
944
+ "description": self._description,
945
+ "class": self._service_class,
946
+ "state_dict": self._state_dict_data,
1669
947
  }
1670
- update_data = {
1671
- ServiceModel.description: description,
1672
- ServiceModel.state: state,
1673
- ServiceModel.updated_at: datetime.now(),
1674
- }
1675
-
1676
- if state_dict is not None:
1677
- insert_data["state_dict"] = state_dict
1678
- update_data[ServiceModel.state_dict] = state_dict
1679
-
1680
- ServiceModel.insert(**insert_data).on_conflict(
1681
- conflict_target=[
1682
- ServiceModel.service_id,
1683
- ServiceModel.experiment_id,
1684
- ServiceModel.run_id,
1685
- ],
1686
- update=update_data,
1687
- ).execute()
1688
-
1689
- logger.debug(
1690
- "Updated service %s (experiment=%s, run=%s)",
1691
- service_id,
1692
- experiment_id,
1693
- run_id,
1694
- )
1695
-
1696
- # Notify listeners
1697
- self._notify_listeners(
1698
- StateEvent(
1699
- event_type=StateEventType.SERVICE_UPDATED,
1700
- data={
1701
- "serviceId": service_id,
1702
- "experimentId": experiment_id,
1703
- "runId": run_id,
1704
- "state": state,
1705
- "description": description,
1706
- },
1707
- )
1708
- )
1709
-
1710
- @_with_db_context
1711
- def get_services(
1712
- self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
1713
- ) -> List["Service"]:
1714
- """Get services, optionally filtered by experiment/run
1715
-
1716
- This method abstracts whether services are live (from scheduler) or
1717
- from the database. It returns actual Service objects in both cases:
1718
- - If a live scheduler has the experiment, return live Service objects
1719
- - Otherwise, recreate Service objects from stored state_dict
1720
-
1721
- Args:
1722
- experiment_id: Filter by experiment (None = all)
1723
- run_id: Filter by run (None = current run if experiment_id provided)
1724
-
1725
- Returns:
1726
- List of Service objects
1727
- """
1728
- from experimaestro.scheduler.services import Service
1729
-
1730
- # First, check for live services from the scheduler
1731
- if experiment_id is not None:
1732
- try:
1733
- from experimaestro.scheduler.base import Scheduler
1734
-
1735
- if Scheduler.has_instance():
1736
- scheduler = Scheduler.instance()
1737
- # Check if experiment is registered with scheduler
1738
- if experiment_id in scheduler.experiments:
1739
- exp = scheduler.experiments[experiment_id]
1740
- services = list(exp.services.values())
1741
- logger.debug(
1742
- "Returning %d live services for experiment %s",
1743
- len(services),
1744
- experiment_id,
1745
- )
1746
- return services
1747
- except Exception as e:
1748
- # Scheduler not available or error - fall back to database
1749
- logger.debug("Could not get live services: %s", e)
1750
-
1751
- # Fall back to database
1752
- query = ServiceModel.select()
1753
-
1754
- if experiment_id is not None:
1755
- # Use current run if not specified
1756
- if run_id is None:
1757
- run_id = self.get_current_run(experiment_id)
1758
- if run_id is None:
1759
- return []
1760
-
1761
- query = query.where(
1762
- (ServiceModel.experiment_id == experiment_id)
1763
- & (ServiceModel.run_id == run_id)
1764
- )
1765
-
1766
- services = []
1767
- for service_model in query:
1768
- # Try to recreate service from state_dict
1769
- state_dict_json = service_model.state_dict
1770
- if state_dict_json and state_dict_json != "{}":
1771
- try:
1772
- state_dict = json.loads(state_dict_json)
1773
- if "__class__" in state_dict:
1774
- service = Service.from_state_dict(state_dict)
1775
- # Set the id from the database record
1776
- service.id = service_model.service_id
1777
- services.append(service)
1778
- continue
1779
- except Exception as e:
1780
- logger.warning(
1781
- "Failed to recreate service %s from state_dict: %s",
1782
- service_model.service_id,
1783
- e,
1784
- )
1785
- # If we can't recreate, skip this service (it's not usable)
1786
- logger.debug(
1787
- "Service %s has no state_dict for recreation, skipping",
1788
- service_model.service_id,
1789
- )
1790
-
1791
- return services
1792
-
1793
- def get_live_job_states(self, experiment_id: str) -> Dict[str, str]:
1794
- """Get live job states from the scheduler if available
1795
-
1796
- This is useful for debugging to compare live state vs database state.
1797
-
1798
- Args:
1799
- experiment_id: The experiment ID to get live jobs for
1800
-
1801
- Returns:
1802
- Dict mapping job identifier to live state name, empty if scheduler
1803
- not available or experiment not registered
1804
- """
1805
- try:
1806
- from experimaestro.scheduler.base import Scheduler
1807
-
1808
- if not Scheduler.has_instance():
1809
- logger.debug("No scheduler instance available for live states")
1810
- return {}
1811
-
1812
- scheduler = Scheduler.instance()
1813
- live_states = {}
1814
-
1815
- logger.debug(
1816
- "get_live_job_states: looking for exp=%s, scheduler has %d jobs",
1817
- experiment_id,
1818
- len(scheduler.jobs),
1819
- )
1820
-
1821
- for job_id, job in scheduler.jobs.items():
1822
- # Filter by experiment if needed
1823
- if hasattr(job, "experiment") and job.experiment is not None:
1824
- if hasattr(job.experiment, "workdir"):
1825
- job_exp_id = job.experiment.workdir.name
1826
- if job_exp_id == experiment_id:
1827
- live_states[job_id] = job.state.name
1828
- else:
1829
- logger.debug(
1830
- "Job %s exp_id=%s != requested %s",
1831
- job_id[:8],
1832
- job_exp_id,
1833
- experiment_id,
1834
- )
1835
- else:
1836
- # Job not associated with experiment, include it anyway
1837
- live_states[job_id] = job.state.name
1838
- logger.debug(
1839
- "Job %s has no experiment, including anyway", job_id[:8]
1840
- )
1841
-
1842
- logger.debug("Returning %d live job states", len(live_states))
1843
- return live_states
1844
-
1845
- except Exception as e:
1846
- logger.debug("Could not get live job states: %s", e)
1847
- return {}
1848
-
1849
- # Sync metadata methods
1850
-
1851
- @_with_db_context
1852
- def get_last_sync_time(self) -> Optional[datetime]:
1853
- """Get the timestamp of the last successful sync
1854
-
1855
- Returns:
1856
- datetime of last sync, or None if never synced
1857
- """
1858
- from .state_db import WorkspaceSyncMetadata
1859
-
1860
- metadata = WorkspaceSyncMetadata.get_or_none(
1861
- WorkspaceSyncMetadata.id == "workspace"
1862
- )
1863
- if metadata and metadata.last_sync_time:
1864
- return metadata.last_sync_time
1865
- return None
1866
-
1867
- @_with_db_context
1868
- def update_last_sync_time(self) -> None:
1869
- """Update the last sync timestamp to now
1870
-
1871
- Raises:
1872
- RuntimeError: If in read-only mode
1873
- """
1874
- if self.read_only:
1875
- raise RuntimeError("Cannot update sync time in read-only mode")
1876
-
1877
- from .state_db import WorkspaceSyncMetadata
1878
-
1879
- WorkspaceSyncMetadata.insert(
1880
- id="workspace", last_sync_time=datetime.now()
1881
- ).on_conflict(
1882
- conflict_target=[WorkspaceSyncMetadata.id],
1883
- update={WorkspaceSyncMetadata.last_sync_time: datetime.now()},
1884
- ).execute()
1885
- logger.debug("Updated last sync time")
1886
-
1887
- # Partial management methods
1888
-
1889
- @_with_db_context
1890
- def register_partial(
1891
- self, partial_id: str, task_id: str, subparameters_name: str
1892
- ) -> None:
1893
- """Register a partial directory (creates if not exists)
1894
-
1895
- Args:
1896
- partial_id: Hex hash of the partial identifier
1897
- task_id: Task class identifier
1898
- subparameters_name: Name of the subparameters definition
1899
-
1900
- Raises:
1901
- RuntimeError: If in read-only mode
1902
- """
1903
- if self.read_only:
1904
- raise RuntimeError("Cannot register partials in read-only mode")
1905
-
1906
- PartialModel.insert(
1907
- partial_id=partial_id,
1908
- task_id=task_id,
1909
- subparameters_name=subparameters_name,
1910
- created_at=datetime.now(),
1911
- ).on_conflict_ignore().execute()
1912
-
1913
- logger.debug(
1914
- "Registered partial: %s (task=%s, subparams=%s)",
1915
- partial_id,
1916
- task_id,
1917
- subparameters_name,
1918
- )
1919
-
1920
- @_with_db_context
1921
- def register_job_partial(
1922
- self, job_id: str, experiment_id: str, run_id: str, partial_id: str
1923
- ) -> None:
1924
- """Link a job to a partial directory it uses
1925
-
1926
- Args:
1927
- job_id: Job identifier
1928
- experiment_id: Experiment identifier
1929
- run_id: Run identifier
1930
- partial_id: Partial directory identifier
1931
-
1932
- Raises:
1933
- RuntimeError: If in read-only mode
1934
- """
1935
- if self.read_only:
1936
- raise RuntimeError("Cannot register job partials in read-only mode")
1937
-
1938
- JobPartialModel.insert(
1939
- job_id=job_id,
1940
- experiment_id=experiment_id,
1941
- run_id=run_id,
1942
- partial_id=partial_id,
1943
- ).on_conflict_ignore().execute()
1944
-
1945
- logger.debug(
1946
- "Linked job %s to partial %s (experiment=%s, run=%s)",
1947
- job_id,
1948
- partial_id,
1949
- experiment_id,
1950
- run_id,
1951
- )
1952
-
1953
- @_with_db_context
1954
- def unregister_job_partials(
1955
- self, job_id: str, experiment_id: str, run_id: str
1956
- ) -> None:
1957
- """Remove all partial links for a job
1958
-
1959
- Called when a job is deleted to clean up its partial references.
1960
-
1961
- Args:
1962
- job_id: Job identifier
1963
- experiment_id: Experiment identifier
1964
- run_id: Run identifier
1965
948
 
1966
- Raises:
1967
- RuntimeError: If in read-only mode
1968
- """
1969
- if self.read_only:
1970
- raise RuntimeError("Cannot unregister job partials in read-only mode")
1971
-
1972
- JobPartialModel.delete().where(
1973
- (JobPartialModel.job_id == job_id)
1974
- & (JobPartialModel.experiment_id == experiment_id)
1975
- & (JobPartialModel.run_id == run_id)
1976
- ).execute()
1977
-
1978
- logger.debug(
1979
- "Unregistered partials for job %s (experiment=%s, run=%s)",
1980
- job_id,
1981
- experiment_id,
1982
- run_id,
1983
- )
1984
-
1985
- @_with_db_context
1986
- def get_orphan_partials(self) -> List[Dict]:
1987
- """Find partial directories that are not referenced by any job
1988
-
1989
- Returns:
1990
- List of dictionaries with partial_id, task_id, subparameters_name
1991
- """
1992
- # Find partials that have no job references
1993
- # Using a subquery to find referenced partial_ids
1994
- referenced_partials = JobPartialModel.select(JobPartialModel.partial_id)
1995
-
1996
- orphan_query = PartialModel.select().where(
1997
- PartialModel.partial_id.not_in(referenced_partials)
1998
- )
1999
-
2000
- orphans = []
2001
- for partial in orphan_query:
2002
- orphans.append(
2003
- {
2004
- "partial_id": partial.partial_id,
2005
- "task_id": partial.task_id,
2006
- "subparameters_name": partial.subparameters_name,
2007
- "created_at": partial.created_at.isoformat(),
2008
- }
2009
- )
2010
-
2011
- return orphans
2012
-
2013
- def cleanup_orphan_partials(self, perform: bool = False) -> List[Path]:
2014
- """Clean up orphan partial directories
2015
-
2016
- Finds partial directories not referenced by any job and removes them.
2017
-
2018
- Args:
2019
- perform: If True, actually delete. If False, dry run (list only).
2020
-
2021
- Returns:
2022
- List of paths that were deleted (or would be deleted in dry run)
2023
- """
2024
- from shutil import rmtree
2025
-
2026
- orphans = self.get_orphan_partials()
2027
- deleted_paths = []
2028
-
2029
- for orphan in orphans:
2030
- # Reconstruct path: WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID
2031
- partial_path = (
2032
- self.workspace_path
2033
- / "partials"
2034
- / orphan["task_id"]
2035
- / orphan["subparameters_name"]
2036
- / orphan["partial_id"]
2037
- )
2038
-
2039
- if perform:
2040
- # Delete directory if it exists
2041
- if partial_path.exists():
2042
- logger.info("Cleaning orphan partial: %s", partial_path)
2043
- rmtree(partial_path)
2044
-
2045
- # Delete from database
2046
- if not self.read_only:
2047
- with self.workspace_db.bind_ctx([PartialModel]):
2048
- PartialModel.delete().where(
2049
- PartialModel.partial_id == orphan["partial_id"]
2050
- ).execute()
2051
-
2052
- deleted_paths.append(partial_path)
2053
-
2054
- return deleted_paths
2055
-
2056
- # Utility methods
2057
-
2058
- def close(self):
2059
- """Close the database connection and remove from registry
2060
-
2061
- This should be called when done with the workspace to free resources.
2062
- """
2063
- # Stop file watcher if running
2064
- self._stop_file_watcher()
2065
-
2066
- # Close database connection
2067
- if hasattr(self, "workspace_db") and self.workspace_db is not None:
2068
- from .state_db import close_workspace_database
2069
-
2070
- close_workspace_database(self.workspace_db)
2071
- self.workspace_db = None
2072
-
2073
- # Remove from registry
2074
- with WorkspaceStateProvider._lock:
2075
- if self.workspace_path in WorkspaceStateProvider._instances:
2076
- del WorkspaceStateProvider._instances[self.workspace_path]
2077
-
2078
- logger.debug("WorkspaceStateProvider closed for %s", self.workspace_path)
2079
-
2080
- # Listener methods for push notifications
2081
-
2082
- def add_listener(self, listener: StateListener) -> None:
2083
- """Register a listener for state change notifications
2084
-
2085
- Listeners are called synchronously when state changes occur.
2086
- For UI applications, listeners should queue updates for their
2087
- own event loop to avoid blocking database operations.
2088
-
2089
- When the first listener is added, starts watching the database
2090
- file for changes to enable push notifications.
2091
-
2092
- Args:
2093
- listener: Callback function that receives StateEvent objects
2094
- """
2095
- with self._listeners_lock:
2096
- was_empty = len(self._listeners) == 0
2097
- self._listeners.add(listener)
2098
-
2099
- # Start file watcher when first listener is added
2100
- if was_empty:
2101
- self._start_file_watcher()
2102
-
2103
- logger.info(
2104
- "Added state listener: %s (total: %d)", listener, len(self._listeners)
2105
- )
2106
-
2107
- def remove_listener(self, listener: StateListener) -> None:
2108
- """Unregister a state change listener
2109
-
2110
- When the last listener is removed, stops watching the database file.
2111
-
2112
- Args:
2113
- listener: Previously registered callback function
2114
- """
2115
- with self._listeners_lock:
2116
- self._listeners.discard(listener)
2117
- is_empty = len(self._listeners) == 0
2118
-
2119
- # Stop file watcher when last listener is removed
2120
- if is_empty:
2121
- self._stop_file_watcher()
2122
-
2123
- logger.debug("Removed state listener: %s", listener)
2124
-
2125
- def _start_file_watcher(self) -> None:
2126
- """Start watching the database file for changes"""
2127
- if self._db_file_watch is not None:
2128
- logger.info("File watcher already running for %s", self._db_dir)
2129
- return # Already watching
2130
-
2131
- from experimaestro.ipc import ipcom
2132
-
2133
- # Create and start the change detector thread
2134
- self._change_detector = _DatabaseChangeDetector(self)
2135
- self._change_detector.start()
2136
-
2137
- # Create the file handler that signals the detector
2138
- self._db_file_handler = _DatabaseFileHandler(self._change_detector)
2139
- self._db_file_watch = ipcom().fswatch(
2140
- self._db_file_handler,
2141
- self._db_dir,
2142
- recursive=False,
2143
- )
2144
- logger.info("Started database file watcher for %s", self._db_dir)
2145
-
2146
- def _stop_file_watcher(self) -> None:
2147
- """Stop watching the database file"""
2148
- if self._db_file_watch is None:
2149
- return # Not watching
2150
-
2151
- from experimaestro.ipc import ipcom
2152
-
2153
- # Stop the file watcher first
2154
- ipcom().fsunwatch(self._db_file_watch)
2155
- self._db_file_watch = None
2156
- self._db_file_handler = None
2157
-
2158
- # Stop the change detector thread
2159
- if self._change_detector is not None:
2160
- self._change_detector.stop()
2161
- self._change_detector = None
2162
-
2163
- logger.debug("Stopped database file watcher for %s", self.workspace_path)
2164
-
2165
- def _notify_listeners(self, event: StateEvent) -> None:
2166
- """Notify all registered listeners of a state change
2167
-
2168
- This is called internally by state-modifying methods.
2169
- Listeners are called synchronously - they should be fast.
2170
-
2171
- Args:
2172
- event: State change event to broadcast
2173
- """
2174
- with self._listeners_lock:
2175
- listeners = list(self._listeners)
2176
-
2177
- for listener in listeners:
2178
- try:
2179
- listener(event)
2180
- except Exception as e:
2181
- logger.warning("Listener %s raised exception: %s", listener, e)
2182
-
2183
- # Helper methods
2184
-
2185
- @_with_db_context
2186
- def _get_job_tags(
2187
- self, job_id: str, experiment_id: str, run_id: str
2188
- ) -> Dict[str, str]:
2189
- """Get tags for a job
2190
-
2191
- Args:
2192
- job_id: Job identifier
2193
- experiment_id: Experiment identifier
2194
- run_id: Run identifier
949
+ @property
950
+ def service_class(self) -> Optional[str]:
951
+ """Return service class name"""
952
+ return self._service_class
2195
953
 
2196
- Returns:
2197
- Dictionary of tag key-value pairs
2198
- """
2199
- tags = {}
2200
- for tag_model in JobTagModel.select().where(
2201
- (JobTagModel.job_id == job_id)
2202
- & (JobTagModel.experiment_id == experiment_id)
2203
- & (JobTagModel.run_id == run_id)
2204
- ):
2205
- tags[tag_model.tag_key] = tag_model.tag_value
2206
- return tags
2207
-
2208
- def _job_model_to_dict(self, job_model: JobModel, tags: Dict[str, str]) -> MockJob:
2209
- """Convert a JobModel to a MockJob object
954
+ @classmethod
955
+ def from_full_state_dict(cls, d: Dict) -> "MockService":
956
+ """Create MockService from full state dictionary
2210
957
 
2211
958
  Args:
2212
- job_model: JobModel instance
2213
- tags: Dictionary of tags for this job
959
+ d: Dictionary from full_state_dict()
2214
960
 
2215
961
  Returns:
2216
- MockJob object
2217
- """
2218
- # Parse progress JSON
2219
- progress_list = json.loads(job_model.progress)
2220
-
2221
- # Compute job path from workspace_path, task_id, and job_id
2222
- job_path = self.workspace_path / "jobs" / job_model.task_id / job_model.job_id
2223
-
2224
- # Convert failure_reason string to enum if present
2225
- failure_reason = None
2226
- if job_model.failure_reason:
2227
- try:
2228
- failure_reason = JobFailureStatus[job_model.failure_reason]
2229
- except KeyError:
2230
- pass # Unknown failure reason, leave as None
2231
-
2232
- return MockJob(
2233
- identifier=job_model.job_id,
2234
- task_id=job_model.task_id,
2235
- locator=job_model.locator,
2236
- path=job_path,
2237
- state=job_model.state,
2238
- submittime=job_model.submitted_time,
2239
- starttime=job_model.started_time,
2240
- endtime=job_model.ended_time,
2241
- progress=progress_list,
2242
- tags=tags,
2243
- experiment_id=job_model.experiment_id,
2244
- run_id=job_model.run_id,
2245
- updated_at=job_model.updated_at.isoformat(),
2246
- failure_reason=failure_reason,
962
+ MockService instance (state is always MOCK, not from dict)
963
+ """
964
+ return cls(
965
+ service_id=d["service_id"],
966
+ description_text=d.get("description", ""),
967
+ state_dict_data=d.get("state_dict", {}),
968
+ service_class=d.get("class"),
969
+ experiment_id=d.get("experiment_id"),
970
+ run_id=d.get("run_id"),
971
+ url=d.get("url"),
2247
972
  )
2248
973
 
2249
- def _format_time(self, timestamp: Optional[float]) -> str:
2250
- """Format timestamp for UI
974
+ def to_service(self) -> "BaseService":
975
+ """Try to recreate a live Service instance from this mock.
2251
976
 
2252
- Args:
2253
- timestamp: Unix timestamp or None
977
+ Attempts to recreate the service using the stored configuration.
978
+ If recreation fails, returns self.
2254
979
 
2255
980
  Returns:
2256
- ISO format datetime string or empty string
2257
- """
2258
- if not timestamp:
2259
- return ""
2260
- return datetime.fromtimestamp(timestamp).isoformat()
2261
-
2262
-
2263
- # Scheduler listener adapter
2264
- class SchedulerListener:
2265
- """Adapter to connect scheduler events to WorkspaceStateProvider
2266
-
2267
- This class implements the scheduler listener interface and forwards
2268
- events to the WorkspaceStateProvider. It tracks which experiment/run
2269
- each job belongs to for proper database updates.
2270
- """
2271
-
2272
- def __init__(self, state_provider: WorkspaceStateProvider):
2273
- """Initialize listener
2274
-
2275
- Args:
2276
- state_provider: WorkspaceStateProvider instance to update
2277
- """
2278
- self.state_provider = state_provider
2279
- # Map job_id -> (experiment_id, run_id) for tracking
2280
- self.job_experiments: Dict[str, tuple] = {}
2281
-
2282
- logger.info("SchedulerListener initialized")
2283
-
2284
- @_with_db_context
2285
- def job_submitted(self, job: "Job", experiment_id: str, run_id: str):
2286
- """Called when a job is submitted
2287
-
2288
- Args:
2289
- job: The submitted job
2290
- experiment_id: Experiment this job belongs to
2291
- run_id: Run this job belongs to
2292
- """
2293
- # Track job's experiment/run
2294
- self.job_experiments[job.identifier] = (experiment_id, run_id)
2295
-
2296
- # Update state provider
2297
- try:
2298
- self.state_provider.update_job_submitted(job, experiment_id, run_id)
2299
- except Exception as e:
2300
- logger.exception(
2301
- "Error updating job submission for %s: %s", job.identifier, e
2302
- )
2303
-
2304
- @_with_db_context
2305
- def job_state(self, job: "Job"):
2306
- """Called when a job's state changes
2307
-
2308
- Args:
2309
- job: The job with updated state
2310
- """
2311
- # Look up job's experiment/run
2312
- if job.identifier not in self.job_experiments:
2313
- logger.warning(
2314
- "State change for unknown job %s (not tracked by listener)",
2315
- job.identifier,
2316
- )
2317
- return
2318
-
2319
- experiment_id, run_id = self.job_experiments[job.identifier]
2320
-
2321
- # Update state provider
2322
- try:
2323
- self.state_provider.update_job_state(job, experiment_id, run_id)
2324
- except Exception as e:
2325
- logger.exception("Error updating job state for %s: %s", job.identifier, e)
2326
-
2327
- @_with_db_context
2328
- def service_add(self, service: "Service", experiment_id: str, run_id: str):
2329
- """Called when a service is added
2330
-
2331
- Args:
2332
- service: The added service
2333
- experiment_id: Experiment identifier
2334
- run_id: Run identifier
2335
- """
2336
- try:
2337
- self.state_provider.update_service(
2338
- service.id,
2339
- experiment_id,
2340
- run_id,
2341
- service.description(),
2342
- service.state.name,
2343
- )
2344
- except Exception as e:
2345
- logger.exception("Error updating service %s: %s", service.id, e)
981
+ A live Service instance or self if recreation is not possible
982
+ """
983
+ # Just return self - service recreation from config not implemented
984
+ return self
985
+
986
+
987
+ __all__ = [
988
+ # Data classes
989
+ "ProcessInfo",
990
+ # Listener type alias
991
+ "StateListener",
992
+ # ABC
993
+ "StateProvider",
994
+ "OfflineStateProvider",
995
+ # Mock classes
996
+ "MockJob",
997
+ "MockExperiment",
998
+ "MockService",
999
+ ]