experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (152) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +239 -126
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +217 -50
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +629 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +732 -167
  36. experimaestro/scheduler/interfaces.py +316 -101
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  39. experimaestro/scheduler/remote/client.py +171 -117
  40. experimaestro/scheduler/remote/protocol.py +8 -193
  41. experimaestro/scheduler/remote/server.py +95 -71
  42. experimaestro/scheduler/services.py +53 -28
  43. experimaestro/scheduler/state_provider.py +663 -2430
  44. experimaestro/scheduler/state_status.py +1247 -0
  45. experimaestro/scheduler/transient.py +31 -0
  46. experimaestro/scheduler/workspace.py +1 -1
  47. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  48. experimaestro/scriptbuilder.py +4 -4
  49. experimaestro/settings.py +36 -0
  50. experimaestro/tests/conftest.py +33 -5
  51. experimaestro/tests/connectors/bin/executable.py +1 -1
  52. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  53. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  54. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  55. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  56. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  58. experimaestro/tests/launchers/bin/test.py +1 -0
  59. experimaestro/tests/launchers/test_slurm.py +9 -9
  60. experimaestro/tests/partial_reschedule.py +46 -0
  61. experimaestro/tests/restart.py +3 -3
  62. experimaestro/tests/restart_main.py +1 -0
  63. experimaestro/tests/scripts/notifyandwait.py +1 -0
  64. experimaestro/tests/task_partial.py +38 -0
  65. experimaestro/tests/task_tokens.py +2 -2
  66. experimaestro/tests/tasks/test_dynamic.py +6 -6
  67. experimaestro/tests/test_dependencies.py +3 -3
  68. experimaestro/tests/test_deprecated.py +15 -15
  69. experimaestro/tests/test_dynamic_locking.py +317 -0
  70. experimaestro/tests/test_environment.py +24 -14
  71. experimaestro/tests/test_experiment.py +171 -36
  72. experimaestro/tests/test_identifier.py +25 -25
  73. experimaestro/tests/test_identifier_stability.py +3 -5
  74. experimaestro/tests/test_multitoken.py +2 -4
  75. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  76. experimaestro/tests/test_partial_paths.py +81 -138
  77. experimaestro/tests/test_pre_experiment.py +219 -0
  78. experimaestro/tests/test_progress.py +2 -8
  79. experimaestro/tests/test_remote_state.py +560 -99
  80. experimaestro/tests/test_stray_jobs.py +261 -0
  81. experimaestro/tests/test_tasks.py +1 -2
  82. experimaestro/tests/test_token_locking.py +52 -67
  83. experimaestro/tests/test_tokens.py +5 -6
  84. experimaestro/tests/test_transient.py +225 -0
  85. experimaestro/tests/test_workspace_state_provider.py +768 -0
  86. experimaestro/tests/token_reschedule.py +1 -3
  87. experimaestro/tests/utils.py +2 -7
  88. experimaestro/tokens.py +227 -372
  89. experimaestro/tools/diff.py +1 -0
  90. experimaestro/tools/documentation.py +4 -5
  91. experimaestro/tools/jobs.py +1 -2
  92. experimaestro/tui/app.py +438 -1966
  93. experimaestro/tui/app.tcss +162 -0
  94. experimaestro/tui/dialogs.py +172 -0
  95. experimaestro/tui/log_viewer.py +253 -3
  96. experimaestro/tui/messages.py +137 -0
  97. experimaestro/tui/utils.py +54 -0
  98. experimaestro/tui/widgets/__init__.py +23 -0
  99. experimaestro/tui/widgets/experiments.py +468 -0
  100. experimaestro/tui/widgets/global_services.py +238 -0
  101. experimaestro/tui/widgets/jobs.py +972 -0
  102. experimaestro/tui/widgets/log.py +156 -0
  103. experimaestro/tui/widgets/orphans.py +363 -0
  104. experimaestro/tui/widgets/runs.py +185 -0
  105. experimaestro/tui/widgets/services.py +314 -0
  106. experimaestro/tui/widgets/stray_jobs.py +528 -0
  107. experimaestro/utils/__init__.py +1 -1
  108. experimaestro/utils/environment.py +105 -22
  109. experimaestro/utils/fswatcher.py +124 -0
  110. experimaestro/utils/jobs.py +1 -2
  111. experimaestro/utils/jupyter.py +1 -2
  112. experimaestro/utils/logging.py +72 -0
  113. experimaestro/version.py +2 -2
  114. experimaestro/webui/__init__.py +9 -0
  115. experimaestro/webui/app.py +117 -0
  116. experimaestro/{server → webui}/data/index.css +66 -11
  117. experimaestro/webui/data/index.css.map +1 -0
  118. experimaestro/{server → webui}/data/index.js +82763 -87217
  119. experimaestro/webui/data/index.js.map +1 -0
  120. experimaestro/webui/routes/__init__.py +5 -0
  121. experimaestro/webui/routes/auth.py +53 -0
  122. experimaestro/webui/routes/proxy.py +117 -0
  123. experimaestro/webui/server.py +200 -0
  124. experimaestro/webui/state_bridge.py +152 -0
  125. experimaestro/webui/websocket.py +413 -0
  126. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
  127. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  128. experimaestro/cli/progress.py +0 -269
  129. experimaestro/scheduler/state.py +0 -75
  130. experimaestro/scheduler/state_db.py +0 -437
  131. experimaestro/scheduler/state_sync.py +0 -891
  132. experimaestro/server/__init__.py +0 -467
  133. experimaestro/server/data/index.css.map +0 -1
  134. experimaestro/server/data/index.js.map +0 -1
  135. experimaestro/tests/test_cli_jobs.py +0 -615
  136. experimaestro/tests/test_file_progress.py +0 -425
  137. experimaestro/tests/test_file_progress_integration.py +0 -477
  138. experimaestro/tests/test_state_db.py +0 -434
  139. experimaestro-2.0.0b8.dist-info/RECORD +0 -187
  140. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  141. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  142. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  143. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  145. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  147. /experimaestro/{server → webui}/data/index.html +0 -0
  148. /experimaestro/{server → webui}/data/login.html +0 -0
  149. /experimaestro/{server → webui}/data/manifest.json +0 -0
  150. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  151. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  152. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -1,164 +1,170 @@
1
- """Unified workspace state provider for accessing experiment and job information
1
+ """State provider interfaces for accessing experiment and job information
2
2
 
3
- This module provides a single WorkspaceStateProvider class that accesses state
4
- from the workspace-level database (.experimaestro/workspace.db). This replaces
5
- the previous multi-provider architecture with a unified approach.
3
+ This module provides the abstract StateProvider interface and related data classes.
4
+ The concrete implementations are in db_state_provider.py (DbStateProvider) and
5
+ remote/client.py (SSHStateProviderClient).
6
6
 
7
7
  Key features:
8
- - Single .experimaestro/workspace.db database shared across all experiments
9
- - Support for multiple runs per experiment
10
- - Run-scoped tags (fixes GH #128)
11
- - Thread-safe database access via thread-local connections
12
- - Real-time updates via scheduler listener interface
13
- - Push notifications via listener callbacks (for reactive UI)
8
+ - StateProvider ABC: Abstract base class for all state providers
9
+ - Mock classes: Concrete implementations for database-loaded state objects
10
+ - StateListener: Type alias for listener callbacks
11
+
12
+ Note: Event classes are defined in state_status.py (EventBase and subclasses).
14
13
  """
15
14
 
16
15
  import json
17
16
  import logging
18
- import socket
19
17
  import threading
20
- import time
21
18
  from dataclasses import dataclass
22
19
  from datetime import datetime
23
- from enum import Enum, auto
24
20
  from pathlib import Path
25
21
  from abc import ABC, abstractmethod
26
- from typing import Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
27
-
28
- from watchdog.events import FileSystemEventHandler
29
- from watchdog.observers.api import ObservedWatch
30
-
31
- from experimaestro.scheduler.state_db import (
32
- ExperimentModel,
33
- ExperimentRunModel,
34
- JobModel,
35
- JobTagModel,
36
- ServiceModel,
37
- PartialModel,
38
- JobPartialModel,
39
- WorkspaceSyncMetadata,
40
- ALL_MODELS,
41
- CURRENT_DB_VERSION,
42
- )
22
+ from typing import Callable, Dict, List, Optional, Set, Tuple
23
+
43
24
  from experimaestro.scheduler.interfaces import (
44
25
  BaseJob,
45
26
  BaseExperiment,
46
27
  BaseService,
28
+ ExperimentJobInformation,
29
+ ExperimentStatus,
47
30
  JobState,
48
31
  JobFailureStatus,
49
32
  STATE_NAME_TO_JOBSTATE,
33
+ deserialize_timestamp,
50
34
  )
51
-
52
- if TYPE_CHECKING:
53
- from experimaestro.scheduler.jobs import Job
54
- from experimaestro.scheduler.services import Service
35
+ from experimaestro.scheduler.transient import TransientMode
36
+ from experimaestro.notifications import (
37
+ ProgressInformation,
38
+ get_progress_information_from_dict,
39
+ )
40
+ from experimaestro.scheduler.state_status import EventBase
55
41
 
56
42
  logger = logging.getLogger("xpm.state")
57
43
 
58
44
 
59
- # Event types for state provider notifications
60
- class StateEventType(Enum):
61
- """Types of state change events"""
62
-
63
- EXPERIMENT_UPDATED = auto()
64
- RUN_UPDATED = auto()
65
- JOB_UPDATED = auto()
66
- SERVICE_UPDATED = auto()
45
+ # =============================================================================
46
+ # Process Information
47
+ # =============================================================================
67
48
 
68
49
 
69
50
  @dataclass
70
- class StateEvent:
71
- """Base class for state change events
51
+ class ProcessInfo:
52
+ """Information about a running or completed process"""
72
53
 
73
- Attributes:
74
- event_type: Type of the event
75
- data: Event-specific data dictionary
76
- """
54
+ pid: int
55
+ """Process ID"""
56
+
57
+ type: str
58
+ """Process type (e.g., 'local', 'slurm', 'oar')"""
77
59
 
78
- event_type: StateEventType
79
- data: Dict
60
+ running: bool = False
61
+ """Whether the process is currently running"""
80
62
 
63
+ cpu_percent: Optional[float] = None
64
+ """CPU usage percentage (if available)"""
81
65
 
82
- # Type alias for listener callbacks
83
- StateListener = Callable[[StateEvent], None]
66
+ memory_mb: Optional[float] = None
67
+ """Memory usage in MB (if available)"""
68
+
69
+ num_threads: Optional[int] = None
70
+ """Number of threads (if available)"""
71
+
72
+
73
+ # Type alias for listener callbacks (uses EventBase from state_status)
74
+ StateListener = Callable[[EventBase], None]
75
+
76
+
77
+ # =============================================================================
78
+ # State Provider ABC
79
+ # =============================================================================
84
80
 
85
81
 
86
82
  class StateProvider(ABC):
87
83
  """Abstract base class for state providers
88
84
 
89
85
  Defines the interface that all state providers must implement.
90
- This enables both local (WorkspaceStateProvider) and remote
91
- (SSHStateProviderClient) providers to be used interchangeably.
86
+ This enables both local (DbStateProvider), remote (SSHStateProviderClient),
87
+ and live (Scheduler) providers to be used interchangeably.
88
+
89
+ Concrete implementations:
90
+ - Scheduler: Live in-memory state from running experiments
91
+ - OfflineStateProvider: Base for cached/persistent state (in db_state_provider.py)
92
+ - DbStateProvider: SQLite database-backed state
93
+ - SSHStateProviderClient: Remote SSH-based state
92
94
 
93
- Provides common service caching logic to preserve service instances
94
- (and their URLs) across calls to get_services(). Subclasses should call
95
- _init_service_cache() in their __init__ and implement _fetch_services_from_storage().
95
+ State listener management is provided by the base class with default implementations.
96
96
  """
97
97
 
98
- def _init_service_cache(self) -> None:
99
- """Initialize service cache - call from subclass __init__"""
100
- self._service_cache: Dict[Tuple[str, str], Dict[str, "BaseService"]] = {}
101
- self._service_cache_lock = threading.Lock()
98
+ #: Whether this provider is connected to a live scheduler
99
+ is_live: bool = False
102
100
 
103
- def _clear_service_cache(self) -> None:
104
- """Clear the service cache"""
105
- with self._service_cache_lock:
106
- self._service_cache.clear()
101
+ def __init__(self) -> None:
102
+ """Initialize state listener management"""
103
+ self._state_listeners: Set[StateListener] = set()
104
+ self._state_listener_lock = threading.Lock()
107
105
 
108
- def get_services(
109
- self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
110
- ) -> List[BaseService]:
111
- """Get services for an experiment
106
+ def add_listener(self, listener: StateListener) -> None:
107
+ """Register a listener for state change events
112
108
 
113
- Uses caching to preserve service instances (and their URLs) across calls.
114
- Subclasses can override _get_live_services() for live service support
115
- and must implement _fetch_services_from_storage() for persistent storage.
109
+ Args:
110
+ listener: Callback function that receives StateEvent objects
116
111
  """
117
- # Resolve run_id if needed
118
- if experiment_id is not None and run_id is None:
119
- run_id = self.get_current_run(experiment_id)
120
- if run_id is None:
121
- return []
112
+ with self._state_listener_lock:
113
+ self._state_listeners.add(listener)
122
114
 
123
- cache_key = (experiment_id or "", run_id or "")
115
+ def remove_listener(self, listener: StateListener) -> None:
116
+ """Unregister a listener
124
117
 
125
- with self._service_cache_lock:
126
- # Try to get live services (scheduler, etc.) - may return None
127
- live_services = self._get_live_services(experiment_id, run_id)
128
- if live_services is not None:
129
- # Cache and return live services
130
- self._service_cache[cache_key] = {s.id: s for s in live_services}
131
- return live_services
118
+ Args:
119
+ listener: Previously registered callback function
120
+ """
121
+ with self._state_listener_lock:
122
+ self._state_listeners.discard(listener)
132
123
 
133
- # Check cache
134
- cached = self._service_cache.get(cache_key)
135
- if cached is not None:
136
- return list(cached.values())
124
+ def _notify_state_listeners(self, event: EventBase) -> None:
125
+ """Notify all state listeners of an event
137
126
 
138
- # Fetch from persistent storage (DB or remote)
139
- services = self._fetch_services_from_storage(experiment_id, run_id)
140
- self._service_cache[cache_key] = {s.id: s for s in services}
141
- return services
127
+ Args:
128
+ event: State change event to broadcast
129
+ """
130
+ with self._state_listener_lock:
131
+ listeners = list(self._state_listeners)
142
132
 
143
- def _get_live_services(
144
- self, experiment_id: Optional[str], run_id: Optional[str]
145
- ) -> Optional[List[BaseService]]:
146
- """Get live services if available (e.g., from scheduler).
133
+ logger.debug(
134
+ "Notifying %d listeners of %s", len(listeners), type(event).__name__
135
+ )
136
+ for listener in listeners:
137
+ try:
138
+ listener(event)
139
+ except Exception as e:
140
+ logger.exception("Error in state listener: %s", e)
147
141
 
148
- Returns None if no live services are available (default).
149
- Subclasses may override to check for live services.
142
+ def service_state_changed(self, service) -> None:
143
+ """Called when a service's state changes - emit event to listeners
144
+
145
+ StateProvider registers itself as a listener on services it returns,
146
+ so this method is called when those services' states change.
150
147
  """
151
- return None
148
+ from experimaestro.scheduler.state_status import ServiceStateChangedEvent
152
149
 
153
- @abstractmethod
154
- def _fetch_services_from_storage(
155
- self, experiment_id: Optional[str], run_id: Optional[str]
156
- ) -> List[BaseService]:
157
- """Fetch services from persistent storage (DB or remote).
150
+ experiment_id = getattr(service, "_experiment_id", "") or ""
151
+ run_id = getattr(service, "_run_id", "") or ""
152
+ state_name = service.state.name if hasattr(service.state, "name") else "UNKNOWN"
158
153
 
159
- Called when no live services and cache is empty.
160
- """
161
- ...
154
+ logger.debug(
155
+ "Service %s state changed to %s (experiment=%s)",
156
+ service.id,
157
+ state_name,
158
+ experiment_id,
159
+ )
160
+
161
+ event = ServiceStateChangedEvent(
162
+ experiment_id=experiment_id,
163
+ run_id=run_id,
164
+ service_id=service.id,
165
+ state=state_name,
166
+ )
167
+ self._notify_state_listeners(event)
162
168
 
163
169
  @abstractmethod
164
170
  def get_experiments(self, since: Optional[datetime] = None) -> List[BaseExperiment]:
@@ -171,8 +177,13 @@ class StateProvider(ABC):
171
177
  ...
172
178
 
173
179
  @abstractmethod
174
- def get_experiment_runs(self, experiment_id: str) -> List[Dict]:
175
- """Get all runs for an experiment"""
180
+ def get_experiment_runs(self, experiment_id: str) -> List[BaseExperiment]:
181
+ """Get all runs for an experiment
182
+
183
+ Returns:
184
+ List of BaseExperiment instances (MockExperiment for past runs,
185
+ or live experiment for the current run in Scheduler)
186
+ """
176
187
  ...
177
188
 
178
189
  @abstractmethod
@@ -210,25 +221,55 @@ class StateProvider(ABC):
210
221
  """Get all jobs across all experiments"""
211
222
  ...
212
223
 
213
- # Note: get_services is implemented in base class using _fetch_services_from_storage
214
-
215
224
  @abstractmethod
216
- def get_services_raw(
217
- self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
218
- ) -> List[Dict]:
219
- """Get raw service data as dictionaries (for serialization)"""
225
+ def get_tags_map(
226
+ self,
227
+ experiment_id: str,
228
+ run_id: Optional[str] = None,
229
+ ) -> Dict[str, Dict[str, str]]:
230
+ """Get tags map for jobs in an experiment/run
231
+
232
+ Tags are stored per (job_id, experiment_id, run_id) in JobTagModel.
233
+ This method returns a map from job_id to {tag_key: tag_value}.
234
+
235
+ Args:
236
+ experiment_id: Experiment identifier
237
+ run_id: Run identifier (None = current run)
238
+
239
+ Returns:
240
+ Dictionary mapping job identifiers to their tags dict
241
+ """
220
242
  ...
221
243
 
222
244
  @abstractmethod
223
- def add_listener(self, listener: StateListener) -> None:
224
- """Register a listener for state change events"""
245
+ def get_dependencies_map(
246
+ self,
247
+ experiment_id: str,
248
+ run_id: Optional[str] = None,
249
+ ) -> Dict[str, List[str]]:
250
+ """Get dependencies map for jobs in an experiment/run
251
+
252
+ Dependencies are stored per (job_id, experiment_id, run_id) in JobDependenciesModel.
253
+ This method returns a map from job_id to list of job_ids it depends on.
254
+
255
+ Args:
256
+ experiment_id: Experiment identifier
257
+ run_id: Run identifier (None = current run)
258
+
259
+ Returns:
260
+ Dictionary mapping job identifiers to list of job IDs they depend on
261
+ """
225
262
  ...
226
263
 
227
264
  @abstractmethod
228
- def remove_listener(self, listener: StateListener) -> None:
229
- """Unregister a listener"""
265
+ def get_services(
266
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
267
+ ) -> List[BaseService]:
268
+ """Get services for an experiment"""
230
269
  ...
231
270
 
271
+ # add_listener and remove_listener are implemented in base class
272
+
232
273
  @abstractmethod
233
274
  def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
234
275
  """Kill a running job"""
@@ -257,6 +298,19 @@ class StateProvider(ABC):
257
298
  """Get orphan jobs (jobs not associated with any experiment run)"""
258
299
  return []
259
300
 
301
+ def get_stray_jobs(self) -> List[BaseJob]:
302
+ """Get stray jobs (running jobs not associated with any active experiment)
303
+
304
+ Stray jobs are a subset of orphan jobs - they are orphan jobs that are
305
+ currently running or scheduled. These represent jobs where the experimental
306
+ plan changed but the job process is still running.
307
+
308
+ Returns:
309
+ List of running/scheduled jobs not in any active experiment
310
+ """
311
+ # Default implementation: filter orphan jobs to running ones
312
+ return [j for j in self.get_orphan_jobs() if j.state and j.state.running()]
313
+
260
314
  def delete_job_safely(self, job: BaseJob, perform: bool = True) -> Tuple[bool, str]:
261
315
  """Safely delete a job and its data"""
262
316
  return False, "Not implemented"
@@ -271,6 +325,13 @@ class StateProvider(ABC):
271
325
  """Clean up orphan partial directories"""
272
326
  return []
273
327
 
328
+ def get_process_info(self, job: BaseJob) -> Optional[ProcessInfo]:
329
+ """Get process information for a job
330
+
331
+ Returns a ProcessInfo dataclass or None if not available.
332
+ """
333
+ return None
334
+
274
335
  def get_last_sync_time(self) -> Optional[datetime]:
275
336
  """Get the last sync time (for incremental updates)"""
276
337
  return None
@@ -290,165 +351,102 @@ class StateProvider(ABC):
290
351
  return False
291
352
 
292
353
 
293
- class _DatabaseChangeDetector:
294
- """Background thread that detects database changes and notifies listeners
354
+ # =============================================================================
355
+ # Offline State Provider (with service caching)
356
+ # =============================================================================
295
357
 
296
- Uses a semaphore pattern so that the watchdog event handler never blocks.
297
- The watchdog just signals the semaphore, and this thread does the actual
298
- database queries and listener notifications.
299
358
 
300
- Thread safety:
301
- - Uses a lock to protect start/stop transitions
302
- - Once stop() is called, the stop event cannot be cleared by start()
303
- - Uses a Condition for atomic wait-and-clear of change notifications
359
+ class OfflineStateProvider(StateProvider):
360
+ """State provider for offline/cached state access
361
+
362
+ Provides state listener management and service caching shared by
363
+ WorkspaceStateProvider and SSHStateProviderClient.
364
+
365
+ This is an intermediate class between StateProvider (the ABC) and concrete
366
+ implementations that need state listener support and service caching.
304
367
  """
305
368
 
306
- def __init__(self, state_provider: "WorkspaceStateProvider"):
307
- self.state_provider = state_provider
308
- self._last_check_time: Optional[datetime] = None
309
- self._change_condition = threading.Condition()
310
- self._change_pending = False # Protected by _change_condition
311
- self._thread: Optional[threading.Thread] = None
312
- self._debounce_seconds = 0.5 # Wait before processing to batch rapid changes
313
- self._state_lock = threading.Lock() # Protects start/stop transitions
314
- self._stopped = False # Once True, cannot be restarted
315
-
316
- def start(self) -> None:
317
- """Start the change detection thread"""
318
- with self._state_lock:
319
- # Once stopped, cannot restart
320
- if self._stopped:
321
- logger.debug("Cannot start change detector - already stopped")
322
- return
323
-
324
- if self._thread is not None and self._thread.is_alive():
325
- return # Already running
326
-
327
- self._thread = threading.Thread(
328
- target=self._run,
329
- daemon=True,
330
- name="DBChangeDetector",
331
- )
332
- self._thread.start()
333
- logger.debug("Started database change detector thread")
334
-
335
- def stop(self) -> None:
336
- """Stop the change detection thread"""
337
- with self._state_lock:
338
- self._stopped = True # Mark as permanently stopped
339
-
340
- # Wake up the thread so it can exit
341
- with self._change_condition:
342
- self._change_condition.notify_all()
343
-
344
- # Join outside the lock to avoid deadlock
345
- if self._thread is not None:
346
- self._thread.join(timeout=2.0)
347
- self._thread = None
348
- logger.debug("Stopped database change detector thread")
349
-
350
- def signal_change(self) -> None:
351
- """Signal that a database change was detected (non-blocking)"""
352
- with self._change_condition:
353
- self._change_pending = True
354
- self._change_condition.notify()
355
-
356
- def _run(self) -> None:
357
- """Main loop: wait for changes and process them"""
358
- while not self._stopped:
359
- # Wait for a change signal and clear it atomically
360
- with self._change_condition:
361
- while not self._change_pending and not self._stopped:
362
- self._change_condition.wait()
363
-
364
- if self._stopped:
365
- break
366
-
367
- # Clear the pending flag atomically while holding the lock
368
- self._change_pending = False
369
-
370
- # Debounce - wait a bit for more changes to accumulate
371
- time.sleep(self._debounce_seconds)
372
-
373
- # Process all accumulated changes
374
- self._detect_and_notify_changes()
375
-
376
- def _detect_and_notify_changes(self) -> None:
377
- """Query the database to detect what changed and send events"""
378
- try:
379
- since = self._last_check_time
380
- self._last_check_time = datetime.now()
381
-
382
- # Query for changed experiments
383
- with self.state_provider.workspace_db.bind_ctx([ExperimentModel]):
384
- query = ExperimentModel.select()
385
- if since:
386
- query = query.where(ExperimentModel.updated_at > since)
387
-
388
- for exp in query:
389
- self.state_provider._notify_listeners(
390
- StateEvent(
391
- event_type=StateEventType.EXPERIMENT_UPDATED,
392
- data={
393
- "experiment_id": exp.experiment_id,
394
- },
395
- )
396
- )
397
-
398
- # Query for changed jobs
399
- with self.state_provider.workspace_db.bind_ctx([JobModel]):
400
- query = JobModel.select()
401
- if since:
402
- query = query.where(JobModel.updated_at > since)
403
-
404
- for job in query:
405
- self.state_provider._notify_listeners(
406
- StateEvent(
407
- event_type=StateEventType.JOB_UPDATED,
408
- data={
409
- "jobId": job.job_id,
410
- "experimentId": job.experiment_id,
411
- "runId": job.run_id,
412
- "status": job.state,
413
- },
414
- )
415
- )
369
+ def __init__(self):
370
+ """Initialize offline state provider with service cache and listener management"""
371
+ super().__init__() # Initialize state listener management
372
+ self._init_service_cache()
416
373
 
417
- except Exception as e:
418
- logger.warning("Error detecting database changes: %s", e)
374
+ # =========================================================================
375
+ # Service caching methods
376
+ # =========================================================================
419
377
 
378
+ def _init_service_cache(self) -> None:
379
+ """Initialize service cache - call from subclass __init__"""
380
+ self._service_cache: Dict[tuple[str, str], Dict[str, "BaseService"]] = {}
381
+ self._service_cache_lock = threading.Lock()
420
382
 
421
- class _DatabaseFileHandler(FileSystemEventHandler):
422
- """Watchdog handler for SQLite database file changes
383
+ def _clear_service_cache(self) -> None:
384
+ """Clear the service cache"""
385
+ with self._service_cache_lock:
386
+ self._service_cache.clear()
423
387
 
424
- Simply signals the change detector when database files are modified.
425
- Does not block - all processing happens in the detector thread.
426
- """
388
+ def get_services(
389
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
390
+ ) -> List["BaseService"]:
391
+ """Get services for an experiment
392
+
393
+ Uses caching to preserve service instances (and their URLs) across calls.
394
+ Subclasses can override _get_live_services() for live service support
395
+ and must implement _fetch_services_from_storage() for persistent storage.
396
+ """
397
+ # Resolve run_id if needed
398
+ if experiment_id is not None and run_id is None:
399
+ run_id = self.get_current_run(experiment_id)
400
+ if run_id is None:
401
+ return []
402
+
403
+ cache_key = (experiment_id or "", run_id or "")
404
+
405
+ with self._service_cache_lock:
406
+ # Try to get live services (scheduler, etc.) - may return None
407
+ live_services = self._get_live_services(experiment_id, run_id)
408
+ if live_services is not None:
409
+ # Cache and return live services
410
+ self._service_cache[cache_key] = {s.id: s for s in live_services}
411
+ return live_services
412
+
413
+ # Check cache
414
+ cached = self._service_cache.get(cache_key)
415
+ if cached is not None:
416
+ return list(cached.values())
417
+
418
+ # Fetch from persistent storage (filesystem or remote)
419
+ services = self._fetch_services_from_storage(experiment_id, run_id)
420
+ self._service_cache[cache_key] = {s.id: s for s in services}
421
+ return services
422
+
423
+ def _get_live_services(
424
+ self, experiment_id: Optional[str], run_id: Optional[str]
425
+ ) -> Optional[List["BaseService"]]:
426
+ """Get live services if available (e.g., from scheduler).
427
427
 
428
- def __init__(self, change_detector: _DatabaseChangeDetector):
429
- super().__init__()
430
- self.change_detector = change_detector
428
+ Returns None if no live services are available (default).
429
+ Subclasses may override to check for live services.
430
+ """
431
+ return None
431
432
 
432
- def on_any_event(self, event) -> None:
433
- """Handle all file system events"""
434
- # Only handle modification-like events
435
- if event.event_type not in ("modified", "created", "moved"):
436
- return
433
+ @abstractmethod
434
+ def _fetch_services_from_storage(
435
+ self, experiment_id: Optional[str], run_id: Optional[str]
436
+ ) -> List["BaseService"]:
437
+ """Fetch services from persistent storage (filesystem or remote).
437
438
 
438
- if event.is_directory:
439
- return
439
+ Called when no live services and cache is empty.
440
+ """
441
+ ...
440
442
 
441
- # Only react to database files
442
- path = Path(event.src_path)
443
- if path.name not in ("workspace.db", "workspace.db-wal"):
444
- return
443
+ # State listener methods (add_listener, remove_listener, _notify_state_listeners)
444
+ # are inherited from StateProvider base class
445
445
 
446
- logger.debug(
447
- "Database file changed: %s (event: %s)", path.name, event.event_type
448
- )
449
446
 
450
- # Signal the detector thread (non-blocking)
451
- self.change_detector.signal_change()
447
+ # =============================================================================
448
+ # Mock Classes for Database-Loaded State
449
+ # =============================================================================
452
450
 
453
451
 
454
452
  class MockJob(BaseJob):
@@ -458,28 +456,69 @@ class MockJob(BaseJob):
458
456
  as opposed to live Job instances which are created during experiment runs.
459
457
  """
460
458
 
459
+ def apply_event(self, event: "EventBase") -> None:
460
+ """Apply a job event to update this job's state"""
461
+ from experimaestro.scheduler.state_status import (
462
+ JobStateChangedEvent,
463
+ JobProgressEvent,
464
+ )
465
+ from experimaestro.notifications import LevelInformation
466
+
467
+ if isinstance(event, JobStateChangedEvent):
468
+ self.state = STATE_NAME_TO_JOBSTATE.get(event.state, self.state)
469
+ if event.failure_reason:
470
+ try:
471
+ self.failure_reason = JobFailureStatus[event.failure_reason]
472
+ except KeyError:
473
+ pass
474
+ if event.submitted_time is not None:
475
+ self.submittime = event.submitted_time
476
+ if event.started_time is not None:
477
+ self.starttime = event.started_time
478
+ if event.ended_time is not None:
479
+ self.endtime = event.ended_time
480
+ if event.exit_code is not None:
481
+ self.exit_code = event.exit_code
482
+ if event.retry_count:
483
+ self.retry_count = event.retry_count
484
+ logger.debug(
485
+ "Applied state change to job %s: %s", self.identifier, self.state
486
+ )
487
+
488
+ elif isinstance(event, JobProgressEvent):
489
+ level = event.level
490
+ # Truncate to level + 1 entries
491
+ self.progress = self.progress[: (level + 1)]
492
+ # Extend if needed
493
+ while len(self.progress) <= level:
494
+ self.progress.append(LevelInformation(len(self.progress), None, 0.0))
495
+ # Update the level's progress and description
496
+ if event.desc:
497
+ self.progress[-1].desc = event.desc
498
+ self.progress[-1].progress = event.progress
499
+ logger.debug(
500
+ "Applied progress to job %s: %s", self.identifier, self.progress
501
+ )
502
+
461
503
  def __init__(
462
504
  self,
463
505
  identifier: str,
464
506
  task_id: str,
465
- locator: str,
466
507
  path: Path,
467
508
  state: str, # State name string from DB
468
509
  submittime: Optional[float],
469
510
  starttime: Optional[float],
470
511
  endtime: Optional[float],
471
- progress: List[Dict],
472
- tags: Dict[str, str],
473
- experiment_id: str,
474
- run_id: str,
512
+ progress: ProgressInformation,
475
513
  updated_at: str,
476
514
  exit_code: Optional[int] = None,
477
515
  retry_count: int = 0,
478
516
  failure_reason: Optional[JobFailureStatus] = None,
517
+ transient: TransientMode = TransientMode.NONE,
518
+ process: dict | None = None,
479
519
  ):
480
520
  self.identifier = identifier
481
521
  self.task_id = task_id
482
- self.locator = locator
483
522
  self.path = path
484
523
  # Convert state name to JobState instance
485
524
  self.state = STATE_NAME_TO_JOBSTATE.get(state, JobState.UNSCHEDULED)
@@ -487,56 +526,16 @@ class MockJob(BaseJob):
487
526
  self.starttime = starttime
488
527
  self.endtime = endtime
489
528
  self.progress = progress
490
- self.tags = tags
491
- self.experiment_id = experiment_id
492
- self.run_id = run_id
493
529
  self.updated_at = updated_at
494
530
  self.exit_code = exit_code
495
531
  self.retry_count = retry_count
496
532
  self.failure_reason = failure_reason
533
+ self.transient = transient
534
+ self._process_dict = process
497
535
 
498
- @classmethod
499
- def from_disk(cls, path: Path) -> Optional["MockJob"]:
500
- """Create a MockJob by reading metadata from disk
501
-
502
- Args:
503
- path: Path to the job directory
504
-
505
- Returns:
506
- MockJob instance if metadata exists, None otherwise
507
- """
508
- metadata_path = path / ".xpm_metadata.json"
509
- if not metadata_path.exists():
510
- return None
511
-
512
- try:
513
- import json
514
-
515
- with metadata_path.open("r") as f:
516
- metadata = json.load(f)
517
-
518
- return cls(
519
- identifier=metadata.get("job_id", path.name),
520
- task_id=metadata.get(
521
- "task_id", path.parent.name if path.parent else "unknown"
522
- ),
523
- locator=metadata.get("job_id", path.name),
524
- path=path,
525
- state=metadata.get("state", "unscheduled"),
526
- submittime=metadata.get("submitted_time"),
527
- starttime=metadata.get("started_time"),
528
- endtime=metadata.get("ended_time"),
529
- progress=[], # Progress not stored in metadata
530
- tags={}, # Tags come from jobs.jsonl, not metadata
531
- experiment_id="", # Not stored in job metadata
532
- run_id="", # Not stored in job metadata
533
- updated_at=str(metadata.get("last_updated", "")),
534
- exit_code=metadata.get("exit_code"),
535
- retry_count=metadata.get("retry_count", 0),
536
- )
537
- except Exception as e:
538
- logger.warning("Failed to read job metadata from %s: %s", path, e)
539
- return None
536
+ def process_state_dict(self) -> dict | None:
537
+ """Get process state as dictionary."""
538
+ return self._process_dict
540
539
 
541
540
  def getprocess(self):
542
541
  """Get process handle for running job
@@ -565,2202 +564,436 @@ class MockJob(BaseJob):
565
564
  logger.warning("Could not get process for job at %s: %s", self.path, e)
566
565
  return None
567
566
 
567
+ @classmethod
568
+ def from_state_dict(cls, d: Dict, workspace_path: Path) -> "MockJob":
569
+ """Create MockJob from state dictionary
570
+
571
+ Args:
572
+ d: Dictionary from state_dict()
573
+ workspace_path: Workspace path to compute job path if not provided
574
+
575
+ Returns:
576
+ MockJob instance
577
+ """
578
+ task_id = d["task_id"]
579
+ identifier = d["job_id"]
580
+
581
+ # Use path from dict if it's already a Path, otherwise compute it
582
+ path = d.get("path")
583
+ if path is None:
584
+ path = workspace_path / "jobs" / task_id / identifier
585
+ elif isinstance(path, str):
586
+ path = Path(path)
587
+
588
+ failure_reason = None
589
+ if d.get("failure_reason"):
590
+ failure_reason = JobFailureStatus[d["failure_reason"]]
591
+
592
+ # Convert progress dicts to LevelInformation objects
593
+ progress_list = get_progress_information_from_dict(d.get("progress", []))
594
+
595
+ return cls(
596
+ identifier=identifier,
597
+ task_id=task_id,
598
+ path=path,
599
+ state=d["state"],
600
+ submittime=deserialize_timestamp(d.get("submitted_time")),
601
+ starttime=deserialize_timestamp(d.get("started_time")),
602
+ endtime=deserialize_timestamp(d.get("ended_time")),
603
+ progress=progress_list,
604
+ updated_at=d.get("updated_at", ""),
605
+ exit_code=d.get("exit_code"),
606
+ retry_count=d.get("retry_count", 0),
607
+ failure_reason=failure_reason,
608
+ process=d.get("process"),
609
+ )
610
+
568
611
 
569
612
  class MockExperiment(BaseExperiment):
570
- """Concrete implementation of BaseExperiment for database-loaded experiments
613
+ """Concrete implementation of BaseExperiment for loaded experiments
571
614
 
572
- This class is used when loading experiment information from the database,
615
+ This class is used when loading experiment information from disk,
573
616
  as opposed to live experiment instances which are created during runs.
617
+
618
+ It stores all experiment state including jobs, services, tags,
619
+ dependencies, and event tracking (replaces StatusData).
574
620
  """
575
621
 
576
622
  def __init__(
577
623
  self,
578
624
  workdir: Path,
579
- current_run_id: Optional[str],
580
- total_jobs: int,
581
- finished_jobs: int,
582
- failed_jobs: int,
583
- updated_at: str,
625
+ run_id: str,
626
+ *,
627
+ status: ExperimentStatus = ExperimentStatus.RUNNING,
628
+ events_count: int = 0,
629
+ hostname: Optional[str] = None,
584
630
  started_at: Optional[float] = None,
585
631
  ended_at: Optional[float] = None,
586
- hostname: Optional[str] = None,
632
+ job_infos: Optional[Dict[str, "ExperimentJobInformation"]] = None,
633
+ services: Optional[Dict[str, "MockService"]] = None,
634
+ dependencies: Optional[Dict[str, List[str]]] = None,
635
+ experiment_id_override: Optional[str] = None,
636
+ finished_jobs: int = 0,
637
+ failed_jobs: int = 0,
587
638
  ):
588
639
  self.workdir = workdir
589
- self.current_run_id = current_run_id
590
- self.total_jobs = total_jobs
591
- self.finished_jobs = finished_jobs
592
- self.failed_jobs = failed_jobs
593
- self.updated_at = updated_at
594
- self.started_at = started_at
595
- self.ended_at = ended_at
596
- self.hostname = hostname
640
+ self.run_id = run_id
641
+ self._status = status
642
+ self._events_count = events_count
643
+ self._hostname = hostname
644
+ self._started_at = started_at
645
+ self._ended_at = ended_at
646
+ self._job_infos = job_infos or {}
647
+ self._services = services or {}
648
+ self._dependencies = dependencies or {}
649
+ self._experiment_id_override = experiment_id_override
650
+ self._finished_jobs = finished_jobs
651
+ self._failed_jobs = failed_jobs
597
652
 
598
653
  @property
599
654
  def experiment_id(self) -> str:
600
- """Experiment identifier derived from workdir name"""
601
- return self.workdir.name
655
+ """Return experiment_id (overriding base class if needed for v1 layout)"""
656
+ if self._experiment_id_override:
657
+ return self._experiment_id_override
658
+ return super().experiment_id
602
659
 
660
+ # Implement abstract properties from BaseExperiment
603
661
 
604
- class MockService(BaseService):
605
- """Mock service object for remote monitoring
662
+ @property
663
+ def status(self) -> ExperimentStatus:
664
+ return self._status
606
665
 
607
- This class provides a service-like interface for services loaded from
608
- the remote server. It mimics the Service class interface sufficiently
609
- for display in the TUI ServicesList widget.
610
- """
611
-
612
- def __init__(
613
- self,
614
- service_id: str,
615
- description_text: str,
616
- state_dict_data: dict,
617
- experiment_id: Optional[str] = None,
618
- run_id: Optional[str] = None,
619
- url: Optional[str] = None,
620
- state: str = "STOPPED",
621
- ):
622
- self.id = service_id
623
- self._description = description_text
624
- self._state_name = state
625
- self._state_dict_data = state_dict_data
626
- self.experiment_id = experiment_id
627
- self.run_id = run_id
628
- self.url = url
666
+ @property
667
+ def job_infos(self) -> Dict[str, "ExperimentJobInformation"]:
668
+ """Lightweight job info from jobs.jsonl (job_id, task_id, tags, timestamp)"""
669
+ return self._job_infos
629
670
 
630
671
  @property
631
- def state(self):
632
- """Return state as a ServiceState-like object with a name attribute"""
633
- from experimaestro.scheduler.services import ServiceState
634
-
635
- # Convert state name to ServiceState enum
636
- try:
637
- return ServiceState[self._state_name]
638
- except KeyError:
639
- # Return a mock object with name attribute for unknown states
640
- class MockState:
641
- def __init__(self, name):
642
- self.name = name
643
-
644
- return MockState(self._state_name)
645
-
646
- def description(self) -> str:
647
- """Return service description"""
648
- return self._description
649
-
650
- def state_dict(self) -> dict:
651
- """Return state dictionary for service recreation"""
652
- return self._state_dict_data
653
-
654
-
655
- def _with_db_context(func):
656
- """Decorator to wrap method in database bind context
657
-
658
- This ensures all database queries have the models bound to the database.
659
- """
660
- from functools import wraps
661
-
662
- @wraps(func)
663
- def wrapper(self, *args, **kwargs):
664
- try:
665
- with self.workspace_db.bind_ctx(ALL_MODELS):
666
- return func(self, *args, **kwargs)
667
- except Exception as e:
668
- logger.exception("Error in %s with database context: %s", func.__name__, e)
669
- raise
670
-
671
- return wrapper
672
-
673
-
674
- class WorkspaceStateProvider(StateProvider):
675
- """Unified state provider for workspace-level database (singleton per workspace path)
676
-
677
- Provides access to experiment and job state from a single workspace database.
678
- Supports both read-only (monitoring) and read-write (scheduler) modes.
679
-
680
- Only one WorkspaceStateProvider instance exists per workspace path. Subsequent
681
- requests for the same path return the existing instance.
682
-
683
- Thread safety:
684
- - Database connections are thread-local (managed by state_db module)
685
- - Singleton registry is protected by a lock
686
- - Each thread gets its own database connection
687
-
688
- Run tracking:
689
- - Each experiment can have multiple runs
690
- - Jobs/services are scoped to (experiment_id, run_id)
691
- - Tags are scoped to (job_id, experiment_id, run_id) - fixes GH #128
692
- """
693
-
694
- # Registry of state provider instances by absolute path
695
- _instances: Dict[Path, "WorkspaceStateProvider"] = {}
696
- _lock = threading.Lock()
697
-
698
- @classmethod
699
- def get_instance(
700
- cls,
701
- workspace_path: Path,
702
- read_only: bool = False,
703
- sync_on_start: bool = False,
704
- sync_interval_minutes: int = 5,
705
- ) -> "WorkspaceStateProvider":
706
- """Get or create WorkspaceStateProvider instance for a workspace path
707
-
708
- Args:
709
- workspace_path: Root workspace directory
710
- read_only: If True, database is in read-only mode
711
- sync_on_start: If True, sync from disk on initialization
712
- sync_interval_minutes: Minimum interval between syncs (default: 5)
713
-
714
- Returns:
715
- WorkspaceStateProvider instance (singleton per path)
716
- """
717
- # Normalize path
718
- if isinstance(workspace_path, Path):
719
- workspace_path = workspace_path.absolute()
720
- else:
721
- workspace_path = Path(workspace_path).absolute()
722
-
723
- # Check if instance already exists
724
- with cls._lock:
725
- if workspace_path in cls._instances:
726
- existing = cls._instances[workspace_path]
727
- # Fail if requesting different read_only mode than cached instance
728
- if existing.read_only != read_only:
729
- raise RuntimeError(
730
- f"WorkspaceStateProvider for {workspace_path} already exists "
731
- f"with read_only={existing.read_only}, cannot open with "
732
- f"read_only={read_only}. Close the existing instance first."
733
- )
734
- return existing
735
-
736
- # Create new instance - register BEFORE __init__ to handle
737
- # nested get_instance calls during sync_on_start
738
- instance = object.__new__(cls)
739
- cls._instances[workspace_path] = instance
740
-
741
- # Initialize outside the lock to avoid deadlock during sync
742
- try:
743
- instance.__init__(
744
- workspace_path, read_only, sync_on_start, sync_interval_minutes
745
- )
746
- except Exception:
747
- # Remove from registry if initialization fails
748
- with cls._lock:
749
- cls._instances.pop(workspace_path, None)
750
- raise
751
- return instance
752
-
753
- def __init__(
754
- self,
755
- workspace_path: Path,
756
- read_only: bool = False,
757
- sync_on_start: bool = False,
758
- sync_interval_minutes: int = 5,
759
- ):
760
- """Initialize workspace state provider (called by get_instance())
761
-
762
- Args:
763
- workspace_path: Root workspace directory
764
- read_only: If True, database is in read-only mode
765
- sync_on_start: If True, sync from disk on initialization
766
- sync_interval_minutes: Minimum interval between syncs (default: 5)
767
- """
768
- # Normalize path
769
- if isinstance(workspace_path, Path):
770
- workspace_path = workspace_path.absolute()
771
- else:
772
- workspace_path = Path(workspace_path).absolute()
773
-
774
- self.workspace_path = workspace_path
775
- self._read_only = read_only
776
- self.sync_interval_minutes = sync_interval_minutes
777
-
778
- # Listeners for push notifications
779
- self._listeners: Set[StateListener] = set()
780
- self._listeners_lock = threading.Lock()
781
-
782
- # Service cache (from base class)
783
- self._init_service_cache()
672
+ def services(self) -> Dict[str, "BaseService"]:
673
+ return self._services
784
674
 
785
- # File watcher for database changes (started when listeners are added)
786
- self._change_detector: Optional[_DatabaseChangeDetector] = None
787
- self._db_file_handler: Optional[_DatabaseFileHandler] = None
788
- self._db_file_watch: Optional[ObservedWatch] = None
789
-
790
- # Check and update workspace version
791
- from .workspace import WORKSPACE_VERSION
792
-
793
- version_file = self.workspace_path / ".__experimaestro__"
794
-
795
- if version_file.exists():
796
- # Read existing version
797
- content = version_file.read_text().strip()
798
- if content == "":
799
- # Empty file = v0
800
- workspace_version = 0
801
- else:
802
- try:
803
- workspace_version = int(content)
804
- except ValueError:
805
- raise RuntimeError(
806
- f"Invalid workspace version file at {version_file}: "
807
- f"expected integer, got '{content}'"
808
- )
809
-
810
- # Check if workspace version is supported
811
- if workspace_version > WORKSPACE_VERSION:
812
- raise RuntimeError(
813
- f"Workspace version {workspace_version} is not supported by "
814
- f"this version of experimaestro (supports up to version "
815
- f"{WORKSPACE_VERSION}). Please upgrade experimaestro."
816
- )
817
- if workspace_version < WORKSPACE_VERSION:
818
- raise RuntimeError(
819
- f"Workspace version {workspace_version} is not supported by "
820
- "this version of experimaestro (please upgrade the experimaestro "
821
- "workspace)"
822
- )
823
- else:
824
- # New workspace - create the file
825
- workspace_version = WORKSPACE_VERSION
826
-
827
- # Write current version to file (update empty v0 workspaces)
828
- if not read_only and (
829
- not version_file.exists() or version_file.read_text().strip() == ""
830
- ):
831
- version_file.write_text(str(WORKSPACE_VERSION))
832
-
833
- # Initialize workspace database in hidden .experimaestro directory
834
- from .state_db import initialize_workspace_database
675
+ @property
676
+ def tags(self) -> Dict[str, Dict[str, str]]:
677
+ """Build tags dict from job_infos"""
678
+ return {
679
+ job_id: job_info.tags
680
+ for job_id, job_info in self._job_infos.items()
681
+ if job_info.tags
682
+ }
835
683
 
836
- experimaestro_dir = self.workspace_path / ".experimaestro"
837
- if not read_only:
838
- experimaestro_dir.mkdir(parents=True, exist_ok=True)
684
+ @property
685
+ def dependencies(self) -> Dict[str, List[str]]:
686
+ return self._dependencies
839
687
 
840
- db_path = experimaestro_dir / "workspace.db"
841
- self.workspace_db, needs_resync = initialize_workspace_database(
842
- db_path, read_only=read_only
843
- )
844
- self._db_dir = experimaestro_dir # Store for file watcher
688
+ @property
689
+ def events_count(self) -> int:
690
+ return self._events_count
845
691
 
846
- # Sync from disk if needed due to schema version change
847
- if needs_resync and not read_only:
848
- logger.info(
849
- "Database schema version changed, triggering full resync from disk"
850
- )
851
- sync_on_start = True # Force sync
852
-
853
- # Optionally sync from disk on start (only in write mode)
854
- # Syncing requires write access to update the database and sync timestamp
855
- if sync_on_start and not read_only:
856
- from .state_sync import sync_workspace_from_disk
857
-
858
- sync_workspace_from_disk(
859
- self.workspace_path,
860
- write_mode=True,
861
- force=needs_resync, # Force full sync if schema changed
862
- sync_interval_minutes=sync_interval_minutes,
863
- )
692
+ @property
693
+ def hostname(self) -> Optional[str]:
694
+ return self._hostname
864
695
 
865
- # Update db_version after successful sync
866
- if needs_resync:
867
- with self.workspace_db.bind_ctx([WorkspaceSyncMetadata]):
868
- WorkspaceSyncMetadata.update(db_version=CURRENT_DB_VERSION).where(
869
- WorkspaceSyncMetadata.id == "workspace"
870
- ).execute()
871
- logger.info("Database schema updated to version %d", CURRENT_DB_VERSION)
872
-
873
- logger.info(
874
- "WorkspaceStateProvider initialized (read_only=%s, workspace=%s)",
875
- read_only,
876
- workspace_path,
877
- )
696
+ @property
697
+ def started_at(self) -> Optional[float]:
698
+ return self._started_at
878
699
 
879
700
  @property
880
- def read_only(self) -> bool:
881
- """Whether this provider is read-only"""
882
- return self._read_only
701
+ def ended_at(self) -> Optional[float]:
702
+ return self._ended_at
883
703
 
884
- # Experiment management methods
704
+ @property
705
+ def total_jobs(self) -> int:
706
+ return len(self._job_infos)
885
707
 
886
- @_with_db_context
887
- def ensure_experiment(self, experiment_id: str):
888
- """Create or update experiment record
708
+ @property
709
+ def finished_jobs(self) -> int:
710
+ return self._finished_jobs
889
711
 
890
- Args:
891
- experiment_id: Unique identifier for the experiment
892
- """
893
- if self.read_only:
894
- raise RuntimeError("Cannot modify experiments in read-only mode")
712
+ @property
713
+ def failed_jobs(self) -> int:
714
+ return self._failed_jobs
895
715
 
896
- now = datetime.now()
897
- ExperimentModel.insert(
898
- experiment_id=experiment_id,
899
- created_at=now,
900
- updated_at=now,
901
- ).on_conflict(
902
- conflict_target=[ExperimentModel.experiment_id],
903
- update={
904
- ExperimentModel.updated_at: now,
905
- },
906
- ).execute()
907
-
908
- logger.debug("Ensured experiment: %s", experiment_id)
909
-
910
- # Notify listeners
911
- exp_path = str(self.workspace_path / "xp" / experiment_id)
912
- self._notify_listeners(
913
- StateEvent(
914
- event_type=StateEventType.EXPERIMENT_UPDATED,
915
- data={
916
- "experiment_id": experiment_id,
917
- "workdir_path": exp_path,
918
- "updated_at": now.isoformat(),
919
- },
920
- )
921
- )
716
+ # state_dict() is inherited from BaseExperiment
922
717
 
923
- @_with_db_context
924
- def create_run(self, experiment_id: str, run_id: Optional[str] = None) -> str:
925
- """Create a new run for an experiment
718
+ @classmethod
719
+ def from_disk(
720
+ cls, run_dir: Path, workspace_path: Path
721
+ ) -> Optional["MockExperiment"]:
722
+ """Load MockExperiment from status.json and jobs.jsonl on disk
926
723
 
927
724
  Args:
928
- experiment_id: Experiment identifier
929
- run_id: Optional run ID (auto-generated from timestamp if not provided)
725
+ run_dir: Path to the run directory containing status.json
726
+ workspace_path: Workspace path for resolving relative paths
930
727
 
931
728
  Returns:
932
- The run_id that was created
933
-
934
- Raises:
935
- RuntimeError: If in read-only mode
729
+ MockExperiment instance or None if status.json doesn't exist
936
730
  """
937
- if self.read_only:
938
- raise RuntimeError("Cannot create runs in read-only mode")
939
-
940
- # Auto-generate run_id from timestamp if not provided
941
- if run_id is None:
942
- now = datetime.now()
943
- run_id = now.strftime("%Y%m%d_%H%M%S") + f"_{now.microsecond:06d}"
731
+ import fasteners
944
732
 
945
- # Capture hostname
946
- hostname = socket.gethostname()
947
- started_at = datetime.now()
733
+ status_path = run_dir / "status.json"
734
+ if not status_path.exists():
735
+ return None
948
736
 
949
- # Create run record with hostname
950
- ExperimentRunModel.insert(
951
- experiment_id=experiment_id,
952
- run_id=run_id,
953
- started_at=started_at,
954
- status="active",
955
- hostname=hostname,
956
- ).execute()
957
-
958
- # Persist to disk in experiment folder (informations.json)
959
- exp_dir = self.workspace_path / "xp" / experiment_id
960
- exp_dir.mkdir(parents=True, exist_ok=True)
961
- info_file = exp_dir / "informations.json"
962
-
963
- # Merge with existing data (may have multiple runs)
964
- info_data: Dict = {}
965
- if info_file.exists():
737
+ lock_path = status_path.parent / f".{status_path.name}.lock"
738
+ lock = fasteners.InterProcessLock(str(lock_path))
739
+ with lock:
966
740
  try:
967
- info_data = json.loads(info_file.read_text())
968
- except json.JSONDecodeError:
969
- logger.warning("Could not parse existing informations.json")
970
-
971
- if "runs" not in info_data:
972
- info_data["runs"] = {}
973
- info_data["runs"][run_id] = {
974
- "hostname": hostname,
975
- "started_at": started_at.isoformat(),
976
- }
977
- info_file.write_text(json.dumps(info_data, indent=2))
978
-
979
- # Update experiment's current_run_id and updated_at
980
- now = datetime.now()
981
- ExperimentModel.update(
982
- current_run_id=run_id,
983
- updated_at=now,
984
- ).where(ExperimentModel.experiment_id == experiment_id).execute()
985
-
986
- logger.info(
987
- "Created run %s for experiment %s on host %s",
988
- run_id,
989
- experiment_id,
990
- hostname,
991
- )
992
-
993
- # Notify listeners
994
- self._notify_listeners(
995
- StateEvent(
996
- event_type=StateEventType.RUN_UPDATED,
997
- data={
998
- "experiment_id": experiment_id,
999
- "run_id": run_id,
1000
- "status": "active",
1001
- "started_at": now.isoformat(),
1002
- "hostname": hostname,
1003
- },
1004
- )
1005
- )
1006
-
1007
- return run_id
1008
-
1009
- @_with_db_context
1010
- def get_current_run(self, experiment_id: str) -> Optional[str]:
1011
- """Get the current/latest run_id for an experiment
741
+ with status_path.open("r") as f:
742
+ data = json.load(f)
743
+ except (json.JSONDecodeError, OSError) as e:
744
+ logger.warning("Failed to read %s: %s", status_path, e)
745
+ return None
1012
746
 
1013
- Args:
1014
- experiment_id: Experiment identifier
747
+ # Create experiment from status.json
748
+ exp = cls.from_state_dict(data, workspace_path)
1015
749
 
1016
- Returns:
1017
- Current run_id or None if no runs exist
1018
- """
1019
- try:
1020
- experiment = ExperimentModel.get(
1021
- ExperimentModel.experiment_id == experiment_id
1022
- )
1023
- return experiment.current_run_id
1024
- except ExperimentModel.DoesNotExist:
1025
- return None
750
+ # Load jobs from jobs.jsonl
751
+ jobs_jsonl_path = run_dir / "jobs.jsonl"
752
+ if jobs_jsonl_path.exists():
753
+ try:
754
+ with jobs_jsonl_path.open("r") as f:
755
+ for line in f:
756
+ line = line.strip()
757
+ if not line:
758
+ continue
759
+ try:
760
+ record = json.loads(line)
761
+ job_info = ExperimentJobInformation.from_dict(record)
762
+ exp._job_infos[job_info.job_id] = job_info
763
+ except (json.JSONDecodeError, KeyError):
764
+ continue
765
+ except OSError as e:
766
+ logger.warning("Failed to read %s: %s", jobs_jsonl_path, e)
767
+
768
+ return exp
1026
769
 
1027
- @_with_db_context
1028
- def get_experiments(self, since: Optional[datetime] = None) -> List[MockExperiment]:
1029
- """Get list of all experiments
770
+ @classmethod
771
+ def from_state_dict(cls, d: Dict, workspace_path: Path) -> "MockExperiment":
772
+ """Create MockExperiment from state dictionary
1030
773
 
1031
774
  Args:
1032
- since: If provided, only return experiments updated after this timestamp
775
+ d: Dictionary from state_dict()
776
+ workspace_path: Workspace path to compute experiment path if not provided
1033
777
 
1034
778
  Returns:
1035
- List of MockExperiment objects with attributes:
1036
- - workdir: Path to experiment directory
1037
- - experiment_id: Unique identifier (property derived from workdir.name)
1038
- - current_run_id: Current/latest run ID
1039
- - total_jobs: Total number of jobs (for current run)
1040
- - finished_jobs: Number of completed jobs (for current run)
1041
- - failed_jobs: Number of failed jobs (for current run)
1042
- - updated_at: When experiment was last modified
1043
- - hostname: Host where the current run was launched
779
+ MockExperiment instance
1044
780
  """
1045
- experiments = []
1046
-
1047
- query = ExperimentModel.select()
1048
- if since is not None:
1049
- query = query.where(ExperimentModel.updated_at > since)
1050
-
1051
- for exp_model in query:
1052
- # Count jobs for current run
1053
- total_jobs = 0
1054
- finished_jobs = 0
1055
- failed_jobs = 0
1056
-
1057
- started_at = None
1058
- ended_at = None
1059
- hostname = None
1060
-
1061
- if exp_model.current_run_id:
1062
- total_jobs = (
1063
- JobModel.select()
1064
- .where(
1065
- (JobModel.experiment_id == exp_model.experiment_id)
1066
- & (JobModel.run_id == exp_model.current_run_id)
1067
- )
1068
- .count()
1069
- )
1070
- finished_jobs = (
1071
- JobModel.select()
1072
- .where(
1073
- (JobModel.experiment_id == exp_model.experiment_id)
1074
- & (JobModel.run_id == exp_model.current_run_id)
1075
- & (JobModel.state == "done")
1076
- )
1077
- .count()
1078
- )
1079
- failed_jobs = (
1080
- JobModel.select()
1081
- .where(
1082
- (JobModel.experiment_id == exp_model.experiment_id)
1083
- & (JobModel.run_id == exp_model.current_run_id)
1084
- & (JobModel.state == "error")
1085
- )
1086
- .count()
1087
- )
1088
-
1089
- # Get run timestamps and hostname
1090
- try:
1091
- run_model = ExperimentRunModel.get(
1092
- (ExperimentRunModel.experiment_id == exp_model.experiment_id)
1093
- & (ExperimentRunModel.run_id == exp_model.current_run_id)
1094
- )
1095
- if run_model.started_at:
1096
- started_at = run_model.started_at.timestamp()
1097
- if run_model.ended_at:
1098
- ended_at = run_model.ended_at.timestamp()
1099
- hostname = run_model.hostname
1100
- except ExperimentRunModel.DoesNotExist:
1101
- pass
1102
-
1103
- # Compute experiment path from workspace_path and experiment_id
1104
- exp_path = self.workspace_path / "xp" / exp_model.experiment_id
1105
-
1106
- experiments.append(
1107
- MockExperiment(
1108
- workdir=exp_path,
1109
- current_run_id=exp_model.current_run_id,
1110
- total_jobs=total_jobs,
1111
- finished_jobs=finished_jobs,
1112
- failed_jobs=failed_jobs,
1113
- updated_at=exp_model.updated_at.isoformat(),
1114
- started_at=started_at,
1115
- ended_at=ended_at,
1116
- hostname=hostname,
1117
- )
1118
- )
781
+ experiment_id = d.get("experiment_id", "")
782
+ run_id = d.get("run_id", "")
1119
783
 
1120
- return experiments
1121
-
1122
- @_with_db_context
1123
- def get_experiment(self, experiment_id: str) -> Optional[MockExperiment]:
1124
- """Get a specific experiment by ID
1125
-
1126
- Args:
1127
- experiment_id: Experiment identifier
784
+ # Use workdir from dict if provided, otherwise compute it
785
+ workdir = d.get("workdir")
786
+ if workdir is None:
787
+ # New layout: experiments/{experiment_id}/{run_id}/
788
+ workdir = workspace_path / "experiments" / experiment_id / run_id
789
+ elif isinstance(workdir, str):
790
+ workdir = Path(workdir)
1128
791
 
1129
- Returns:
1130
- MockExperiment object or None if not found
1131
- """
792
+ # Parse status from string to enum
793
+ status_str = d.get("status", "running")
1132
794
  try:
1133
- exp_model = ExperimentModel.get(
1134
- ExperimentModel.experiment_id == experiment_id
1135
- )
1136
- except ExperimentModel.DoesNotExist:
1137
- return None
1138
-
1139
- # Count jobs for current run
1140
- total_jobs = 0
1141
- finished_jobs = 0
1142
- failed_jobs = 0
1143
- hostname = None
1144
-
1145
- if exp_model.current_run_id:
1146
- total_jobs = (
1147
- JobModel.select()
1148
- .where(
1149
- (JobModel.experiment_id == exp_model.experiment_id)
1150
- & (JobModel.run_id == exp_model.current_run_id)
1151
- )
1152
- .count()
1153
- )
1154
- finished_jobs = (
1155
- JobModel.select()
1156
- .where(
1157
- (JobModel.experiment_id == exp_model.experiment_id)
1158
- & (JobModel.run_id == exp_model.current_run_id)
1159
- & (JobModel.state == "done")
1160
- )
1161
- .count()
1162
- )
1163
- failed_jobs = (
1164
- JobModel.select()
1165
- .where(
1166
- (JobModel.experiment_id == exp_model.experiment_id)
1167
- & (JobModel.run_id == exp_model.current_run_id)
1168
- & (JobModel.state == "error")
1169
- )
1170
- .count()
1171
- )
795
+ status = ExperimentStatus(status_str)
796
+ except ValueError:
797
+ # Handle legacy status values
798
+ if status_str in ("active", "running"):
799
+ status = ExperimentStatus.RUNNING
800
+ elif status_str in ("completed", "done"):
801
+ status = ExperimentStatus.DONE
802
+ elif status_str == "failed":
803
+ status = ExperimentStatus.FAILED
804
+ else:
805
+ status = ExperimentStatus.RUNNING
806
+
807
+ # Parse services from dict (can be list or dict)
808
+ services_data = d.get("services", {})
809
+ if isinstance(services_data, list):
810
+ services = {
811
+ s.get("service_id", ""): MockService.from_full_state_dict(s)
812
+ for s in services_data
813
+ }
814
+ else:
815
+ services = {
816
+ k: MockService.from_full_state_dict(v) for k, v in services_data.items()
817
+ }
1172
818
 
1173
- # Get hostname from run model
1174
- try:
1175
- run_model = ExperimentRunModel.get(
1176
- (ExperimentRunModel.experiment_id == exp_model.experiment_id)
1177
- & (ExperimentRunModel.run_id == exp_model.current_run_id)
1178
- )
1179
- hostname = run_model.hostname
1180
- except ExperimentRunModel.DoesNotExist:
1181
- pass
1182
-
1183
- # Compute experiment path from workspace_path and experiment_id
1184
- exp_path = self.workspace_path / "xp" / exp_model.experiment_id
1185
-
1186
- return MockExperiment(
1187
- workdir=exp_path,
1188
- current_run_id=exp_model.current_run_id,
1189
- total_jobs=total_jobs,
1190
- finished_jobs=finished_jobs,
1191
- failed_jobs=failed_jobs,
1192
- updated_at=exp_model.updated_at.isoformat(),
1193
- hostname=hostname,
819
+ return cls(
820
+ workdir=workdir,
821
+ run_id=run_id,
822
+ status=status,
823
+ events_count=d.get("events_count", 0),
824
+ hostname=d.get("hostname"),
825
+ started_at=d.get("started_at"),
826
+ ended_at=d.get("ended_at"),
827
+ services=services,
828
+ dependencies=d.get("dependencies", {}),
829
+ finished_jobs=d.get("finished_jobs", 0),
830
+ failed_jobs=d.get("failed_jobs", 0),
1194
831
  )
1195
832
 
1196
- @_with_db_context
1197
- def get_experiment_runs(self, experiment_id: str) -> List[Dict]:
1198
- """Get all runs for an experiment
833
+ def apply_event(self, event: "EventBase") -> None:
834
+ """Apply an event to update experiment state
1199
835
 
1200
836
  Args:
1201
- experiment_id: Experiment identifier
1202
-
1203
- Returns:
1204
- List of run dictionaries with keys:
1205
- - experiment_id: Experiment ID
1206
- - run_id: Run ID
1207
- - started_at: When run started
1208
- - ended_at: When run completed (None if active)
1209
- - status: Run status (active, completed, failed, abandoned)
837
+ event: Event to apply
1210
838
  """
1211
- runs = []
1212
- for run_model in (
1213
- ExperimentRunModel.select()
1214
- .where(ExperimentRunModel.experiment_id == experiment_id)
1215
- .order_by(ExperimentRunModel.started_at.desc())
1216
- ):
1217
- runs.append(
1218
- {
1219
- "experiment_id": run_model.experiment_id,
1220
- "run_id": run_model.run_id,
1221
- "started_at": run_model.started_at.isoformat(),
1222
- "ended_at": (
1223
- run_model.ended_at.isoformat() if run_model.ended_at else None
1224
- ),
1225
- "status": run_model.status,
1226
- }
1227
- )
1228
- return runs
1229
-
1230
- @_with_db_context
1231
- def complete_run(self, experiment_id: str, run_id: str, status: str = "completed"):
1232
- """Mark a run as completed
1233
-
1234
- Args:
1235
- experiment_id: Experiment identifier
1236
- run_id: Run identifier
1237
- status: Final status (completed, failed, abandoned)
839
+ from experimaestro.scheduler.state_status import (
840
+ JobSubmittedEvent,
841
+ JobStateChangedEvent,
842
+ ServiceAddedEvent,
843
+ RunCompletedEvent,
844
+ )
1238
845
 
1239
- Raises:
1240
- RuntimeError: If in read-only mode
1241
- """
1242
- if self.read_only:
1243
- raise RuntimeError("Cannot modify runs in read-only mode")
846
+ if isinstance(event, JobSubmittedEvent):
847
+ # Add lightweight job info (tags are stored in ExperimentJobInformation)
848
+ self._job_infos[event.job_id] = ExperimentJobInformation(
849
+ job_id=event.job_id,
850
+ task_id=event.task_id,
851
+ tags=event.tags or {},
852
+ timestamp=event.timestamp,
853
+ )
854
+ if event.depends_on:
855
+ self._dependencies[event.job_id] = event.depends_on
856
+
857
+ elif isinstance(event, ServiceAddedEvent):
858
+ self._services[event.service_id] = MockService(
859
+ service_id=event.service_id,
860
+ description_text=event.description,
861
+ state_dict_data=event.state_dict,
862
+ service_class=event.service_class,
863
+ experiment_id=self.experiment_id,
864
+ run_id=self.run_id,
865
+ )
866
+
867
+ elif isinstance(event, JobStateChangedEvent):
868
+ # Update finished/failed counters when jobs complete
869
+ if event.state == "done":
870
+ self._finished_jobs += 1
871
+ elif event.state == "error":
872
+ self._failed_jobs += 1
873
+
874
+ elif isinstance(event, RunCompletedEvent):
875
+ # Map status string to ExperimentStatus
876
+ if event.status in ("completed", "done"):
877
+ self._status = ExperimentStatus.DONE
878
+ elif event.status == "failed":
879
+ self._status = ExperimentStatus.FAILED
880
+ else:
881
+ self._status = ExperimentStatus.RUNNING
882
+ self._ended_at = event.ended_at
1244
883
 
1245
- ExperimentRunModel.update(ended_at=datetime.now(), status=status).where(
1246
- (ExperimentRunModel.experiment_id == experiment_id)
1247
- & (ExperimentRunModel.run_id == run_id)
1248
- ).execute()
1249
884
 
1250
- logger.info("Marked run %s/%s as %s", experiment_id, run_id, status)
885
+ class MockService(BaseService):
886
+ """Mock service object for remote monitoring
1251
887
 
1252
- # Job operations
888
+ This class provides a service-like interface for services loaded from
889
+ the remote server. It mimics the Service class interface sufficiently
890
+ for display in the TUI ServicesList widget.
891
+ """
1253
892
 
1254
- @_with_db_context
1255
- def get_jobs(
893
+ def __init__(
1256
894
  self,
895
+ service_id: str,
896
+ description_text: str,
897
+ state_dict_data: dict,
898
+ service_class: Optional[str] = None,
1257
899
  experiment_id: Optional[str] = None,
1258
900
  run_id: Optional[str] = None,
1259
- task_id: Optional[str] = None,
1260
- state: Optional[str] = None,
1261
- tags: Optional[Dict[str, str]] = None,
1262
- since: Optional[datetime] = None,
1263
- ) -> List[MockJob]:
1264
- """Query jobs with optional filters
1265
-
1266
- Args:
1267
- experiment_id: Filter by experiment (None = all experiments)
1268
- run_id: Filter by run (None = current run if experiment_id provided)
1269
- task_id: Filter by task class identifier
1270
- state: Filter by job state
1271
- tags: Filter by tags (all tags must match)
1272
- since: If provided, only return jobs updated after this timestamp
1273
-
1274
- Returns:
1275
- List of MockJob objects
1276
- """
1277
- # Build base query
1278
- query = JobModel.select()
1279
-
1280
- # Apply since filter for incremental updates
1281
- if since is not None:
1282
- query = query.where(JobModel.updated_at > since)
1283
-
1284
- # Apply experiment filter
1285
- if experiment_id is not None:
1286
- # If experiment_id provided but not run_id, use current run
1287
- if run_id is None:
1288
- current_run = self.get_current_run(experiment_id)
1289
- if current_run is None:
1290
- return [] # No runs exist for this experiment
1291
- run_id = current_run
1292
-
1293
- query = query.where(
1294
- (JobModel.experiment_id == experiment_id) & (JobModel.run_id == run_id)
1295
- )
1296
-
1297
- # Apply task_id filter
1298
- if task_id is not None:
1299
- query = query.where(JobModel.task_id == task_id)
1300
-
1301
- # Apply state filter
1302
- if state is not None:
1303
- query = query.where(JobModel.state == state)
1304
-
1305
- # Apply tag filters
1306
- if tags:
1307
- for tag_key, tag_value in tags.items():
1308
- # Join with JobTagModel for each tag filter
1309
- query = query.join(
1310
- JobTagModel,
1311
- on=(
1312
- (JobTagModel.job_id == JobModel.job_id)
1313
- & (JobTagModel.experiment_id == JobModel.experiment_id)
1314
- & (JobTagModel.run_id == JobModel.run_id)
1315
- & (JobTagModel.tag_key == tag_key)
1316
- & (JobTagModel.tag_value == tag_value)
1317
- ),
1318
- )
1319
-
1320
- # Execute query and convert to dictionaries
1321
- jobs = []
1322
- for job_model in query:
1323
- # Get tags for this job
1324
- job_tags = self._get_job_tags(
1325
- job_model.job_id, job_model.experiment_id, job_model.run_id
1326
- )
1327
-
1328
- jobs.append(self._job_model_to_dict(job_model, job_tags))
1329
-
1330
- return jobs
1331
-
1332
- @_with_db_context
1333
- def get_job(
1334
- self, job_id: str, experiment_id: str, run_id: Optional[str] = None
1335
- ) -> Optional[MockJob]:
1336
- """Get a specific job
1337
-
1338
- Args:
1339
- job_id: Job identifier
1340
- experiment_id: Experiment identifier
1341
- run_id: Run identifier (None = current run)
901
+ url: Optional[str] = None,
902
+ ):
903
+ self.id = service_id
904
+ self._description = description_text
905
+ self._state_name = "MOCK" # MockService always has MOCK state
906
+ self._state_dict_data = state_dict_data
907
+ self._service_class = service_class
908
+ self.experiment_id = experiment_id
909
+ self.run_id = run_id
910
+ self.url = url
1342
911
 
1343
- Returns:
1344
- MockJob object or None if not found
1345
- """
1346
- # Use current run if not specified
1347
- if run_id is None:
1348
- run_id = self.get_current_run(experiment_id)
1349
- if run_id is None:
1350
- return None
912
+ @property
913
+ def state(self):
914
+ """Return state as a ServiceState-like object with a name attribute"""
915
+ from experimaestro.scheduler.services import ServiceState
1351
916
 
917
+ # Convert state name to ServiceState enum
1352
918
  try:
1353
- job_model = JobModel.get(
1354
- (JobModel.job_id == job_id)
1355
- & (JobModel.experiment_id == experiment_id)
1356
- & (JobModel.run_id == run_id)
1357
- )
1358
- except JobModel.DoesNotExist:
1359
- return None
1360
-
1361
- # Get tags for this job
1362
- job_tags = self._get_job_tags(job_id, experiment_id, run_id)
1363
-
1364
- return self._job_model_to_dict(job_model, job_tags)
1365
-
1366
- @_with_db_context
1367
- def update_job_submitted(self, job: "Job", experiment_id: str, run_id: str):
1368
- """Record that a job has been submitted
1369
-
1370
- Args:
1371
- job: Job instance
1372
- experiment_id: Experiment identifier
1373
- run_id: Run identifier
1374
-
1375
- Raises:
1376
- RuntimeError: If in read-only mode
1377
- """
1378
- if self.read_only:
1379
- raise RuntimeError("Cannot update jobs in read-only mode")
1380
-
1381
- task_id = str(job.type.identifier)
1382
-
1383
- # Create or update job record
1384
- now = datetime.now()
1385
- JobModel.insert(
1386
- job_id=job.identifier,
1387
- experiment_id=experiment_id,
1388
- run_id=run_id,
1389
- task_id=task_id,
1390
- locator=job.identifier,
1391
- state=job.state.name,
1392
- submitted_time=job.submittime,
1393
- updated_at=now,
1394
- ).on_conflict(
1395
- conflict_target=[JobModel.job_id, JobModel.experiment_id, JobModel.run_id],
1396
- update={
1397
- JobModel.state: job.state.name,
1398
- JobModel.submitted_time: job.submittime,
1399
- JobModel.updated_at: now,
1400
- JobModel.failure_reason: None, # Clear old failure reason on resubmit
1401
- },
1402
- ).execute()
1403
-
1404
- # Update tags (run-scoped)
1405
- self.update_job_tags(job.identifier, experiment_id, run_id, job.tags)
1406
-
1407
- # Register partials for all declared subparameters
1408
- subparameters = job.type._subparameters
1409
- for name, sp in subparameters.items():
1410
- partial_id = job.config.__xpm__.get_partial_identifier(sp)
1411
- partial_id_hex = partial_id.all.hex()
1412
-
1413
- # Register the partial directory
1414
- self.register_partial(partial_id_hex, task_id, name)
1415
-
1416
- # Link job to partial
1417
- self.register_job_partial(
1418
- job.identifier, experiment_id, run_id, partial_id_hex
1419
- )
919
+ return ServiceState[self._state_name]
920
+ except KeyError:
921
+ # Return a mock object with name attribute for unknown states
922
+ class MockState:
923
+ def __init__(self, name):
924
+ self.name = name
1420
925
 
1421
- logger.debug(
1422
- "Recorded job submission: %s (experiment=%s, run=%s)",
1423
- job.identifier,
1424
- experiment_id,
1425
- run_id,
1426
- )
926
+ return MockState(self._state_name)
1427
927
 
1428
- # Notify listeners
1429
- job_path = str(
1430
- self.workspace_path / "jobs" / str(job.type.identifier) / job.identifier
1431
- )
1432
- self._notify_listeners(
1433
- StateEvent(
1434
- event_type=StateEventType.JOB_UPDATED,
1435
- data={
1436
- "jobId": job.identifier,
1437
- "taskId": str(job.type.identifier),
1438
- "experimentId": experiment_id,
1439
- "runId": run_id,
1440
- "status": job.state.name,
1441
- "path": job_path,
1442
- "updatedAt": now.isoformat(),
1443
- },
1444
- )
1445
- )
928
+ def description(self) -> str:
929
+ """Return service description"""
930
+ return self._description
1446
931
 
1447
- @_with_db_context
1448
- def update_job_state(self, job: "Job", experiment_id: str, run_id: str):
1449
- """Update the state of a job
932
+ def state_dict(self) -> dict:
933
+ """Return service state for recreation"""
934
+ return self._state_dict_data
1450
935
 
1451
- Args:
1452
- job: Job instance
1453
- experiment_id: Experiment identifier
1454
- run_id: Run identifier
936
+ def full_state_dict(self) -> dict:
937
+ """Get full state as dictionary for JSON serialization.
1455
938
 
1456
- Raises:
1457
- RuntimeError: If in read-only mode
939
+ Overrides BaseService.full_state_dict() to preserve the original
940
+ service class name instead of using MockService's class name.
1458
941
  """
1459
- if self.read_only:
1460
- raise RuntimeError("Cannot update jobs in read-only mode")
1461
-
1462
- # Build update dict with updated_at timestamp
1463
- now = datetime.now()
1464
- update_data = {
1465
- JobModel.state: job.state.name,
1466
- JobModel.updated_at: now,
942
+ return {
943
+ "service_id": self.id,
944
+ "description": self._description,
945
+ "class": self._service_class,
946
+ "state_dict": self._state_dict_data,
1467
947
  }
1468
948
 
1469
- # Add or clear failure reason based on state
1470
- from experimaestro.scheduler.jobs import JobStateError
1471
-
1472
- if isinstance(job.state, JobStateError) and job.state.failure_reason:
1473
- update_data[JobModel.failure_reason] = job.state.failure_reason.name
1474
- else:
1475
- # Clear failure reason when job is not in error state
1476
- update_data[JobModel.failure_reason] = None
1477
-
1478
- # Add timing information
1479
- if job.starttime:
1480
- update_data[JobModel.started_time] = job.starttime
1481
- if job.endtime:
1482
- update_data[JobModel.ended_time] = job.endtime
1483
-
1484
- # Add progress information
1485
- if job._progress:
1486
- update_data[JobModel.progress] = json.dumps(
1487
- [
1488
- {"level": p.level, "progress": p.progress, "desc": p.desc}
1489
- for p in job._progress
1490
- ]
1491
- )
949
+ @property
950
+ def service_class(self) -> Optional[str]:
951
+ """Return service class name"""
952
+ return self._service_class
1492
953
 
1493
- # Update the job record
1494
- JobModel.update(update_data).where(
1495
- (JobModel.job_id == job.identifier)
1496
- & (JobModel.experiment_id == experiment_id)
1497
- & (JobModel.run_id == run_id)
1498
- ).execute()
954
+ @classmethod
955
+ def from_full_state_dict(cls, d: Dict) -> "MockService":
956
+ """Create MockService from full state dictionary
1499
957
 
1500
- logger.debug(
1501
- "Updated job state: %s -> %s (experiment=%s, run=%s)",
1502
- job.identifier,
1503
- job.state.name,
1504
- experiment_id,
1505
- run_id,
1506
- )
958
+ Args:
959
+ d: Dictionary from full_state_dict()
1507
960
 
1508
- # Notify listeners
1509
- job_path = str(
1510
- self.workspace_path / "jobs" / str(job.type.identifier) / job.identifier
1511
- )
1512
- self._notify_listeners(
1513
- StateEvent(
1514
- event_type=StateEventType.JOB_UPDATED,
1515
- data={
1516
- "jobId": job.identifier,
1517
- "taskId": str(job.type.identifier),
1518
- "experimentId": experiment_id,
1519
- "runId": run_id,
1520
- "status": job.state.name,
1521
- "path": job_path,
1522
- "updatedAt": now.isoformat(),
1523
- },
1524
- )
961
+ Returns:
962
+ MockService instance (state is always MOCK, not from dict)
963
+ """
964
+ return cls(
965
+ service_id=d["service_id"],
966
+ description_text=d.get("description", ""),
967
+ state_dict_data=d.get("state_dict", {}),
968
+ service_class=d.get("class"),
969
+ experiment_id=d.get("experiment_id"),
970
+ run_id=d.get("run_id"),
971
+ url=d.get("url"),
1525
972
  )
1526
973
 
1527
- @_with_db_context
1528
- def update_job_tags(
1529
- self, job_id: str, experiment_id: str, run_id: str, tags_dict: Dict[str, str]
1530
- ):
1531
- """Update tags for a job (run-scoped - fixes GH #128)
974
+ def to_service(self) -> "BaseService":
975
+ """Try to recreate a live Service instance from this mock.
1532
976
 
1533
- Deletes existing tags for this (job_id, experiment_id, run_id) combination
1534
- and inserts new tags. This ensures that the same job in different runs can
1535
- have different tags.
1536
-
1537
- Args:
1538
- job_id: Job identifier
1539
- experiment_id: Experiment identifier
1540
- run_id: Run identifier
1541
- tags_dict: Dictionary of tag key-value pairs
1542
-
1543
- Raises:
1544
- RuntimeError: If in read-only mode
1545
- """
1546
- if self.read_only:
1547
- raise RuntimeError("Cannot update tags in read-only mode")
1548
-
1549
- # Delete existing tags for this job/experiment/run
1550
- JobTagModel.delete().where(
1551
- (JobTagModel.job_id == job_id)
1552
- & (JobTagModel.experiment_id == experiment_id)
1553
- & (JobTagModel.run_id == run_id)
1554
- ).execute()
1555
-
1556
- # Insert new tags
1557
- if tags_dict:
1558
- tag_records = [
1559
- {
1560
- "job_id": job_id,
1561
- "experiment_id": experiment_id,
1562
- "run_id": run_id,
1563
- "tag_key": key,
1564
- "tag_value": value,
1565
- }
1566
- for key, value in tags_dict.items()
1567
- ]
1568
- JobTagModel.insert_many(tag_records).execute()
1569
-
1570
- logger.debug(
1571
- "Updated tags for job %s (experiment=%s, run=%s): %s",
1572
- job_id,
1573
- experiment_id,
1574
- run_id,
1575
- tags_dict,
1576
- )
1577
-
1578
- @_with_db_context
1579
- def delete_job(self, job_id: str, experiment_id: str, run_id: str):
1580
- """Remove a job, its tags, and partial references
1581
-
1582
- Args:
1583
- job_id: Job identifier
1584
- experiment_id: Experiment identifier
1585
- run_id: Run identifier
1586
-
1587
- Raises:
1588
- RuntimeError: If in read-only mode
1589
- """
1590
- if self.read_only:
1591
- raise RuntimeError("Cannot delete jobs in read-only mode")
1592
-
1593
- # Delete tags first (foreign key constraint)
1594
- JobTagModel.delete().where(
1595
- (JobTagModel.job_id == job_id)
1596
- & (JobTagModel.experiment_id == experiment_id)
1597
- & (JobTagModel.run_id == run_id)
1598
- ).execute()
1599
-
1600
- # Delete partial references
1601
- JobPartialModel.delete().where(
1602
- (JobPartialModel.job_id == job_id)
1603
- & (JobPartialModel.experiment_id == experiment_id)
1604
- & (JobPartialModel.run_id == run_id)
1605
- ).execute()
1606
-
1607
- # Delete job
1608
- JobModel.delete().where(
1609
- (JobModel.job_id == job_id)
1610
- & (JobModel.experiment_id == experiment_id)
1611
- & (JobModel.run_id == run_id)
1612
- ).execute()
1613
-
1614
- logger.debug(
1615
- "Deleted job %s (experiment=%s, run=%s)", job_id, experiment_id, run_id
1616
- )
1617
-
1618
- # CLI utility methods for job management
1619
-
1620
- @_with_db_context
1621
- def get_all_jobs(
1622
- self,
1623
- state: Optional[str] = None,
1624
- tags: Optional[Dict[str, str]] = None,
1625
- since: Optional[datetime] = None,
1626
- ) -> List[MockJob]:
1627
- """Query all jobs across all experiments/runs
1628
-
1629
- This method is designed for CLI tools that need to list or manage jobs
1630
- across the entire workspace, regardless of experiment or run.
1631
-
1632
- Args:
1633
- state: Filter by job state (e.g., "done", "error", "running")
1634
- tags: Filter by tags (all tags must match)
1635
- since: If provided, only return jobs updated after this timestamp
1636
-
1637
- Returns:
1638
- List of MockJob objects
1639
- """
1640
- # Build base query
1641
- query = JobModel.select()
1642
-
1643
- # Apply since filter for incremental updates
1644
- if since is not None:
1645
- query = query.where(JobModel.updated_at > since)
1646
-
1647
- # Apply state filter
1648
- if state is not None:
1649
- query = query.where(JobModel.state == state)
1650
-
1651
- # Apply tag filters
1652
- if tags:
1653
- for tag_key, tag_value in tags.items():
1654
- query = query.join(
1655
- JobTagModel,
1656
- on=(
1657
- (JobTagModel.job_id == JobModel.job_id)
1658
- & (JobTagModel.experiment_id == JobModel.experiment_id)
1659
- & (JobTagModel.run_id == JobModel.run_id)
1660
- & (JobTagModel.tag_key == tag_key)
1661
- & (JobTagModel.tag_value == tag_value)
1662
- ),
1663
- )
1664
-
1665
- # Execute query and convert to MockJob objects
1666
- jobs = []
1667
- for job_model in query:
1668
- # Get tags for this job
1669
- job_tags = self._get_job_tags(
1670
- job_model.job_id, job_model.experiment_id, job_model.run_id
1671
- )
1672
- jobs.append(self._job_model_to_dict(job_model, job_tags))
1673
-
1674
- return jobs
1675
-
1676
- def kill_job(self, job: MockJob, perform: bool = False) -> bool:
1677
- """Kill a running job process
1678
-
1679
- This method finds the process associated with a running job and kills it.
1680
- It also updates the job state in the database to ERROR.
1681
-
1682
- Args:
1683
- job: MockJob instance to kill
1684
- perform: If True, actually kill the process. If False, just check
1685
- if the job can be killed (dry run).
977
+ Attempts to recreate the service using the stored configuration.
978
+ If recreation fails, returns self.
1686
979
 
1687
980
  Returns:
1688
- True if job was killed (or would be killed in dry run),
1689
- False if job is not running or process not found
1690
- """
1691
- # Check if job is in a running state
1692
- if not job.state.running():
1693
- logger.debug("Job %s is not running (state=%s)", job.identifier, job.state)
1694
- return False
1695
-
1696
- # Get process from job
1697
- process = job.getprocess()
1698
- if process is None:
1699
- logger.warning("Could not get process for job %s", job.identifier)
1700
- return False
1701
-
1702
- if perform:
1703
- try:
1704
- logger.info("Killing job %s (process: %s)", job.identifier, process)
1705
- process.kill()
1706
-
1707
- # Update job state in database
1708
- if not self.read_only:
1709
- self._update_job_state_to_error(job, "killed")
1710
- except Exception as e:
1711
- logger.error("Error killing job %s: %s", job.identifier, e)
1712
- return False
1713
-
1714
- return True
1715
-
1716
- def _update_job_state_to_error(self, job: MockJob, reason: str):
1717
- """Update job state to ERROR in database
1718
-
1719
- Args:
1720
- job: MockJob instance
1721
- reason: Failure reason
1722
- """
1723
- if self.read_only:
1724
- return
1725
-
1726
- now = datetime.now()
1727
- with self.workspace_db.bind_ctx([JobModel]):
1728
- JobModel.update(
1729
- state="error",
1730
- failure_reason=reason,
1731
- ended_time=now.timestamp(),
1732
- updated_at=now,
1733
- ).where(
1734
- (JobModel.job_id == job.identifier)
1735
- & (JobModel.experiment_id == job.experiment_id)
1736
- & (JobModel.run_id == job.run_id)
1737
- ).execute()
1738
-
1739
- logger.debug(
1740
- "Updated job %s state to error (reason=%s)", job.identifier, reason
1741
- )
1742
-
1743
- def clean_job(self, job: MockJob, perform: bool = False) -> bool:
1744
- """Clean a finished job (delete directory and DB entry)
1745
-
1746
- This method removes the job's working directory and its database entry.
1747
- Only finished jobs (DONE or ERROR state) can be cleaned.
1748
-
1749
- Args:
1750
- job: MockJob instance to clean
1751
- perform: If True, actually delete the job. If False, just check
1752
- if the job can be cleaned (dry run).
1753
-
1754
- Returns:
1755
- True if job was cleaned (or would be cleaned in dry run),
1756
- False if job is not finished or cannot be cleaned
1757
- """
1758
- from shutil import rmtree
1759
-
1760
- # Check if job is in a finished state
1761
- if not job.state.finished():
1762
- logger.debug(
1763
- "Job %s is not finished (state=%s), cannot clean",
1764
- job.identifier,
1765
- job.state,
1766
- )
1767
- return False
1768
-
1769
- if perform:
1770
- # Delete job directory
1771
- if job.path.exists():
1772
- logger.info("Cleaning job %s: removing %s", job.identifier, job.path)
1773
- rmtree(job.path)
1774
- else:
1775
- logger.warning("Job directory does not exist: %s", job.path)
1776
-
1777
- # Delete from database
1778
- if not self.read_only:
1779
- self.delete_job(job.identifier, job.experiment_id, job.run_id)
1780
-
1781
- return True
1782
-
1783
- def kill_jobs(self, jobs: List[MockJob], perform: bool = False) -> int:
1784
- """Kill multiple jobs
1785
-
1786
- Args:
1787
- jobs: List of MockJob instances to kill
1788
- perform: If True, actually kill the processes. If False, dry run.
1789
-
1790
- Returns:
1791
- Number of jobs that were killed (or would be killed in dry run)
1792
- """
1793
- count = 0
1794
- for job in jobs:
1795
- if self.kill_job(job, perform=perform):
1796
- count += 1
1797
- return count
1798
-
1799
- def clean_jobs(self, jobs: List[MockJob], perform: bool = False) -> int:
1800
- """Clean multiple finished jobs
1801
-
1802
- Args:
1803
- jobs: List of MockJob instances to clean
1804
- perform: If True, actually delete the jobs. If False, dry run.
1805
-
1806
- Returns:
1807
- Number of jobs that were cleaned (or would be cleaned in dry run)
1808
- """
1809
- count = 0
1810
- for job in jobs:
1811
- if self.clean_job(job, perform=perform):
1812
- count += 1
1813
- return count
1814
-
1815
- def delete_job_safely(
1816
- self, job: MockJob, cascade_orphans: bool = True
1817
- ) -> tuple[bool, str]:
1818
- """Delete a job with proper locking and orphan cleanup
1819
-
1820
- This method is designed for TUI/UI use. It acquires a lock on the job
1821
- to prevent race conditions, then deletes the job directory and DB entry.
1822
-
1823
- Args:
1824
- job: MockJob instance to delete
1825
- cascade_orphans: If True, clean up orphan partials after deletion
1826
-
1827
- Returns:
1828
- Tuple of (success: bool, message: str)
1829
- """
1830
- import fasteners
1831
- from shutil import rmtree
1832
-
1833
- # Check if job is running
1834
- if job.state.running():
1835
- return False, "Cannot delete a running job"
1836
-
1837
- # Check if path exists
1838
- if not job.path or not job.path.exists():
1839
- # Just delete from database if path doesn't exist
1840
- if not self.read_only:
1841
- self.delete_job(job.identifier, job.experiment_id, job.run_id)
1842
- if cascade_orphans:
1843
- self.cleanup_orphan_partials(perform=True)
1844
- return True, f"Job {job.identifier} deleted (directory already gone)"
1845
-
1846
- # Try to acquire job lock (non-blocking)
1847
- # Lock file is typically {script_name}.lock, but we use .lock for general locking
1848
- lock_path = job.path / ".lock"
1849
- lock = fasteners.InterProcessLock(str(lock_path))
1850
-
1851
- if not lock.acquire(blocking=False):
1852
- return False, "Job is currently locked (possibly running)"
1853
-
1854
- try:
1855
- # Delete all files except the lock file
1856
- for item in job.path.iterdir():
1857
- if item.name != ".lock":
1858
- if item.is_dir():
1859
- rmtree(item)
1860
- else:
1861
- item.unlink()
1862
-
1863
- # Mark job as "phantom" in database (don't delete - keep as phantom)
1864
- if not self.read_only:
1865
- from datetime import datetime
1866
-
1867
- JobModel.update(
1868
- state="phantom",
1869
- updated_at=datetime.now(),
1870
- ).where(
1871
- (JobModel.job_id == job.identifier)
1872
- & (JobModel.experiment_id == job.experiment_id)
1873
- & (JobModel.run_id == job.run_id)
1874
- ).execute()
1875
-
1876
- finally:
1877
- lock.release()
1878
- # Now delete the lock file and directory
1879
- try:
1880
- lock_path.unlink(missing_ok=True)
1881
- if job.path.exists() and not any(job.path.iterdir()):
1882
- job.path.rmdir()
1883
- except Exception as e:
1884
- logger.warning("Could not clean up lock file: %s", e)
1885
-
1886
- # Clean up orphan partials if requested
1887
- if cascade_orphans:
1888
- self.cleanup_orphan_partials(perform=True)
1889
-
1890
- return True, f"Job {job.identifier} deleted successfully"
1891
-
1892
- @_with_db_context
1893
- def delete_experiment(
1894
- self, experiment_id: str, delete_jobs: bool = False
1895
- ) -> tuple[bool, str]:
1896
- """Delete an experiment from the database
1897
-
1898
- Args:
1899
- experiment_id: Experiment identifier
1900
- delete_jobs: If True, also delete associated jobs (default: False)
1901
-
1902
- Returns:
1903
- Tuple of (success: bool, message: str)
1904
- """
1905
- from shutil import rmtree
1906
-
1907
- if self.read_only:
1908
- return False, "Cannot delete in read-only mode"
1909
-
1910
- # Get all jobs for this experiment
1911
- jobs = self.get_jobs(experiment_id)
1912
- running_jobs = [j for j in jobs if j.state.running()]
1913
-
1914
- if running_jobs:
1915
- return (
1916
- False,
1917
- f"Cannot delete experiment with {len(running_jobs)} running job(s)",
1918
- )
1919
-
1920
- # Delete jobs if requested
1921
- if delete_jobs:
1922
- for job in jobs:
1923
- success, msg = self.delete_job_safely(job, cascade_orphans=False)
1924
- if not success:
1925
- logger.warning("Failed to delete job %s: %s", job.identifier, msg)
1926
-
1927
- # Delete experiment runs
1928
- ExperimentRunModel.delete().where(
1929
- ExperimentRunModel.experiment_id == experiment_id
1930
- ).execute()
1931
-
1932
- # Delete experiment
1933
- ExperimentModel.delete().where(
1934
- ExperimentModel.experiment_id == experiment_id
1935
- ).execute()
1936
-
1937
- # Optionally delete experiment directory
1938
- exp_path = self.workspace_path / "xp" / experiment_id
1939
- if exp_path.exists():
1940
- try:
1941
- rmtree(exp_path)
1942
- except Exception as e:
1943
- logger.warning("Could not delete experiment directory: %s", e)
1944
-
1945
- # Clean up orphan partials
1946
- self.cleanup_orphan_partials(perform=True)
1947
-
1948
- return True, f"Experiment {experiment_id} deleted successfully"
1949
-
1950
- @_with_db_context
1951
- def get_orphan_jobs(self) -> List[MockJob]:
1952
- """Find jobs that have no associated experiment in the database
1953
-
1954
- Returns:
1955
- List of MockJob instances for orphan jobs
1956
- """
1957
- # Get all jobs
1958
- all_jobs = self.get_all_jobs()
1959
-
1960
- # Get all experiment IDs
1961
- experiments = self.get_experiments()
1962
- experiment_ids = {exp.experiment_id for exp in experiments}
1963
-
1964
- # Find jobs with no matching experiment
1965
- orphan_jobs = [
1966
- job for job in all_jobs if job.experiment_id not in experiment_ids
1967
- ]
1968
-
1969
- return orphan_jobs
1970
-
1971
- # Service operations
1972
-
1973
- @_with_db_context
1974
- def register_service(
1975
- self,
1976
- service_id: str,
1977
- experiment_id: str,
1978
- run_id: str,
1979
- description: str,
1980
- state_dict: Optional[str] = None,
1981
- ):
1982
- """Register a service in the database
1983
-
1984
- Services are only added or removed, not updated. Runtime state
1985
- is managed by the Service object itself.
1986
-
1987
- Args:
1988
- service_id: Service identifier
1989
- experiment_id: Experiment identifier
1990
- run_id: Run identifier
1991
- description: Human-readable description
1992
- state_dict: JSON serialized state_dict for service recreation
1993
-
1994
- Raises:
1995
- RuntimeError: If in read-only mode
1996
- """
1997
- if self.read_only:
1998
- raise RuntimeError("Cannot register services in read-only mode")
1999
-
2000
- insert_data = {
2001
- "service_id": service_id,
2002
- "experiment_id": experiment_id,
2003
- "run_id": run_id,
2004
- "description": description,
2005
- "created_at": datetime.now(),
2006
- }
2007
-
2008
- if state_dict is not None:
2009
- insert_data["state_dict"] = state_dict
2010
-
2011
- # Use INSERT OR IGNORE - services are only added, not updated
2012
- ServiceModel.insert(**insert_data).on_conflict_ignore().execute()
2013
-
2014
- logger.debug(
2015
- "Registered service %s (experiment=%s, run=%s)",
2016
- service_id,
2017
- experiment_id,
2018
- run_id,
2019
- )
2020
-
2021
- # Notify listeners
2022
- self._notify_listeners(
2023
- StateEvent(
2024
- event_type=StateEventType.SERVICE_UPDATED,
2025
- data={
2026
- "serviceId": service_id,
2027
- "experimentId": experiment_id,
2028
- "runId": run_id,
2029
- "description": description,
2030
- },
2031
- )
2032
- )
2033
-
2034
- def _get_live_services(
2035
- self, experiment_id: Optional[str], run_id: Optional[str]
2036
- ) -> Optional[List["Service"]]:
2037
- """Get live services from scheduler if available.
2038
-
2039
- Returns None if no live services (experiment not in scheduler).
2040
- """
2041
- if experiment_id is None:
2042
- return None
2043
-
2044
- try:
2045
- from experimaestro.scheduler.base import Scheduler
2046
-
2047
- if not Scheduler.has_instance():
2048
- return None
2049
-
2050
- scheduler = Scheduler.instance()
2051
- if experiment_id not in scheduler.experiments:
2052
- logger.debug("Experiment %s not in scheduler", experiment_id)
2053
- return None
2054
-
2055
- exp = scheduler.experiments[experiment_id]
2056
- services = list(exp.services.values())
2057
- logger.debug(
2058
- "Returning %d live services for experiment %s",
2059
- len(services),
2060
- experiment_id,
2061
- )
2062
- return services
2063
-
2064
- except Exception as e:
2065
- logger.warning("Could not get live services: %s", e)
2066
- return None
2067
-
2068
- @_with_db_context
2069
- def _fetch_services_from_storage(
2070
- self, experiment_id: Optional[str], run_id: Optional[str]
2071
- ) -> List["Service"]:
2072
- """Fetch services from database.
2073
-
2074
- Called when no live services and cache is empty.
2075
- """
2076
- from experimaestro.scheduler.services import Service
2077
-
2078
- query = ServiceModel.select()
2079
-
2080
- if experiment_id is not None:
2081
- query = query.where(
2082
- (ServiceModel.experiment_id == experiment_id)
2083
- & (ServiceModel.run_id == run_id)
2084
- )
2085
-
2086
- services = []
2087
-
2088
- for service_model in query:
2089
- service_id = service_model.service_id
2090
-
2091
- # Try to recreate service from state_dict
2092
- state_dict_json = service_model.state_dict
2093
- if state_dict_json and state_dict_json != "{}":
2094
- try:
2095
- state_dict = json.loads(state_dict_json)
2096
- if "__class__" in state_dict:
2097
- service = Service.from_state_dict(state_dict)
2098
- except Exception as e:
2099
- service = MockService(
2100
- service_id,
2101
- f"error: {e}",
2102
- {},
2103
- experiment_id=experiment_id,
2104
- run_id=run_id,
2105
- )
2106
-
2107
- logger.warning(
2108
- "Failed to recreate service %s from state_dict: %s",
2109
- service_id,
2110
- e,
2111
- )
2112
- else:
2113
- # If we can't recreate, skip this service (it's not usable)
2114
- logger.debug(
2115
- "Service %s has no state_dict for recreation, skipping",
2116
- service_id,
2117
- )
2118
- service = MockService(
2119
- service_id,
2120
- "error: no state_dict",
2121
- {},
2122
- experiment_id=experiment_id,
2123
- run_id=run_id,
2124
- )
2125
-
2126
- # Add to services
2127
- service.id = service_id
2128
- services.append(service)
2129
- continue
2130
-
2131
- return services
2132
-
2133
- @_with_db_context
2134
- def get_services_raw(
2135
- self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
2136
- ) -> List[Dict]:
2137
- """Get raw service data from database without recreating Service objects
2138
-
2139
- This is useful for remote monitoring where the client may have different
2140
- modules installed than the server. Returns dictionaries with service
2141
- metadata that can be serialized over JSON-RPC.
2142
-
2143
- Args:
2144
- experiment_id: Filter by experiment (None = all)
2145
- run_id: Filter by run (None = current run if experiment_id provided)
2146
-
2147
- Returns:
2148
- List of dictionaries with service data
2149
- """
2150
- query = ServiceModel.select()
2151
-
2152
- if experiment_id is not None:
2153
- # Use current run if not specified
2154
- if run_id is None:
2155
- run_id = self.get_current_run(experiment_id)
2156
- if run_id is None:
2157
- return []
2158
-
2159
- query = query.where(
2160
- (ServiceModel.experiment_id == experiment_id)
2161
- & (ServiceModel.run_id == run_id)
2162
- )
2163
-
2164
- services = []
2165
- for service_model in query:
2166
- state_dict = {}
2167
- if service_model.state_dict and service_model.state_dict != "{}":
2168
- try:
2169
- state_dict = json.loads(service_model.state_dict)
2170
- except json.JSONDecodeError:
2171
- pass
2172
-
2173
- services.append(
2174
- {
2175
- "service_id": service_model.service_id,
2176
- "description": service_model.description,
2177
- "state_dict": state_dict,
2178
- "experiment_id": service_model.experiment_id,
2179
- "run_id": service_model.run_id,
2180
- }
2181
- )
2182
-
2183
- return services
2184
-
2185
- def get_live_job_states(self, experiment_id: str) -> Dict[str, str]:
2186
- """Get live job states from the scheduler if available
2187
-
2188
- This is useful for debugging to compare live state vs database state.
2189
-
2190
- Args:
2191
- experiment_id: The experiment ID to get live jobs for
2192
-
2193
- Returns:
2194
- Dict mapping job identifier to live state name, empty if scheduler
2195
- not available or experiment not registered
2196
- """
2197
- try:
2198
- from experimaestro.scheduler.base import Scheduler
2199
-
2200
- if not Scheduler.has_instance():
2201
- logger.debug("No scheduler instance available for live states")
2202
- return {}
2203
-
2204
- scheduler = Scheduler.instance()
2205
- live_states = {}
2206
-
2207
- logger.debug(
2208
- "get_live_job_states: looking for exp=%s, scheduler has %d jobs",
2209
- experiment_id,
2210
- len(scheduler.jobs),
2211
- )
2212
-
2213
- for job_id, job in scheduler.jobs.items():
2214
- # Filter by experiment if needed
2215
- if hasattr(job, "experiment") and job.experiment is not None:
2216
- if hasattr(job.experiment, "workdir"):
2217
- job_exp_id = job.experiment.workdir.name
2218
- if job_exp_id == experiment_id:
2219
- live_states[job_id] = job.state.name
2220
- else:
2221
- logger.debug(
2222
- "Job %s exp_id=%s != requested %s",
2223
- job_id[:8],
2224
- job_exp_id,
2225
- experiment_id,
2226
- )
2227
- else:
2228
- # Job not associated with experiment, include it anyway
2229
- live_states[job_id] = job.state.name
2230
- logger.debug(
2231
- "Job %s has no experiment, including anyway", job_id[:8]
2232
- )
2233
-
2234
- logger.debug("Returning %d live job states", len(live_states))
2235
- return live_states
2236
-
2237
- except Exception as e:
2238
- logger.debug("Could not get live job states: %s", e)
2239
- return {}
2240
-
2241
- # Sync metadata methods
2242
-
2243
- @_with_db_context
2244
- def get_last_sync_time(self) -> Optional[datetime]:
2245
- """Get the timestamp of the last successful sync
2246
-
2247
- Returns:
2248
- datetime of last sync, or None if never synced
2249
- """
2250
- from peewee import OperationalError
2251
-
2252
- from .state_db import WorkspaceSyncMetadata
2253
-
2254
- try:
2255
- metadata = WorkspaceSyncMetadata.get_or_none(
2256
- WorkspaceSyncMetadata.id == "workspace"
2257
- )
2258
- if metadata and metadata.last_sync_time:
2259
- return metadata.last_sync_time
2260
- except OperationalError:
2261
- # Table might not exist in older workspaces opened in read-only mode
2262
- pass
2263
- return None
2264
-
2265
- @_with_db_context
2266
- def update_last_sync_time(self) -> None:
2267
- """Update the last sync timestamp to now
2268
-
2269
- Raises:
2270
- RuntimeError: If in read-only mode
2271
- """
2272
- if self.read_only:
2273
- raise RuntimeError("Cannot update sync time in read-only mode")
2274
-
2275
- from .state_db import WorkspaceSyncMetadata
2276
-
2277
- WorkspaceSyncMetadata.insert(
2278
- id="workspace", last_sync_time=datetime.now()
2279
- ).on_conflict(
2280
- conflict_target=[WorkspaceSyncMetadata.id],
2281
- update={WorkspaceSyncMetadata.last_sync_time: datetime.now()},
2282
- ).execute()
2283
- logger.debug("Updated last sync time")
2284
-
2285
- # Partial management methods
2286
-
2287
- @_with_db_context
2288
- def register_partial(
2289
- self, partial_id: str, task_id: str, subparameters_name: str
2290
- ) -> None:
2291
- """Register a partial directory (creates if not exists)
2292
-
2293
- Args:
2294
- partial_id: Hex hash of the partial identifier
2295
- task_id: Task class identifier
2296
- subparameters_name: Name of the subparameters definition
2297
-
2298
- Raises:
2299
- RuntimeError: If in read-only mode
2300
- """
2301
- if self.read_only:
2302
- raise RuntimeError("Cannot register partials in read-only mode")
2303
-
2304
- PartialModel.insert(
2305
- partial_id=partial_id,
2306
- task_id=task_id,
2307
- subparameters_name=subparameters_name,
2308
- created_at=datetime.now(),
2309
- ).on_conflict_ignore().execute()
2310
-
2311
- logger.debug(
2312
- "Registered partial: %s (task=%s, subparams=%s)",
2313
- partial_id,
2314
- task_id,
2315
- subparameters_name,
2316
- )
2317
-
2318
- @_with_db_context
2319
- def register_job_partial(
2320
- self, job_id: str, experiment_id: str, run_id: str, partial_id: str
2321
- ) -> None:
2322
- """Link a job to a partial directory it uses
2323
-
2324
- Args:
2325
- job_id: Job identifier
2326
- experiment_id: Experiment identifier
2327
- run_id: Run identifier
2328
- partial_id: Partial directory identifier
2329
-
2330
- Raises:
2331
- RuntimeError: If in read-only mode
2332
- """
2333
- if self.read_only:
2334
- raise RuntimeError("Cannot register job partials in read-only mode")
2335
-
2336
- JobPartialModel.insert(
2337
- job_id=job_id,
2338
- experiment_id=experiment_id,
2339
- run_id=run_id,
2340
- partial_id=partial_id,
2341
- ).on_conflict_ignore().execute()
2342
-
2343
- logger.debug(
2344
- "Linked job %s to partial %s (experiment=%s, run=%s)",
2345
- job_id,
2346
- partial_id,
2347
- experiment_id,
2348
- run_id,
2349
- )
2350
-
2351
- @_with_db_context
2352
- def unregister_job_partials(
2353
- self, job_id: str, experiment_id: str, run_id: str
2354
- ) -> None:
2355
- """Remove all partial links for a job
2356
-
2357
- Called when a job is deleted to clean up its partial references.
2358
-
2359
- Args:
2360
- job_id: Job identifier
2361
- experiment_id: Experiment identifier
2362
- run_id: Run identifier
2363
-
2364
- Raises:
2365
- RuntimeError: If in read-only mode
2366
- """
2367
- if self.read_only:
2368
- raise RuntimeError("Cannot unregister job partials in read-only mode")
2369
-
2370
- JobPartialModel.delete().where(
2371
- (JobPartialModel.job_id == job_id)
2372
- & (JobPartialModel.experiment_id == experiment_id)
2373
- & (JobPartialModel.run_id == run_id)
2374
- ).execute()
2375
-
2376
- logger.debug(
2377
- "Unregistered partials for job %s (experiment=%s, run=%s)",
2378
- job_id,
2379
- experiment_id,
2380
- run_id,
2381
- )
2382
-
2383
- @_with_db_context
2384
- def get_orphan_partials(self) -> List[Dict]:
2385
- """Find partial directories that are not referenced by any job
2386
-
2387
- Returns:
2388
- List of dictionaries with partial_id, task_id, subparameters_name
2389
- """
2390
- # Find partials that have no job references
2391
- # Using a subquery to find referenced partial_ids
2392
- referenced_partials = JobPartialModel.select(JobPartialModel.partial_id)
2393
-
2394
- orphan_query = PartialModel.select().where(
2395
- PartialModel.partial_id.not_in(referenced_partials)
2396
- )
2397
-
2398
- orphans = []
2399
- for partial in orphan_query:
2400
- orphans.append(
2401
- {
2402
- "partial_id": partial.partial_id,
2403
- "task_id": partial.task_id,
2404
- "subparameters_name": partial.subparameters_name,
2405
- "created_at": partial.created_at.isoformat(),
2406
- }
2407
- )
2408
-
2409
- return orphans
2410
-
2411
- def cleanup_orphan_partials(self, perform: bool = False) -> List[Path]:
2412
- """Clean up orphan partial directories
2413
-
2414
- Finds partial directories not referenced by any job and removes them.
2415
-
2416
- Args:
2417
- perform: If True, actually delete. If False, dry run (list only).
2418
-
2419
- Returns:
2420
- List of paths that were deleted (or would be deleted in dry run)
2421
- """
2422
- from shutil import rmtree
2423
-
2424
- orphans = self.get_orphan_partials()
2425
- deleted_paths = []
2426
-
2427
- for orphan in orphans:
2428
- # Reconstruct path: WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID
2429
- partial_path = (
2430
- self.workspace_path
2431
- / "partials"
2432
- / orphan["task_id"]
2433
- / orphan["subparameters_name"]
2434
- / orphan["partial_id"]
2435
- )
2436
-
2437
- if perform:
2438
- # Delete directory if it exists
2439
- if partial_path.exists():
2440
- logger.info("Cleaning orphan partial: %s", partial_path)
2441
- rmtree(partial_path)
2442
-
2443
- # Delete from database
2444
- if not self.read_only:
2445
- with self.workspace_db.bind_ctx([PartialModel]):
2446
- PartialModel.delete().where(
2447
- PartialModel.partial_id == orphan["partial_id"]
2448
- ).execute()
2449
-
2450
- deleted_paths.append(partial_path)
2451
-
2452
- return deleted_paths
2453
-
2454
- # Utility methods
2455
-
2456
- def close(self):
2457
- """Close the database connection and remove from registry
2458
-
2459
- This should be called when done with the workspace to free resources.
2460
- """
2461
- # Stop file watcher if running
2462
- self._stop_file_watcher()
2463
-
2464
- # Close database connection
2465
- if hasattr(self, "workspace_db") and self.workspace_db is not None:
2466
- from .state_db import close_workspace_database
2467
-
2468
- close_workspace_database(self.workspace_db)
2469
- self.workspace_db = None
2470
-
2471
- # Remove from registry
2472
- with WorkspaceStateProvider._lock:
2473
- if self.workspace_path in WorkspaceStateProvider._instances:
2474
- del WorkspaceStateProvider._instances[self.workspace_path]
2475
-
2476
- logger.debug("WorkspaceStateProvider closed for %s", self.workspace_path)
2477
-
2478
- # Listener methods for push notifications
2479
-
2480
- def add_listener(self, listener: StateListener) -> None:
2481
- """Register a listener for state change notifications
2482
-
2483
- Listeners are called synchronously when state changes occur.
2484
- For UI applications, listeners should queue updates for their
2485
- own event loop to avoid blocking database operations.
2486
-
2487
- When the first listener is added, starts watching the database
2488
- file for changes to enable push notifications.
2489
-
2490
- Args:
2491
- listener: Callback function that receives StateEvent objects
2492
- """
2493
- with self._listeners_lock:
2494
- was_empty = len(self._listeners) == 0
2495
- self._listeners.add(listener)
2496
-
2497
- # Start file watcher when first listener is added
2498
- if was_empty:
2499
- self._start_file_watcher()
2500
-
2501
- logger.info(
2502
- "Added state listener: %s (total: %d)", listener, len(self._listeners)
2503
- )
2504
-
2505
- def remove_listener(self, listener: StateListener) -> None:
2506
- """Unregister a state change listener
2507
-
2508
- When the last listener is removed, stops watching the database file.
2509
-
2510
- Args:
2511
- listener: Previously registered callback function
2512
- """
2513
- with self._listeners_lock:
2514
- self._listeners.discard(listener)
2515
- is_empty = len(self._listeners) == 0
2516
-
2517
- # Stop file watcher when last listener is removed
2518
- if is_empty:
2519
- self._stop_file_watcher()
2520
-
2521
- logger.debug("Removed state listener: %s", listener)
2522
-
2523
- def _start_file_watcher(self) -> None:
2524
- """Start watching the database file for changes"""
2525
- if self._db_file_watch is not None:
2526
- logger.info("File watcher already running for %s", self._db_dir)
2527
- return # Already watching
2528
-
2529
- from experimaestro.ipc import ipcom
2530
-
2531
- # Create and start the change detector thread
2532
- self._change_detector = _DatabaseChangeDetector(self)
2533
- self._change_detector.start()
2534
-
2535
- # Create the file handler that signals the detector
2536
- self._db_file_handler = _DatabaseFileHandler(self._change_detector)
2537
- self._db_file_watch = ipcom().fswatch(
2538
- self._db_file_handler,
2539
- self._db_dir,
2540
- recursive=False,
2541
- )
2542
- logger.info("Started database file watcher for %s", self._db_dir)
2543
-
2544
- def _stop_file_watcher(self) -> None:
2545
- """Stop watching the database file"""
2546
- if self._db_file_watch is None:
2547
- return # Not watching
2548
-
2549
- from experimaestro.ipc import ipcom
2550
-
2551
- # Stop the file watcher first
2552
- ipcom().fsunwatch(self._db_file_watch)
2553
- self._db_file_watch = None
2554
- self._db_file_handler = None
2555
-
2556
- # Stop the change detector thread
2557
- if self._change_detector is not None:
2558
- self._change_detector.stop()
2559
- self._change_detector = None
2560
-
2561
- logger.debug("Stopped database file watcher for %s", self.workspace_path)
2562
-
2563
- def _notify_listeners(self, event: StateEvent) -> None:
2564
- """Notify all registered listeners of a state change
2565
-
2566
- This is called internally by state-modifying methods.
2567
- Listeners are called synchronously - they should be fast.
2568
-
2569
- Args:
2570
- event: State change event to broadcast
2571
- """
2572
- with self._listeners_lock:
2573
- listeners = list(self._listeners)
2574
-
2575
- for listener in listeners:
2576
- try:
2577
- listener(event)
2578
- except Exception as e:
2579
- logger.warning("Listener %s raised exception: %s", listener, e)
2580
-
2581
- # Helper methods
2582
-
2583
- @_with_db_context
2584
- def _get_job_tags(
2585
- self, job_id: str, experiment_id: str, run_id: str
2586
- ) -> Dict[str, str]:
2587
- """Get tags for a job
2588
-
2589
- Args:
2590
- job_id: Job identifier
2591
- experiment_id: Experiment identifier
2592
- run_id: Run identifier
2593
-
2594
- Returns:
2595
- Dictionary of tag key-value pairs
2596
- """
2597
- tags = {}
2598
- for tag_model in JobTagModel.select().where(
2599
- (JobTagModel.job_id == job_id)
2600
- & (JobTagModel.experiment_id == experiment_id)
2601
- & (JobTagModel.run_id == run_id)
2602
- ):
2603
- tags[tag_model.tag_key] = tag_model.tag_value
2604
- return tags
2605
-
2606
- def _job_model_to_dict(self, job_model: JobModel, tags: Dict[str, str]) -> MockJob:
2607
- """Convert a JobModel to a MockJob object
2608
-
2609
- Args:
2610
- job_model: JobModel instance
2611
- tags: Dictionary of tags for this job
2612
-
2613
- Returns:
2614
- MockJob object
2615
- """
2616
- # Parse progress JSON
2617
- progress_list = json.loads(job_model.progress)
2618
-
2619
- # Compute job path from workspace_path, task_id, and job_id
2620
- job_path = self.workspace_path / "jobs" / job_model.task_id / job_model.job_id
2621
-
2622
- # Convert failure_reason string to enum if present
2623
- failure_reason = None
2624
- if job_model.failure_reason:
2625
- try:
2626
- failure_reason = JobFailureStatus[job_model.failure_reason]
2627
- except KeyError:
2628
- pass # Unknown failure reason, leave as None
2629
-
2630
- return MockJob(
2631
- identifier=job_model.job_id,
2632
- task_id=job_model.task_id,
2633
- locator=job_model.locator,
2634
- path=job_path,
2635
- state=job_model.state,
2636
- submittime=job_model.submitted_time,
2637
- starttime=job_model.started_time,
2638
- endtime=job_model.ended_time,
2639
- progress=progress_list,
2640
- tags=tags,
2641
- experiment_id=job_model.experiment_id,
2642
- run_id=job_model.run_id,
2643
- updated_at=job_model.updated_at.isoformat(),
2644
- failure_reason=failure_reason,
2645
- )
2646
-
2647
- def _format_time(self, timestamp: Optional[float]) -> str:
2648
- """Format timestamp for UI
2649
-
2650
- Args:
2651
- timestamp: Unix timestamp or None
2652
-
2653
- Returns:
2654
- ISO format datetime string or empty string
2655
- """
2656
- if not timestamp:
2657
- return ""
2658
- return datetime.fromtimestamp(timestamp).isoformat()
2659
-
2660
-
2661
- # Scheduler listener adapter
2662
- class SchedulerListener:
2663
- """Adapter to connect scheduler events to WorkspaceStateProvider
2664
-
2665
- This class implements the scheduler listener interface and forwards
2666
- events to the WorkspaceStateProvider. It tracks which experiment/run
2667
- each job belongs to for proper database updates.
2668
- """
2669
-
2670
- def __init__(self, state_provider: WorkspaceStateProvider):
2671
- """Initialize listener
2672
-
2673
- Args:
2674
- state_provider: WorkspaceStateProvider instance to update
2675
- """
2676
- self.state_provider = state_provider
2677
- # Map job_id -> (experiment_id, run_id) for tracking
2678
- self.job_experiments: Dict[str, tuple] = {}
2679
-
2680
- logger.info("SchedulerListener initialized")
2681
-
2682
- @_with_db_context
2683
- def job_submitted(self, job: "Job", experiment_id: str, run_id: str):
2684
- """Called when a job is submitted
2685
-
2686
- Args:
2687
- job: The submitted job
2688
- experiment_id: Experiment this job belongs to
2689
- run_id: Run this job belongs to
2690
- """
2691
- # Track job's experiment/run
2692
- self.job_experiments[job.identifier] = (experiment_id, run_id)
2693
-
2694
- # Update state provider
2695
- try:
2696
- self.state_provider.update_job_submitted(job, experiment_id, run_id)
2697
- except Exception as e:
2698
- logger.exception(
2699
- "Error updating job submission for %s: %s", job.identifier, e
2700
- )
2701
-
2702
- @_with_db_context
2703
- def job_state(self, job: "Job"):
2704
- """Called when a job's state changes
2705
-
2706
- Args:
2707
- job: The job with updated state
2708
- """
2709
- # Look up job's experiment/run
2710
- if job.identifier not in self.job_experiments:
2711
- logger.warning(
2712
- "State change for unknown job %s (not tracked by listener)",
2713
- job.identifier,
2714
- )
2715
- return
2716
-
2717
- experiment_id, run_id = self.job_experiments[job.identifier]
2718
-
2719
- # Update state provider
2720
- try:
2721
- self.state_provider.update_job_state(job, experiment_id, run_id)
2722
- except Exception as e:
2723
- logger.exception("Error updating job state for %s: %s", job.identifier, e)
2724
-
2725
- @_with_db_context
2726
- def service_add(self, service: "Service", experiment_id: str, run_id: str):
2727
- """Called when a service is added
2728
-
2729
- Args:
2730
- service: The added service
2731
- experiment_id: Experiment identifier
2732
- run_id: Run identifier
2733
- """
2734
- from experimaestro.scheduler.services import Service
2735
-
2736
- try:
2737
- # Get state_dict for service recreation
2738
- state_dict_json = None
2739
- try:
2740
- # _full_state_dict includes __class__ automatically
2741
- state_dict = service._full_state_dict()
2742
- # Serialize paths automatically
2743
- serialized = Service.serialize_state_dict(state_dict)
2744
- state_dict_json = json.dumps(serialized)
2745
- except Exception as e:
2746
- # Service cannot be serialized - store unserializable marker
2747
- logger.warning(
2748
- "Could not get state_dict for service %s: %s", service.id, e
2749
- )
2750
- state_dict_json = json.dumps(
2751
- {
2752
- "__class__": f"{service.__class__.__module__}.{service.__class__.__name__}",
2753
- "__unserializable__": True,
2754
- "__reason__": f"Cannot serialize: {e}",
2755
- }
2756
- )
2757
-
2758
- self.state_provider.register_service(
2759
- service.id,
2760
- experiment_id,
2761
- run_id,
2762
- service.description(),
2763
- state_dict=state_dict_json,
2764
- )
2765
- except Exception as e:
2766
- logger.exception("Error updating service %s: %s", service.id, e)
981
+ A live Service instance or self if recreation is not possible
982
+ """
983
+ # Just return self - service recreation from config not implemented
984
+ return self
985
+
986
+
987
+ __all__ = [
988
+ # Data classes
989
+ "ProcessInfo",
990
+ # Listener type alias
991
+ "StateListener",
992
+ # ABC
993
+ "StateProvider",
994
+ "OfflineStateProvider",
995
+ # Mock classes
996
+ "MockJob",
997
+ "MockExperiment",
998
+ "MockService",
999
+ ]