experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (133) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +140 -16
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/progress.py +269 -0
  7. experimaestro/cli/refactor.py +249 -0
  8. experimaestro/click.py +0 -1
  9. experimaestro/commandline.py +19 -3
  10. experimaestro/connectors/__init__.py +22 -3
  11. experimaestro/connectors/local.py +12 -0
  12. experimaestro/core/arguments.py +192 -37
  13. experimaestro/core/identifier.py +127 -12
  14. experimaestro/core/objects/__init__.py +6 -0
  15. experimaestro/core/objects/config.py +702 -285
  16. experimaestro/core/objects/config_walk.py +24 -6
  17. experimaestro/core/serialization.py +91 -34
  18. experimaestro/core/serializers.py +1 -8
  19. experimaestro/core/subparameters.py +164 -0
  20. experimaestro/core/types.py +198 -83
  21. experimaestro/exceptions.py +26 -0
  22. experimaestro/experiments/cli.py +107 -25
  23. experimaestro/generators.py +50 -9
  24. experimaestro/huggingface.py +3 -1
  25. experimaestro/launcherfinder/parser.py +29 -0
  26. experimaestro/launcherfinder/registry.py +3 -3
  27. experimaestro/launchers/__init__.py +26 -1
  28. experimaestro/launchers/direct.py +12 -0
  29. experimaestro/launchers/slurm/base.py +154 -2
  30. experimaestro/mkdocs/base.py +6 -8
  31. experimaestro/mkdocs/metaloader.py +0 -1
  32. experimaestro/mypy.py +452 -7
  33. experimaestro/notifications.py +75 -16
  34. experimaestro/progress.py +404 -0
  35. experimaestro/rpyc.py +0 -1
  36. experimaestro/run.py +19 -6
  37. experimaestro/scheduler/__init__.py +18 -1
  38. experimaestro/scheduler/base.py +504 -959
  39. experimaestro/scheduler/dependencies.py +43 -28
  40. experimaestro/scheduler/dynamic_outputs.py +259 -130
  41. experimaestro/scheduler/experiment.py +582 -0
  42. experimaestro/scheduler/interfaces.py +474 -0
  43. experimaestro/scheduler/jobs.py +485 -0
  44. experimaestro/scheduler/services.py +186 -12
  45. experimaestro/scheduler/signal_handler.py +32 -0
  46. experimaestro/scheduler/state.py +1 -1
  47. experimaestro/scheduler/state_db.py +388 -0
  48. experimaestro/scheduler/state_provider.py +2345 -0
  49. experimaestro/scheduler/state_sync.py +834 -0
  50. experimaestro/scheduler/workspace.py +52 -10
  51. experimaestro/scriptbuilder.py +7 -0
  52. experimaestro/server/__init__.py +153 -32
  53. experimaestro/server/data/index.css +0 -125
  54. experimaestro/server/data/index.css.map +1 -1
  55. experimaestro/server/data/index.js +194 -58
  56. experimaestro/server/data/index.js.map +1 -1
  57. experimaestro/settings.py +47 -6
  58. experimaestro/sphinx/__init__.py +3 -3
  59. experimaestro/taskglobals.py +20 -0
  60. experimaestro/tests/conftest.py +80 -0
  61. experimaestro/tests/core/test_generics.py +2 -2
  62. experimaestro/tests/identifier_stability.json +45 -0
  63. experimaestro/tests/launchers/bin/sacct +6 -2
  64. experimaestro/tests/launchers/bin/sbatch +4 -2
  65. experimaestro/tests/launchers/common.py +2 -2
  66. experimaestro/tests/launchers/test_slurm.py +80 -0
  67. experimaestro/tests/restart.py +1 -1
  68. experimaestro/tests/tasks/all.py +7 -0
  69. experimaestro/tests/tasks/test_dynamic.py +231 -0
  70. experimaestro/tests/test_checkers.py +2 -2
  71. experimaestro/tests/test_cli_jobs.py +615 -0
  72. experimaestro/tests/test_dependencies.py +11 -17
  73. experimaestro/tests/test_deprecated.py +630 -0
  74. experimaestro/tests/test_environment.py +200 -0
  75. experimaestro/tests/test_experiment.py +3 -3
  76. experimaestro/tests/test_file_progress.py +425 -0
  77. experimaestro/tests/test_file_progress_integration.py +477 -0
  78. experimaestro/tests/test_forward.py +3 -3
  79. experimaestro/tests/test_generators.py +93 -0
  80. experimaestro/tests/test_identifier.py +520 -169
  81. experimaestro/tests/test_identifier_stability.py +458 -0
  82. experimaestro/tests/test_instance.py +16 -21
  83. experimaestro/tests/test_multitoken.py +442 -0
  84. experimaestro/tests/test_mypy.py +433 -0
  85. experimaestro/tests/test_objects.py +314 -30
  86. experimaestro/tests/test_outputs.py +8 -8
  87. experimaestro/tests/test_param.py +22 -26
  88. experimaestro/tests/test_partial_paths.py +231 -0
  89. experimaestro/tests/test_progress.py +2 -50
  90. experimaestro/tests/test_resumable_task.py +480 -0
  91. experimaestro/tests/test_serializers.py +141 -60
  92. experimaestro/tests/test_state_db.py +434 -0
  93. experimaestro/tests/test_subparameters.py +160 -0
  94. experimaestro/tests/test_tags.py +151 -15
  95. experimaestro/tests/test_tasks.py +137 -160
  96. experimaestro/tests/test_token_locking.py +252 -0
  97. experimaestro/tests/test_tokens.py +25 -19
  98. experimaestro/tests/test_types.py +133 -11
  99. experimaestro/tests/test_validation.py +19 -19
  100. experimaestro/tests/test_workspace_triggers.py +158 -0
  101. experimaestro/tests/token_reschedule.py +5 -3
  102. experimaestro/tests/utils.py +2 -2
  103. experimaestro/tokens.py +154 -57
  104. experimaestro/tools/diff.py +8 -1
  105. experimaestro/tui/__init__.py +8 -0
  106. experimaestro/tui/app.py +2303 -0
  107. experimaestro/tui/app.tcss +353 -0
  108. experimaestro/tui/log_viewer.py +228 -0
  109. experimaestro/typingutils.py +11 -2
  110. experimaestro/utils/__init__.py +23 -0
  111. experimaestro/utils/environment.py +148 -0
  112. experimaestro/utils/git.py +129 -0
  113. experimaestro/utils/resources.py +1 -1
  114. experimaestro/version.py +34 -0
  115. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
  116. experimaestro-2.0.0b4.dist-info/RECORD +181 -0
  117. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
  118. experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
  119. experimaestro/compat.py +0 -6
  120. experimaestro/core/objects.pyi +0 -225
  121. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  122. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  123. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  124. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  125. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  126. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  127. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  128. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  129. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  130. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  131. experimaestro-1.11.1.dist-info/RECORD +0 -158
  132. experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
  133. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,2345 @@
1
+ """Unified workspace state provider for accessing experiment and job information
2
+
3
+ This module provides a single WorkspaceStateProvider class that accesses state
4
+ from the workspace-level database (.experimaestro/workspace.db). This replaces
5
+ the previous multi-provider architecture with a unified approach.
6
+
7
+ Key features:
8
+ - Single .experimaestro/workspace.db database shared across all experiments
9
+ - Support for multiple runs per experiment
10
+ - Run-scoped tags (fixes GH #128)
11
+ - Thread-safe database access via thread-local connections
12
+ - Real-time updates via scheduler listener interface
13
+ - Push notifications via listener callbacks (for reactive UI)
14
+ """
15
+
16
+ import json
17
+ import logging
18
+ import threading
19
+ import time
20
+ from dataclasses import dataclass
21
+ from datetime import datetime
22
+ from enum import Enum, auto
23
+ from pathlib import Path
24
+ from typing import Callable, Dict, List, Optional, Set, TYPE_CHECKING
25
+
26
+ from watchdog.events import FileSystemEventHandler
27
+ from watchdog.observers.api import ObservedWatch
28
+
29
+ from experimaestro.scheduler.state_db import (
30
+ ExperimentModel,
31
+ ExperimentRunModel,
32
+ JobModel,
33
+ JobTagModel,
34
+ ServiceModel,
35
+ PartialModel,
36
+ JobPartialModel,
37
+ ALL_MODELS,
38
+ )
39
+ from experimaestro.scheduler.interfaces import (
40
+ BaseJob,
41
+ BaseExperiment,
42
+ JobState,
43
+ JobFailureStatus,
44
+ STATE_NAME_TO_JOBSTATE,
45
+ )
46
+
47
+ if TYPE_CHECKING:
48
+ from experimaestro.scheduler.jobs import Job
49
+ from experimaestro.scheduler.services import Service
50
+
51
+ logger = logging.getLogger("xpm.state")
52
+
53
+
54
+ # Event types for state provider notifications
55
+ class StateEventType(Enum):
56
+ """Types of state change events"""
57
+
58
+ EXPERIMENT_UPDATED = auto()
59
+ RUN_UPDATED = auto()
60
+ JOB_UPDATED = auto()
61
+ SERVICE_UPDATED = auto()
62
+
63
+
64
+ @dataclass
65
+ class StateEvent:
66
+ """Base class for state change events
67
+
68
+ Attributes:
69
+ event_type: Type of the event
70
+ data: Event-specific data dictionary
71
+ """
72
+
73
+ event_type: StateEventType
74
+ data: Dict
75
+
76
+
77
+ # Type alias for listener callbacks
78
+ StateListener = Callable[[StateEvent], None]
79
+
80
+
81
+ class _DatabaseChangeDetector:
82
+ """Background thread that detects database changes and notifies listeners
83
+
84
+ Uses a semaphore pattern so that the watchdog event handler never blocks.
85
+ The watchdog just signals the semaphore, and this thread does the actual
86
+ database queries and listener notifications.
87
+
88
+ Thread safety:
89
+ - Uses a lock to protect start/stop transitions
90
+ - Once stop() is called, the stop event cannot be cleared by start()
91
+ - Uses a Condition for atomic wait-and-clear of change notifications
92
+ """
93
+
94
+ def __init__(self, state_provider: "WorkspaceStateProvider"):
95
+ self.state_provider = state_provider
96
+ self._last_check_time: Optional[datetime] = None
97
+ self._change_condition = threading.Condition()
98
+ self._change_pending = False # Protected by _change_condition
99
+ self._thread: Optional[threading.Thread] = None
100
+ self._debounce_seconds = 0.5 # Wait before processing to batch rapid changes
101
+ self._state_lock = threading.Lock() # Protects start/stop transitions
102
+ self._stopped = False # Once True, cannot be restarted
103
+
104
+ def start(self) -> None:
105
+ """Start the change detection thread"""
106
+ with self._state_lock:
107
+ # Once stopped, cannot restart
108
+ if self._stopped:
109
+ logger.debug("Cannot start change detector - already stopped")
110
+ return
111
+
112
+ if self._thread is not None and self._thread.is_alive():
113
+ return # Already running
114
+
115
+ self._thread = threading.Thread(
116
+ target=self._run,
117
+ daemon=True,
118
+ name="DBChangeDetector",
119
+ )
120
+ self._thread.start()
121
+ logger.debug("Started database change detector thread")
122
+
123
+ def stop(self) -> None:
124
+ """Stop the change detection thread"""
125
+ with self._state_lock:
126
+ self._stopped = True # Mark as permanently stopped
127
+
128
+ # Wake up the thread so it can exit
129
+ with self._change_condition:
130
+ self._change_condition.notify_all()
131
+
132
+ # Join outside the lock to avoid deadlock
133
+ if self._thread is not None:
134
+ self._thread.join(timeout=2.0)
135
+ self._thread = None
136
+ logger.debug("Stopped database change detector thread")
137
+
138
+ def signal_change(self) -> None:
139
+ """Signal that a database change was detected (non-blocking)"""
140
+ with self._change_condition:
141
+ self._change_pending = True
142
+ self._change_condition.notify()
143
+
144
+ def _run(self) -> None:
145
+ """Main loop: wait for changes and process them"""
146
+ while not self._stopped:
147
+ # Wait for a change signal and clear it atomically
148
+ with self._change_condition:
149
+ while not self._change_pending and not self._stopped:
150
+ self._change_condition.wait()
151
+
152
+ if self._stopped:
153
+ break
154
+
155
+ # Clear the pending flag atomically while holding the lock
156
+ self._change_pending = False
157
+
158
+ # Debounce - wait a bit for more changes to accumulate
159
+ time.sleep(self._debounce_seconds)
160
+
161
+ # Process all accumulated changes
162
+ self._detect_and_notify_changes()
163
+
164
+ def _detect_and_notify_changes(self) -> None:
165
+ """Query the database to detect what changed and send events"""
166
+ try:
167
+ since = self._last_check_time
168
+ self._last_check_time = datetime.now()
169
+
170
+ # Query for changed experiments
171
+ with self.state_provider.workspace_db.bind_ctx([ExperimentModel]):
172
+ query = ExperimentModel.select()
173
+ if since:
174
+ query = query.where(ExperimentModel.updated_at > since)
175
+
176
+ for exp in query:
177
+ self.state_provider._notify_listeners(
178
+ StateEvent(
179
+ event_type=StateEventType.EXPERIMENT_UPDATED,
180
+ data={
181
+ "experiment_id": exp.experiment_id,
182
+ },
183
+ )
184
+ )
185
+
186
+ # Query for changed jobs
187
+ with self.state_provider.workspace_db.bind_ctx([JobModel]):
188
+ query = JobModel.select()
189
+ if since:
190
+ query = query.where(JobModel.updated_at > since)
191
+
192
+ for job in query:
193
+ self.state_provider._notify_listeners(
194
+ StateEvent(
195
+ event_type=StateEventType.JOB_UPDATED,
196
+ data={
197
+ "jobId": job.job_id,
198
+ "experimentId": job.experiment_id,
199
+ "runId": job.run_id,
200
+ "status": job.state,
201
+ },
202
+ )
203
+ )
204
+
205
+ except Exception as e:
206
+ logger.warning("Error detecting database changes: %s", e)
207
+
208
+
209
+ class _DatabaseFileHandler(FileSystemEventHandler):
210
+ """Watchdog handler for SQLite database file changes
211
+
212
+ Simply signals the change detector when database files are modified.
213
+ Does not block - all processing happens in the detector thread.
214
+ """
215
+
216
+ def __init__(self, change_detector: _DatabaseChangeDetector):
217
+ super().__init__()
218
+ self.change_detector = change_detector
219
+
220
+ def on_any_event(self, event) -> None:
221
+ """Handle all file system events"""
222
+ # Only handle modification-like events
223
+ if event.event_type not in ("modified", "created", "moved"):
224
+ return
225
+
226
+ if event.is_directory:
227
+ return
228
+
229
+ # Only react to database files
230
+ path = Path(event.src_path)
231
+ if path.name not in ("workspace.db", "workspace.db-wal"):
232
+ return
233
+
234
+ logger.debug(
235
+ "Database file changed: %s (event: %s)", path.name, event.event_type
236
+ )
237
+
238
+ # Signal the detector thread (non-blocking)
239
+ self.change_detector.signal_change()
240
+
241
+
242
+ class MockJob(BaseJob):
243
+ """Concrete implementation of BaseJob for database-loaded jobs
244
+
245
+ This class is used when loading job information from the database,
246
+ as opposed to live Job instances which are created during experiment runs.
247
+ """
248
+
249
+ def __init__(
250
+ self,
251
+ identifier: str,
252
+ task_id: str,
253
+ locator: str,
254
+ path: Path,
255
+ state: str, # State name string from DB
256
+ submittime: Optional[float],
257
+ starttime: Optional[float],
258
+ endtime: Optional[float],
259
+ progress: List[Dict],
260
+ tags: Dict[str, str],
261
+ experiment_id: str,
262
+ run_id: str,
263
+ updated_at: str,
264
+ exit_code: Optional[int] = None,
265
+ retry_count: int = 0,
266
+ failure_reason: Optional[JobFailureStatus] = None,
267
+ ):
268
+ self.identifier = identifier
269
+ self.task_id = task_id
270
+ self.locator = locator
271
+ self.path = path
272
+ # Convert state name to JobState instance
273
+ self.state = STATE_NAME_TO_JOBSTATE.get(state, JobState.UNSCHEDULED)
274
+ self.submittime = submittime
275
+ self.starttime = starttime
276
+ self.endtime = endtime
277
+ self.progress = progress
278
+ self.tags = tags
279
+ self.experiment_id = experiment_id
280
+ self.run_id = run_id
281
+ self.updated_at = updated_at
282
+ self.exit_code = exit_code
283
+ self.retry_count = retry_count
284
+ self.failure_reason = failure_reason
285
+
286
+ @classmethod
287
+ def from_disk(cls, path: Path) -> Optional["MockJob"]:
288
+ """Create a MockJob by reading metadata from disk
289
+
290
+ Args:
291
+ path: Path to the job directory
292
+
293
+ Returns:
294
+ MockJob instance if metadata exists, None otherwise
295
+ """
296
+ metadata_path = path / ".xpm_metadata.json"
297
+ if not metadata_path.exists():
298
+ return None
299
+
300
+ try:
301
+ import json
302
+
303
+ with metadata_path.open("r") as f:
304
+ metadata = json.load(f)
305
+
306
+ return cls(
307
+ identifier=metadata.get("job_id", path.name),
308
+ task_id=metadata.get(
309
+ "task_id", path.parent.name if path.parent else "unknown"
310
+ ),
311
+ locator=metadata.get("job_id", path.name),
312
+ path=path,
313
+ state=metadata.get("state", "unscheduled"),
314
+ submittime=metadata.get("submitted_time"),
315
+ starttime=metadata.get("started_time"),
316
+ endtime=metadata.get("ended_time"),
317
+ progress=[], # Progress not stored in metadata
318
+ tags={}, # Tags come from jobs.jsonl, not metadata
319
+ experiment_id="", # Not stored in job metadata
320
+ run_id="", # Not stored in job metadata
321
+ updated_at=str(metadata.get("last_updated", "")),
322
+ exit_code=metadata.get("exit_code"),
323
+ retry_count=metadata.get("retry_count", 0),
324
+ )
325
+ except Exception as e:
326
+ logger.warning("Failed to read job metadata from %s: %s", path, e)
327
+ return None
328
+
329
+ def getprocess(self):
330
+ """Get process handle for running job
331
+
332
+ This method is used for compatibility with filter expressions and
333
+ for killing running jobs.
334
+
335
+ Returns:
336
+ Process instance or None if process info not available
337
+ """
338
+ from experimaestro.connectors import Process
339
+ from experimaestro.connectors.local import LocalConnector
340
+
341
+ # Get script name from task_id (last component after the last dot)
342
+ scriptname = self.task_id.rsplit(".", 1)[-1]
343
+ pid_file = self.path / f"{scriptname}.pid"
344
+
345
+ if not pid_file.exists():
346
+ return None
347
+
348
+ try:
349
+ connector = LocalConnector.instance()
350
+ pinfo = json.loads(pid_file.read_text())
351
+ return Process.fromDefinition(connector, pinfo)
352
+ except Exception as e:
353
+ logger.warning("Could not get process for job at %s: %s", self.path, e)
354
+ return None
355
+
356
+
357
+ class MockExperiment(BaseExperiment):
358
+ """Concrete implementation of BaseExperiment for database-loaded experiments
359
+
360
+ This class is used when loading experiment information from the database,
361
+ as opposed to live experiment instances which are created during runs.
362
+ """
363
+
364
+ def __init__(
365
+ self,
366
+ workdir: Path,
367
+ current_run_id: Optional[str],
368
+ total_jobs: int,
369
+ finished_jobs: int,
370
+ failed_jobs: int,
371
+ updated_at: str,
372
+ started_at: Optional[float] = None,
373
+ ended_at: Optional[float] = None,
374
+ ):
375
+ self.workdir = workdir
376
+ self.current_run_id = current_run_id
377
+ self.total_jobs = total_jobs
378
+ self.finished_jobs = finished_jobs
379
+ self.failed_jobs = failed_jobs
380
+ self.updated_at = updated_at
381
+ self.started_at = started_at
382
+ self.ended_at = ended_at
383
+
384
+ @property
385
+ def experiment_id(self) -> str:
386
+ """Experiment identifier derived from workdir name"""
387
+ return self.workdir.name
388
+
389
+
390
+ def _with_db_context(func):
391
+ """Decorator to wrap method in database bind context
392
+
393
+ This ensures all database queries have the models bound to the database.
394
+ """
395
+ from functools import wraps
396
+
397
+ @wraps(func)
398
+ def wrapper(self, *args, **kwargs):
399
+ try:
400
+ with self.workspace_db.bind_ctx(ALL_MODELS):
401
+ return func(self, *args, **kwargs)
402
+ except Exception as e:
403
+ logger.exception("Error in %s with database context: %s", func.__name__, e)
404
+ raise
405
+
406
+ return wrapper
407
+
408
+
409
+ class WorkspaceStateProvider:
410
+ """Unified state provider for workspace-level database (singleton per workspace path)
411
+
412
+ Provides access to experiment and job state from a single workspace database.
413
+ Supports both read-only (monitoring) and read-write (scheduler) modes.
414
+
415
+ Only one WorkspaceStateProvider instance exists per workspace path. Subsequent
416
+ requests for the same path return the existing instance.
417
+
418
+ Thread safety:
419
+ - Database connections are thread-local (managed by state_db module)
420
+ - Singleton registry is protected by a lock
421
+ - Each thread gets its own database connection
422
+
423
+ Run tracking:
424
+ - Each experiment can have multiple runs
425
+ - Jobs/services are scoped to (experiment_id, run_id)
426
+ - Tags are scoped to (job_id, experiment_id, run_id) - fixes GH #128
427
+ """
428
+
429
+ # Registry of state provider instances by absolute path
430
+ _instances: Dict[Path, "WorkspaceStateProvider"] = {}
431
+ _lock = threading.Lock()
432
+
433
+ @classmethod
434
+ def get_instance(
435
+ cls,
436
+ workspace_path: Path,
437
+ read_only: bool = False,
438
+ sync_on_start: bool = False,
439
+ sync_interval_minutes: int = 5,
440
+ ) -> "WorkspaceStateProvider":
441
+ """Get or create WorkspaceStateProvider instance for a workspace path
442
+
443
+ Args:
444
+ workspace_path: Root workspace directory
445
+ read_only: If True, database is in read-only mode
446
+ sync_on_start: If True, sync from disk on initialization
447
+ sync_interval_minutes: Minimum interval between syncs (default: 5)
448
+
449
+ Returns:
450
+ WorkspaceStateProvider instance (singleton per path)
451
+ """
452
+ # Normalize path
453
+ if isinstance(workspace_path, Path):
454
+ workspace_path = workspace_path.absolute()
455
+ else:
456
+ workspace_path = Path(workspace_path).absolute()
457
+
458
+ # Check if instance already exists
459
+ with cls._lock:
460
+ if workspace_path in cls._instances:
461
+ existing = cls._instances[workspace_path]
462
+ # Fail if requesting different read_only mode than cached instance
463
+ if existing.read_only != read_only:
464
+ raise RuntimeError(
465
+ f"WorkspaceStateProvider for {workspace_path} already exists "
466
+ f"with read_only={existing.read_only}, cannot open with "
467
+ f"read_only={read_only}. Close the existing instance first."
468
+ )
469
+ return existing
470
+
471
+ # Create new instance - register BEFORE __init__ to handle
472
+ # nested get_instance calls during sync_on_start
473
+ instance = object.__new__(cls)
474
+ cls._instances[workspace_path] = instance
475
+
476
+ # Initialize outside the lock to avoid deadlock during sync
477
+ try:
478
+ instance.__init__(
479
+ workspace_path, read_only, sync_on_start, sync_interval_minutes
480
+ )
481
+ except Exception:
482
+ # Remove from registry if initialization fails
483
+ with cls._lock:
484
+ cls._instances.pop(workspace_path, None)
485
+ raise
486
+ return instance
487
+
488
+ def __init__(
489
+ self,
490
+ workspace_path: Path,
491
+ read_only: bool = False,
492
+ sync_on_start: bool = False,
493
+ sync_interval_minutes: int = 5,
494
+ ):
495
+ """Initialize workspace state provider (called by get_instance())
496
+
497
+ Args:
498
+ workspace_path: Root workspace directory
499
+ read_only: If True, database is in read-only mode
500
+ sync_on_start: If True, sync from disk on initialization
501
+ sync_interval_minutes: Minimum interval between syncs (default: 5)
502
+ """
503
+ # Normalize path
504
+ if isinstance(workspace_path, Path):
505
+ workspace_path = workspace_path.absolute()
506
+ else:
507
+ workspace_path = Path(workspace_path).absolute()
508
+
509
+ self.workspace_path = workspace_path
510
+ self.read_only = read_only
511
+ self.sync_interval_minutes = sync_interval_minutes
512
+
513
+ # Listeners for push notifications
514
+ self._listeners: Set[StateListener] = set()
515
+ self._listeners_lock = threading.Lock()
516
+
517
+ # File watcher for database changes (started when listeners are added)
518
+ self._change_detector: Optional[_DatabaseChangeDetector] = None
519
+ self._db_file_handler: Optional[_DatabaseFileHandler] = None
520
+ self._db_file_watch: Optional[ObservedWatch] = None
521
+
522
+ # Check and update workspace version
523
+ from .workspace import WORKSPACE_VERSION
524
+
525
+ version_file = self.workspace_path / ".__experimaestro__"
526
+
527
+ if version_file.exists():
528
+ # Read existing version
529
+ content = version_file.read_text().strip()
530
+ if content == "":
531
+ # Empty file = v0
532
+ workspace_version = 0
533
+ else:
534
+ try:
535
+ workspace_version = int(content)
536
+ except ValueError:
537
+ raise RuntimeError(
538
+ f"Invalid workspace version file at {version_file}: "
539
+ f"expected integer, got '{content}'"
540
+ )
541
+
542
+ # Check if workspace version is supported
543
+ if workspace_version > WORKSPACE_VERSION:
544
+ raise RuntimeError(
545
+ f"Workspace version {workspace_version} is not supported by "
546
+ f"this version of experimaestro (supports up to version "
547
+ f"{WORKSPACE_VERSION}). Please upgrade experimaestro."
548
+ )
549
+ if workspace_version < WORKSPACE_VERSION:
550
+ raise RuntimeError(
551
+ f"Workspace version {workspace_version} is not supported by "
552
+ "this version of experimaestro (please upgrade the experimaestro "
553
+ "workspace)"
554
+ )
555
+ else:
556
+ # New workspace - create the file
557
+ workspace_version = WORKSPACE_VERSION
558
+
559
+ # Write current version to file (update empty v0 workspaces)
560
+ if not read_only and (
561
+ not version_file.exists() or version_file.read_text().strip() == ""
562
+ ):
563
+ version_file.write_text(str(WORKSPACE_VERSION))
564
+
565
+ # Initialize workspace database in hidden .experimaestro directory
566
+ from .state_db import initialize_workspace_database
567
+
568
+ experimaestro_dir = self.workspace_path / ".experimaestro"
569
+ if not read_only:
570
+ experimaestro_dir.mkdir(parents=True, exist_ok=True)
571
+
572
+ db_path = experimaestro_dir / "workspace.db"
573
+ self.workspace_db = initialize_workspace_database(db_path, read_only=read_only)
574
+ self._db_dir = experimaestro_dir # Store for file watcher
575
+
576
+ # Optionally sync from disk on start (only in write mode)
577
+ # Syncing requires write access to update the database and sync timestamp
578
+ if sync_on_start and not read_only:
579
+ from .state_sync import sync_workspace_from_disk
580
+
581
+ sync_workspace_from_disk(
582
+ self.workspace_path,
583
+ write_mode=True,
584
+ force=False,
585
+ sync_interval_minutes=sync_interval_minutes,
586
+ )
587
+
588
+ logger.info(
589
+ "WorkspaceStateProvider initialized (read_only=%s, workspace=%s)",
590
+ read_only,
591
+ workspace_path,
592
+ )
593
+
594
+ # Experiment management methods
595
+
596
+ @_with_db_context
597
+ def ensure_experiment(self, experiment_id: str):
598
+ """Create or update experiment record
599
+
600
+ Args:
601
+ experiment_id: Unique identifier for the experiment
602
+ """
603
+ if self.read_only:
604
+ raise RuntimeError("Cannot modify experiments in read-only mode")
605
+
606
+ now = datetime.now()
607
+ ExperimentModel.insert(
608
+ experiment_id=experiment_id,
609
+ created_at=now,
610
+ updated_at=now,
611
+ ).on_conflict(
612
+ conflict_target=[ExperimentModel.experiment_id],
613
+ update={
614
+ ExperimentModel.updated_at: now,
615
+ },
616
+ ).execute()
617
+
618
+ logger.debug("Ensured experiment: %s", experiment_id)
619
+
620
+ # Notify listeners
621
+ exp_path = str(self.workspace_path / "xp" / experiment_id)
622
+ self._notify_listeners(
623
+ StateEvent(
624
+ event_type=StateEventType.EXPERIMENT_UPDATED,
625
+ data={
626
+ "experiment_id": experiment_id,
627
+ "workdir_path": exp_path,
628
+ "updated_at": now.isoformat(),
629
+ },
630
+ )
631
+ )
632
+
633
+ @_with_db_context
634
+ def create_run(self, experiment_id: str, run_id: Optional[str] = None) -> str:
635
+ """Create a new run for an experiment
636
+
637
+ Args:
638
+ experiment_id: Experiment identifier
639
+ run_id: Optional run ID (auto-generated from timestamp if not provided)
640
+
641
+ Returns:
642
+ The run_id that was created
643
+
644
+ Raises:
645
+ RuntimeError: If in read-only mode
646
+ """
647
+ if self.read_only:
648
+ raise RuntimeError("Cannot create runs in read-only mode")
649
+
650
+ # Auto-generate run_id from timestamp if not provided
651
+ if run_id is None:
652
+ now = datetime.now()
653
+ run_id = now.strftime("%Y%m%d_%H%M%S") + f"_{now.microsecond:06d}"
654
+
655
+ # Create run record
656
+ ExperimentRunModel.insert(
657
+ experiment_id=experiment_id,
658
+ run_id=run_id,
659
+ started_at=datetime.now(),
660
+ status="active",
661
+ ).execute()
662
+
663
+ # Update experiment's current_run_id and updated_at
664
+ now = datetime.now()
665
+ ExperimentModel.update(
666
+ current_run_id=run_id,
667
+ updated_at=now,
668
+ ).where(ExperimentModel.experiment_id == experiment_id).execute()
669
+
670
+ logger.info("Created run %s for experiment %s", run_id, experiment_id)
671
+
672
+ # Notify listeners
673
+ self._notify_listeners(
674
+ StateEvent(
675
+ event_type=StateEventType.RUN_UPDATED,
676
+ data={
677
+ "experiment_id": experiment_id,
678
+ "run_id": run_id,
679
+ "status": "active",
680
+ "started_at": now.isoformat(),
681
+ },
682
+ )
683
+ )
684
+
685
+ return run_id
686
+
687
+ @_with_db_context
688
+ def get_current_run(self, experiment_id: str) -> Optional[str]:
689
+ """Get the current/latest run_id for an experiment
690
+
691
+ Args:
692
+ experiment_id: Experiment identifier
693
+
694
+ Returns:
695
+ Current run_id or None if no runs exist
696
+ """
697
+ try:
698
+ experiment = ExperimentModel.get(
699
+ ExperimentModel.experiment_id == experiment_id
700
+ )
701
+ return experiment.current_run_id
702
+ except ExperimentModel.DoesNotExist:
703
+ return None
704
+
705
+ @_with_db_context
706
+ def get_experiments(self, since: Optional[datetime] = None) -> List[MockExperiment]:
707
+ """Get list of all experiments
708
+
709
+ Args:
710
+ since: If provided, only return experiments updated after this timestamp
711
+
712
+ Returns:
713
+ List of MockExperiment objects with attributes:
714
+ - workdir: Path to experiment directory
715
+ - experiment_id: Unique identifier (property derived from workdir.name)
716
+ - current_run_id: Current/latest run ID
717
+ - total_jobs: Total number of jobs (for current run)
718
+ - finished_jobs: Number of completed jobs (for current run)
719
+ - failed_jobs: Number of failed jobs (for current run)
720
+ - updated_at: When experiment was last modified
721
+ """
722
+ experiments = []
723
+
724
+ query = ExperimentModel.select()
725
+ if since is not None:
726
+ query = query.where(ExperimentModel.updated_at > since)
727
+
728
+ for exp_model in query:
729
+ # Count jobs for current run
730
+ total_jobs = 0
731
+ finished_jobs = 0
732
+ failed_jobs = 0
733
+
734
+ started_at = None
735
+ ended_at = None
736
+
737
+ if exp_model.current_run_id:
738
+ total_jobs = (
739
+ JobModel.select()
740
+ .where(
741
+ (JobModel.experiment_id == exp_model.experiment_id)
742
+ & (JobModel.run_id == exp_model.current_run_id)
743
+ )
744
+ .count()
745
+ )
746
+ finished_jobs = (
747
+ JobModel.select()
748
+ .where(
749
+ (JobModel.experiment_id == exp_model.experiment_id)
750
+ & (JobModel.run_id == exp_model.current_run_id)
751
+ & (JobModel.state == "done")
752
+ )
753
+ .count()
754
+ )
755
+ failed_jobs = (
756
+ JobModel.select()
757
+ .where(
758
+ (JobModel.experiment_id == exp_model.experiment_id)
759
+ & (JobModel.run_id == exp_model.current_run_id)
760
+ & (JobModel.state == "error")
761
+ )
762
+ .count()
763
+ )
764
+
765
+ # Get run timestamps
766
+ try:
767
+ run_model = ExperimentRunModel.get(
768
+ (ExperimentRunModel.experiment_id == exp_model.experiment_id)
769
+ & (ExperimentRunModel.run_id == exp_model.current_run_id)
770
+ )
771
+ if run_model.started_at:
772
+ started_at = run_model.started_at.timestamp()
773
+ if run_model.ended_at:
774
+ ended_at = run_model.ended_at.timestamp()
775
+ except ExperimentRunModel.DoesNotExist:
776
+ pass
777
+
778
+ # Compute experiment path from workspace_path and experiment_id
779
+ exp_path = self.workspace_path / "xp" / exp_model.experiment_id
780
+
781
+ experiments.append(
782
+ MockExperiment(
783
+ workdir=exp_path,
784
+ current_run_id=exp_model.current_run_id,
785
+ total_jobs=total_jobs,
786
+ finished_jobs=finished_jobs,
787
+ failed_jobs=failed_jobs,
788
+ updated_at=exp_model.updated_at.isoformat(),
789
+ started_at=started_at,
790
+ ended_at=ended_at,
791
+ )
792
+ )
793
+
794
+ return experiments
795
+
796
+ @_with_db_context
797
+ def get_experiment(self, experiment_id: str) -> Optional[MockExperiment]:
798
+ """Get a specific experiment by ID
799
+
800
+ Args:
801
+ experiment_id: Experiment identifier
802
+
803
+ Returns:
804
+ MockExperiment object or None if not found
805
+ """
806
+ try:
807
+ exp_model = ExperimentModel.get(
808
+ ExperimentModel.experiment_id == experiment_id
809
+ )
810
+ except ExperimentModel.DoesNotExist:
811
+ return None
812
+
813
+ # Count jobs for current run
814
+ total_jobs = 0
815
+ finished_jobs = 0
816
+ failed_jobs = 0
817
+
818
+ if exp_model.current_run_id:
819
+ total_jobs = (
820
+ JobModel.select()
821
+ .where(
822
+ (JobModel.experiment_id == exp_model.experiment_id)
823
+ & (JobModel.run_id == exp_model.current_run_id)
824
+ )
825
+ .count()
826
+ )
827
+ finished_jobs = (
828
+ JobModel.select()
829
+ .where(
830
+ (JobModel.experiment_id == exp_model.experiment_id)
831
+ & (JobModel.run_id == exp_model.current_run_id)
832
+ & (JobModel.state == "done")
833
+ )
834
+ .count()
835
+ )
836
+ failed_jobs = (
837
+ JobModel.select()
838
+ .where(
839
+ (JobModel.experiment_id == exp_model.experiment_id)
840
+ & (JobModel.run_id == exp_model.current_run_id)
841
+ & (JobModel.state == "error")
842
+ )
843
+ .count()
844
+ )
845
+
846
+ # Compute experiment path from workspace_path and experiment_id
847
+ exp_path = self.workspace_path / "xp" / exp_model.experiment_id
848
+
849
+ return MockExperiment(
850
+ workdir=exp_path,
851
+ current_run_id=exp_model.current_run_id,
852
+ total_jobs=total_jobs,
853
+ finished_jobs=finished_jobs,
854
+ failed_jobs=failed_jobs,
855
+ updated_at=exp_model.updated_at.isoformat(),
856
+ )
857
+
858
+ @_with_db_context
859
+ def get_experiment_runs(self, experiment_id: str) -> List[Dict]:
860
+ """Get all runs for an experiment
861
+
862
+ Args:
863
+ experiment_id: Experiment identifier
864
+
865
+ Returns:
866
+ List of run dictionaries with keys:
867
+ - experiment_id: Experiment ID
868
+ - run_id: Run ID
869
+ - started_at: When run started
870
+ - ended_at: When run completed (None if active)
871
+ - status: Run status (active, completed, failed, abandoned)
872
+ """
873
+ runs = []
874
+ for run_model in (
875
+ ExperimentRunModel.select()
876
+ .where(ExperimentRunModel.experiment_id == experiment_id)
877
+ .order_by(ExperimentRunModel.started_at.desc())
878
+ ):
879
+ runs.append(
880
+ {
881
+ "experiment_id": run_model.experiment_id,
882
+ "run_id": run_model.run_id,
883
+ "started_at": run_model.started_at.isoformat(),
884
+ "ended_at": (
885
+ run_model.ended_at.isoformat() if run_model.ended_at else None
886
+ ),
887
+ "status": run_model.status,
888
+ }
889
+ )
890
+ return runs
891
+
892
+ @_with_db_context
893
+ def complete_run(self, experiment_id: str, run_id: str, status: str = "completed"):
894
+ """Mark a run as completed
895
+
896
+ Args:
897
+ experiment_id: Experiment identifier
898
+ run_id: Run identifier
899
+ status: Final status (completed, failed, abandoned)
900
+
901
+ Raises:
902
+ RuntimeError: If in read-only mode
903
+ """
904
+ if self.read_only:
905
+ raise RuntimeError("Cannot modify runs in read-only mode")
906
+
907
+ ExperimentRunModel.update(ended_at=datetime.now(), status=status).where(
908
+ (ExperimentRunModel.experiment_id == experiment_id)
909
+ & (ExperimentRunModel.run_id == run_id)
910
+ ).execute()
911
+
912
+ logger.info("Marked run %s/%s as %s", experiment_id, run_id, status)
913
+
914
+ # Job operations
915
+
916
+ @_with_db_context
917
+ def get_jobs(
918
+ self,
919
+ experiment_id: Optional[str] = None,
920
+ run_id: Optional[str] = None,
921
+ task_id: Optional[str] = None,
922
+ state: Optional[str] = None,
923
+ tags: Optional[Dict[str, str]] = None,
924
+ since: Optional[datetime] = None,
925
+ ) -> List[MockJob]:
926
+ """Query jobs with optional filters
927
+
928
+ Args:
929
+ experiment_id: Filter by experiment (None = all experiments)
930
+ run_id: Filter by run (None = current run if experiment_id provided)
931
+ task_id: Filter by task class identifier
932
+ state: Filter by job state
933
+ tags: Filter by tags (all tags must match)
934
+ since: If provided, only return jobs updated after this timestamp
935
+
936
+ Returns:
937
+ List of MockJob objects
938
+ """
939
+ # Build base query
940
+ query = JobModel.select()
941
+
942
+ # Apply since filter for incremental updates
943
+ if since is not None:
944
+ query = query.where(JobModel.updated_at > since)
945
+
946
+ # Apply experiment filter
947
+ if experiment_id is not None:
948
+ # If experiment_id provided but not run_id, use current run
949
+ if run_id is None:
950
+ current_run = self.get_current_run(experiment_id)
951
+ if current_run is None:
952
+ return [] # No runs exist for this experiment
953
+ run_id = current_run
954
+
955
+ query = query.where(
956
+ (JobModel.experiment_id == experiment_id) & (JobModel.run_id == run_id)
957
+ )
958
+
959
+ # Apply task_id filter
960
+ if task_id is not None:
961
+ query = query.where(JobModel.task_id == task_id)
962
+
963
+ # Apply state filter
964
+ if state is not None:
965
+ query = query.where(JobModel.state == state)
966
+
967
+ # Apply tag filters
968
+ if tags:
969
+ for tag_key, tag_value in tags.items():
970
+ # Join with JobTagModel for each tag filter
971
+ query = query.join(
972
+ JobTagModel,
973
+ on=(
974
+ (JobTagModel.job_id == JobModel.job_id)
975
+ & (JobTagModel.experiment_id == JobModel.experiment_id)
976
+ & (JobTagModel.run_id == JobModel.run_id)
977
+ & (JobTagModel.tag_key == tag_key)
978
+ & (JobTagModel.tag_value == tag_value)
979
+ ),
980
+ )
981
+
982
+ # Execute query and convert to dictionaries
983
+ jobs = []
984
+ for job_model in query:
985
+ # Get tags for this job
986
+ job_tags = self._get_job_tags(
987
+ job_model.job_id, job_model.experiment_id, job_model.run_id
988
+ )
989
+
990
+ jobs.append(self._job_model_to_dict(job_model, job_tags))
991
+
992
+ return jobs
993
+
994
+ @_with_db_context
995
+ def get_job(
996
+ self, job_id: str, experiment_id: str, run_id: Optional[str] = None
997
+ ) -> Optional[MockJob]:
998
+ """Get a specific job
999
+
1000
+ Args:
1001
+ job_id: Job identifier
1002
+ experiment_id: Experiment identifier
1003
+ run_id: Run identifier (None = current run)
1004
+
1005
+ Returns:
1006
+ MockJob object or None if not found
1007
+ """
1008
+ # Use current run if not specified
1009
+ if run_id is None:
1010
+ run_id = self.get_current_run(experiment_id)
1011
+ if run_id is None:
1012
+ return None
1013
+
1014
+ try:
1015
+ job_model = JobModel.get(
1016
+ (JobModel.job_id == job_id)
1017
+ & (JobModel.experiment_id == experiment_id)
1018
+ & (JobModel.run_id == run_id)
1019
+ )
1020
+ except JobModel.DoesNotExist:
1021
+ return None
1022
+
1023
+ # Get tags for this job
1024
+ job_tags = self._get_job_tags(job_id, experiment_id, run_id)
1025
+
1026
+ return self._job_model_to_dict(job_model, job_tags)
1027
+
1028
+ @_with_db_context
1029
+ def update_job_submitted(self, job: "Job", experiment_id: str, run_id: str):
1030
+ """Record that a job has been submitted
1031
+
1032
+ Args:
1033
+ job: Job instance
1034
+ experiment_id: Experiment identifier
1035
+ run_id: Run identifier
1036
+
1037
+ Raises:
1038
+ RuntimeError: If in read-only mode
1039
+ """
1040
+ if self.read_only:
1041
+ raise RuntimeError("Cannot update jobs in read-only mode")
1042
+
1043
+ task_id = str(job.type.identifier)
1044
+
1045
+ # Create or update job record
1046
+ now = datetime.now()
1047
+ JobModel.insert(
1048
+ job_id=job.identifier,
1049
+ experiment_id=experiment_id,
1050
+ run_id=run_id,
1051
+ task_id=task_id,
1052
+ locator=job.identifier,
1053
+ state=job.state.name,
1054
+ submitted_time=job.submittime,
1055
+ updated_at=now,
1056
+ ).on_conflict(
1057
+ conflict_target=[JobModel.job_id, JobModel.experiment_id, JobModel.run_id],
1058
+ update={
1059
+ JobModel.state: job.state.name,
1060
+ JobModel.submitted_time: job.submittime,
1061
+ JobModel.updated_at: now,
1062
+ JobModel.failure_reason: None, # Clear old failure reason on resubmit
1063
+ },
1064
+ ).execute()
1065
+
1066
+ # Update tags (run-scoped)
1067
+ self.update_job_tags(job.identifier, experiment_id, run_id, job.tags)
1068
+
1069
+ # Register partials for all declared subparameters
1070
+ subparameters = job.type._subparameters
1071
+ for name, sp in subparameters.items():
1072
+ partial_id = job.config.__xpm__.get_partial_identifier(sp)
1073
+ partial_id_hex = partial_id.all.hex()
1074
+
1075
+ # Register the partial directory
1076
+ self.register_partial(partial_id_hex, task_id, name)
1077
+
1078
+ # Link job to partial
1079
+ self.register_job_partial(
1080
+ job.identifier, experiment_id, run_id, partial_id_hex
1081
+ )
1082
+
1083
+ logger.debug(
1084
+ "Recorded job submission: %s (experiment=%s, run=%s)",
1085
+ job.identifier,
1086
+ experiment_id,
1087
+ run_id,
1088
+ )
1089
+
1090
+ # Notify listeners
1091
+ job_path = str(
1092
+ self.workspace_path / "jobs" / str(job.type.identifier) / job.identifier
1093
+ )
1094
+ self._notify_listeners(
1095
+ StateEvent(
1096
+ event_type=StateEventType.JOB_UPDATED,
1097
+ data={
1098
+ "jobId": job.identifier,
1099
+ "taskId": str(job.type.identifier),
1100
+ "experimentId": experiment_id,
1101
+ "runId": run_id,
1102
+ "status": job.state.name,
1103
+ "path": job_path,
1104
+ "updatedAt": now.isoformat(),
1105
+ },
1106
+ )
1107
+ )
1108
+
1109
+ @_with_db_context
1110
+ def update_job_state(self, job: "Job", experiment_id: str, run_id: str):
1111
+ """Update the state of a job
1112
+
1113
+ Args:
1114
+ job: Job instance
1115
+ experiment_id: Experiment identifier
1116
+ run_id: Run identifier
1117
+
1118
+ Raises:
1119
+ RuntimeError: If in read-only mode
1120
+ """
1121
+ if self.read_only:
1122
+ raise RuntimeError("Cannot update jobs in read-only mode")
1123
+
1124
+ # Build update dict with updated_at timestamp
1125
+ now = datetime.now()
1126
+ update_data = {
1127
+ JobModel.state: job.state.name,
1128
+ JobModel.updated_at: now,
1129
+ }
1130
+
1131
+ # Add or clear failure reason based on state
1132
+ from experimaestro.scheduler.jobs import JobStateError
1133
+
1134
+ if isinstance(job.state, JobStateError) and job.state.failure_reason:
1135
+ update_data[JobModel.failure_reason] = job.state.failure_reason.name
1136
+ else:
1137
+ # Clear failure reason when job is not in error state
1138
+ update_data[JobModel.failure_reason] = None
1139
+
1140
+ # Add timing information
1141
+ if job.starttime:
1142
+ update_data[JobModel.started_time] = job.starttime
1143
+ if job.endtime:
1144
+ update_data[JobModel.ended_time] = job.endtime
1145
+
1146
+ # Add progress information
1147
+ if job._progress:
1148
+ update_data[JobModel.progress] = json.dumps(
1149
+ [
1150
+ {"level": p.level, "progress": p.progress, "desc": p.desc}
1151
+ for p in job._progress
1152
+ ]
1153
+ )
1154
+
1155
+ # Update the job record
1156
+ JobModel.update(update_data).where(
1157
+ (JobModel.job_id == job.identifier)
1158
+ & (JobModel.experiment_id == experiment_id)
1159
+ & (JobModel.run_id == run_id)
1160
+ ).execute()
1161
+
1162
+ logger.debug(
1163
+ "Updated job state: %s -> %s (experiment=%s, run=%s)",
1164
+ job.identifier,
1165
+ job.state.name,
1166
+ experiment_id,
1167
+ run_id,
1168
+ )
1169
+
1170
+ # Notify listeners
1171
+ job_path = str(
1172
+ self.workspace_path / "jobs" / str(job.type.identifier) / job.identifier
1173
+ )
1174
+ self._notify_listeners(
1175
+ StateEvent(
1176
+ event_type=StateEventType.JOB_UPDATED,
1177
+ data={
1178
+ "jobId": job.identifier,
1179
+ "taskId": str(job.type.identifier),
1180
+ "experimentId": experiment_id,
1181
+ "runId": run_id,
1182
+ "status": job.state.name,
1183
+ "path": job_path,
1184
+ "updatedAt": now.isoformat(),
1185
+ },
1186
+ )
1187
+ )
1188
+
1189
+ @_with_db_context
1190
+ def update_job_tags(
1191
+ self, job_id: str, experiment_id: str, run_id: str, tags_dict: Dict[str, str]
1192
+ ):
1193
+ """Update tags for a job (run-scoped - fixes GH #128)
1194
+
1195
+ Deletes existing tags for this (job_id, experiment_id, run_id) combination
1196
+ and inserts new tags. This ensures that the same job in different runs can
1197
+ have different tags.
1198
+
1199
+ Args:
1200
+ job_id: Job identifier
1201
+ experiment_id: Experiment identifier
1202
+ run_id: Run identifier
1203
+ tags_dict: Dictionary of tag key-value pairs
1204
+
1205
+ Raises:
1206
+ RuntimeError: If in read-only mode
1207
+ """
1208
+ if self.read_only:
1209
+ raise RuntimeError("Cannot update tags in read-only mode")
1210
+
1211
+ # Delete existing tags for this job/experiment/run
1212
+ JobTagModel.delete().where(
1213
+ (JobTagModel.job_id == job_id)
1214
+ & (JobTagModel.experiment_id == experiment_id)
1215
+ & (JobTagModel.run_id == run_id)
1216
+ ).execute()
1217
+
1218
+ # Insert new tags
1219
+ if tags_dict:
1220
+ tag_records = [
1221
+ {
1222
+ "job_id": job_id,
1223
+ "experiment_id": experiment_id,
1224
+ "run_id": run_id,
1225
+ "tag_key": key,
1226
+ "tag_value": value,
1227
+ }
1228
+ for key, value in tags_dict.items()
1229
+ ]
1230
+ JobTagModel.insert_many(tag_records).execute()
1231
+
1232
+ logger.debug(
1233
+ "Updated tags for job %s (experiment=%s, run=%s): %s",
1234
+ job_id,
1235
+ experiment_id,
1236
+ run_id,
1237
+ tags_dict,
1238
+ )
1239
+
1240
+ @_with_db_context
1241
+ def delete_job(self, job_id: str, experiment_id: str, run_id: str):
1242
+ """Remove a job, its tags, and partial references
1243
+
1244
+ Args:
1245
+ job_id: Job identifier
1246
+ experiment_id: Experiment identifier
1247
+ run_id: Run identifier
1248
+
1249
+ Raises:
1250
+ RuntimeError: If in read-only mode
1251
+ """
1252
+ if self.read_only:
1253
+ raise RuntimeError("Cannot delete jobs in read-only mode")
1254
+
1255
+ # Delete tags first (foreign key constraint)
1256
+ JobTagModel.delete().where(
1257
+ (JobTagModel.job_id == job_id)
1258
+ & (JobTagModel.experiment_id == experiment_id)
1259
+ & (JobTagModel.run_id == run_id)
1260
+ ).execute()
1261
+
1262
+ # Delete partial references
1263
+ JobPartialModel.delete().where(
1264
+ (JobPartialModel.job_id == job_id)
1265
+ & (JobPartialModel.experiment_id == experiment_id)
1266
+ & (JobPartialModel.run_id == run_id)
1267
+ ).execute()
1268
+
1269
+ # Delete job
1270
+ JobModel.delete().where(
1271
+ (JobModel.job_id == job_id)
1272
+ & (JobModel.experiment_id == experiment_id)
1273
+ & (JobModel.run_id == run_id)
1274
+ ).execute()
1275
+
1276
+ logger.debug(
1277
+ "Deleted job %s (experiment=%s, run=%s)", job_id, experiment_id, run_id
1278
+ )
1279
+
1280
+ # CLI utility methods for job management
1281
+
1282
+ @_with_db_context
1283
+ def get_all_jobs(
1284
+ self,
1285
+ state: Optional[str] = None,
1286
+ tags: Optional[Dict[str, str]] = None,
1287
+ since: Optional[datetime] = None,
1288
+ ) -> List[MockJob]:
1289
+ """Query all jobs across all experiments/runs
1290
+
1291
+ This method is designed for CLI tools that need to list or manage jobs
1292
+ across the entire workspace, regardless of experiment or run.
1293
+
1294
+ Args:
1295
+ state: Filter by job state (e.g., "done", "error", "running")
1296
+ tags: Filter by tags (all tags must match)
1297
+ since: If provided, only return jobs updated after this timestamp
1298
+
1299
+ Returns:
1300
+ List of MockJob objects
1301
+ """
1302
+ # Build base query
1303
+ query = JobModel.select()
1304
+
1305
+ # Apply since filter for incremental updates
1306
+ if since is not None:
1307
+ query = query.where(JobModel.updated_at > since)
1308
+
1309
+ # Apply state filter
1310
+ if state is not None:
1311
+ query = query.where(JobModel.state == state)
1312
+
1313
+ # Apply tag filters
1314
+ if tags:
1315
+ for tag_key, tag_value in tags.items():
1316
+ query = query.join(
1317
+ JobTagModel,
1318
+ on=(
1319
+ (JobTagModel.job_id == JobModel.job_id)
1320
+ & (JobTagModel.experiment_id == JobModel.experiment_id)
1321
+ & (JobTagModel.run_id == JobModel.run_id)
1322
+ & (JobTagModel.tag_key == tag_key)
1323
+ & (JobTagModel.tag_value == tag_value)
1324
+ ),
1325
+ )
1326
+
1327
+ # Execute query and convert to MockJob objects
1328
+ jobs = []
1329
+ for job_model in query:
1330
+ # Get tags for this job
1331
+ job_tags = self._get_job_tags(
1332
+ job_model.job_id, job_model.experiment_id, job_model.run_id
1333
+ )
1334
+ jobs.append(self._job_model_to_dict(job_model, job_tags))
1335
+
1336
+ return jobs
1337
+
1338
+ def kill_job(self, job: MockJob, perform: bool = False) -> bool:
1339
+ """Kill a running job process
1340
+
1341
+ This method finds the process associated with a running job and kills it.
1342
+ It also updates the job state in the database to ERROR.
1343
+
1344
+ Args:
1345
+ job: MockJob instance to kill
1346
+ perform: If True, actually kill the process. If False, just check
1347
+ if the job can be killed (dry run).
1348
+
1349
+ Returns:
1350
+ True if job was killed (or would be killed in dry run),
1351
+ False if job is not running or process not found
1352
+ """
1353
+ # Check if job is in a running state
1354
+ if not job.state.running():
1355
+ logger.debug("Job %s is not running (state=%s)", job.identifier, job.state)
1356
+ return False
1357
+
1358
+ # Get process from job
1359
+ process = job.getprocess()
1360
+ if process is None:
1361
+ logger.warning("Could not get process for job %s", job.identifier)
1362
+ return False
1363
+
1364
+ if perform:
1365
+ try:
1366
+ logger.info("Killing job %s (process: %s)", job.identifier, process)
1367
+ process.kill()
1368
+
1369
+ # Update job state in database
1370
+ if not self.read_only:
1371
+ self._update_job_state_to_error(job, "killed")
1372
+ except Exception as e:
1373
+ logger.error("Error killing job %s: %s", job.identifier, e)
1374
+ return False
1375
+
1376
+ return True
1377
+
1378
+ def _update_job_state_to_error(self, job: MockJob, reason: str):
1379
+ """Update job state to ERROR in database
1380
+
1381
+ Args:
1382
+ job: MockJob instance
1383
+ reason: Failure reason
1384
+ """
1385
+ if self.read_only:
1386
+ return
1387
+
1388
+ now = datetime.now()
1389
+ with self.workspace_db.bind_ctx([JobModel]):
1390
+ JobModel.update(
1391
+ state="error",
1392
+ failure_reason=reason,
1393
+ ended_time=now.timestamp(),
1394
+ updated_at=now,
1395
+ ).where(
1396
+ (JobModel.job_id == job.identifier)
1397
+ & (JobModel.experiment_id == job.experiment_id)
1398
+ & (JobModel.run_id == job.run_id)
1399
+ ).execute()
1400
+
1401
+ logger.debug(
1402
+ "Updated job %s state to error (reason=%s)", job.identifier, reason
1403
+ )
1404
+
1405
+ def clean_job(self, job: MockJob, perform: bool = False) -> bool:
1406
+ """Clean a finished job (delete directory and DB entry)
1407
+
1408
+ This method removes the job's working directory and its database entry.
1409
+ Only finished jobs (DONE or ERROR state) can be cleaned.
1410
+
1411
+ Args:
1412
+ job: MockJob instance to clean
1413
+ perform: If True, actually delete the job. If False, just check
1414
+ if the job can be cleaned (dry run).
1415
+
1416
+ Returns:
1417
+ True if job was cleaned (or would be cleaned in dry run),
1418
+ False if job is not finished or cannot be cleaned
1419
+ """
1420
+ from shutil import rmtree
1421
+
1422
+ # Check if job is in a finished state
1423
+ if not job.state.finished():
1424
+ logger.debug(
1425
+ "Job %s is not finished (state=%s), cannot clean",
1426
+ job.identifier,
1427
+ job.state,
1428
+ )
1429
+ return False
1430
+
1431
+ if perform:
1432
+ # Delete job directory
1433
+ if job.path.exists():
1434
+ logger.info("Cleaning job %s: removing %s", job.identifier, job.path)
1435
+ rmtree(job.path)
1436
+ else:
1437
+ logger.warning("Job directory does not exist: %s", job.path)
1438
+
1439
+ # Delete from database
1440
+ if not self.read_only:
1441
+ self.delete_job(job.identifier, job.experiment_id, job.run_id)
1442
+
1443
+ return True
1444
+
1445
+ def kill_jobs(self, jobs: List[MockJob], perform: bool = False) -> int:
1446
+ """Kill multiple jobs
1447
+
1448
+ Args:
1449
+ jobs: List of MockJob instances to kill
1450
+ perform: If True, actually kill the processes. If False, dry run.
1451
+
1452
+ Returns:
1453
+ Number of jobs that were killed (or would be killed in dry run)
1454
+ """
1455
+ count = 0
1456
+ for job in jobs:
1457
+ if self.kill_job(job, perform=perform):
1458
+ count += 1
1459
+ return count
1460
+
1461
+ def clean_jobs(self, jobs: List[MockJob], perform: bool = False) -> int:
1462
+ """Clean multiple finished jobs
1463
+
1464
+ Args:
1465
+ jobs: List of MockJob instances to clean
1466
+ perform: If True, actually delete the jobs. If False, dry run.
1467
+
1468
+ Returns:
1469
+ Number of jobs that were cleaned (or would be cleaned in dry run)
1470
+ """
1471
+ count = 0
1472
+ for job in jobs:
1473
+ if self.clean_job(job, perform=perform):
1474
+ count += 1
1475
+ return count
1476
+
1477
+ def delete_job_safely(
1478
+ self, job: MockJob, cascade_orphans: bool = True
1479
+ ) -> tuple[bool, str]:
1480
+ """Delete a job with proper locking and orphan cleanup
1481
+
1482
+ This method is designed for TUI/UI use. It acquires a lock on the job
1483
+ to prevent race conditions, then deletes the job directory and DB entry.
1484
+
1485
+ Args:
1486
+ job: MockJob instance to delete
1487
+ cascade_orphans: If True, clean up orphan partials after deletion
1488
+
1489
+ Returns:
1490
+ Tuple of (success: bool, message: str)
1491
+ """
1492
+ import fasteners
1493
+ from shutil import rmtree
1494
+
1495
+ # Check if job is running
1496
+ if job.state.running():
1497
+ return False, "Cannot delete a running job"
1498
+
1499
+ # Check if path exists
1500
+ if not job.path or not job.path.exists():
1501
+ # Just delete from database if path doesn't exist
1502
+ if not self.read_only:
1503
+ self.delete_job(job.identifier, job.experiment_id, job.run_id)
1504
+ if cascade_orphans:
1505
+ self.cleanup_orphan_partials(perform=True)
1506
+ return True, f"Job {job.identifier} deleted (directory already gone)"
1507
+
1508
+ # Try to acquire job lock (non-blocking)
1509
+ # Lock file is typically {script_name}.lock, but we use .lock for general locking
1510
+ lock_path = job.path / ".lock"
1511
+ lock = fasteners.InterProcessLock(str(lock_path))
1512
+
1513
+ if not lock.acquire(blocking=False):
1514
+ return False, "Job is currently locked (possibly running)"
1515
+
1516
+ try:
1517
+ # Delete all files except the lock file
1518
+ for item in job.path.iterdir():
1519
+ if item.name != ".lock":
1520
+ if item.is_dir():
1521
+ rmtree(item)
1522
+ else:
1523
+ item.unlink()
1524
+
1525
+ # Mark job as "phantom" in database (don't delete - keep as phantom)
1526
+ if not self.read_only:
1527
+ from datetime import datetime
1528
+
1529
+ JobModel.update(
1530
+ state="phantom",
1531
+ updated_at=datetime.now(),
1532
+ ).where(
1533
+ (JobModel.job_id == job.identifier)
1534
+ & (JobModel.experiment_id == job.experiment_id)
1535
+ & (JobModel.run_id == job.run_id)
1536
+ ).execute()
1537
+
1538
+ finally:
1539
+ lock.release()
1540
+ # Now delete the lock file and directory
1541
+ try:
1542
+ lock_path.unlink(missing_ok=True)
1543
+ if job.path.exists() and not any(job.path.iterdir()):
1544
+ job.path.rmdir()
1545
+ except Exception as e:
1546
+ logger.warning("Could not clean up lock file: %s", e)
1547
+
1548
+ # Clean up orphan partials if requested
1549
+ if cascade_orphans:
1550
+ self.cleanup_orphan_partials(perform=True)
1551
+
1552
+ return True, f"Job {job.identifier} deleted successfully"
1553
+
1554
+ @_with_db_context
1555
+ def delete_experiment(
1556
+ self, experiment_id: str, delete_jobs: bool = False
1557
+ ) -> tuple[bool, str]:
1558
+ """Delete an experiment from the database
1559
+
1560
+ Args:
1561
+ experiment_id: Experiment identifier
1562
+ delete_jobs: If True, also delete associated jobs (default: False)
1563
+
1564
+ Returns:
1565
+ Tuple of (success: bool, message: str)
1566
+ """
1567
+ from shutil import rmtree
1568
+
1569
+ if self.read_only:
1570
+ return False, "Cannot delete in read-only mode"
1571
+
1572
+ # Get all jobs for this experiment
1573
+ jobs = self.get_jobs(experiment_id)
1574
+ running_jobs = [j for j in jobs if j.state.running()]
1575
+
1576
+ if running_jobs:
1577
+ return (
1578
+ False,
1579
+ f"Cannot delete experiment with {len(running_jobs)} running job(s)",
1580
+ )
1581
+
1582
+ # Delete jobs if requested
1583
+ if delete_jobs:
1584
+ for job in jobs:
1585
+ success, msg = self.delete_job_safely(job, cascade_orphans=False)
1586
+ if not success:
1587
+ logger.warning("Failed to delete job %s: %s", job.identifier, msg)
1588
+
1589
+ # Delete experiment runs
1590
+ ExperimentRunModel.delete().where(
1591
+ ExperimentRunModel.experiment_id == experiment_id
1592
+ ).execute()
1593
+
1594
+ # Delete experiment
1595
+ ExperimentModel.delete().where(
1596
+ ExperimentModel.experiment_id == experiment_id
1597
+ ).execute()
1598
+
1599
+ # Optionally delete experiment directory
1600
+ exp_path = self.workspace_path / "xp" / experiment_id
1601
+ if exp_path.exists():
1602
+ try:
1603
+ rmtree(exp_path)
1604
+ except Exception as e:
1605
+ logger.warning("Could not delete experiment directory: %s", e)
1606
+
1607
+ # Clean up orphan partials
1608
+ self.cleanup_orphan_partials(perform=True)
1609
+
1610
+ return True, f"Experiment {experiment_id} deleted successfully"
1611
+
1612
+ @_with_db_context
1613
+ def get_orphan_jobs(self) -> List[MockJob]:
1614
+ """Find jobs that have no associated experiment in the database
1615
+
1616
+ Returns:
1617
+ List of MockJob instances for orphan jobs
1618
+ """
1619
+ # Get all jobs
1620
+ all_jobs = self.get_all_jobs()
1621
+
1622
+ # Get all experiment IDs
1623
+ experiments = self.get_experiments()
1624
+ experiment_ids = {exp.experiment_id for exp in experiments}
1625
+
1626
+ # Find jobs with no matching experiment
1627
+ orphan_jobs = [
1628
+ job for job in all_jobs if job.experiment_id not in experiment_ids
1629
+ ]
1630
+
1631
+ return orphan_jobs
1632
+
1633
+ # Service operations
1634
+
1635
+ @_with_db_context
1636
+ def update_service(
1637
+ self,
1638
+ service_id: str,
1639
+ experiment_id: str,
1640
+ run_id: str,
1641
+ description: str,
1642
+ state: str,
1643
+ state_dict: Optional[str] = None,
1644
+ ):
1645
+ """Update service information
1646
+
1647
+ Args:
1648
+ service_id: Service identifier
1649
+ experiment_id: Experiment identifier
1650
+ run_id: Run identifier
1651
+ description: Human-readable description
1652
+ state: Service state
1653
+ state_dict: JSON serialized state_dict for service recreation
1654
+
1655
+ Raises:
1656
+ RuntimeError: If in read-only mode
1657
+ """
1658
+ if self.read_only:
1659
+ raise RuntimeError("Cannot update services in read-only mode")
1660
+
1661
+ insert_data = {
1662
+ "service_id": service_id,
1663
+ "experiment_id": experiment_id,
1664
+ "run_id": run_id,
1665
+ "description": description,
1666
+ "state": state,
1667
+ "created_at": datetime.now(),
1668
+ "updated_at": datetime.now(),
1669
+ }
1670
+ update_data = {
1671
+ ServiceModel.description: description,
1672
+ ServiceModel.state: state,
1673
+ ServiceModel.updated_at: datetime.now(),
1674
+ }
1675
+
1676
+ if state_dict is not None:
1677
+ insert_data["state_dict"] = state_dict
1678
+ update_data[ServiceModel.state_dict] = state_dict
1679
+
1680
+ ServiceModel.insert(**insert_data).on_conflict(
1681
+ conflict_target=[
1682
+ ServiceModel.service_id,
1683
+ ServiceModel.experiment_id,
1684
+ ServiceModel.run_id,
1685
+ ],
1686
+ update=update_data,
1687
+ ).execute()
1688
+
1689
+ logger.debug(
1690
+ "Updated service %s (experiment=%s, run=%s)",
1691
+ service_id,
1692
+ experiment_id,
1693
+ run_id,
1694
+ )
1695
+
1696
+ # Notify listeners
1697
+ self._notify_listeners(
1698
+ StateEvent(
1699
+ event_type=StateEventType.SERVICE_UPDATED,
1700
+ data={
1701
+ "serviceId": service_id,
1702
+ "experimentId": experiment_id,
1703
+ "runId": run_id,
1704
+ "state": state,
1705
+ "description": description,
1706
+ },
1707
+ )
1708
+ )
1709
+
1710
+ @_with_db_context
1711
+ def get_services(
1712
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
1713
+ ) -> List["Service"]:
1714
+ """Get services, optionally filtered by experiment/run
1715
+
1716
+ This method abstracts whether services are live (from scheduler) or
1717
+ from the database. It returns actual Service objects in both cases:
1718
+ - If a live scheduler has the experiment, return live Service objects
1719
+ - Otherwise, recreate Service objects from stored state_dict
1720
+
1721
+ Args:
1722
+ experiment_id: Filter by experiment (None = all)
1723
+ run_id: Filter by run (None = current run if experiment_id provided)
1724
+
1725
+ Returns:
1726
+ List of Service objects
1727
+ """
1728
+ from experimaestro.scheduler.services import Service
1729
+
1730
+ # First, check for live services from the scheduler
1731
+ if experiment_id is not None:
1732
+ try:
1733
+ from experimaestro.scheduler.base import Scheduler
1734
+
1735
+ if Scheduler.has_instance():
1736
+ scheduler = Scheduler.instance()
1737
+ # Check if experiment is registered with scheduler
1738
+ if experiment_id in scheduler.experiments:
1739
+ exp = scheduler.experiments[experiment_id]
1740
+ services = list(exp.services.values())
1741
+ logger.debug(
1742
+ "Returning %d live services for experiment %s",
1743
+ len(services),
1744
+ experiment_id,
1745
+ )
1746
+ return services
1747
+ except Exception as e:
1748
+ # Scheduler not available or error - fall back to database
1749
+ logger.debug("Could not get live services: %s", e)
1750
+
1751
+ # Fall back to database
1752
+ query = ServiceModel.select()
1753
+
1754
+ if experiment_id is not None:
1755
+ # Use current run if not specified
1756
+ if run_id is None:
1757
+ run_id = self.get_current_run(experiment_id)
1758
+ if run_id is None:
1759
+ return []
1760
+
1761
+ query = query.where(
1762
+ (ServiceModel.experiment_id == experiment_id)
1763
+ & (ServiceModel.run_id == run_id)
1764
+ )
1765
+
1766
+ services = []
1767
+ for service_model in query:
1768
+ # Try to recreate service from state_dict
1769
+ state_dict_json = service_model.state_dict
1770
+ if state_dict_json and state_dict_json != "{}":
1771
+ try:
1772
+ state_dict = json.loads(state_dict_json)
1773
+ if "__class__" in state_dict:
1774
+ service = Service.from_state_dict(state_dict)
1775
+ # Set the id from the database record
1776
+ service.id = service_model.service_id
1777
+ services.append(service)
1778
+ continue
1779
+ except Exception as e:
1780
+ logger.warning(
1781
+ "Failed to recreate service %s from state_dict: %s",
1782
+ service_model.service_id,
1783
+ e,
1784
+ )
1785
+ # If we can't recreate, skip this service (it's not usable)
1786
+ logger.debug(
1787
+ "Service %s has no state_dict for recreation, skipping",
1788
+ service_model.service_id,
1789
+ )
1790
+
1791
+ return services
1792
+
1793
+ def get_live_job_states(self, experiment_id: str) -> Dict[str, str]:
1794
+ """Get live job states from the scheduler if available
1795
+
1796
+ This is useful for debugging to compare live state vs database state.
1797
+
1798
+ Args:
1799
+ experiment_id: The experiment ID to get live jobs for
1800
+
1801
+ Returns:
1802
+ Dict mapping job identifier to live state name, empty if scheduler
1803
+ not available or experiment not registered
1804
+ """
1805
+ try:
1806
+ from experimaestro.scheduler.base import Scheduler
1807
+
1808
+ if not Scheduler.has_instance():
1809
+ logger.debug("No scheduler instance available for live states")
1810
+ return {}
1811
+
1812
+ scheduler = Scheduler.instance()
1813
+ live_states = {}
1814
+
1815
+ logger.debug(
1816
+ "get_live_job_states: looking for exp=%s, scheduler has %d jobs",
1817
+ experiment_id,
1818
+ len(scheduler.jobs),
1819
+ )
1820
+
1821
+ for job_id, job in scheduler.jobs.items():
1822
+ # Filter by experiment if needed
1823
+ if hasattr(job, "experiment") and job.experiment is not None:
1824
+ if hasattr(job.experiment, "workdir"):
1825
+ job_exp_id = job.experiment.workdir.name
1826
+ if job_exp_id == experiment_id:
1827
+ live_states[job_id] = job.state.name
1828
+ else:
1829
+ logger.debug(
1830
+ "Job %s exp_id=%s != requested %s",
1831
+ job_id[:8],
1832
+ job_exp_id,
1833
+ experiment_id,
1834
+ )
1835
+ else:
1836
+ # Job not associated with experiment, include it anyway
1837
+ live_states[job_id] = job.state.name
1838
+ logger.debug(
1839
+ "Job %s has no experiment, including anyway", job_id[:8]
1840
+ )
1841
+
1842
+ logger.debug("Returning %d live job states", len(live_states))
1843
+ return live_states
1844
+
1845
+ except Exception as e:
1846
+ logger.debug("Could not get live job states: %s", e)
1847
+ return {}
1848
+
1849
+ # Sync metadata methods
1850
+
1851
+ @_with_db_context
1852
+ def get_last_sync_time(self) -> Optional[datetime]:
1853
+ """Get the timestamp of the last successful sync
1854
+
1855
+ Returns:
1856
+ datetime of last sync, or None if never synced
1857
+ """
1858
+ from .state_db import WorkspaceSyncMetadata
1859
+
1860
+ metadata = WorkspaceSyncMetadata.get_or_none(
1861
+ WorkspaceSyncMetadata.id == "workspace"
1862
+ )
1863
+ if metadata and metadata.last_sync_time:
1864
+ return metadata.last_sync_time
1865
+ return None
1866
+
1867
+ @_with_db_context
1868
+ def update_last_sync_time(self) -> None:
1869
+ """Update the last sync timestamp to now
1870
+
1871
+ Raises:
1872
+ RuntimeError: If in read-only mode
1873
+ """
1874
+ if self.read_only:
1875
+ raise RuntimeError("Cannot update sync time in read-only mode")
1876
+
1877
+ from .state_db import WorkspaceSyncMetadata
1878
+
1879
+ WorkspaceSyncMetadata.insert(
1880
+ id="workspace", last_sync_time=datetime.now()
1881
+ ).on_conflict(
1882
+ conflict_target=[WorkspaceSyncMetadata.id],
1883
+ update={WorkspaceSyncMetadata.last_sync_time: datetime.now()},
1884
+ ).execute()
1885
+ logger.debug("Updated last sync time")
1886
+
1887
+ # Partial management methods
1888
+
1889
+ @_with_db_context
1890
+ def register_partial(
1891
+ self, partial_id: str, task_id: str, subparameters_name: str
1892
+ ) -> None:
1893
+ """Register a partial directory (creates if not exists)
1894
+
1895
+ Args:
1896
+ partial_id: Hex hash of the partial identifier
1897
+ task_id: Task class identifier
1898
+ subparameters_name: Name of the subparameters definition
1899
+
1900
+ Raises:
1901
+ RuntimeError: If in read-only mode
1902
+ """
1903
+ if self.read_only:
1904
+ raise RuntimeError("Cannot register partials in read-only mode")
1905
+
1906
+ PartialModel.insert(
1907
+ partial_id=partial_id,
1908
+ task_id=task_id,
1909
+ subparameters_name=subparameters_name,
1910
+ created_at=datetime.now(),
1911
+ ).on_conflict_ignore().execute()
1912
+
1913
+ logger.debug(
1914
+ "Registered partial: %s (task=%s, subparams=%s)",
1915
+ partial_id,
1916
+ task_id,
1917
+ subparameters_name,
1918
+ )
1919
+
1920
+ @_with_db_context
1921
+ def register_job_partial(
1922
+ self, job_id: str, experiment_id: str, run_id: str, partial_id: str
1923
+ ) -> None:
1924
+ """Link a job to a partial directory it uses
1925
+
1926
+ Args:
1927
+ job_id: Job identifier
1928
+ experiment_id: Experiment identifier
1929
+ run_id: Run identifier
1930
+ partial_id: Partial directory identifier
1931
+
1932
+ Raises:
1933
+ RuntimeError: If in read-only mode
1934
+ """
1935
+ if self.read_only:
1936
+ raise RuntimeError("Cannot register job partials in read-only mode")
1937
+
1938
+ JobPartialModel.insert(
1939
+ job_id=job_id,
1940
+ experiment_id=experiment_id,
1941
+ run_id=run_id,
1942
+ partial_id=partial_id,
1943
+ ).on_conflict_ignore().execute()
1944
+
1945
+ logger.debug(
1946
+ "Linked job %s to partial %s (experiment=%s, run=%s)",
1947
+ job_id,
1948
+ partial_id,
1949
+ experiment_id,
1950
+ run_id,
1951
+ )
1952
+
1953
+ @_with_db_context
1954
+ def unregister_job_partials(
1955
+ self, job_id: str, experiment_id: str, run_id: str
1956
+ ) -> None:
1957
+ """Remove all partial links for a job
1958
+
1959
+ Called when a job is deleted to clean up its partial references.
1960
+
1961
+ Args:
1962
+ job_id: Job identifier
1963
+ experiment_id: Experiment identifier
1964
+ run_id: Run identifier
1965
+
1966
+ Raises:
1967
+ RuntimeError: If in read-only mode
1968
+ """
1969
+ if self.read_only:
1970
+ raise RuntimeError("Cannot unregister job partials in read-only mode")
1971
+
1972
+ JobPartialModel.delete().where(
1973
+ (JobPartialModel.job_id == job_id)
1974
+ & (JobPartialModel.experiment_id == experiment_id)
1975
+ & (JobPartialModel.run_id == run_id)
1976
+ ).execute()
1977
+
1978
+ logger.debug(
1979
+ "Unregistered partials for job %s (experiment=%s, run=%s)",
1980
+ job_id,
1981
+ experiment_id,
1982
+ run_id,
1983
+ )
1984
+
1985
+ @_with_db_context
1986
+ def get_orphan_partials(self) -> List[Dict]:
1987
+ """Find partial directories that are not referenced by any job
1988
+
1989
+ Returns:
1990
+ List of dictionaries with partial_id, task_id, subparameters_name
1991
+ """
1992
+ # Find partials that have no job references
1993
+ # Using a subquery to find referenced partial_ids
1994
+ referenced_partials = JobPartialModel.select(JobPartialModel.partial_id)
1995
+
1996
+ orphan_query = PartialModel.select().where(
1997
+ PartialModel.partial_id.not_in(referenced_partials)
1998
+ )
1999
+
2000
+ orphans = []
2001
+ for partial in orphan_query:
2002
+ orphans.append(
2003
+ {
2004
+ "partial_id": partial.partial_id,
2005
+ "task_id": partial.task_id,
2006
+ "subparameters_name": partial.subparameters_name,
2007
+ "created_at": partial.created_at.isoformat(),
2008
+ }
2009
+ )
2010
+
2011
+ return orphans
2012
+
2013
+ def cleanup_orphan_partials(self, perform: bool = False) -> List[Path]:
2014
+ """Clean up orphan partial directories
2015
+
2016
+ Finds partial directories not referenced by any job and removes them.
2017
+
2018
+ Args:
2019
+ perform: If True, actually delete. If False, dry run (list only).
2020
+
2021
+ Returns:
2022
+ List of paths that were deleted (or would be deleted in dry run)
2023
+ """
2024
+ from shutil import rmtree
2025
+
2026
+ orphans = self.get_orphan_partials()
2027
+ deleted_paths = []
2028
+
2029
+ for orphan in orphans:
2030
+ # Reconstruct path: WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID
2031
+ partial_path = (
2032
+ self.workspace_path
2033
+ / "partials"
2034
+ / orphan["task_id"]
2035
+ / orphan["subparameters_name"]
2036
+ / orphan["partial_id"]
2037
+ )
2038
+
2039
+ if perform:
2040
+ # Delete directory if it exists
2041
+ if partial_path.exists():
2042
+ logger.info("Cleaning orphan partial: %s", partial_path)
2043
+ rmtree(partial_path)
2044
+
2045
+ # Delete from database
2046
+ if not self.read_only:
2047
+ with self.workspace_db.bind_ctx([PartialModel]):
2048
+ PartialModel.delete().where(
2049
+ PartialModel.partial_id == orphan["partial_id"]
2050
+ ).execute()
2051
+
2052
+ deleted_paths.append(partial_path)
2053
+
2054
+ return deleted_paths
2055
+
2056
+ # Utility methods
2057
+
2058
+ def close(self):
2059
+ """Close the database connection and remove from registry
2060
+
2061
+ This should be called when done with the workspace to free resources.
2062
+ """
2063
+ # Stop file watcher if running
2064
+ self._stop_file_watcher()
2065
+
2066
+ # Close database connection
2067
+ if hasattr(self, "workspace_db") and self.workspace_db is not None:
2068
+ from .state_db import close_workspace_database
2069
+
2070
+ close_workspace_database(self.workspace_db)
2071
+ self.workspace_db = None
2072
+
2073
+ # Remove from registry
2074
+ with WorkspaceStateProvider._lock:
2075
+ if self.workspace_path in WorkspaceStateProvider._instances:
2076
+ del WorkspaceStateProvider._instances[self.workspace_path]
2077
+
2078
+ logger.debug("WorkspaceStateProvider closed for %s", self.workspace_path)
2079
+
2080
+ # Listener methods for push notifications
2081
+
2082
+ def add_listener(self, listener: StateListener) -> None:
2083
+ """Register a listener for state change notifications
2084
+
2085
+ Listeners are called synchronously when state changes occur.
2086
+ For UI applications, listeners should queue updates for their
2087
+ own event loop to avoid blocking database operations.
2088
+
2089
+ When the first listener is added, starts watching the database
2090
+ file for changes to enable push notifications.
2091
+
2092
+ Args:
2093
+ listener: Callback function that receives StateEvent objects
2094
+ """
2095
+ with self._listeners_lock:
2096
+ was_empty = len(self._listeners) == 0
2097
+ self._listeners.add(listener)
2098
+
2099
+ # Start file watcher when first listener is added
2100
+ if was_empty:
2101
+ self._start_file_watcher()
2102
+
2103
+ logger.info(
2104
+ "Added state listener: %s (total: %d)", listener, len(self._listeners)
2105
+ )
2106
+
2107
+ def remove_listener(self, listener: StateListener) -> None:
2108
+ """Unregister a state change listener
2109
+
2110
+ When the last listener is removed, stops watching the database file.
2111
+
2112
+ Args:
2113
+ listener: Previously registered callback function
2114
+ """
2115
+ with self._listeners_lock:
2116
+ self._listeners.discard(listener)
2117
+ is_empty = len(self._listeners) == 0
2118
+
2119
+ # Stop file watcher when last listener is removed
2120
+ if is_empty:
2121
+ self._stop_file_watcher()
2122
+
2123
+ logger.debug("Removed state listener: %s", listener)
2124
+
2125
+ def _start_file_watcher(self) -> None:
2126
+ """Start watching the database file for changes"""
2127
+ if self._db_file_watch is not None:
2128
+ logger.info("File watcher already running for %s", self._db_dir)
2129
+ return # Already watching
2130
+
2131
+ from experimaestro.ipc import ipcom
2132
+
2133
+ # Create and start the change detector thread
2134
+ self._change_detector = _DatabaseChangeDetector(self)
2135
+ self._change_detector.start()
2136
+
2137
+ # Create the file handler that signals the detector
2138
+ self._db_file_handler = _DatabaseFileHandler(self._change_detector)
2139
+ self._db_file_watch = ipcom().fswatch(
2140
+ self._db_file_handler,
2141
+ self._db_dir,
2142
+ recursive=False,
2143
+ )
2144
+ logger.info("Started database file watcher for %s", self._db_dir)
2145
+
2146
+ def _stop_file_watcher(self) -> None:
2147
+ """Stop watching the database file"""
2148
+ if self._db_file_watch is None:
2149
+ return # Not watching
2150
+
2151
+ from experimaestro.ipc import ipcom
2152
+
2153
+ # Stop the file watcher first
2154
+ ipcom().fsunwatch(self._db_file_watch)
2155
+ self._db_file_watch = None
2156
+ self._db_file_handler = None
2157
+
2158
+ # Stop the change detector thread
2159
+ if self._change_detector is not None:
2160
+ self._change_detector.stop()
2161
+ self._change_detector = None
2162
+
2163
+ logger.debug("Stopped database file watcher for %s", self.workspace_path)
2164
+
2165
+ def _notify_listeners(self, event: StateEvent) -> None:
2166
+ """Notify all registered listeners of a state change
2167
+
2168
+ This is called internally by state-modifying methods.
2169
+ Listeners are called synchronously - they should be fast.
2170
+
2171
+ Args:
2172
+ event: State change event to broadcast
2173
+ """
2174
+ with self._listeners_lock:
2175
+ listeners = list(self._listeners)
2176
+
2177
+ for listener in listeners:
2178
+ try:
2179
+ listener(event)
2180
+ except Exception as e:
2181
+ logger.warning("Listener %s raised exception: %s", listener, e)
2182
+
2183
+ # Helper methods
2184
+
2185
+ @_with_db_context
2186
+ def _get_job_tags(
2187
+ self, job_id: str, experiment_id: str, run_id: str
2188
+ ) -> Dict[str, str]:
2189
+ """Get tags for a job
2190
+
2191
+ Args:
2192
+ job_id: Job identifier
2193
+ experiment_id: Experiment identifier
2194
+ run_id: Run identifier
2195
+
2196
+ Returns:
2197
+ Dictionary of tag key-value pairs
2198
+ """
2199
+ tags = {}
2200
+ for tag_model in JobTagModel.select().where(
2201
+ (JobTagModel.job_id == job_id)
2202
+ & (JobTagModel.experiment_id == experiment_id)
2203
+ & (JobTagModel.run_id == run_id)
2204
+ ):
2205
+ tags[tag_model.tag_key] = tag_model.tag_value
2206
+ return tags
2207
+
2208
+ def _job_model_to_dict(self, job_model: JobModel, tags: Dict[str, str]) -> MockJob:
2209
+ """Convert a JobModel to a MockJob object
2210
+
2211
+ Args:
2212
+ job_model: JobModel instance
2213
+ tags: Dictionary of tags for this job
2214
+
2215
+ Returns:
2216
+ MockJob object
2217
+ """
2218
+ # Parse progress JSON
2219
+ progress_list = json.loads(job_model.progress)
2220
+
2221
+ # Compute job path from workspace_path, task_id, and job_id
2222
+ job_path = self.workspace_path / "jobs" / job_model.task_id / job_model.job_id
2223
+
2224
+ # Convert failure_reason string to enum if present
2225
+ failure_reason = None
2226
+ if job_model.failure_reason:
2227
+ try:
2228
+ failure_reason = JobFailureStatus[job_model.failure_reason]
2229
+ except KeyError:
2230
+ pass # Unknown failure reason, leave as None
2231
+
2232
+ return MockJob(
2233
+ identifier=job_model.job_id,
2234
+ task_id=job_model.task_id,
2235
+ locator=job_model.locator,
2236
+ path=job_path,
2237
+ state=job_model.state,
2238
+ submittime=job_model.submitted_time,
2239
+ starttime=job_model.started_time,
2240
+ endtime=job_model.ended_time,
2241
+ progress=progress_list,
2242
+ tags=tags,
2243
+ experiment_id=job_model.experiment_id,
2244
+ run_id=job_model.run_id,
2245
+ updated_at=job_model.updated_at.isoformat(),
2246
+ failure_reason=failure_reason,
2247
+ )
2248
+
2249
+ def _format_time(self, timestamp: Optional[float]) -> str:
2250
+ """Format timestamp for UI
2251
+
2252
+ Args:
2253
+ timestamp: Unix timestamp or None
2254
+
2255
+ Returns:
2256
+ ISO format datetime string or empty string
2257
+ """
2258
+ if not timestamp:
2259
+ return ""
2260
+ return datetime.fromtimestamp(timestamp).isoformat()
2261
+
2262
+
2263
+ # Scheduler listener adapter
2264
+ class SchedulerListener:
2265
+ """Adapter to connect scheduler events to WorkspaceStateProvider
2266
+
2267
+ This class implements the scheduler listener interface and forwards
2268
+ events to the WorkspaceStateProvider. It tracks which experiment/run
2269
+ each job belongs to for proper database updates.
2270
+ """
2271
+
2272
+ def __init__(self, state_provider: WorkspaceStateProvider):
2273
+ """Initialize listener
2274
+
2275
+ Args:
2276
+ state_provider: WorkspaceStateProvider instance to update
2277
+ """
2278
+ self.state_provider = state_provider
2279
+ # Map job_id -> (experiment_id, run_id) for tracking
2280
+ self.job_experiments: Dict[str, tuple] = {}
2281
+
2282
+ logger.info("SchedulerListener initialized")
2283
+
2284
+ @_with_db_context
2285
+ def job_submitted(self, job: "Job", experiment_id: str, run_id: str):
2286
+ """Called when a job is submitted
2287
+
2288
+ Args:
2289
+ job: The submitted job
2290
+ experiment_id: Experiment this job belongs to
2291
+ run_id: Run this job belongs to
2292
+ """
2293
+ # Track job's experiment/run
2294
+ self.job_experiments[job.identifier] = (experiment_id, run_id)
2295
+
2296
+ # Update state provider
2297
+ try:
2298
+ self.state_provider.update_job_submitted(job, experiment_id, run_id)
2299
+ except Exception as e:
2300
+ logger.exception(
2301
+ "Error updating job submission for %s: %s", job.identifier, e
2302
+ )
2303
+
2304
+ @_with_db_context
2305
+ def job_state(self, job: "Job"):
2306
+ """Called when a job's state changes
2307
+
2308
+ Args:
2309
+ job: The job with updated state
2310
+ """
2311
+ # Look up job's experiment/run
2312
+ if job.identifier not in self.job_experiments:
2313
+ logger.warning(
2314
+ "State change for unknown job %s (not tracked by listener)",
2315
+ job.identifier,
2316
+ )
2317
+ return
2318
+
2319
+ experiment_id, run_id = self.job_experiments[job.identifier]
2320
+
2321
+ # Update state provider
2322
+ try:
2323
+ self.state_provider.update_job_state(job, experiment_id, run_id)
2324
+ except Exception as e:
2325
+ logger.exception("Error updating job state for %s: %s", job.identifier, e)
2326
+
2327
+ @_with_db_context
2328
+ def service_add(self, service: "Service", experiment_id: str, run_id: str):
2329
+ """Called when a service is added
2330
+
2331
+ Args:
2332
+ service: The added service
2333
+ experiment_id: Experiment identifier
2334
+ run_id: Run identifier
2335
+ """
2336
+ try:
2337
+ self.state_provider.update_service(
2338
+ service.id,
2339
+ experiment_id,
2340
+ run_id,
2341
+ service.description(),
2342
+ service.state.name,
2343
+ )
2344
+ except Exception as e:
2345
+ logger.exception("Error updating service %s: %s", service.id, e)