experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (152) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +239 -126
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +217 -50
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +629 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +732 -167
  36. experimaestro/scheduler/interfaces.py +316 -101
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  39. experimaestro/scheduler/remote/client.py +171 -117
  40. experimaestro/scheduler/remote/protocol.py +8 -193
  41. experimaestro/scheduler/remote/server.py +95 -71
  42. experimaestro/scheduler/services.py +53 -28
  43. experimaestro/scheduler/state_provider.py +663 -2430
  44. experimaestro/scheduler/state_status.py +1247 -0
  45. experimaestro/scheduler/transient.py +31 -0
  46. experimaestro/scheduler/workspace.py +1 -1
  47. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  48. experimaestro/scriptbuilder.py +4 -4
  49. experimaestro/settings.py +36 -0
  50. experimaestro/tests/conftest.py +33 -5
  51. experimaestro/tests/connectors/bin/executable.py +1 -1
  52. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  53. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  54. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  55. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  56. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  58. experimaestro/tests/launchers/bin/test.py +1 -0
  59. experimaestro/tests/launchers/test_slurm.py +9 -9
  60. experimaestro/tests/partial_reschedule.py +46 -0
  61. experimaestro/tests/restart.py +3 -3
  62. experimaestro/tests/restart_main.py +1 -0
  63. experimaestro/tests/scripts/notifyandwait.py +1 -0
  64. experimaestro/tests/task_partial.py +38 -0
  65. experimaestro/tests/task_tokens.py +2 -2
  66. experimaestro/tests/tasks/test_dynamic.py +6 -6
  67. experimaestro/tests/test_dependencies.py +3 -3
  68. experimaestro/tests/test_deprecated.py +15 -15
  69. experimaestro/tests/test_dynamic_locking.py +317 -0
  70. experimaestro/tests/test_environment.py +24 -14
  71. experimaestro/tests/test_experiment.py +171 -36
  72. experimaestro/tests/test_identifier.py +25 -25
  73. experimaestro/tests/test_identifier_stability.py +3 -5
  74. experimaestro/tests/test_multitoken.py +2 -4
  75. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  76. experimaestro/tests/test_partial_paths.py +81 -138
  77. experimaestro/tests/test_pre_experiment.py +219 -0
  78. experimaestro/tests/test_progress.py +2 -8
  79. experimaestro/tests/test_remote_state.py +560 -99
  80. experimaestro/tests/test_stray_jobs.py +261 -0
  81. experimaestro/tests/test_tasks.py +1 -2
  82. experimaestro/tests/test_token_locking.py +52 -67
  83. experimaestro/tests/test_tokens.py +5 -6
  84. experimaestro/tests/test_transient.py +225 -0
  85. experimaestro/tests/test_workspace_state_provider.py +768 -0
  86. experimaestro/tests/token_reschedule.py +1 -3
  87. experimaestro/tests/utils.py +2 -7
  88. experimaestro/tokens.py +227 -372
  89. experimaestro/tools/diff.py +1 -0
  90. experimaestro/tools/documentation.py +4 -5
  91. experimaestro/tools/jobs.py +1 -2
  92. experimaestro/tui/app.py +438 -1966
  93. experimaestro/tui/app.tcss +162 -0
  94. experimaestro/tui/dialogs.py +172 -0
  95. experimaestro/tui/log_viewer.py +253 -3
  96. experimaestro/tui/messages.py +137 -0
  97. experimaestro/tui/utils.py +54 -0
  98. experimaestro/tui/widgets/__init__.py +23 -0
  99. experimaestro/tui/widgets/experiments.py +468 -0
  100. experimaestro/tui/widgets/global_services.py +238 -0
  101. experimaestro/tui/widgets/jobs.py +972 -0
  102. experimaestro/tui/widgets/log.py +156 -0
  103. experimaestro/tui/widgets/orphans.py +363 -0
  104. experimaestro/tui/widgets/runs.py +185 -0
  105. experimaestro/tui/widgets/services.py +314 -0
  106. experimaestro/tui/widgets/stray_jobs.py +528 -0
  107. experimaestro/utils/__init__.py +1 -1
  108. experimaestro/utils/environment.py +105 -22
  109. experimaestro/utils/fswatcher.py +124 -0
  110. experimaestro/utils/jobs.py +1 -2
  111. experimaestro/utils/jupyter.py +1 -2
  112. experimaestro/utils/logging.py +72 -0
  113. experimaestro/version.py +2 -2
  114. experimaestro/webui/__init__.py +9 -0
  115. experimaestro/webui/app.py +117 -0
  116. experimaestro/{server → webui}/data/index.css +66 -11
  117. experimaestro/webui/data/index.css.map +1 -0
  118. experimaestro/{server → webui}/data/index.js +82763 -87217
  119. experimaestro/webui/data/index.js.map +1 -0
  120. experimaestro/webui/routes/__init__.py +5 -0
  121. experimaestro/webui/routes/auth.py +53 -0
  122. experimaestro/webui/routes/proxy.py +117 -0
  123. experimaestro/webui/server.py +200 -0
  124. experimaestro/webui/state_bridge.py +152 -0
  125. experimaestro/webui/websocket.py +413 -0
  126. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
  127. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  128. experimaestro/cli/progress.py +0 -269
  129. experimaestro/scheduler/state.py +0 -75
  130. experimaestro/scheduler/state_db.py +0 -437
  131. experimaestro/scheduler/state_sync.py +0 -891
  132. experimaestro/server/__init__.py +0 -467
  133. experimaestro/server/data/index.css.map +0 -1
  134. experimaestro/server/data/index.js.map +0 -1
  135. experimaestro/tests/test_cli_jobs.py +0 -615
  136. experimaestro/tests/test_file_progress.py +0 -425
  137. experimaestro/tests/test_file_progress_integration.py +0 -477
  138. experimaestro/tests/test_state_db.py +0 -434
  139. experimaestro-2.0.0b8.dist-info/RECORD +0 -187
  140. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  141. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  142. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  143. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  145. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  147. /experimaestro/{server → webui}/data/index.html +0 -0
  148. /experimaestro/{server → webui}/data/login.html +0 -0
  149. /experimaestro/{server → webui}/data/manifest.json +0 -0
  150. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  151. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  152. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1273 @@
1
+ """Filesystem-based state provider implementation
2
+
3
+ This module provides the concrete implementation of StateProvider that
4
+ uses the filesystem for persistent state storage, replacing the SQLite/peewee
5
+ based DbStateProvider.
6
+
7
+ Classes:
8
+ - WorkspaceStateProvider: Filesystem-backed state provider (read-only for monitoring)
9
+ """
10
+
11
+ import json
12
+ import logging
13
+ import sys
14
+ import threading
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import Dict, List, Optional, TYPE_CHECKING
18
+
19
+ from experimaestro.scheduler.interfaces import (
20
+ BaseExperiment,
21
+ BaseService,
22
+ JobState,
23
+ STATE_NAME_TO_JOBSTATE,
24
+ )
25
+ from experimaestro.scheduler.state_provider import (
26
+ StateProvider,
27
+ MockJob,
28
+ MockExperiment,
29
+ ProcessInfo,
30
+ )
31
+ from experimaestro.scheduler.state_status import (
32
+ EventBase,
33
+ JobEventBase,
34
+ ExperimentEventBase,
35
+ JobSubmittedEvent,
36
+ ExperimentUpdatedEvent,
37
+ RunUpdatedEvent,
38
+ EventReader,
39
+ WatchedDirectory,
40
+ job_entity_id_extractor,
41
+ )
42
+
43
+ if TYPE_CHECKING:
44
+ pass
45
+
46
+ logger = logging.getLogger("xpm.workspace_state")
47
+
48
+
49
+ class WorkspaceStateProvider(StateProvider):
50
+ """Filesystem-based state provider for monitoring experiments
51
+
52
+ This provider reads experiment state from status.json and events JSONL files.
53
+ It is read-only and used by TUI/web monitors to observe running and past experiments.
54
+
55
+ Singleton per workspace path - use get_instance() to obtain instances.
56
+ """
57
+
58
+ _instances: Dict[Path, "WorkspaceStateProvider"] = {}
59
+ _lock = threading.Lock()
60
+
61
+ @classmethod
62
+ def get_instance(
63
+ cls,
64
+ workspace_path: Path,
65
+ ) -> "WorkspaceStateProvider":
66
+ """Get or create singleton instance for workspace
67
+
68
+ Args:
69
+ workspace_path: Path to workspace directory
70
+
71
+ Returns:
72
+ WorkspaceStateProvider instance
73
+ """
74
+ workspace_path = Path(workspace_path).resolve()
75
+
76
+ with cls._lock:
77
+ instance = cls._instances.get(workspace_path)
78
+ if instance is None:
79
+ instance = cls(workspace_path)
80
+ cls._instances[workspace_path] = instance
81
+ return instance
82
+
83
+ def __init__(self, workspace_path: Path):
84
+ """Initialize workspace state provider
85
+
86
+ Args:
87
+ workspace_path: Path to workspace directory
88
+ """
89
+ super().__init__()
90
+ self.workspace_path = Path(workspace_path).resolve()
91
+ self._experiments_dir = self.workspace_path / ".events" / "experiments"
92
+
93
+ # Experiment cache: (experiment_id, run_id) -> MockExperiment
94
+ # Only caches active experiments (those with event files)
95
+ self._experiment_cache: Dict[tuple[str, str], MockExperiment] = {}
96
+ self._experiment_cache_lock = threading.Lock()
97
+
98
+ # Job cache: job_id -> MockJob
99
+ # Shared cache for all jobs, updated directly by job events
100
+ self._job_cache: Dict[str, MockJob] = {}
101
+ self._job_cache_lock = threading.Lock()
102
+
103
+ # Service cache
104
+ self._service_cache: Dict[tuple[str, str], Dict[str, BaseService]] = {}
105
+ self._service_cache_lock = threading.Lock()
106
+
107
+ # Event reader (with built-in watching capability)
108
+ self._event_reader: Optional[EventReader] = None
109
+ self._jobs_dir = self.workspace_path / ".events" / "jobs"
110
+ self._start_watcher()
111
+
112
+ def _start_watcher(self) -> None:
113
+ """Start the event file watcher for experiments and jobs"""
114
+ if self._event_reader is None:
115
+ self._event_reader = EventReader(
116
+ [
117
+ WatchedDirectory(path=self._experiments_dir),
118
+ WatchedDirectory(
119
+ path=self._jobs_dir,
120
+ glob_pattern="*/event-*-*.jsonl",
121
+ entity_id_extractor=job_entity_id_extractor,
122
+ ),
123
+ ]
124
+ )
125
+ # Start buffering before watching to avoid race condition:
126
+ # events arriving before scan_existing_files completes should be
127
+ # queued and processed after initial state is loaded
128
+ self._event_reader.start_buffering()
129
+ self._event_reader.start_watching(
130
+ on_event=self._on_event,
131
+ on_deleted=self._on_deleted,
132
+ )
133
+ self._event_reader.scan_existing_files()
134
+ # Now flush any events that arrived during initialization
135
+ self._event_reader.flush_buffer()
136
+
137
+ def _stop_watcher(self) -> None:
138
+ """Stop the event file watcher"""
139
+ if self._event_reader is not None:
140
+ self._event_reader.stop_watching()
141
+ self._event_reader = None
142
+
143
+ def _on_event(self, entity_id: str, event: EventBase) -> None:
144
+ """Unified callback for events from file watcher
145
+
146
+ Uses event class hierarchy to determine how to handle events:
147
+ - ExperimentEventBase: update experiment status cache
148
+ - JobEventBase: update job cache directly
149
+ """
150
+ logger.debug("Received event for entity %s: %s", entity_id, event)
151
+
152
+ # Handle experiment events (update experiment status cache)
153
+ if isinstance(event, ExperimentEventBase):
154
+ experiment_id = entity_id
155
+ self._apply_event_to_cache(experiment_id, event)
156
+
157
+ # Handle job events (update job cache directly)
158
+ # Note: JobSubmittedEvent is both, but job is created when status loads
159
+ if isinstance(event, JobEventBase) and not isinstance(event, JobSubmittedEvent):
160
+ with self._job_cache_lock:
161
+ job = self._job_cache.get(event.job_id)
162
+ if job is not None:
163
+ job.apply_event(event)
164
+
165
+ # Always forward to listeners
166
+ self._notify_state_listeners(event)
167
+
168
+ def _on_deleted(self, entity_id: str) -> None:
169
+ """Unified callback when event files are deleted"""
170
+ # Check if this is an experiment directory
171
+ if (self._experiments_dir / entity_id).exists() or self.get_current_run(
172
+ entity_id
173
+ ):
174
+ # Experiment event files deleted (experiment finalized)
175
+ experiment_id = entity_id
176
+ self._clear_experiment_cache(experiment_id)
177
+ run_id = self.get_current_run(experiment_id) or ""
178
+ self._notify_state_listeners(
179
+ ExperimentUpdatedEvent(experiment_id=experiment_id)
180
+ )
181
+ self._notify_state_listeners(
182
+ RunUpdatedEvent(experiment_id=experiment_id, run_id=run_id)
183
+ )
184
+ # Job deletion doesn't need special handling
185
+
186
+ @property
187
+ def read_only(self) -> bool:
188
+ """This provider is always read-only"""
189
+ return True
190
+
191
+ @property
192
+ def is_remote(self) -> bool:
193
+ """This is a local provider"""
194
+ return False
195
+
196
+ # =========================================================================
197
+ # Status cache methods
198
+ # =========================================================================
199
+
200
+ def _get_cached_experiment(
201
+ self, experiment_id: str, run_id: str, run_dir: Path
202
+ ) -> MockExperiment:
203
+ """Get experiment from cache or load from disk
204
+
205
+ For active experiments (with event files), maintains an in-memory cache
206
+ that is updated when events arrive via the file watcher.
207
+
208
+ Args:
209
+ experiment_id: Experiment identifier
210
+ run_id: Run identifier
211
+ run_dir: Path to run directory
212
+
213
+ Returns:
214
+ MockExperiment with all events applied
215
+ """
216
+ cache_key = (experiment_id, run_id)
217
+
218
+ with self._experiment_cache_lock:
219
+ # Check cache first
220
+ if cache_key in self._experiment_cache:
221
+ return self._experiment_cache[cache_key]
222
+
223
+ # Load from disk using MockExperiment.from_disk
224
+ exp = MockExperiment.from_disk(run_dir, self.workspace_path)
225
+ if exp is None:
226
+ # Create empty experiment if no status.json exists
227
+ exp = MockExperiment(
228
+ workdir=run_dir,
229
+ run_id=run_id,
230
+ )
231
+
232
+ # Apply pending events from event files
233
+ # Events are in experiments/{experiment_id}/events-{count}.jsonl
234
+ reader = EventReader([WatchedDirectory(path=self._experiments_dir)])
235
+ events = reader.read_events_since_count(experiment_id, exp.events_count)
236
+ for event in events:
237
+ exp.apply_event(event)
238
+
239
+ # Only cache if experiment is active (has event files)
240
+ # This avoids caching finished experiments that won't change
241
+ if self._has_event_files(experiment_id):
242
+ self._experiment_cache[cache_key] = exp
243
+
244
+ return exp
245
+
246
+ def _has_event_files(self, experiment_id: str) -> bool:
247
+ """Check if experiment has any event files (is active)"""
248
+ # Format: experiments/{experiment_id}/events-*.jsonl
249
+ exp_events_dir = self._experiments_dir / experiment_id
250
+ return exp_events_dir.is_dir() and any(exp_events_dir.glob("events-*.jsonl"))
251
+
252
+ def _apply_event_to_cache(self, experiment_id: str, event: EventBase) -> None:
253
+ """Apply an event to the cached experiment (called by EventFileWatcher)"""
254
+ run_id = self.get_current_run(experiment_id)
255
+ if run_id is None:
256
+ return
257
+
258
+ cache_key = (experiment_id, run_id)
259
+
260
+ with self._experiment_cache_lock:
261
+ if cache_key in self._experiment_cache:
262
+ self._experiment_cache[cache_key].apply_event(event)
263
+
264
+ def _clear_experiment_cache(self, experiment_id: str) -> None:
265
+ """Clear cached experiment for an experiment (called when experiment finishes)"""
266
+ with self._experiment_cache_lock:
267
+ # Remove all cache entries for this experiment
268
+ keys_to_remove = [
269
+ k for k in self._experiment_cache if k[0] == experiment_id
270
+ ]
271
+ for key in keys_to_remove:
272
+ del self._experiment_cache[key]
273
+
274
+ def _get_or_load_job(
275
+ self, job_id: str, task_id: str, submit_time: float | None
276
+ ) -> MockJob:
277
+ """Get job from cache or load from disk and cache it.
278
+
279
+ This ensures that job events (progress, state changes) can be applied
280
+ to cached jobs, keeping them up to date between get_jobs() calls.
281
+
282
+ Args:
283
+ job_id: Job identifier
284
+ task_id: Task identifier (for job path)
285
+ submit_time: Submit timestamp (fallback if job directory doesn't exist)
286
+
287
+ Returns:
288
+ MockJob from cache or freshly loaded from disk
289
+ """
290
+ with self._job_cache_lock:
291
+ if job_id in self._job_cache:
292
+ return self._job_cache[job_id]
293
+
294
+ # Load from disk
295
+ job_path = self.workspace_path / "jobs" / task_id / job_id
296
+ if job_path.exists():
297
+ job = self._create_mock_job_from_path(job_path, task_id, job_id)
298
+ else:
299
+ # Job directory doesn't exist - create minimal MockJob
300
+ job = MockJob(
301
+ identifier=job_id,
302
+ task_id=task_id,
303
+ path=job_path,
304
+ state="unscheduled",
305
+ submittime=submit_time,
306
+ starttime=None,
307
+ endtime=None,
308
+ progress=[],
309
+ updated_at="",
310
+ )
311
+
312
+ self._job_cache[job_id] = job
313
+ return job
314
+
315
+ # =========================================================================
316
+ # Experiment methods
317
+ # =========================================================================
318
+
319
+ def get_experiments(self, since: Optional[datetime] = None) -> List[MockExperiment]:
320
+ """Get list of all experiments (v2 and v1 layouts)"""
321
+ experiments = []
322
+ seen_ids = set()
323
+
324
+ # v2 layout: experiments/{exp-id}/{run-id}/
325
+ experiments_base = self.workspace_path / "experiments"
326
+ if experiments_base.exists():
327
+ for exp_dir in experiments_base.iterdir():
328
+ if not exp_dir.is_dir():
329
+ continue
330
+
331
+ experiment_id = exp_dir.name
332
+ seen_ids.add(experiment_id)
333
+ experiment = self._load_experiment(experiment_id)
334
+ if experiment is not None:
335
+ # Filter by since if provided
336
+ if since is not None and experiment.updated_at:
337
+ try:
338
+ updated = datetime.fromisoformat(experiment.updated_at)
339
+ if updated < since:
340
+ continue
341
+ except ValueError:
342
+ pass
343
+ experiments.append(experiment)
344
+
345
+ # v1 layout: xp/{exp-id}/ (with jobs/, jobs.bak/)
346
+ old_xp_dir = self.workspace_path / "xp"
347
+ if old_xp_dir.exists():
348
+ for exp_dir in old_xp_dir.iterdir():
349
+ if not exp_dir.is_dir():
350
+ continue
351
+
352
+ experiment_id = exp_dir.name
353
+ if experiment_id in seen_ids:
354
+ continue # Already loaded from v2
355
+
356
+ experiment = self._load_v1_experiment(experiment_id, exp_dir)
357
+ if experiment is not None:
358
+ if since is not None and experiment.updated_at:
359
+ try:
360
+ updated = datetime.fromisoformat(experiment.updated_at)
361
+ if updated < since:
362
+ continue
363
+ except ValueError:
364
+ pass
365
+ experiments.append(experiment)
366
+
367
+ return experiments
368
+
369
+ def get_experiment(self, experiment_id: str) -> Optional[MockExperiment]:
370
+ """Get a specific experiment by ID (v2 or v1 layout)"""
371
+ # Try v2 layout first
372
+ experiment = self._load_experiment(experiment_id)
373
+ if experiment is not None:
374
+ return experiment
375
+
376
+ # Try v1 layout
377
+ old_exp_dir = self.workspace_path / "xp" / experiment_id
378
+ if old_exp_dir.exists():
379
+ return self._load_v1_experiment(experiment_id, old_exp_dir)
380
+
381
+ return None
382
+
383
+ def _load_experiment(self, experiment_id: str) -> Optional[MockExperiment]:
384
+ """Load experiment from filesystem"""
385
+ exp_dir = self.workspace_path / "experiments" / experiment_id
386
+ if not exp_dir.exists():
387
+ return None
388
+
389
+ # Find current run (latest by directory name or from symlink)
390
+ current_run_id = self.get_current_run(experiment_id)
391
+ if current_run_id is None:
392
+ # No runs yet, return empty experiment
393
+ return MockExperiment(
394
+ workdir=exp_dir,
395
+ run_id="",
396
+ )
397
+
398
+ run_dir = exp_dir / current_run_id
399
+ if not run_dir.exists():
400
+ return None
401
+
402
+ # Get experiment from cache or load from disk
403
+ return self._get_cached_experiment(experiment_id, current_run_id, run_dir)
404
+
405
+ def _load_v1_experiment(
406
+ self, experiment_id: str, exp_dir: Path
407
+ ) -> Optional[MockExperiment]:
408
+ """Load experiment from v1 layout (xp/{exp-id}/ with jobs/, jobs.bak/)"""
409
+ from experimaestro.scheduler.interfaces import (
410
+ ExperimentJobInformation,
411
+ ExperimentStatus,
412
+ )
413
+
414
+ # Build job_infos from jobs/ and jobs.bak/ directories
415
+ jobs_dir = exp_dir / "jobs"
416
+ jobs_bak_dir = exp_dir / "jobs.bak"
417
+
418
+ job_infos: Dict[str, ExperimentJobInformation] = {}
419
+ seen_jobs: set[str] = set()
420
+ status = ExperimentStatus.DONE
421
+ finished_count = 0
422
+ failed_count = 0
423
+
424
+ for jdir in [jobs_dir, jobs_bak_dir]:
425
+ if not jdir.exists():
426
+ continue
427
+
428
+ for job_link in jdir.glob("*/*"):
429
+ # Job key is task_id/job_id
430
+ key = str(job_link.relative_to(jdir))
431
+ if key in seen_jobs:
432
+ continue
433
+ seen_jobs.add(key)
434
+
435
+ # Resolve symlink to check if job exists
436
+ try:
437
+ job_path = job_link.resolve()
438
+ if not job_path.is_dir():
439
+ continue
440
+ except OSError:
441
+ # Broken symlink - skip
442
+ continue
443
+
444
+ task_id = job_link.parent.name
445
+ job_id = job_link.name
446
+
447
+ # Create ExperimentJobInformation
448
+ try:
449
+ mtime = job_path.stat().st_mtime
450
+ except OSError:
451
+ mtime = None
452
+ job_infos[job_id] = ExperimentJobInformation(
453
+ job_id=job_id,
454
+ task_id=task_id,
455
+ tags={},
456
+ timestamp=mtime,
457
+ )
458
+
459
+ # Check job state for experiment status and counting
460
+ job = self._create_mock_job_from_path(job_path, task_id, job_id)
461
+ if job.state.is_error():
462
+ status = ExperimentStatus.FAILED
463
+ failed_count += 1
464
+ elif job.state.finished():
465
+ finished_count += 1
466
+ else:
467
+ status = ExperimentStatus.RUNNING
468
+
469
+ # Get modification time for started_at
470
+ try:
471
+ mtime = exp_dir.stat().st_mtime
472
+ except OSError:
473
+ mtime = None
474
+
475
+ return MockExperiment(
476
+ workdir=exp_dir,
477
+ run_id="v1", # Mark as v1 experiment
478
+ status=status,
479
+ job_infos=job_infos,
480
+ started_at=mtime,
481
+ experiment_id_override=experiment_id,
482
+ finished_jobs=finished_count,
483
+ failed_jobs=failed_count,
484
+ )
485
+
486
+ def _get_v1_jobs(self, experiment_id: str) -> List[MockJob]:
487
+ """Get jobs from v1 experiment layout
488
+
489
+ v1 layout: xp/{exp-id}/jobs/{task_id}/{job_hash} -> symlink to jobs/{task_id}/{job_hash}
490
+ """
491
+
492
+ exp_dir = self.workspace_path / "xp" / experiment_id
493
+ if not exp_dir.exists():
494
+ return []
495
+
496
+ jobs = []
497
+ jobs_dir = exp_dir / "jobs"
498
+ jobs_bak_dir = exp_dir / "jobs.bak"
499
+
500
+ for jdir in [jobs_dir, jobs_bak_dir]:
501
+ if not jdir.exists():
502
+ continue
503
+
504
+ for job_link in jdir.glob("*/*"):
505
+ # Resolve symlinks to get actual job path
506
+ try:
507
+ job_path = job_link.resolve()
508
+ if not job_path.is_dir():
509
+ continue
510
+ except OSError:
511
+ # Broken symlink
512
+ continue
513
+
514
+ task_id = job_link.parent.name
515
+ job_id = job_link.name
516
+
517
+ # Create MockJob from filesystem state (done/failed files, etc.)
518
+ job = self._create_mock_job_from_path(job_path, task_id, job_id)
519
+ jobs.append(job)
520
+
521
+ return jobs
522
+
523
+ def get_experiment_runs(self, experiment_id: str) -> List[BaseExperiment]:
524
+ """Get all runs for an experiment"""
525
+ runs: List[BaseExperiment] = []
526
+ exp_dir = self.workspace_path / "experiments" / experiment_id
527
+
528
+ # Check for v1 layout first (xp/{exp-id}/ without separate runs)
529
+ v1_exp_dir = self.workspace_path / "xp" / experiment_id
530
+ if v1_exp_dir.exists() and not exp_dir.exists():
531
+ # v1 experiment: return single synthetic run
532
+ exp = self._load_v1_experiment(experiment_id, v1_exp_dir)
533
+ if exp:
534
+ runs.append(exp)
535
+ return runs
536
+
537
+ if not exp_dir.exists():
538
+ return runs
539
+
540
+ for run_dir in sorted(exp_dir.iterdir(), reverse=True):
541
+ if not run_dir.is_dir() or run_dir.name.startswith("."):
542
+ continue
543
+
544
+ # Use MockExperiment.from_disk to load the experiment
545
+ mock_exp = MockExperiment.from_disk(run_dir, self.workspace_path)
546
+ if mock_exp is not None:
547
+ runs.append(mock_exp)
548
+
549
+ return runs
550
+
551
+ def get_current_run(self, experiment_id: str) -> Optional[str]:
552
+ """Get the current run ID for an experiment"""
553
+ # Check new symlink location: .events/experiments/{experiment_id}/current
554
+ exp_events_dir = self._experiments_dir / experiment_id
555
+ symlink = exp_events_dir / "current"
556
+ if symlink.is_symlink():
557
+ try:
558
+ target = symlink.resolve()
559
+ return target.name
560
+ except OSError:
561
+ pass
562
+
563
+ # Check legacy symlink location (old .experimaestro path)
564
+ legacy_experiments_dir = self.workspace_path / ".experimaestro" / "experiments"
565
+ legacy_symlink = legacy_experiments_dir / experiment_id
566
+ if legacy_symlink.is_symlink():
567
+ try:
568
+ target = legacy_symlink.resolve()
569
+ return target.name
570
+ except OSError:
571
+ pass
572
+
573
+ # Fall back to finding latest run directory
574
+ exp_dir = self.workspace_path / "experiments" / experiment_id
575
+ if not exp_dir.exists():
576
+ return None
577
+
578
+ runs = sorted(
579
+ [d for d in exp_dir.iterdir() if d.is_dir() and not d.name.startswith(".")],
580
+ key=lambda d: d.name,
581
+ reverse=True,
582
+ )
583
+ return runs[0].name if runs else None
584
+
585
+ # =========================================================================
586
+ # Job methods
587
+ # =========================================================================
588
+
589
+ def get_jobs(
590
+ self,
591
+ experiment_id: Optional[str] = None,
592
+ run_id: Optional[str] = None,
593
+ task_id: Optional[str] = None,
594
+ state: Optional[str] = None,
595
+ tags: Optional[Dict[str, str]] = None,
596
+ since: Optional[datetime] = None,
597
+ ) -> List[MockJob]:
598
+ """Query jobs with optional filters"""
599
+ if experiment_id is None:
600
+ return self.get_all_jobs(state=state, tags=tags, since=since)
601
+
602
+ if run_id is None:
603
+ run_id = self.get_current_run(experiment_id)
604
+
605
+ # Check for v1 experiment
606
+ if run_id == "v1" or run_id is None:
607
+ v1_exp_dir = self.workspace_path / "xp" / experiment_id
608
+ if v1_exp_dir.exists():
609
+ return self._get_v1_jobs(experiment_id)
610
+ if run_id is None:
611
+ return []
612
+
613
+ run_dir = self.workspace_path / "experiments" / experiment_id / run_id
614
+ if not run_dir.exists():
615
+ return []
616
+
617
+ # Get experiment from cache or load from disk
618
+ exp = self._get_cached_experiment(experiment_id, run_id, run_dir)
619
+
620
+ # Load jobs using job_infos
621
+ jobs = []
622
+ for job_id, job_info in exp.job_infos.items():
623
+ # Apply task_id filter early
624
+ if task_id and job_info.task_id != task_id:
625
+ continue
626
+
627
+ # Apply tags filter early using job_info.tags
628
+ if tags:
629
+ if not all(job_info.tags.get(k) == v for k, v in tags.items()):
630
+ continue
631
+
632
+ # Get job from cache or load from disk
633
+ job = self._get_or_load_job(job_id, job_info.task_id, job_info.timestamp)
634
+
635
+ # Apply state filter on loaded job
636
+ if state:
637
+ state_enum = STATE_NAME_TO_JOBSTATE.get(state)
638
+ if state_enum and job.state != state_enum:
639
+ continue
640
+
641
+ jobs.append(job)
642
+
643
+ return jobs
644
+
645
+ def get_job(
646
+ self, job_id: str, experiment_id: str, run_id: Optional[str] = None
647
+ ) -> Optional[MockJob]:
648
+ """Get a specific job"""
649
+ if run_id is None:
650
+ run_id = self.get_current_run(experiment_id)
651
+ if run_id is None:
652
+ return None
653
+
654
+ run_dir = self.workspace_path / "experiments" / experiment_id / run_id
655
+ if not run_dir.exists():
656
+ return None
657
+
658
+ # Get experiment from cache or load from disk
659
+ exp = self._get_cached_experiment(experiment_id, run_id, run_dir)
660
+
661
+ # Get job_info and load full job data
662
+ job_info = exp.job_infos.get(job_id)
663
+ if job_info is None:
664
+ return None
665
+
666
+ return self._get_or_load_job(job_id, job_info.task_id, job_info.timestamp)
667
+
668
+ def get_all_jobs(
669
+ self,
670
+ state: Optional[str] = None,
671
+ tags: Optional[Dict[str, str]] = None,
672
+ since: Optional[datetime] = None,
673
+ ) -> List[MockJob]:
674
+ """Get all jobs across all experiments"""
675
+ all_jobs = []
676
+ experiments_base = self.workspace_path / "experiments"
677
+ if not experiments_base.exists():
678
+ return all_jobs
679
+
680
+ for exp_dir in experiments_base.iterdir():
681
+ if not exp_dir.is_dir():
682
+ continue
683
+ experiment_id = exp_dir.name
684
+ jobs = self.get_jobs(
685
+ experiment_id=experiment_id, state=state, tags=tags, since=since
686
+ )
687
+ all_jobs.extend(jobs)
688
+
689
+ return all_jobs
690
+
691
+ # =========================================================================
692
+ # Tags and dependencies
693
+ # =========================================================================
694
+
695
+ def get_tags_map(
696
+ self, experiment_id: str, run_id: Optional[str] = None
697
+ ) -> Dict[str, Dict[str, str]]:
698
+ """Get tags map for jobs in an experiment/run"""
699
+ if run_id is None:
700
+ run_id = self.get_current_run(experiment_id)
701
+ if run_id is None:
702
+ return {}
703
+
704
+ run_dir = self.workspace_path / "experiments" / experiment_id / run_id
705
+ if not run_dir.exists():
706
+ return {}
707
+
708
+ # Get experiment from cache or load from disk
709
+ exp = self._get_cached_experiment(experiment_id, run_id, run_dir)
710
+
711
+ return exp.tags
712
+
713
+ def get_dependencies_map(
714
+ self, experiment_id: str, run_id: Optional[str] = None
715
+ ) -> Dict[str, List[str]]:
716
+ """Get dependencies map for jobs in an experiment/run"""
717
+ if run_id is None:
718
+ run_id = self.get_current_run(experiment_id)
719
+ if run_id is None:
720
+ return {}
721
+
722
+ run_dir = self.workspace_path / "experiments" / experiment_id / run_id
723
+ if not run_dir.exists():
724
+ return {}
725
+
726
+ # Get experiment from cache or load from disk
727
+ exp = self._get_cached_experiment(experiment_id, run_id, run_dir)
728
+
729
+ return exp.dependencies
730
+
731
+ # =========================================================================
732
+ # Services
733
+ # =========================================================================
734
+
735
+ def get_services(
736
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
737
+ ) -> List[BaseService]:
738
+ """Get services for an experiment
739
+
740
+ Tries to recreate real Service objects from service_config, falls back to
741
+ MockService if recreation fails.
742
+
743
+ If experiment_id is None, returns services from all experiments.
744
+ """
745
+ if experiment_id is None:
746
+ # Return services from all experiments
747
+ all_services = []
748
+ for exp in self.get_experiments():
749
+ exp_services = self.get_services(exp.experiment_id)
750
+ all_services.extend(exp_services)
751
+ return all_services
752
+
753
+ if run_id is None:
754
+ run_id = self.get_current_run(experiment_id)
755
+ if run_id is None:
756
+ return []
757
+
758
+ cache_key = (experiment_id, run_id)
759
+
760
+ with self._service_cache_lock:
761
+ # Check cache
762
+ cached = self._service_cache.get(cache_key)
763
+ if cached is not None:
764
+ return list(cached.values())
765
+
766
+ # Fetch and try to recreate services
767
+ services = self._fetch_services_from_storage(experiment_id, run_id)
768
+ # Store experiment_id on services for global view
769
+ for s in services:
770
+ s._experiment_id = experiment_id
771
+ s._run_id = run_id
772
+ self._service_cache[cache_key] = {s.id: s for s in services}
773
+ return services
774
+
775
+ def _fetch_services_from_storage(
776
+ self, experiment_id: Optional[str], run_id: Optional[str]
777
+ ) -> List[BaseService]:
778
+ """Fetch services from status.json and try to recreate real Service objects"""
779
+ from experimaestro.scheduler.services import Service
780
+
781
+ if experiment_id is None or run_id is None:
782
+ return []
783
+
784
+ run_dir = self.workspace_path / "experiments" / experiment_id / run_id
785
+ if not run_dir.exists():
786
+ return []
787
+
788
+ # Get experiment from cache or load from disk
789
+ exp = self._get_cached_experiment(experiment_id, run_id, run_dir)
790
+
791
+ services = []
792
+ for service_id, mock_service in exp.services.items():
793
+ # Try to recreate service from state_dict
794
+ service_class = mock_service.service_class
795
+ state_dict = mock_service.state_dict()
796
+ if service_class:
797
+ try:
798
+ service = Service.from_state_dict(service_class, state_dict)
799
+ # Store experiment info on the service
800
+ service._experiment_id = experiment_id
801
+ service._run_id = run_id
802
+ # Register as listener to emit events when state changes
803
+ service.add_listener(self)
804
+ services.append(service)
805
+ logger.debug("Recreated service %s from state_dict", service_id)
806
+ except Exception as e:
807
+ # Failed to recreate - use MockService with error description
808
+ from experimaestro.scheduler.state_provider import MockService
809
+
810
+ service = MockService(
811
+ service_id=service_id,
812
+ description_text=f"error: {e}",
813
+ state_dict_data={},
814
+ experiment_id=experiment_id,
815
+ run_id=run_id,
816
+ )
817
+ services.append(service)
818
+ logger.warning(
819
+ "Failed to recreate service %s from state_dict: %s",
820
+ service_id,
821
+ e,
822
+ )
823
+ if isinstance(e, ModuleNotFoundError):
824
+ logger.warning(
825
+ "Missing module for service recreation. Python Path: %s",
826
+ sys.path,
827
+ )
828
+ else:
829
+ # No service_class - use MockService with error
830
+ from experimaestro.scheduler.state_provider import MockService
831
+
832
+ service = MockService(
833
+ service_id=service_id,
834
+ description_text="error: no service_class",
835
+ state_dict_data={},
836
+ experiment_id=experiment_id,
837
+ run_id=run_id,
838
+ )
839
+ services.append(service)
840
+ logger.debug(
841
+ "Service %s has no service_class for recreation", service_id
842
+ )
843
+
844
+ return services
845
+
846
+ # =========================================================================
847
+ # Job operations
848
+ # =========================================================================
849
+
850
+ def kill_job(self, job: MockJob, perform: bool = False) -> bool:
851
+ """Kill a running job"""
852
+ if not perform:
853
+ return job.state.running()
854
+
855
+ process = job.getprocess()
856
+ if process is None:
857
+ return False
858
+
859
+ try:
860
+ process.kill()
861
+ return True
862
+ except Exception as e:
863
+ logger.warning("Failed to kill job %s: %s", job.identifier, e)
864
+ return False
865
+
866
+ def clean_job(self, job: MockJob, perform: bool = False) -> bool:
867
+ """Clean a finished job"""
868
+ if not job.state.finished():
869
+ return False
870
+
871
+ if not perform:
872
+ return True
873
+
874
+ try:
875
+ import shutil
876
+
877
+ if job.path.exists():
878
+ shutil.rmtree(job.path)
879
+ return True
880
+ except Exception as e:
881
+ logger.warning("Failed to clean job %s: %s", job.identifier, e)
882
+ return False
883
+
884
+ # =========================================================================
885
+ # Orphan job detection
886
+ # =========================================================================
887
+
888
+ def get_orphan_jobs(self) -> List[MockJob]:
889
+ """Get orphan jobs (jobs not associated with any experiment run)
890
+
891
+ Scans workspace/jobs/ for all job directories and compares against
892
+ jobs referenced by experiments (both v1 and v2 layouts).
893
+
894
+ Returns:
895
+ List of MockJob objects for jobs that exist on disk but are not
896
+ referenced by any experiment.
897
+ """
898
+ jobs_base = self.workspace_path / "jobs"
899
+ if not jobs_base.exists():
900
+ return []
901
+
902
+ # Collect all job paths referenced by experiments
903
+ referenced_jobs = self._collect_referenced_job_paths()
904
+
905
+ # Scan workspace/jobs/ for all job directories
906
+ orphan_jobs = []
907
+ for task_dir in jobs_base.iterdir():
908
+ if not task_dir.is_dir():
909
+ continue
910
+
911
+ task_id = task_dir.name
912
+
913
+ for job_dir in task_dir.iterdir():
914
+ if not job_dir.is_dir():
915
+ continue
916
+
917
+ job_id = job_dir.name
918
+
919
+ # Resolve to canonical path for comparison
920
+ try:
921
+ job_path = job_dir.resolve()
922
+ except OSError:
923
+ continue
924
+
925
+ # Check if this job is referenced by any experiment
926
+ if job_path not in referenced_jobs:
927
+ # This is an orphan job - create MockJob from filesystem state
928
+ job = self._create_mock_job_from_path(job_path, task_id, job_id)
929
+ orphan_jobs.append(job)
930
+
931
+ return orphan_jobs
932
+
933
+ def get_stray_jobs(self) -> list[MockJob]:
934
+ """Get stray jobs (running jobs not in the latest run of any experiment)
935
+
936
+ A stray job is a running job that was submitted by a previous run of an
937
+ experiment, but the experiment has since been relaunched with different
938
+ parameters (i.e., a new run was started).
939
+
940
+ This differs from orphan jobs which considers ALL runs. Stray jobs only
941
+ look at the LATEST run of each experiment.
942
+
943
+ Returns:
944
+ List of MockJob objects for running jobs not in any current experiment
945
+ """
946
+ jobs_base = self.workspace_path / "jobs"
947
+ if not jobs_base.exists():
948
+ return []
949
+
950
+ # Collect job paths from LATEST runs only
951
+ latest_run_jobs = self._collect_latest_run_job_paths()
952
+
953
+ # Scan workspace/jobs/ for all running job directories
954
+ stray_jobs = []
955
+ for task_dir in jobs_base.iterdir():
956
+ if not task_dir.is_dir():
957
+ continue
958
+
959
+ task_id = task_dir.name
960
+
961
+ for job_dir in task_dir.iterdir():
962
+ if not job_dir.is_dir():
963
+ continue
964
+
965
+ job_id = job_dir.name
966
+
967
+ # Resolve to canonical path for comparison
968
+ try:
969
+ job_path = job_dir.resolve()
970
+ except OSError:
971
+ continue
972
+
973
+ # Check if this job is in any latest run
974
+ if job_path not in latest_run_jobs:
975
+ # Always verify running state from PID file (don't trust metadata)
976
+ scriptname = task_id.rsplit(".", 1)[-1]
977
+ actual_state = self._check_running_from_pid(job_path, scriptname)
978
+
979
+ # Only include if the job is actually running
980
+ if actual_state == JobState.RUNNING:
981
+ # Create MockJob for the running job
982
+ job = self._create_mock_job_from_path(job_path, task_id, job_id)
983
+ # Update state to verified running state
984
+ job.state = JobState.RUNNING
985
+ stray_jobs.append(job)
986
+
987
+ return stray_jobs
988
+
989
+ def _collect_latest_run_job_paths(self) -> set[Path]:
990
+ """Collect job paths from the latest run of each experiment only
991
+
992
+ Returns:
993
+ Set of resolved job paths that are in the latest run of any experiment
994
+ """
995
+ referenced = set()
996
+
997
+ # v2 layout: experiments/{exp-id}/{run-id}/status.json
998
+ experiments_base = self.workspace_path / "experiments"
999
+ if experiments_base.exists():
1000
+ for exp_dir in experiments_base.iterdir():
1001
+ if not exp_dir.is_dir():
1002
+ continue
1003
+
1004
+ experiment_id = exp_dir.name
1005
+
1006
+ # Get the latest run for this experiment
1007
+ latest_run_id = self.get_current_run(experiment_id)
1008
+ if latest_run_id is None:
1009
+ continue
1010
+
1011
+ run_dir = exp_dir / latest_run_id
1012
+ if not run_dir.is_dir():
1013
+ continue
1014
+
1015
+ # Load experiment from disk (with locking)
1016
+ exp = MockExperiment.from_disk(run_dir, self.workspace_path)
1017
+ if exp is None:
1018
+ continue
1019
+
1020
+ # Also apply pending events
1021
+ # Events are in experiments/{experiment_id}/events-{count}.jsonl
1022
+ reader = EventReader([WatchedDirectory(path=self._experiments_dir)])
1023
+ events = reader.read_events_since_count(experiment_id, exp.events_count)
1024
+ for event in events:
1025
+ exp.apply_event(event)
1026
+
1027
+ # Add all job paths from this run
1028
+ for job_info in exp.job_infos.values():
1029
+ job_path = (
1030
+ self.workspace_path
1031
+ / "jobs"
1032
+ / job_info.task_id
1033
+ / job_info.job_id
1034
+ )
1035
+ try:
1036
+ referenced.add(job_path.resolve())
1037
+ except OSError:
1038
+ pass
1039
+
1040
+ # v1 layout: only most recent jobs/ (not jobs.bak/)
1041
+ old_xp_dir = self.workspace_path / "xp"
1042
+ if old_xp_dir.exists():
1043
+ for exp_dir in old_xp_dir.iterdir():
1044
+ if not exp_dir.is_dir():
1045
+ continue
1046
+
1047
+ # Only check current jobs/ (not jobs.bak/)
1048
+ jobs_dir = exp_dir / "jobs"
1049
+ if not jobs_dir.exists():
1050
+ continue
1051
+
1052
+ for job_link in jobs_dir.glob("*/*"):
1053
+ try:
1054
+ job_path = job_link.resolve()
1055
+ referenced.add(job_path)
1056
+ except OSError:
1057
+ pass
1058
+
1059
+ return referenced
1060
+
1061
+ def _collect_referenced_job_paths(self) -> set[Path]:
1062
+ """Collect all job paths referenced by experiments (v1 and v2 layouts)
1063
+
1064
+ Returns:
1065
+ Set of resolved job paths that are referenced by at least one experiment
1066
+ """
1067
+ referenced = set()
1068
+
1069
+ # v2 layout: experiments/{exp-id}/{run-id}/status.json
1070
+ experiments_base = self.workspace_path / "experiments"
1071
+ if experiments_base.exists():
1072
+ for exp_dir in experiments_base.iterdir():
1073
+ if not exp_dir.is_dir():
1074
+ continue
1075
+
1076
+ experiment_id = exp_dir.name
1077
+
1078
+ for run_dir in exp_dir.iterdir():
1079
+ if not run_dir.is_dir() or run_dir.name.startswith("."):
1080
+ continue
1081
+
1082
+ # Load experiment from disk (with locking)
1083
+ exp = MockExperiment.from_disk(run_dir, self.workspace_path)
1084
+ if exp is None:
1085
+ continue
1086
+
1087
+ # Also apply pending events
1088
+ # Events are in experiments/{experiment_id}/events-{count}.jsonl
1089
+ reader = EventReader([WatchedDirectory(path=self._experiments_dir)])
1090
+ events = reader.read_events_since_count(
1091
+ experiment_id, exp.events_count
1092
+ )
1093
+ for event in events:
1094
+ exp.apply_event(event)
1095
+
1096
+ # Add all job paths from this run
1097
+ for job_info in exp.job_infos.values():
1098
+ job_path = (
1099
+ self.workspace_path
1100
+ / "jobs"
1101
+ / job_info.task_id
1102
+ / job_info.job_id
1103
+ )
1104
+ try:
1105
+ referenced.add(job_path.resolve())
1106
+ except OSError:
1107
+ pass
1108
+
1109
+ # v1 layout: xp/{exp-id}/jobs/{task_id}/{job_hash} -> symlinks
1110
+ old_xp_dir = self.workspace_path / "xp"
1111
+ if old_xp_dir.exists():
1112
+ for exp_dir in old_xp_dir.iterdir():
1113
+ if not exp_dir.is_dir():
1114
+ continue
1115
+
1116
+ # Check jobs/ and jobs.bak/ directories
1117
+ for jdir_name in ["jobs", "jobs.bak"]:
1118
+ jobs_dir = exp_dir / jdir_name
1119
+ if not jobs_dir.exists():
1120
+ continue
1121
+
1122
+ for job_link in jobs_dir.glob("*/*"):
1123
+ try:
1124
+ job_path = job_link.resolve()
1125
+ referenced.add(job_path)
1126
+ except OSError:
1127
+ # Broken symlink - skip
1128
+ pass
1129
+
1130
+ return referenced
1131
+
1132
+ def _create_mock_job_from_path(
1133
+ self, job_path: Path, task_id: str, job_id: str
1134
+ ) -> MockJob:
1135
+ """Create a MockJob from a job directory path (when no metadata exists)"""
1136
+ from experimaestro.scheduler.interfaces import JobState as JobStateClass
1137
+
1138
+ # Try to determine state from marker files
1139
+ scriptname = task_id.rsplit(".", 1)[-1]
1140
+ state = JobStateClass.from_path(job_path, scriptname)
1141
+
1142
+ # If no done/failed marker, check if job is running via PID file
1143
+ if state is None:
1144
+ state = self._check_running_from_pid(job_path, scriptname)
1145
+
1146
+ if state is None:
1147
+ state = JobState.UNSCHEDULED
1148
+
1149
+ # Get modification time for timestamps
1150
+ try:
1151
+ mtime = job_path.stat().st_mtime
1152
+ except OSError:
1153
+ mtime = None
1154
+
1155
+ return MockJob(
1156
+ identifier=job_id,
1157
+ task_id=task_id,
1158
+ path=job_path,
1159
+ state=state.name,
1160
+ submittime=mtime,
1161
+ starttime=mtime,
1162
+ endtime=mtime if state.finished() else None,
1163
+ progress=[],
1164
+ updated_at="",
1165
+ )
1166
+
1167
+ def _check_running_from_pid(
1168
+ self, job_path: Path, scriptname: str
1169
+ ) -> Optional[JobState]:
1170
+ """Check if a job is running by reading its PID file and checking the process
1171
+
1172
+ Args:
1173
+ job_path: Path to the job directory
1174
+ scriptname: The script name (used for file naming)
1175
+
1176
+ Returns:
1177
+ JobState.RUNNING if the process is still running, None otherwise
1178
+ """
1179
+ pid_file = job_path / f"{scriptname}.pid"
1180
+ if not pid_file.exists():
1181
+ return None
1182
+
1183
+ try:
1184
+ pinfo = json.loads(pid_file.read_text())
1185
+ pid = pinfo.get("pid")
1186
+ if pid is None:
1187
+ return None
1188
+
1189
+ # Ensure pid is an integer (JSON may store it as string)
1190
+ pid = int(pid)
1191
+
1192
+ # Check if the process is still running
1193
+ try:
1194
+ import psutil
1195
+
1196
+ proc = psutil.Process(pid)
1197
+ if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
1198
+ return JobState.RUNNING
1199
+ except (ImportError, psutil.NoSuchProcess, psutil.AccessDenied):
1200
+ pass
1201
+
1202
+ except (json.JSONDecodeError, OSError, ValueError, TypeError):
1203
+ pass
1204
+
1205
+ return None
1206
+
1207
+ # =========================================================================
1208
+ # Process information
1209
+ # =========================================================================
1210
+
1211
+ def get_process_info(self, job: MockJob) -> Optional[ProcessInfo]:
1212
+ """Get process information for a job
1213
+
1214
+ Returns a ProcessInfo dataclass or None if not available.
1215
+ """
1216
+ if not job.path or not job.task_id:
1217
+ return None
1218
+
1219
+ # Get script name from task_id
1220
+ scriptname = job.task_id.rsplit(".", 1)[-1]
1221
+ pid_file = job.path / f"{scriptname}.pid"
1222
+
1223
+ if not pid_file.exists():
1224
+ return None
1225
+
1226
+ try:
1227
+ pinfo = json.loads(pid_file.read_text())
1228
+ pid = pinfo.get("pid")
1229
+ proc_type = pinfo.get("type", "unknown")
1230
+
1231
+ if pid is None:
1232
+ return None
1233
+
1234
+ result = ProcessInfo(pid=pid, type=proc_type, running=False)
1235
+
1236
+ # Try to get more info for running jobs
1237
+ if job.state and job.state.running():
1238
+ try:
1239
+ import psutil
1240
+
1241
+ proc = psutil.Process(pid)
1242
+ if proc.is_running():
1243
+ result.running = True
1244
+ # Get CPU and memory usage
1245
+ result.cpu_percent = proc.cpu_percent(interval=0.1)
1246
+ mem_info = proc.memory_info()
1247
+ result.memory_mb = mem_info.rss / (1024 * 1024)
1248
+ result.num_threads = proc.num_threads()
1249
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
1250
+ pass
1251
+ except ImportError:
1252
+ pass # psutil not available
1253
+
1254
+ return result
1255
+ except (json.JSONDecodeError, OSError):
1256
+ return None
1257
+
1258
+ # =========================================================================
1259
+ # Lifecycle
1260
+ # =========================================================================
1261
+
1262
+ def close(self) -> None:
1263
+ """Close the state provider and release resources"""
1264
+ self._stop_watcher()
1265
+
1266
+ with self._lock:
1267
+ if self.workspace_path in self._instances:
1268
+ del self._instances[self.workspace_path]
1269
+
1270
+
1271
+ __all__ = [
1272
+ "WorkspaceStateProvider",
1273
+ ]