experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (133) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +140 -16
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/progress.py +269 -0
  7. experimaestro/cli/refactor.py +249 -0
  8. experimaestro/click.py +0 -1
  9. experimaestro/commandline.py +19 -3
  10. experimaestro/connectors/__init__.py +22 -3
  11. experimaestro/connectors/local.py +12 -0
  12. experimaestro/core/arguments.py +192 -37
  13. experimaestro/core/identifier.py +127 -12
  14. experimaestro/core/objects/__init__.py +6 -0
  15. experimaestro/core/objects/config.py +702 -285
  16. experimaestro/core/objects/config_walk.py +24 -6
  17. experimaestro/core/serialization.py +91 -34
  18. experimaestro/core/serializers.py +1 -8
  19. experimaestro/core/subparameters.py +164 -0
  20. experimaestro/core/types.py +198 -83
  21. experimaestro/exceptions.py +26 -0
  22. experimaestro/experiments/cli.py +107 -25
  23. experimaestro/generators.py +50 -9
  24. experimaestro/huggingface.py +3 -1
  25. experimaestro/launcherfinder/parser.py +29 -0
  26. experimaestro/launcherfinder/registry.py +3 -3
  27. experimaestro/launchers/__init__.py +26 -1
  28. experimaestro/launchers/direct.py +12 -0
  29. experimaestro/launchers/slurm/base.py +154 -2
  30. experimaestro/mkdocs/base.py +6 -8
  31. experimaestro/mkdocs/metaloader.py +0 -1
  32. experimaestro/mypy.py +452 -7
  33. experimaestro/notifications.py +75 -16
  34. experimaestro/progress.py +404 -0
  35. experimaestro/rpyc.py +0 -1
  36. experimaestro/run.py +19 -6
  37. experimaestro/scheduler/__init__.py +18 -1
  38. experimaestro/scheduler/base.py +504 -959
  39. experimaestro/scheduler/dependencies.py +43 -28
  40. experimaestro/scheduler/dynamic_outputs.py +259 -130
  41. experimaestro/scheduler/experiment.py +582 -0
  42. experimaestro/scheduler/interfaces.py +474 -0
  43. experimaestro/scheduler/jobs.py +485 -0
  44. experimaestro/scheduler/services.py +186 -12
  45. experimaestro/scheduler/signal_handler.py +32 -0
  46. experimaestro/scheduler/state.py +1 -1
  47. experimaestro/scheduler/state_db.py +388 -0
  48. experimaestro/scheduler/state_provider.py +2345 -0
  49. experimaestro/scheduler/state_sync.py +834 -0
  50. experimaestro/scheduler/workspace.py +52 -10
  51. experimaestro/scriptbuilder.py +7 -0
  52. experimaestro/server/__init__.py +153 -32
  53. experimaestro/server/data/index.css +0 -125
  54. experimaestro/server/data/index.css.map +1 -1
  55. experimaestro/server/data/index.js +194 -58
  56. experimaestro/server/data/index.js.map +1 -1
  57. experimaestro/settings.py +47 -6
  58. experimaestro/sphinx/__init__.py +3 -3
  59. experimaestro/taskglobals.py +20 -0
  60. experimaestro/tests/conftest.py +80 -0
  61. experimaestro/tests/core/test_generics.py +2 -2
  62. experimaestro/tests/identifier_stability.json +45 -0
  63. experimaestro/tests/launchers/bin/sacct +6 -2
  64. experimaestro/tests/launchers/bin/sbatch +4 -2
  65. experimaestro/tests/launchers/common.py +2 -2
  66. experimaestro/tests/launchers/test_slurm.py +80 -0
  67. experimaestro/tests/restart.py +1 -1
  68. experimaestro/tests/tasks/all.py +7 -0
  69. experimaestro/tests/tasks/test_dynamic.py +231 -0
  70. experimaestro/tests/test_checkers.py +2 -2
  71. experimaestro/tests/test_cli_jobs.py +615 -0
  72. experimaestro/tests/test_dependencies.py +11 -17
  73. experimaestro/tests/test_deprecated.py +630 -0
  74. experimaestro/tests/test_environment.py +200 -0
  75. experimaestro/tests/test_experiment.py +3 -3
  76. experimaestro/tests/test_file_progress.py +425 -0
  77. experimaestro/tests/test_file_progress_integration.py +477 -0
  78. experimaestro/tests/test_forward.py +3 -3
  79. experimaestro/tests/test_generators.py +93 -0
  80. experimaestro/tests/test_identifier.py +520 -169
  81. experimaestro/tests/test_identifier_stability.py +458 -0
  82. experimaestro/tests/test_instance.py +16 -21
  83. experimaestro/tests/test_multitoken.py +442 -0
  84. experimaestro/tests/test_mypy.py +433 -0
  85. experimaestro/tests/test_objects.py +314 -30
  86. experimaestro/tests/test_outputs.py +8 -8
  87. experimaestro/tests/test_param.py +22 -26
  88. experimaestro/tests/test_partial_paths.py +231 -0
  89. experimaestro/tests/test_progress.py +2 -50
  90. experimaestro/tests/test_resumable_task.py +480 -0
  91. experimaestro/tests/test_serializers.py +141 -60
  92. experimaestro/tests/test_state_db.py +434 -0
  93. experimaestro/tests/test_subparameters.py +160 -0
  94. experimaestro/tests/test_tags.py +151 -15
  95. experimaestro/tests/test_tasks.py +137 -160
  96. experimaestro/tests/test_token_locking.py +252 -0
  97. experimaestro/tests/test_tokens.py +25 -19
  98. experimaestro/tests/test_types.py +133 -11
  99. experimaestro/tests/test_validation.py +19 -19
  100. experimaestro/tests/test_workspace_triggers.py +158 -0
  101. experimaestro/tests/token_reschedule.py +5 -3
  102. experimaestro/tests/utils.py +2 -2
  103. experimaestro/tokens.py +154 -57
  104. experimaestro/tools/diff.py +8 -1
  105. experimaestro/tui/__init__.py +8 -0
  106. experimaestro/tui/app.py +2303 -0
  107. experimaestro/tui/app.tcss +353 -0
  108. experimaestro/tui/log_viewer.py +228 -0
  109. experimaestro/typingutils.py +11 -2
  110. experimaestro/utils/__init__.py +23 -0
  111. experimaestro/utils/environment.py +148 -0
  112. experimaestro/utils/git.py +129 -0
  113. experimaestro/utils/resources.py +1 -1
  114. experimaestro/version.py +34 -0
  115. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
  116. experimaestro-2.0.0b4.dist-info/RECORD +181 -0
  117. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
  118. experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
  119. experimaestro/compat.py +0 -6
  120. experimaestro/core/objects.pyi +0 -225
  121. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  122. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  123. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  124. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  125. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  126. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  127. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  128. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  129. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  130. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  131. experimaestro-1.11.1.dist-info/RECORD +0 -158
  132. experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
  133. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,834 @@
1
+ """Disk-based state synchronization for workspace database
2
+
3
+ This module implements synchronization from disk state (marker files) to the
4
+ workspace database. It includes locking and throttling mechanisms to prevent
5
+ excessive disk scanning and conflicts with running experiments.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Dict, Optional, Tuple, TYPE_CHECKING
12
+ from datetime import datetime
13
+ import fasteners
14
+
15
+ if TYPE_CHECKING:
16
+ from .state_provider import WorkspaceStateProvider
17
+ from .jobs import JobState
18
+
19
+ from .interfaces import BaseJob
20
+
21
+ from experimaestro.scheduler.state_db import (
22
+ ExperimentModel,
23
+ ExperimentRunModel,
24
+ JobModel,
25
+ JobTagModel,
26
+ ServiceModel,
27
+ WorkspaceSyncMetadata,
28
+ )
29
+
30
+ logger = logging.getLogger("xpm.state_sync")
31
+
32
+
33
+ def read_jobs_jsonl(exp_dir: Path) -> Dict[str, Dict]:
34
+ """Read jobs.jsonl file and return a mapping of job_id -> record
35
+
36
+ Args:
37
+ exp_dir: Path to the experiment directory
38
+
39
+ Returns:
40
+ Dictionary mapping job_id to record (with tags, task_id, timestamp)
41
+ """
42
+ jobs_jsonl_path = exp_dir / "jobs.jsonl"
43
+ job_records = {}
44
+
45
+ if not jobs_jsonl_path.exists():
46
+ logger.debug("No jobs.jsonl found in %s", exp_dir)
47
+ return job_records
48
+
49
+ try:
50
+ with jobs_jsonl_path.open("r") as f:
51
+ for line in f:
52
+ if not line.strip():
53
+ continue
54
+ try:
55
+ record = json.loads(line)
56
+ job_id = record.get("job_id")
57
+ if job_id:
58
+ job_records[job_id] = record
59
+ except json.JSONDecodeError as e:
60
+ logger.warning("Failed to parse line in jobs.jsonl: %s", e)
61
+ except Exception as e:
62
+ logger.warning("Failed to read jobs.jsonl from %s: %s", jobs_jsonl_path, e)
63
+
64
+ logger.debug("Read %d job records from jobs.jsonl", len(job_records))
65
+ return job_records
66
+
67
+
68
+ def read_services_json(exp_dir: Path) -> Dict[str, Dict]:
69
+ """Read services.json file and return a mapping of service_id -> record
70
+
71
+ Args:
72
+ exp_dir: Path to the experiment directory
73
+
74
+ Returns:
75
+ Dictionary mapping service_id to record (with description, state, url, timestamp)
76
+ """
77
+ services_json_path = exp_dir / "services.json"
78
+
79
+ if not services_json_path.exists():
80
+ logger.debug("No services.json found in %s", exp_dir)
81
+ return {}
82
+
83
+ try:
84
+ with services_json_path.open("r") as f:
85
+ services_data = json.load(f)
86
+ logger.debug(
87
+ "Read %d service records from services.json", len(services_data)
88
+ )
89
+ return services_data
90
+ except json.JSONDecodeError as e:
91
+ logger.warning("Failed to parse services.json: %s", e)
92
+ except Exception as e:
93
+ logger.warning(
94
+ "Failed to read services.json from %s: %s", services_json_path, e
95
+ )
96
+
97
+ return {}
98
+
99
+
100
+ def acquire_sync_lock(
101
+ workspace_path: Path, blocking: bool = True
102
+ ) -> Optional[fasteners.InterProcessLock]:
103
+ """Acquire exclusive lock for workspace synchronization
104
+
105
+ Args:
106
+ workspace_path: Path to the workspace directory
107
+ blocking: If True, wait for lock; if False, return None if unavailable
108
+
109
+ Returns:
110
+ Lock object if acquired, None if not acquired (only in non-blocking mode)
111
+ """
112
+ lock_path = workspace_path / ".sync.lock"
113
+ lock = fasteners.InterProcessLock(str(lock_path))
114
+
115
+ if lock.acquire(blocking=blocking):
116
+ logger.debug("Acquired sync lock: %s", lock_path)
117
+ return lock
118
+ else:
119
+ logger.debug("Could not acquire sync lock (already held): %s", lock_path)
120
+ return None
121
+
122
+
123
+ def should_sync(
124
+ provider: "WorkspaceStateProvider",
125
+ workspace_path: Path,
126
+ min_interval_minutes: int = 5,
127
+ ) -> Tuple[bool, Optional[fasteners.InterProcessLock]]:
128
+ """Determine if sync should be performed based on locking and timing
129
+
130
+ Args:
131
+ provider: WorkspaceStateProvider instance (used to check last sync time)
132
+ workspace_path: Path to workspace directory
133
+ min_interval_minutes: Minimum minutes between syncs (default: 5)
134
+
135
+ Returns:
136
+ Tuple of (should_sync: bool, lock: Optional[Lock])
137
+ If should_sync is True, lock is acquired and must be released after sync
138
+ If should_sync is False, lock is None
139
+ """
140
+ # Try to acquire exclusive lock (non-blocking)
141
+ lock = acquire_sync_lock(workspace_path, blocking=False)
142
+ if lock is None:
143
+ # Other experiments running - skip sync
144
+ logger.info("Skipping sync: other experiments are running")
145
+ return False, None
146
+
147
+ # Check last sync time using the provider
148
+ last_sync_time = provider.get_last_sync_time()
149
+ if last_sync_time is None:
150
+ # First sync ever
151
+ logger.info("Performing first sync")
152
+ return True, lock
153
+
154
+ time_since_last_sync = datetime.now() - last_sync_time
155
+ if time_since_last_sync.total_seconds() > min_interval_minutes * 60:
156
+ # Enough time has passed
157
+ logger.info(
158
+ "Performing sync (%.1f minutes since last sync)",
159
+ time_since_last_sync.total_seconds() / 60,
160
+ )
161
+ return True, lock
162
+
163
+ # Recently synced, skip
164
+ logger.info(
165
+ "Skipping sync: last sync was %.1f minutes ago (threshold: %d minutes)",
166
+ time_since_last_sync.total_seconds() / 60,
167
+ min_interval_minutes,
168
+ )
169
+ lock.release()
170
+ return False, None
171
+
172
+
173
+ def check_process_alive(
174
+ job_path: Path, scriptname: str, update_disk: bool = True
175
+ ) -> bool:
176
+ """Check if a running job's process is still alive
177
+
178
+ If the process is dead and update_disk=True, this function will:
179
+ 1. Acquire a lock on the job directory
180
+ 2. Create a .failed file to persist the state (with detailed failure info)
181
+ 3. Remove the .pid file
182
+
183
+ Args:
184
+ job_path: Path to the job directory
185
+ scriptname: Name of the script (for PID file name)
186
+ update_disk: If True, update disk state when process is dead
187
+
188
+ Returns:
189
+ True if process is running, False otherwise
190
+ """
191
+ import asyncio
192
+
193
+ from experimaestro.connectors import Process
194
+ from experimaestro.connectors.local import LocalConnector
195
+
196
+ pid_file = BaseJob.get_pidfile(job_path, scriptname)
197
+ if not pid_file.exists():
198
+ return False
199
+
200
+ # Try to acquire lock on job directory (non-blocking)
201
+ # If we can't acquire, assume the job is running (another process has it)
202
+ lock_path = job_path / ".lock"
203
+ lock = fasteners.InterProcessLock(str(lock_path))
204
+
205
+ if not lock.acquire(blocking=False):
206
+ # Can't acquire lock - job is probably running
207
+ logger.debug("Could not acquire lock for %s, assuming job is running", job_path)
208
+ return True
209
+
210
+ try:
211
+ pinfo = json.loads(pid_file.read_text())
212
+ connector = LocalConnector.instance()
213
+ process = Process.fromDefinition(connector, pinfo)
214
+
215
+ if process is None:
216
+ # Can't get process info - mark as dead
217
+ if update_disk:
218
+ _mark_job_failed_on_disk(job_path, scriptname, None)
219
+ return False
220
+
221
+ # Check process state (with 0 timeout for immediate check)
222
+ state = asyncio.run(process.aio_state(0))
223
+ if state is None or state.finished:
224
+ # Process is dead - get detailed job state from process
225
+ if update_disk:
226
+ exit_code = state.exitcode if state else 1
227
+ # Use get_job_state() to get detailed failure info (e.g., SLURM timeout)
228
+ job_state = process.get_job_state(exit_code)
229
+ _mark_job_failed_on_disk(job_path, scriptname, job_state, exit_code)
230
+ return False
231
+
232
+ return True
233
+ except Exception as e:
234
+ logger.debug("Could not check process state for %s: %s", job_path, e)
235
+ # On error, assume process is dead but don't update disk (we're not sure)
236
+ return False
237
+ finally:
238
+ lock.release()
239
+
240
+
241
+ def _mark_job_failed_on_disk(
242
+ job_path: Path,
243
+ scriptname: str,
244
+ job_state: Optional["JobState"],
245
+ exit_code: int = 1,
246
+ ) -> None:
247
+ """Mark a job as failed on disk by creating .failed file and removing .pid
248
+
249
+ Args:
250
+ job_path: Path to the job directory
251
+ scriptname: Name of the script
252
+ job_state: JobState from process.get_job_state() - may contain detailed
253
+ failure info (e.g., TIMEOUT for SLURM jobs)
254
+ exit_code: Exit code of the process (default: 1)
255
+ """
256
+ from experimaestro.scheduler.jobs import JobStateError
257
+
258
+ pid_file = BaseJob.get_pidfile(job_path, scriptname)
259
+ failed_file = BaseJob.get_failedfile(job_path, scriptname)
260
+
261
+ try:
262
+ # Extract failure status from job_state
263
+ failure_status = "UNKNOWN"
264
+
265
+ if isinstance(job_state, JobStateError):
266
+ # JobStateError can contain detailed failure reason
267
+ if job_state.failure_reason:
268
+ failure_status = job_state.failure_reason.name
269
+
270
+ # Write .failed file with exit code and failure status (JSON format)
271
+ failed_data = {
272
+ "exit_code": exit_code,
273
+ "failure_status": failure_status,
274
+ }
275
+
276
+ failed_file.write_text(json.dumps(failed_data))
277
+ logger.info(
278
+ "Created %s for dead job (exit_code=%d, status=%s)",
279
+ failed_file,
280
+ exit_code,
281
+ failure_status,
282
+ )
283
+
284
+ # Remove .pid file
285
+ if pid_file.exists():
286
+ pid_file.unlink()
287
+ logger.debug("Removed stale PID file %s", pid_file)
288
+
289
+ except Exception as e:
290
+ logger.warning("Failed to update disk state for %s: %s", job_path, e)
291
+
292
+
293
+ def scan_job_state_from_disk( # noqa: C901
294
+ job_path: Path, scriptname: str, check_running: bool = True
295
+ ) -> Optional[Dict]:
296
+ """Scan a job directory to determine state from disk files
297
+
298
+ Reads job state from .experimaestro/information.json (primary) or marker
299
+ files (fallback).
300
+
301
+ Args:
302
+ job_path: Path to the job directory
303
+ scriptname: Name of the script (for marker file names)
304
+ check_running: If True, verify that jobs with PID files are actually
305
+ still running (default: True)
306
+
307
+ Returns:
308
+ Dictionary with job state information, or None if no state found:
309
+ {
310
+ 'job_id': str,
311
+ 'task_id': str,
312
+ 'state': str,
313
+ 'submitted_time': float or None,
314
+ 'started_time': float or None,
315
+ 'ended_time': float or None,
316
+ 'exit_code': int or None,
317
+ 'failure_reason': str or None,
318
+ 'retry_count': int,
319
+ 'process': dict or None # Process spec from metadata
320
+ }
321
+ """
322
+ # Try reading metadata from .experimaestro/information.json (primary source)
323
+ metadata_path = BaseJob.get_metadata_path(job_path)
324
+ if metadata_path.exists():
325
+ try:
326
+ with metadata_path.open("r") as f:
327
+ metadata = json.load(f)
328
+
329
+ state = metadata.get("state", "unscheduled")
330
+ failure_reason = metadata.get("failure_reason")
331
+
332
+ # If state is "running", verify the process is still alive
333
+ if state == "running" and check_running:
334
+ if not check_process_alive(job_path, scriptname):
335
+ logger.info(
336
+ "Job %s marked as running but process is dead, marking as error",
337
+ job_path.name,
338
+ )
339
+ state = "error"
340
+ failure_reason = "UNKNOWN"
341
+
342
+ logger.debug("Read metadata from %s", metadata_path)
343
+ return {
344
+ "job_id": metadata.get("job_id"),
345
+ "task_id": metadata.get("task_id"),
346
+ "state": state,
347
+ "submitted_time": metadata.get("submitted_time"),
348
+ "started_time": metadata.get("started_time"),
349
+ "ended_time": metadata.get("ended_time"),
350
+ "exit_code": metadata.get("exit_code"),
351
+ "failure_reason": failure_reason,
352
+ "retry_count": metadata.get("retry_count", 0),
353
+ "process": metadata.get("process"), # Process spec with launcher info
354
+ }
355
+ except Exception as e:
356
+ logger.warning("Failed to read metadata from %s: %s", metadata_path, e)
357
+ # Fall through to marker file fallback
358
+
359
+ # Fallback: Infer from marker files
360
+ try:
361
+ done_file = BaseJob.get_donefile(job_path, scriptname)
362
+ failed_file = BaseJob.get_failedfile(job_path, scriptname)
363
+ pid_file = BaseJob.get_pidfile(job_path, scriptname)
364
+
365
+ state = "unscheduled"
366
+ exit_code = None
367
+ failure_reason = None
368
+
369
+ if done_file.is_file():
370
+ state = "done"
371
+ exit_code = 0
372
+ elif failed_file.is_file():
373
+ state = "error"
374
+ # Try to parse .failed file (JSON or legacy integer format)
375
+ try:
376
+ content = failed_file.read_text().strip()
377
+ data = json.loads(content)
378
+ if isinstance(data, int):
379
+ exit_code = data
380
+ else:
381
+ exit_code = data.get("exit_code", 1)
382
+ failure_reason = data.get("failure_status", "UNKNOWN")
383
+ except json.JSONDecodeError:
384
+ # Legacy integer format
385
+ try:
386
+ exit_code = int(content)
387
+ except (ValueError, OSError):
388
+ exit_code = 1
389
+ elif pid_file.is_file():
390
+ # PID file exists - check if process is actually running
391
+ if check_running and not check_process_alive(job_path, scriptname):
392
+ logger.info(
393
+ "Job %s has PID file but process is dead, marking as error",
394
+ job_path.name,
395
+ )
396
+ state = "error"
397
+ failure_reason = "UNKNOWN"
398
+ else:
399
+ state = "running"
400
+
401
+ # Use directory structure to infer job_id and task_id
402
+ # job_path structure: {workspace}/jobs/{task_id}/{hash}/
403
+ if job_path.parent and job_path.parent.parent:
404
+ job_id = job_path.name # Hash hex
405
+ task_id = job_path.parent.name
406
+
407
+ # Infer timestamps from file modification times
408
+ submitted_time = None
409
+ started_time = None
410
+ ended_time = None
411
+
412
+ # Use params.json mtime as submitted_time
413
+ params_file = job_path / "params.json"
414
+ if params_file.exists():
415
+ try:
416
+ submitted_time = params_file.stat().st_mtime
417
+ except OSError:
418
+ pass
419
+
420
+ # Use stdout file (.out) mtime as started_time
421
+ stdout_file = job_path / f"{scriptname}.out"
422
+ if stdout_file.exists():
423
+ try:
424
+ stdout_stat = stdout_file.stat()
425
+ # Use creation time if available, otherwise mtime
426
+ started_time = getattr(
427
+ stdout_stat, "st_birthtime", stdout_stat.st_ctime
428
+ )
429
+ except OSError:
430
+ pass
431
+
432
+ # Use done/failed file mtime as ended_time
433
+ if state == "done" and done_file.exists():
434
+ try:
435
+ ended_time = done_file.stat().st_mtime
436
+ except OSError:
437
+ pass
438
+ elif state == "error" and failed_file.exists():
439
+ try:
440
+ ended_time = failed_file.stat().st_mtime
441
+ except OSError:
442
+ pass
443
+
444
+ return {
445
+ "job_id": job_id,
446
+ "task_id": task_id,
447
+ "state": state,
448
+ "submitted_time": submitted_time,
449
+ "started_time": started_time,
450
+ "ended_time": ended_time,
451
+ "exit_code": exit_code,
452
+ "failure_reason": failure_reason,
453
+ "retry_count": 0,
454
+ "process": None,
455
+ }
456
+
457
+ except Exception as e:
458
+ logger.exception("Failed to scan job state from %s: %s", job_path, e)
459
+ return None
460
+
461
+
462
+ def sync_workspace_from_disk( # noqa: C901
463
+ workspace_path: Path,
464
+ write_mode: bool = True,
465
+ force: bool = False,
466
+ blocking: bool = True,
467
+ sync_interval_minutes: int = 5,
468
+ ) -> None:
469
+ """Synchronize workspace database from disk state
470
+
471
+ Scans job directories and experiment symlinks to update the database.
472
+ Uses exclusive locking and time-based throttling to prevent conflicts.
473
+
474
+ Args:
475
+ workspace_path: Path to workspace directory
476
+ write_mode: If True, update database; if False, dry-run mode
477
+ force: If True, bypass time throttling (still requires lock)
478
+ blocking: If True, wait for lock; if False, fail if lock unavailable
479
+ sync_interval_minutes: Minimum minutes between syncs (default: 5)
480
+
481
+ Raises:
482
+ RuntimeError: If lock unavailable in non-blocking mode
483
+ """
484
+ # Normalize path
485
+ if not isinstance(workspace_path, Path):
486
+ workspace_path = Path(workspace_path)
487
+ workspace_path = workspace_path.absolute()
488
+
489
+ # Get the workspace state provider FIRST (before should_sync)
490
+ # This ensures consistent read_only mode throughout the sync process
491
+ from .state_provider import WorkspaceStateProvider
492
+
493
+ provider = WorkspaceStateProvider.get_instance(
494
+ workspace_path,
495
+ read_only=not write_mode,
496
+ sync_on_start=False, # Don't sync recursively
497
+ )
498
+
499
+ # Check if sync should proceed (unless force=True)
500
+ if not force:
501
+ should_proceed, lock = should_sync(
502
+ provider, workspace_path, sync_interval_minutes
503
+ )
504
+ if not should_proceed:
505
+ return
506
+ else:
507
+ # Force mode: skip time check but still require lock
508
+ lock = acquire_sync_lock(workspace_path, blocking=blocking)
509
+ if lock is None:
510
+ if blocking:
511
+ raise RuntimeError("Failed to acquire sync lock in blocking mode")
512
+ else:
513
+ raise RuntimeError("Sync lock unavailable (other experiments running)")
514
+
515
+ try:
516
+ logger.info("Starting workspace sync from disk: %s", workspace_path)
517
+
518
+ experiments_found = 0
519
+ runs_found = 0
520
+ jobs_scanned = 0
521
+ jobs_updated = 0
522
+
523
+ # Use database context binding for all queries
524
+ with provider.workspace_db.bind_ctx(
525
+ [
526
+ ExperimentModel,
527
+ ExperimentRunModel,
528
+ JobModel,
529
+ JobTagModel,
530
+ ServiceModel,
531
+ WorkspaceSyncMetadata,
532
+ ]
533
+ ):
534
+ # Scan experiments directory - this is the source of truth
535
+ xp_dir = workspace_path / "xp"
536
+
537
+ if not xp_dir.exists():
538
+ logger.info("No experiments directory found")
539
+ return
540
+
541
+ for exp_dir in xp_dir.iterdir():
542
+ if not exp_dir.is_dir():
543
+ continue
544
+
545
+ experiment_id = exp_dir.name
546
+ experiments_found += 1
547
+
548
+ # Read jobs.jsonl to get tags for each job
549
+ job_records = read_jobs_jsonl(exp_dir)
550
+
551
+ # Read services.json to get services for this experiment
552
+ service_records = read_services_json(exp_dir)
553
+
554
+ if write_mode:
555
+ # Ensure experiment exists in database
556
+ now = datetime.now()
557
+ ExperimentModel.insert(
558
+ experiment_id=experiment_id,
559
+ updated_at=now,
560
+ ).on_conflict(
561
+ conflict_target=[ExperimentModel.experiment_id],
562
+ update={
563
+ ExperimentModel.updated_at: now,
564
+ },
565
+ ).execute()
566
+
567
+ # Determine or create run_id for experiment
568
+ existing_runs = list(
569
+ ExperimentRunModel.select()
570
+ .where(ExperimentRunModel.experiment_id == experiment_id)
571
+ .order_by(ExperimentRunModel.started_at.desc())
572
+ )
573
+
574
+ if existing_runs:
575
+ # Use the most recent run as current
576
+ current_run_id = existing_runs[0].run_id
577
+ runs_found += len(existing_runs)
578
+ else:
579
+ # Create initial run
580
+ current_run_id = "initial"
581
+ runs_found += 1
582
+
583
+ if write_mode:
584
+ ExperimentRunModel.insert(
585
+ experiment_id=experiment_id,
586
+ run_id=current_run_id,
587
+ status="active",
588
+ ).on_conflict_ignore().execute()
589
+
590
+ # Update experiment's current_run_id
591
+ ExperimentModel.update(
592
+ current_run_id=current_run_id,
593
+ updated_at=datetime.now(),
594
+ ).where(
595
+ ExperimentModel.experiment_id == experiment_id
596
+ ).execute()
597
+
598
+ logger.debug(
599
+ "Experiment %s: current_run_id=%s", experiment_id, current_run_id
600
+ )
601
+
602
+ # Sync services from services.json
603
+ if write_mode and service_records:
604
+ for service_id, service_data in service_records.items():
605
+ now = datetime.now()
606
+ # Store the full state_dict as JSON for recreation
607
+ state_dict_json = json.dumps(service_data)
608
+ ServiceModel.insert(
609
+ service_id=service_id,
610
+ experiment_id=experiment_id,
611
+ run_id=current_run_id,
612
+ description=service_data.get("description", ""),
613
+ state=service_data.get("state", "STOPPED"),
614
+ state_dict=state_dict_json,
615
+ created_at=now,
616
+ updated_at=now,
617
+ ).on_conflict(
618
+ conflict_target=[
619
+ ServiceModel.service_id,
620
+ ServiceModel.experiment_id,
621
+ ServiceModel.run_id,
622
+ ],
623
+ update={
624
+ ServiceModel.description: service_data.get(
625
+ "description", ""
626
+ ),
627
+ ServiceModel.state: service_data.get(
628
+ "state", "STOPPED"
629
+ ),
630
+ ServiceModel.state_dict: state_dict_json,
631
+ ServiceModel.updated_at: now,
632
+ },
633
+ ).execute()
634
+ logger.debug(
635
+ "Synced service %s for experiment %s",
636
+ service_id,
637
+ experiment_id,
638
+ )
639
+
640
+ # Scan jobs linked from this experiment
641
+ jobs_dir = exp_dir / "jobs"
642
+ if not jobs_dir.exists():
643
+ continue
644
+
645
+ # Infer experiment run timestamps from jobs directory
646
+ if write_mode:
647
+ try:
648
+ from peewee import fn
649
+
650
+ jobs_stat = jobs_dir.stat()
651
+ # Use jobs dir creation time as started_at (fallback to mtime)
652
+ started_at = datetime.fromtimestamp(
653
+ getattr(jobs_stat, "st_birthtime", jobs_stat.st_ctime)
654
+ )
655
+ # Use jobs dir last modification time as ended_at
656
+ ended_at = datetime.fromtimestamp(jobs_stat.st_mtime)
657
+
658
+ # Update experiment run with inferred timestamps
659
+ # Use COALESCE to only set started_at if not already set
660
+ ExperimentRunModel.update(
661
+ started_at=fn.COALESCE(
662
+ ExperimentRunModel.started_at, started_at
663
+ ),
664
+ ended_at=ended_at,
665
+ ).where(
666
+ (ExperimentRunModel.experiment_id == experiment_id)
667
+ & (ExperimentRunModel.run_id == current_run_id)
668
+ ).execute()
669
+ except OSError:
670
+ pass
671
+
672
+ # Find all symlinks in experiment jobs directory
673
+ for symlink_path in jobs_dir.rglob("*"):
674
+ if not symlink_path.is_symlink():
675
+ continue
676
+
677
+ jobs_scanned += 1
678
+
679
+ # Try to resolve symlink to actual job directory
680
+ try:
681
+ job_path = symlink_path.resolve()
682
+ job_exists = job_path.is_dir()
683
+ except (OSError, RuntimeError):
684
+ # Broken symlink
685
+ job_path = None
686
+ job_exists = False
687
+
688
+ # Read job state from disk if job exists
689
+ job_state = None
690
+ if job_exists and job_path:
691
+ # Try to determine scriptname
692
+ scriptname = None
693
+ for suffix in [".done", ".failed", ".pid"]:
694
+ for file in job_path.glob(f"*{suffix}"):
695
+ scriptname = file.name[: -len(suffix)]
696
+ break
697
+ if scriptname:
698
+ break
699
+
700
+ if not scriptname:
701
+ # Infer from job_path name
702
+ scriptname = job_path.name
703
+
704
+ job_state = scan_job_state_from_disk(job_path, scriptname)
705
+
706
+ # If we couldn't read state, create minimal entry from symlink
707
+ if not job_state:
708
+ # Extract job_id and task_id from symlink structure
709
+ # Symlink structure: xp/{exp}/jobs/{task_id}/{hash}
710
+ parts = symlink_path.parts
711
+ try:
712
+ jobs_idx = parts.index("jobs")
713
+ if jobs_idx + 2 < len(parts):
714
+ task_id = parts[jobs_idx + 1]
715
+ job_id = parts[jobs_idx + 2]
716
+ else:
717
+ # Fallback
718
+ task_id = "unknown"
719
+ job_id = symlink_path.name
720
+ except (ValueError, IndexError):
721
+ task_id = "unknown"
722
+ job_id = symlink_path.name
723
+
724
+ job_state = {
725
+ "job_id": job_id,
726
+ "task_id": task_id,
727
+ "state": "phantom", # Job was deleted but symlink remains
728
+ "submitted_time": None,
729
+ "started_time": None,
730
+ "ended_time": None,
731
+ "exit_code": None,
732
+ "failure_reason": None,
733
+ "retry_count": 0,
734
+ "process": None,
735
+ }
736
+
737
+ # Update database
738
+ if write_mode and job_state and job_state["job_id"]:
739
+ job_now = datetime.now()
740
+ JobModel.insert(
741
+ job_id=job_state["job_id"],
742
+ experiment_id=experiment_id,
743
+ run_id=current_run_id,
744
+ task_id=job_state["task_id"],
745
+ locator="", # Not available from disk
746
+ state=job_state["state"],
747
+ # Only set failure_reason if job is in error state
748
+ failure_reason=(
749
+ job_state.get("failure_reason")
750
+ if job_state["state"] == "error"
751
+ else None
752
+ ),
753
+ submitted_time=job_state.get("submitted_time"),
754
+ started_time=job_state.get("started_time"),
755
+ ended_time=job_state.get("ended_time"),
756
+ progress="[]",
757
+ updated_at=job_now,
758
+ ).on_conflict(
759
+ conflict_target=[
760
+ JobModel.job_id,
761
+ JobModel.experiment_id,
762
+ JobModel.run_id,
763
+ ],
764
+ update={
765
+ JobModel.state: job_state["state"],
766
+ # Only set failure_reason if job is in error state
767
+ # Otherwise clear it to avoid stale failure reasons
768
+ JobModel.failure_reason: (
769
+ job_state.get("failure_reason")
770
+ if job_state["state"] == "error"
771
+ else None
772
+ ),
773
+ JobModel.submitted_time: job_state.get(
774
+ "submitted_time"
775
+ ),
776
+ JobModel.started_time: job_state.get("started_time"),
777
+ JobModel.ended_time: job_state.get("ended_time"),
778
+ JobModel.updated_at: job_now,
779
+ },
780
+ ).execute()
781
+
782
+ jobs_updated += 1
783
+
784
+ # Sync tags from jobs.jsonl
785
+ job_id = job_state["job_id"]
786
+ if job_id in job_records:
787
+ tags = job_records[job_id].get("tags", {})
788
+ if tags:
789
+ # Delete existing tags for this job+experiment+run
790
+ JobTagModel.delete().where(
791
+ (JobTagModel.job_id == job_id)
792
+ & (JobTagModel.experiment_id == experiment_id)
793
+ & (JobTagModel.run_id == current_run_id)
794
+ ).execute()
795
+
796
+ # Insert new tags
797
+ for tag_key, tag_value in tags.items():
798
+ JobTagModel.insert(
799
+ job_id=job_id,
800
+ experiment_id=experiment_id,
801
+ run_id=current_run_id,
802
+ tag_key=tag_key,
803
+ tag_value=str(tag_value),
804
+ ).on_conflict_ignore().execute()
805
+
806
+ logger.debug(
807
+ "Synced %d tags for job %s", len(tags), job_id
808
+ )
809
+
810
+ logger.debug(
811
+ "Synced job %s for experiment %s run %s: state=%s",
812
+ job_state["job_id"],
813
+ experiment_id,
814
+ current_run_id,
815
+ job_state["state"],
816
+ )
817
+
818
+ logger.info(
819
+ "Sync complete: %d experiments, %d runs, %d jobs scanned, %d jobs updated",
820
+ experiments_found,
821
+ runs_found,
822
+ jobs_scanned,
823
+ jobs_updated,
824
+ )
825
+
826
+ # Update last sync time if in write mode
827
+ if write_mode:
828
+ provider.update_last_sync_time()
829
+
830
+ finally:
831
+ # Always release lock
832
+ if lock:
833
+ lock.release()
834
+ logger.debug("Released sync lock")