experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (122) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +278 -7
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/refactor.py +249 -0
  7. experimaestro/click.py +0 -1
  8. experimaestro/commandline.py +19 -3
  9. experimaestro/connectors/__init__.py +20 -1
  10. experimaestro/connectors/local.py +12 -0
  11. experimaestro/core/arguments.py +182 -46
  12. experimaestro/core/identifier.py +107 -6
  13. experimaestro/core/objects/__init__.py +6 -0
  14. experimaestro/core/objects/config.py +542 -25
  15. experimaestro/core/objects/config_walk.py +20 -0
  16. experimaestro/core/serialization.py +91 -34
  17. experimaestro/core/subparameters.py +164 -0
  18. experimaestro/core/types.py +175 -38
  19. experimaestro/exceptions.py +26 -0
  20. experimaestro/experiments/cli.py +111 -25
  21. experimaestro/generators.py +50 -9
  22. experimaestro/huggingface.py +3 -1
  23. experimaestro/launcherfinder/parser.py +29 -0
  24. experimaestro/launchers/__init__.py +26 -1
  25. experimaestro/launchers/direct.py +12 -0
  26. experimaestro/launchers/slurm/base.py +154 -2
  27. experimaestro/mkdocs/metaloader.py +0 -1
  28. experimaestro/mypy.py +452 -7
  29. experimaestro/notifications.py +63 -13
  30. experimaestro/progress.py +0 -2
  31. experimaestro/rpyc.py +0 -1
  32. experimaestro/run.py +19 -6
  33. experimaestro/scheduler/base.py +510 -125
  34. experimaestro/scheduler/dependencies.py +43 -28
  35. experimaestro/scheduler/dynamic_outputs.py +259 -130
  36. experimaestro/scheduler/experiment.py +256 -31
  37. experimaestro/scheduler/interfaces.py +501 -0
  38. experimaestro/scheduler/jobs.py +216 -206
  39. experimaestro/scheduler/remote/__init__.py +31 -0
  40. experimaestro/scheduler/remote/client.py +874 -0
  41. experimaestro/scheduler/remote/protocol.py +467 -0
  42. experimaestro/scheduler/remote/server.py +423 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +323 -23
  45. experimaestro/scheduler/state_db.py +437 -0
  46. experimaestro/scheduler/state_provider.py +2766 -0
  47. experimaestro/scheduler/state_sync.py +891 -0
  48. experimaestro/scheduler/workspace.py +52 -10
  49. experimaestro/scriptbuilder.py +7 -0
  50. experimaestro/server/__init__.py +147 -57
  51. experimaestro/server/data/index.css +0 -125
  52. experimaestro/server/data/index.css.map +1 -1
  53. experimaestro/server/data/index.js +194 -58
  54. experimaestro/server/data/index.js.map +1 -1
  55. experimaestro/settings.py +44 -5
  56. experimaestro/sphinx/__init__.py +3 -3
  57. experimaestro/taskglobals.py +20 -0
  58. experimaestro/tests/conftest.py +80 -0
  59. experimaestro/tests/core/test_generics.py +2 -2
  60. experimaestro/tests/identifier_stability.json +45 -0
  61. experimaestro/tests/launchers/bin/sacct +6 -2
  62. experimaestro/tests/launchers/bin/sbatch +4 -2
  63. experimaestro/tests/launchers/test_slurm.py +80 -0
  64. experimaestro/tests/tasks/test_dynamic.py +231 -0
  65. experimaestro/tests/test_cli_jobs.py +615 -0
  66. experimaestro/tests/test_deprecated.py +630 -0
  67. experimaestro/tests/test_environment.py +200 -0
  68. experimaestro/tests/test_file_progress_integration.py +1 -1
  69. experimaestro/tests/test_forward.py +3 -3
  70. experimaestro/tests/test_identifier.py +372 -41
  71. experimaestro/tests/test_identifier_stability.py +458 -0
  72. experimaestro/tests/test_instance.py +3 -3
  73. experimaestro/tests/test_multitoken.py +442 -0
  74. experimaestro/tests/test_mypy.py +433 -0
  75. experimaestro/tests/test_objects.py +312 -5
  76. experimaestro/tests/test_outputs.py +2 -2
  77. experimaestro/tests/test_param.py +8 -12
  78. experimaestro/tests/test_partial_paths.py +231 -0
  79. experimaestro/tests/test_progress.py +0 -48
  80. experimaestro/tests/test_remote_state.py +671 -0
  81. experimaestro/tests/test_resumable_task.py +480 -0
  82. experimaestro/tests/test_serializers.py +141 -1
  83. experimaestro/tests/test_state_db.py +434 -0
  84. experimaestro/tests/test_subparameters.py +160 -0
  85. experimaestro/tests/test_tags.py +136 -0
  86. experimaestro/tests/test_tasks.py +107 -121
  87. experimaestro/tests/test_token_locking.py +252 -0
  88. experimaestro/tests/test_tokens.py +17 -13
  89. experimaestro/tests/test_types.py +123 -1
  90. experimaestro/tests/test_workspace_triggers.py +158 -0
  91. experimaestro/tests/token_reschedule.py +4 -2
  92. experimaestro/tests/utils.py +2 -2
  93. experimaestro/tokens.py +154 -57
  94. experimaestro/tools/diff.py +1 -1
  95. experimaestro/tui/__init__.py +8 -0
  96. experimaestro/tui/app.py +2395 -0
  97. experimaestro/tui/app.tcss +353 -0
  98. experimaestro/tui/log_viewer.py +228 -0
  99. experimaestro/utils/__init__.py +23 -0
  100. experimaestro/utils/environment.py +148 -0
  101. experimaestro/utils/git.py +129 -0
  102. experimaestro/utils/resources.py +1 -1
  103. experimaestro/version.py +34 -0
  104. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
  105. experimaestro-2.0.0b8.dist-info/RECORD +187 -0
  106. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
  107. experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
  108. experimaestro/compat.py +0 -6
  109. experimaestro/core/objects.pyi +0 -221
  110. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  111. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  112. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  113. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  114. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  115. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  116. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  117. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  118. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  119. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  120. experimaestro-2.0.0a8.dist-info/RECORD +0 -166
  121. experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
  122. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,891 @@
1
+ """Disk-based state synchronization for workspace database
2
+
3
+ This module implements synchronization from disk state (marker files) to the
4
+ workspace database. It includes locking and throttling mechanisms to prevent
5
+ excessive disk scanning and conflicts with running experiments.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Dict, Optional, Tuple, TYPE_CHECKING
12
+ from datetime import datetime
13
+ import fasteners
14
+
15
+ if TYPE_CHECKING:
16
+ from .state_provider import WorkspaceStateProvider
17
+ from .jobs import JobState
18
+
19
+ from .interfaces import BaseJob
20
+
21
+ from experimaestro.scheduler.state_db import (
22
+ ExperimentModel,
23
+ ExperimentRunModel,
24
+ JobModel,
25
+ JobTagModel,
26
+ ServiceModel,
27
+ WorkspaceSyncMetadata,
28
+ )
29
+
30
+ logger = logging.getLogger("xpm.state_sync")
31
+
32
+
33
+ def read_jobs_jsonl(exp_dir: Path) -> Dict[str, Dict]:
34
+ """Read jobs.jsonl file and return a mapping of job_id -> record
35
+
36
+ Args:
37
+ exp_dir: Path to the experiment directory
38
+
39
+ Returns:
40
+ Dictionary mapping job_id to record (with tags, task_id, timestamp)
41
+ """
42
+ jobs_jsonl_path = exp_dir / "jobs.jsonl"
43
+ job_records = {}
44
+
45
+ if not jobs_jsonl_path.exists():
46
+ logger.debug("No jobs.jsonl found in %s", exp_dir)
47
+ return job_records
48
+
49
+ try:
50
+ with jobs_jsonl_path.open("r") as f:
51
+ for line in f:
52
+ if not line.strip():
53
+ continue
54
+ try:
55
+ record = json.loads(line)
56
+ job_id = record.get("job_id")
57
+ if job_id:
58
+ job_records[job_id] = record
59
+ except json.JSONDecodeError as e:
60
+ logger.warning("Failed to parse line in jobs.jsonl: %s", e)
61
+ except Exception as e:
62
+ logger.warning("Failed to read jobs.jsonl from %s: %s", jobs_jsonl_path, e)
63
+
64
+ logger.debug("Read %d job records from jobs.jsonl", len(job_records))
65
+ return job_records
66
+
67
+
68
+ def read_services_json(exp_dir: Path) -> Dict[str, Dict]:
69
+ """Read services.json file and return a mapping of service_id -> record
70
+
71
+ Args:
72
+ exp_dir: Path to the experiment directory
73
+
74
+ Returns:
75
+ Dictionary mapping service_id to record (with description, state, url, timestamp)
76
+ """
77
+ services_json_path = exp_dir / "services.json"
78
+
79
+ if not services_json_path.exists():
80
+ logger.debug("No services.json found in %s", exp_dir)
81
+ return {}
82
+
83
+ try:
84
+ with services_json_path.open("r") as f:
85
+ services_data = json.load(f)
86
+ logger.debug(
87
+ "Read %d service records from services.json", len(services_data)
88
+ )
89
+ return services_data
90
+ except json.JSONDecodeError as e:
91
+ logger.warning("Failed to parse services.json: %s", e)
92
+ except Exception as e:
93
+ logger.warning(
94
+ "Failed to read services.json from %s: %s", services_json_path, e
95
+ )
96
+
97
+ return {}
98
+
99
+
100
+ def read_informations_json(exp_dir: Path) -> Dict:
101
+ """Read informations.json file containing experiment metadata
102
+
103
+ Args:
104
+ exp_dir: Path to the experiment directory
105
+
106
+ Returns:
107
+ Dictionary with experiment informations including:
108
+ - runs: Dict[run_id, {hostname, started_at}]
109
+ """
110
+ info_path = exp_dir / "informations.json"
111
+
112
+ if not info_path.exists():
113
+ logger.debug("No informations.json found in %s", exp_dir)
114
+ return {}
115
+
116
+ try:
117
+ with info_path.open("r") as f:
118
+ info_data = json.load(f)
119
+ logger.debug("Read informations.json from %s", exp_dir)
120
+ return info_data
121
+ except json.JSONDecodeError as e:
122
+ logger.warning("Failed to parse informations.json: %s", e)
123
+ except Exception as e:
124
+ logger.warning("Failed to read informations.json from %s: %s", info_path, e)
125
+
126
+ return {}
127
+
128
+
129
+ def acquire_sync_lock(
130
+ workspace_path: Path, blocking: bool = True
131
+ ) -> Optional[fasteners.InterProcessLock]:
132
+ """Acquire exclusive lock for workspace synchronization
133
+
134
+ Args:
135
+ workspace_path: Path to the workspace directory
136
+ blocking: If True, wait for lock; if False, return None if unavailable
137
+
138
+ Returns:
139
+ Lock object if acquired, None if not acquired (only in non-blocking mode)
140
+ """
141
+ lock_path = workspace_path / ".sync.lock"
142
+ lock = fasteners.InterProcessLock(str(lock_path))
143
+
144
+ if lock.acquire(blocking=blocking):
145
+ logger.debug("Acquired sync lock: %s", lock_path)
146
+ return lock
147
+ else:
148
+ logger.debug("Could not acquire sync lock (already held): %s", lock_path)
149
+ return None
150
+
151
+
152
+ def should_sync(
153
+ provider: "WorkspaceStateProvider",
154
+ workspace_path: Path,
155
+ min_interval_minutes: int = 5,
156
+ ) -> Tuple[bool, Optional[fasteners.InterProcessLock]]:
157
+ """Determine if sync should be performed based on locking and timing
158
+
159
+ Args:
160
+ provider: WorkspaceStateProvider instance (used to check last sync time)
161
+ workspace_path: Path to workspace directory
162
+ min_interval_minutes: Minimum minutes between syncs (default: 5)
163
+
164
+ Returns:
165
+ Tuple of (should_sync: bool, lock: Optional[Lock])
166
+ If should_sync is True, lock is acquired and must be released after sync
167
+ If should_sync is False, lock is None
168
+ """
169
+ # Try to acquire exclusive lock (non-blocking)
170
+ lock = acquire_sync_lock(workspace_path, blocking=False)
171
+ if lock is None:
172
+ # Other experiments running - skip sync
173
+ logger.info("Skipping sync: other experiments are running")
174
+ return False, None
175
+
176
+ # Check last sync time using the provider
177
+ last_sync_time = provider.get_last_sync_time()
178
+ if last_sync_time is None:
179
+ # First sync ever
180
+ logger.info("Performing first sync")
181
+ return True, lock
182
+
183
+ time_since_last_sync = datetime.now() - last_sync_time
184
+ if time_since_last_sync.total_seconds() > min_interval_minutes * 60:
185
+ # Enough time has passed
186
+ logger.info(
187
+ "Performing sync (%.1f minutes since last sync)",
188
+ time_since_last_sync.total_seconds() / 60,
189
+ )
190
+ return True, lock
191
+
192
+ # Recently synced, skip
193
+ logger.info(
194
+ "Skipping sync: last sync was %.1f minutes ago (threshold: %d minutes)",
195
+ time_since_last_sync.total_seconds() / 60,
196
+ min_interval_minutes,
197
+ )
198
+ lock.release()
199
+ return False, None
200
+
201
+
202
+ def check_process_alive(
203
+ job_path: Path, scriptname: str, update_disk: bool = True
204
+ ) -> bool:
205
+ """Check if a running job's process is still alive
206
+
207
+ If the process is dead and update_disk=True, this function will:
208
+ 1. Acquire a lock on the job directory
209
+ 2. Create a .failed file to persist the state (with detailed failure info)
210
+ 3. Remove the .pid file
211
+
212
+ Args:
213
+ job_path: Path to the job directory
214
+ scriptname: Name of the script (for PID file name)
215
+ update_disk: If True, update disk state when process is dead
216
+
217
+ Returns:
218
+ True if process is running, False otherwise
219
+ """
220
+ import asyncio
221
+
222
+ from experimaestro.connectors import Process
223
+ from experimaestro.connectors.local import LocalConnector
224
+
225
+ pid_file = BaseJob.get_pidfile(job_path, scriptname)
226
+ if not pid_file.exists():
227
+ return False
228
+
229
+ # Try to acquire lock on job directory (non-blocking)
230
+ # If we can't acquire, assume the job is running (another process has it)
231
+ lock_path = job_path / ".lock"
232
+ lock = fasteners.InterProcessLock(str(lock_path))
233
+
234
+ if not lock.acquire(blocking=False):
235
+ # Can't acquire lock - job is probably running
236
+ logger.debug("Could not acquire lock for %s, assuming job is running", job_path)
237
+ return True
238
+
239
+ try:
240
+ pinfo = json.loads(pid_file.read_text())
241
+ connector = LocalConnector.instance()
242
+ process = Process.fromDefinition(connector, pinfo)
243
+
244
+ if process is None:
245
+ # Can't get process info - mark as dead
246
+ if update_disk:
247
+ _mark_job_failed_on_disk(job_path, scriptname, None)
248
+ return False
249
+
250
+ # Check process state (with 0 timeout for immediate check)
251
+ state = asyncio.run(process.aio_state(0))
252
+ if state is None or state.finished:
253
+ # Process is dead - get detailed job state from process
254
+ if update_disk:
255
+ exit_code = state.exitcode if state else 1
256
+ # Use get_job_state() to get detailed failure info (e.g., SLURM timeout)
257
+ job_state = process.get_job_state(exit_code)
258
+ _mark_job_failed_on_disk(job_path, scriptname, job_state, exit_code)
259
+ return False
260
+
261
+ return True
262
+ except Exception as e:
263
+ logger.debug("Could not check process state for %s: %s", job_path, e)
264
+ # On error, assume process is dead but don't update disk (we're not sure)
265
+ return False
266
+ finally:
267
+ lock.release()
268
+
269
+
270
+ def _mark_job_failed_on_disk(
271
+ job_path: Path,
272
+ scriptname: str,
273
+ job_state: Optional["JobState"],
274
+ exit_code: int = 1,
275
+ ) -> None:
276
+ """Mark a job as failed on disk by creating .failed file and removing .pid
277
+
278
+ Args:
279
+ job_path: Path to the job directory
280
+ scriptname: Name of the script
281
+ job_state: JobState from process.get_job_state() - may contain detailed
282
+ failure info (e.g., TIMEOUT for SLURM jobs)
283
+ exit_code: Exit code of the process (default: 1)
284
+ """
285
+ from experimaestro.scheduler.jobs import JobStateError
286
+
287
+ pid_file = BaseJob.get_pidfile(job_path, scriptname)
288
+ failed_file = BaseJob.get_failedfile(job_path, scriptname)
289
+
290
+ try:
291
+ # Extract failure status from job_state
292
+ failure_status = "UNKNOWN"
293
+
294
+ if isinstance(job_state, JobStateError):
295
+ # JobStateError can contain detailed failure reason
296
+ if job_state.failure_reason:
297
+ failure_status = job_state.failure_reason.name
298
+
299
+ # Write .failed file with exit code and failure status (JSON format)
300
+ failed_data = {
301
+ "exit_code": exit_code,
302
+ "failure_status": failure_status,
303
+ }
304
+
305
+ failed_file.write_text(json.dumps(failed_data))
306
+ logger.info(
307
+ "Created %s for dead job (exit_code=%d, status=%s)",
308
+ failed_file,
309
+ exit_code,
310
+ failure_status,
311
+ )
312
+
313
+ # Remove .pid file
314
+ if pid_file.exists():
315
+ pid_file.unlink()
316
+ logger.debug("Removed stale PID file %s", pid_file)
317
+
318
+ except Exception as e:
319
+ logger.warning("Failed to update disk state for %s: %s", job_path, e)
320
+
321
+
322
+ def scan_job_state_from_disk( # noqa: C901
323
+ job_path: Path, scriptname: str, check_running: bool = True
324
+ ) -> Optional[Dict]:
325
+ """Scan a job directory to determine state from disk files
326
+
327
+ Reads job state from .experimaestro/information.json (primary) or marker
328
+ files (fallback).
329
+
330
+ Args:
331
+ job_path: Path to the job directory
332
+ scriptname: Name of the script (for marker file names)
333
+ check_running: If True, verify that jobs with PID files are actually
334
+ still running (default: True)
335
+
336
+ Returns:
337
+ Dictionary with job state information, or None if no state found:
338
+ {
339
+ 'job_id': str,
340
+ 'task_id': str,
341
+ 'state': str,
342
+ 'submitted_time': float or None,
343
+ 'started_time': float or None,
344
+ 'ended_time': float or None,
345
+ 'exit_code': int or None,
346
+ 'failure_reason': str or None,
347
+ 'retry_count': int,
348
+ 'process': dict or None # Process spec from metadata
349
+ }
350
+ """
351
+ # Try reading metadata from .experimaestro/information.json (primary source)
352
+ metadata_path = BaseJob.get_metadata_path(job_path)
353
+ if metadata_path.exists():
354
+ try:
355
+ with metadata_path.open("r") as f:
356
+ metadata = json.load(f)
357
+
358
+ state = metadata.get("state", "unscheduled")
359
+ failure_reason = metadata.get("failure_reason")
360
+
361
+ # If state is "running", verify the process is still alive
362
+ if state == "running" and check_running:
363
+ if not check_process_alive(job_path, scriptname):
364
+ logger.info(
365
+ "Job %s marked as running but process is dead, marking as error",
366
+ job_path.name,
367
+ )
368
+ state = "error"
369
+ failure_reason = "UNKNOWN"
370
+
371
+ logger.debug("Read metadata from %s", metadata_path)
372
+ return {
373
+ "job_id": metadata.get("job_id"),
374
+ "task_id": metadata.get("task_id"),
375
+ "state": state,
376
+ "submitted_time": metadata.get("submitted_time"),
377
+ "started_time": metadata.get("started_time"),
378
+ "ended_time": metadata.get("ended_time"),
379
+ "exit_code": metadata.get("exit_code"),
380
+ "failure_reason": failure_reason,
381
+ "retry_count": metadata.get("retry_count", 0),
382
+ "process": metadata.get("process"), # Process spec with launcher info
383
+ }
384
+ except Exception as e:
385
+ logger.warning("Failed to read metadata from %s: %s", metadata_path, e)
386
+ # Fall through to marker file fallback
387
+
388
+ # Fallback: Infer from marker files
389
+ try:
390
+ done_file = BaseJob.get_donefile(job_path, scriptname)
391
+ failed_file = BaseJob.get_failedfile(job_path, scriptname)
392
+ pid_file = BaseJob.get_pidfile(job_path, scriptname)
393
+
394
+ state = "unscheduled"
395
+ exit_code = None
396
+ failure_reason = None
397
+
398
+ if done_file.is_file():
399
+ state = "done"
400
+ exit_code = 0
401
+ elif failed_file.is_file():
402
+ state = "error"
403
+ # Try to parse .failed file (JSON or legacy integer format)
404
+ try:
405
+ content = failed_file.read_text().strip()
406
+ data = json.loads(content)
407
+ if isinstance(data, int):
408
+ exit_code = data
409
+ else:
410
+ exit_code = data.get("exit_code", 1)
411
+ failure_reason = data.get("failure_status", "UNKNOWN")
412
+ except json.JSONDecodeError:
413
+ # Legacy integer format
414
+ try:
415
+ exit_code = int(content)
416
+ except (ValueError, OSError):
417
+ exit_code = 1
418
+ elif pid_file.is_file():
419
+ # PID file exists - check if process is actually running
420
+ if check_running and not check_process_alive(job_path, scriptname):
421
+ logger.info(
422
+ "Job %s has PID file but process is dead, marking as error",
423
+ job_path.name,
424
+ )
425
+ state = "error"
426
+ failure_reason = "UNKNOWN"
427
+ else:
428
+ state = "running"
429
+
430
+ # Use directory structure to infer job_id and task_id
431
+ # job_path structure: {workspace}/jobs/{task_id}/{hash}/
432
+ if job_path.parent and job_path.parent.parent:
433
+ job_id = job_path.name # Hash hex
434
+ task_id = job_path.parent.name
435
+
436
+ # Infer timestamps from file modification times
437
+ submitted_time = None
438
+ started_time = None
439
+ ended_time = None
440
+
441
+ # Use params.json mtime as submitted_time
442
+ params_file = job_path / "params.json"
443
+ if params_file.exists():
444
+ try:
445
+ submitted_time = params_file.stat().st_mtime
446
+ except OSError:
447
+ pass
448
+
449
+ # Use stdout file (.out) mtime as started_time
450
+ stdout_file = job_path / f"{scriptname}.out"
451
+ if stdout_file.exists():
452
+ try:
453
+ stdout_stat = stdout_file.stat()
454
+ # Use creation time if available, otherwise mtime
455
+ started_time = getattr(
456
+ stdout_stat, "st_birthtime", stdout_stat.st_ctime
457
+ )
458
+ except OSError:
459
+ pass
460
+
461
+ # Use done/failed file mtime as ended_time
462
+ if state == "done" and done_file.exists():
463
+ try:
464
+ ended_time = done_file.stat().st_mtime
465
+ except OSError:
466
+ pass
467
+ elif state == "error" and failed_file.exists():
468
+ try:
469
+ ended_time = failed_file.stat().st_mtime
470
+ except OSError:
471
+ pass
472
+
473
+ return {
474
+ "job_id": job_id,
475
+ "task_id": task_id,
476
+ "state": state,
477
+ "submitted_time": submitted_time,
478
+ "started_time": started_time,
479
+ "ended_time": ended_time,
480
+ "exit_code": exit_code,
481
+ "failure_reason": failure_reason,
482
+ "retry_count": 0,
483
+ "process": None,
484
+ }
485
+
486
+ except Exception as e:
487
+ logger.exception("Failed to scan job state from %s: %s", job_path, e)
488
+ return None
489
+
490
+
491
+ def sync_workspace_from_disk( # noqa: C901
492
+ workspace_path: Path,
493
+ write_mode: bool = True,
494
+ force: bool = False,
495
+ blocking: bool = True,
496
+ sync_interval_minutes: int = 5,
497
+ ) -> None:
498
+ """Synchronize workspace database from disk state
499
+
500
+ Scans job directories and experiment symlinks to update the database.
501
+ Uses exclusive locking and time-based throttling to prevent conflicts.
502
+
503
+ Args:
504
+ workspace_path: Path to workspace directory
505
+ write_mode: If True, update database; if False, dry-run mode
506
+ force: If True, bypass time throttling (still requires lock)
507
+ blocking: If True, wait for lock; if False, fail if lock unavailable
508
+ sync_interval_minutes: Minimum minutes between syncs (default: 5)
509
+
510
+ Raises:
511
+ RuntimeError: If lock unavailable in non-blocking mode
512
+ """
513
+ # Normalize path
514
+ if not isinstance(workspace_path, Path):
515
+ workspace_path = Path(workspace_path)
516
+ workspace_path = workspace_path.absolute()
517
+
518
+ # Get the workspace state provider FIRST (before should_sync)
519
+ # This ensures consistent read_only mode throughout the sync process
520
+ from .state_provider import WorkspaceStateProvider
521
+
522
+ provider = WorkspaceStateProvider.get_instance(
523
+ workspace_path,
524
+ read_only=not write_mode,
525
+ sync_on_start=False, # Don't sync recursively
526
+ )
527
+
528
+ # Check if sync should proceed (unless force=True)
529
+ if not force:
530
+ should_proceed, lock = should_sync(
531
+ provider, workspace_path, sync_interval_minutes
532
+ )
533
+ if not should_proceed:
534
+ return
535
+ else:
536
+ # Force mode: skip time check but still require lock
537
+ lock = acquire_sync_lock(workspace_path, blocking=blocking)
538
+ if lock is None:
539
+ if blocking:
540
+ raise RuntimeError("Failed to acquire sync lock in blocking mode")
541
+ else:
542
+ raise RuntimeError("Sync lock unavailable (other experiments running)")
543
+
544
+ try:
545
+ logger.info("Starting workspace sync from disk: %s", workspace_path)
546
+
547
+ experiments_found = 0
548
+ runs_found = 0
549
+ jobs_scanned = 0
550
+ jobs_updated = 0
551
+
552
+ # Use database context binding for all queries
553
+ with provider.workspace_db.bind_ctx(
554
+ [
555
+ ExperimentModel,
556
+ ExperimentRunModel,
557
+ JobModel,
558
+ JobTagModel,
559
+ ServiceModel,
560
+ WorkspaceSyncMetadata,
561
+ ]
562
+ ):
563
+ # Scan experiments directory - this is the source of truth
564
+ xp_dir = workspace_path / "xp"
565
+
566
+ if not xp_dir.exists():
567
+ logger.info("No experiments directory found")
568
+ return
569
+
570
+ for exp_dir in xp_dir.iterdir():
571
+ if not exp_dir.is_dir():
572
+ continue
573
+
574
+ experiment_id = exp_dir.name
575
+ experiments_found += 1
576
+
577
+ # Read jobs.jsonl to get tags for each job
578
+ job_records = read_jobs_jsonl(exp_dir)
579
+
580
+ # Read services.json to get services for this experiment
581
+ service_records = read_services_json(exp_dir)
582
+
583
+ # Read informations.json for run metadata (hostname, etc.)
584
+ info_data = read_informations_json(exp_dir)
585
+ runs_info = info_data.get("runs", {})
586
+
587
+ if write_mode:
588
+ # Ensure experiment exists in database
589
+ now = datetime.now()
590
+ ExperimentModel.insert(
591
+ experiment_id=experiment_id,
592
+ updated_at=now,
593
+ ).on_conflict(
594
+ conflict_target=[ExperimentModel.experiment_id],
595
+ update={
596
+ ExperimentModel.updated_at: now,
597
+ },
598
+ ).execute()
599
+
600
+ # Determine or create run_id for experiment
601
+ existing_runs = list(
602
+ ExperimentRunModel.select()
603
+ .where(ExperimentRunModel.experiment_id == experiment_id)
604
+ .order_by(ExperimentRunModel.started_at.desc())
605
+ )
606
+
607
+ if existing_runs:
608
+ # Use the most recent run as current
609
+ current_run_id = existing_runs[0].run_id
610
+ runs_found += len(existing_runs)
611
+
612
+ # Update hostname from informations.json if available
613
+ if write_mode:
614
+ for run in existing_runs:
615
+ run_info = runs_info.get(run.run_id, {})
616
+ hostname = run_info.get("hostname")
617
+ if hostname and not run.hostname:
618
+ ExperimentRunModel.update(hostname=hostname).where(
619
+ (ExperimentRunModel.experiment_id == experiment_id)
620
+ & (ExperimentRunModel.run_id == run.run_id)
621
+ ).execute()
622
+ else:
623
+ # Create initial run
624
+ current_run_id = "initial"
625
+ runs_found += 1
626
+
627
+ # Get hostname from informations.json if available
628
+ run_info = runs_info.get(current_run_id, {})
629
+ hostname = run_info.get("hostname")
630
+
631
+ if write_mode:
632
+ ExperimentRunModel.insert(
633
+ experiment_id=experiment_id,
634
+ run_id=current_run_id,
635
+ status="active",
636
+ hostname=hostname,
637
+ ).on_conflict_ignore().execute()
638
+
639
+ # Update experiment's current_run_id
640
+ ExperimentModel.update(
641
+ current_run_id=current_run_id,
642
+ updated_at=datetime.now(),
643
+ ).where(
644
+ ExperimentModel.experiment_id == experiment_id
645
+ ).execute()
646
+
647
+ logger.debug(
648
+ "Experiment %s: current_run_id=%s", experiment_id, current_run_id
649
+ )
650
+
651
+ # Sync services from services.json
652
+ if write_mode and service_records:
653
+ for service_id, service_data in service_records.items():
654
+ now = datetime.now()
655
+ # Extract only the state_dict keys (not metadata like
656
+ # service_id, description, state, url, timestamp)
657
+ # The state_dict should have __class__ and service-specific
658
+ # fields like 'path' for TensorboardService
659
+ metadata_keys = {
660
+ "service_id",
661
+ "description",
662
+ "url",
663
+ "timestamp",
664
+ }
665
+ state_dict = {
666
+ k: v
667
+ for k, v in service_data.items()
668
+ if k not in metadata_keys
669
+ }
670
+ state_dict_json = json.dumps(state_dict)
671
+ ServiceModel.insert(
672
+ service_id=service_id,
673
+ experiment_id=experiment_id,
674
+ run_id=current_run_id,
675
+ description=service_data.get("description", ""),
676
+ state_dict=state_dict_json,
677
+ created_at=now,
678
+ ).on_conflict(
679
+ conflict_target=[
680
+ ServiceModel.service_id,
681
+ ServiceModel.experiment_id,
682
+ ServiceModel.run_id,
683
+ ],
684
+ update={
685
+ ServiceModel.description: service_data.get(
686
+ "description", ""
687
+ ),
688
+ ServiceModel.state_dict: state_dict_json,
689
+ },
690
+ ).execute()
691
+ logger.debug(
692
+ "Synced service %s for experiment %s",
693
+ service_id,
694
+ experiment_id,
695
+ )
696
+
697
+ # Scan jobs linked from this experiment
698
+ jobs_dir = exp_dir / "jobs"
699
+ if not jobs_dir.exists():
700
+ continue
701
+
702
+ # Infer experiment run timestamps from jobs directory
703
+ if write_mode:
704
+ try:
705
+ from peewee import fn
706
+
707
+ jobs_stat = jobs_dir.stat()
708
+ # Use jobs dir creation time as started_at (fallback to mtime)
709
+ started_at = datetime.fromtimestamp(
710
+ getattr(jobs_stat, "st_birthtime", jobs_stat.st_ctime)
711
+ )
712
+ # Use jobs dir last modification time as ended_at
713
+ ended_at = datetime.fromtimestamp(jobs_stat.st_mtime)
714
+
715
+ # Update experiment run with inferred timestamps
716
+ # Use COALESCE to only set started_at if not already set
717
+ ExperimentRunModel.update(
718
+ started_at=fn.COALESCE(
719
+ ExperimentRunModel.started_at, started_at
720
+ ),
721
+ ended_at=ended_at,
722
+ ).where(
723
+ (ExperimentRunModel.experiment_id == experiment_id)
724
+ & (ExperimentRunModel.run_id == current_run_id)
725
+ ).execute()
726
+ except OSError:
727
+ pass
728
+
729
+ # Find all symlinks in experiment jobs directory
730
+ for symlink_path in jobs_dir.rglob("*"):
731
+ if not symlink_path.is_symlink():
732
+ continue
733
+
734
+ jobs_scanned += 1
735
+
736
+ # Try to resolve symlink to actual job directory
737
+ try:
738
+ job_path = symlink_path.resolve()
739
+ job_exists = job_path.is_dir()
740
+ except (OSError, RuntimeError):
741
+ # Broken symlink
742
+ job_path = None
743
+ job_exists = False
744
+
745
+ # Read job state from disk if job exists
746
+ job_state = None
747
+ if job_exists and job_path:
748
+ # Try to determine scriptname
749
+ scriptname = None
750
+ for suffix in [".done", ".failed", ".pid"]:
751
+ for file in job_path.glob(f"*{suffix}"):
752
+ scriptname = file.name[: -len(suffix)]
753
+ break
754
+ if scriptname:
755
+ break
756
+
757
+ if not scriptname:
758
+ # Infer from job_path name
759
+ scriptname = job_path.name
760
+
761
+ job_state = scan_job_state_from_disk(job_path, scriptname)
762
+
763
+ # If we couldn't read state, create minimal entry from symlink
764
+ if not job_state:
765
+ # Extract job_id and task_id from symlink structure
766
+ # Symlink structure: xp/{exp}/jobs/{task_id}/{hash}
767
+ parts = symlink_path.parts
768
+ try:
769
+ jobs_idx = parts.index("jobs")
770
+ if jobs_idx + 2 < len(parts):
771
+ task_id = parts[jobs_idx + 1]
772
+ job_id = parts[jobs_idx + 2]
773
+ else:
774
+ # Fallback
775
+ task_id = "unknown"
776
+ job_id = symlink_path.name
777
+ except (ValueError, IndexError):
778
+ task_id = "unknown"
779
+ job_id = symlink_path.name
780
+
781
+ job_state = {
782
+ "job_id": job_id,
783
+ "task_id": task_id,
784
+ "state": "phantom", # Job was deleted but symlink remains
785
+ "submitted_time": None,
786
+ "started_time": None,
787
+ "ended_time": None,
788
+ "exit_code": None,
789
+ "failure_reason": None,
790
+ "retry_count": 0,
791
+ "process": None,
792
+ }
793
+
794
+ # Update database
795
+ if write_mode and job_state and job_state["job_id"]:
796
+ job_now = datetime.now()
797
+ JobModel.insert(
798
+ job_id=job_state["job_id"],
799
+ experiment_id=experiment_id,
800
+ run_id=current_run_id,
801
+ task_id=job_state["task_id"],
802
+ locator="", # Not available from disk
803
+ state=job_state["state"],
804
+ # Only set failure_reason if job is in error state
805
+ failure_reason=(
806
+ job_state.get("failure_reason")
807
+ if job_state["state"] == "error"
808
+ else None
809
+ ),
810
+ submitted_time=job_state.get("submitted_time"),
811
+ started_time=job_state.get("started_time"),
812
+ ended_time=job_state.get("ended_time"),
813
+ progress="[]",
814
+ updated_at=job_now,
815
+ ).on_conflict(
816
+ conflict_target=[
817
+ JobModel.job_id,
818
+ JobModel.experiment_id,
819
+ JobModel.run_id,
820
+ ],
821
+ update={
822
+ JobModel.state: job_state["state"],
823
+ # Only set failure_reason if job is in error state
824
+ # Otherwise clear it to avoid stale failure reasons
825
+ JobModel.failure_reason: (
826
+ job_state.get("failure_reason")
827
+ if job_state["state"] == "error"
828
+ else None
829
+ ),
830
+ JobModel.submitted_time: job_state.get(
831
+ "submitted_time"
832
+ ),
833
+ JobModel.started_time: job_state.get("started_time"),
834
+ JobModel.ended_time: job_state.get("ended_time"),
835
+ JobModel.updated_at: job_now,
836
+ },
837
+ ).execute()
838
+
839
+ jobs_updated += 1
840
+
841
+ # Sync tags from jobs.jsonl
842
+ job_id = job_state["job_id"]
843
+ if job_id in job_records:
844
+ tags = job_records[job_id].get("tags", {})
845
+ if tags:
846
+ # Delete existing tags for this job+experiment+run
847
+ JobTagModel.delete().where(
848
+ (JobTagModel.job_id == job_id)
849
+ & (JobTagModel.experiment_id == experiment_id)
850
+ & (JobTagModel.run_id == current_run_id)
851
+ ).execute()
852
+
853
+ # Insert new tags
854
+ for tag_key, tag_value in tags.items():
855
+ JobTagModel.insert(
856
+ job_id=job_id,
857
+ experiment_id=experiment_id,
858
+ run_id=current_run_id,
859
+ tag_key=tag_key,
860
+ tag_value=str(tag_value),
861
+ ).on_conflict_ignore().execute()
862
+
863
+ logger.debug(
864
+ "Synced %d tags for job %s", len(tags), job_id
865
+ )
866
+
867
+ logger.debug(
868
+ "Synced job %s for experiment %s run %s: state=%s",
869
+ job_state["job_id"],
870
+ experiment_id,
871
+ current_run_id,
872
+ job_state["state"],
873
+ )
874
+
875
+ logger.info(
876
+ "Sync complete: %d experiments, %d runs, %d jobs scanned, %d jobs updated",
877
+ experiments_found,
878
+ runs_found,
879
+ jobs_scanned,
880
+ jobs_updated,
881
+ )
882
+
883
+ # Update last sync time if in write mode
884
+ if write_mode:
885
+ provider.update_last_sync_time()
886
+
887
+ finally:
888
+ # Always release lock
889
+ if lock:
890
+ lock.release()
891
+ logger.debug("Released sync lock")