experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (154) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +393 -134
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +223 -52
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +650 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +764 -169
  36. experimaestro/scheduler/interfaces.py +338 -96
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/__init__.py +31 -0
  39. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  40. experimaestro/scheduler/remote/client.py +928 -0
  41. experimaestro/scheduler/remote/protocol.py +282 -0
  42. experimaestro/scheduler/remote/server.py +447 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +186 -35
  45. experimaestro/scheduler/state_provider.py +811 -2157
  46. experimaestro/scheduler/state_status.py +1247 -0
  47. experimaestro/scheduler/transient.py +31 -0
  48. experimaestro/scheduler/workspace.py +1 -1
  49. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  50. experimaestro/scriptbuilder.py +4 -4
  51. experimaestro/settings.py +36 -0
  52. experimaestro/tests/conftest.py +33 -5
  53. experimaestro/tests/connectors/bin/executable.py +1 -1
  54. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  55. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  56. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  58. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  59. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  60. experimaestro/tests/launchers/bin/test.py +1 -0
  61. experimaestro/tests/launchers/test_slurm.py +9 -9
  62. experimaestro/tests/partial_reschedule.py +46 -0
  63. experimaestro/tests/restart.py +3 -3
  64. experimaestro/tests/restart_main.py +1 -0
  65. experimaestro/tests/scripts/notifyandwait.py +1 -0
  66. experimaestro/tests/task_partial.py +38 -0
  67. experimaestro/tests/task_tokens.py +2 -2
  68. experimaestro/tests/tasks/test_dynamic.py +6 -6
  69. experimaestro/tests/test_dependencies.py +3 -3
  70. experimaestro/tests/test_deprecated.py +15 -15
  71. experimaestro/tests/test_dynamic_locking.py +317 -0
  72. experimaestro/tests/test_environment.py +24 -14
  73. experimaestro/tests/test_experiment.py +171 -36
  74. experimaestro/tests/test_identifier.py +25 -25
  75. experimaestro/tests/test_identifier_stability.py +3 -5
  76. experimaestro/tests/test_multitoken.py +2 -4
  77. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  78. experimaestro/tests/test_partial_paths.py +81 -138
  79. experimaestro/tests/test_pre_experiment.py +219 -0
  80. experimaestro/tests/test_progress.py +2 -8
  81. experimaestro/tests/test_remote_state.py +1132 -0
  82. experimaestro/tests/test_stray_jobs.py +261 -0
  83. experimaestro/tests/test_tasks.py +1 -2
  84. experimaestro/tests/test_token_locking.py +52 -67
  85. experimaestro/tests/test_tokens.py +5 -6
  86. experimaestro/tests/test_transient.py +225 -0
  87. experimaestro/tests/test_workspace_state_provider.py +768 -0
  88. experimaestro/tests/token_reschedule.py +1 -3
  89. experimaestro/tests/utils.py +2 -7
  90. experimaestro/tokens.py +227 -372
  91. experimaestro/tools/diff.py +1 -0
  92. experimaestro/tools/documentation.py +4 -5
  93. experimaestro/tools/jobs.py +1 -2
  94. experimaestro/tui/app.py +459 -1895
  95. experimaestro/tui/app.tcss +162 -0
  96. experimaestro/tui/dialogs.py +172 -0
  97. experimaestro/tui/log_viewer.py +253 -3
  98. experimaestro/tui/messages.py +137 -0
  99. experimaestro/tui/utils.py +54 -0
  100. experimaestro/tui/widgets/__init__.py +23 -0
  101. experimaestro/tui/widgets/experiments.py +468 -0
  102. experimaestro/tui/widgets/global_services.py +238 -0
  103. experimaestro/tui/widgets/jobs.py +972 -0
  104. experimaestro/tui/widgets/log.py +156 -0
  105. experimaestro/tui/widgets/orphans.py +363 -0
  106. experimaestro/tui/widgets/runs.py +185 -0
  107. experimaestro/tui/widgets/services.py +314 -0
  108. experimaestro/tui/widgets/stray_jobs.py +528 -0
  109. experimaestro/utils/__init__.py +1 -1
  110. experimaestro/utils/environment.py +105 -22
  111. experimaestro/utils/fswatcher.py +124 -0
  112. experimaestro/utils/jobs.py +1 -2
  113. experimaestro/utils/jupyter.py +1 -2
  114. experimaestro/utils/logging.py +72 -0
  115. experimaestro/version.py +2 -2
  116. experimaestro/webui/__init__.py +9 -0
  117. experimaestro/webui/app.py +117 -0
  118. experimaestro/{server → webui}/data/index.css +66 -11
  119. experimaestro/webui/data/index.css.map +1 -0
  120. experimaestro/{server → webui}/data/index.js +82763 -87217
  121. experimaestro/webui/data/index.js.map +1 -0
  122. experimaestro/webui/routes/__init__.py +5 -0
  123. experimaestro/webui/routes/auth.py +53 -0
  124. experimaestro/webui/routes/proxy.py +117 -0
  125. experimaestro/webui/server.py +200 -0
  126. experimaestro/webui/state_bridge.py +152 -0
  127. experimaestro/webui/websocket.py +413 -0
  128. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
  129. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  130. experimaestro/cli/progress.py +0 -269
  131. experimaestro/scheduler/state.py +0 -75
  132. experimaestro/scheduler/state_db.py +0 -388
  133. experimaestro/scheduler/state_sync.py +0 -834
  134. experimaestro/server/__init__.py +0 -467
  135. experimaestro/server/data/index.css.map +0 -1
  136. experimaestro/server/data/index.js.map +0 -1
  137. experimaestro/tests/test_cli_jobs.py +0 -615
  138. experimaestro/tests/test_file_progress.py +0 -425
  139. experimaestro/tests/test_file_progress_integration.py +0 -477
  140. experimaestro/tests/test_state_db.py +0 -434
  141. experimaestro-2.0.0b4.dist-info/RECORD +0 -181
  142. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  143. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  145. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  147. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  148. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  149. /experimaestro/{server → webui}/data/index.html +0 -0
  150. /experimaestro/{server → webui}/data/login.html +0 -0
  151. /experimaestro/{server → webui}/data/manifest.json +0 -0
  152. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  153. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  154. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/locking.py CHANGED
@@ -1,5 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from abc import ABC, abstractmethod
5
+ import json
6
+ import logging
7
+ import os.path
8
+ from pathlib import Path
9
+ import threading
10
+ import time
11
+ from typing import TYPE_CHECKING, Callable, Optional
12
+ import weakref
13
+
14
+ import fasteners
15
+ from watchdog.events import FileSystemEventHandler
16
+
1
17
  from experimaestro.utils.asyncio import asyncThreadcheck
2
- from .utils import logger
18
+ from experimaestro.dynamic import DynamicResource
19
+
20
+ logger = logging.getLogger("xpm.locking")
21
+
22
+ if TYPE_CHECKING:
23
+ from experimaestro.scheduler.jobs import Job
24
+ from experimaestro.connectors import Process
25
+ from experimaestro.dynamic import DynamicDependency
26
+
27
+
28
+ def get_job_lock_relpath(task_id: str, identifier: str) -> Path:
29
+ """Get the lock relative path for a job.
30
+
31
+ Creates a unique relative path combining task_id and identifier.
32
+ Limited to 256 characters to avoid filesystem issues.
33
+
34
+ Args:
35
+ task_id: The task identifier
36
+ identifier: The job identifier (hash)
37
+
38
+ Returns:
39
+ Relative path in format "{task_id}@{identifier}.json"
40
+ """
41
+ return Path(f"{task_id}@{identifier}"[:256] + ".json")
3
42
 
4
43
 
5
44
  class Lock:
@@ -66,3 +105,920 @@ class Locks(Lock):
66
105
  for lock in self.locks:
67
106
  logger.debug("[locks] Releasing %s", lock)
68
107
  lock.release()
108
+
109
+
110
+ class DynamicDependencyLock(Lock, ABC):
111
+ """Base class for locks from dynamic dependencies with lifecycle hooks.
112
+
113
+ Dynamic dependency locks have additional lifecycle methods that are called
114
+ by the scheduler when a job starts and finishes. This allows locks to:
115
+ - Persist state for recovery (e.g., write lock info to disk)
116
+ - Clean up resources race-safely when job finishes
117
+ - Serialize lock info to pass to job process
118
+
119
+ File structure (standardized):
120
+ - {lock_folder}/informations.json: Resource-level info (e.g., token counts)
121
+ - {lock_folder}/ipc.lock: IPC lock for inter-process coordination
122
+ - {lock_folder}/jobs/{task_specific_path}.json: Per-job lock file
123
+
124
+ Subclasses must implement:
125
+ - lock_folder: Path to the lock folder
126
+
127
+ Subclasses must implement to_json() to include 'module' and 'class' keys
128
+ for dynamic deserialization in the job process.
129
+ """
130
+
131
+ dependency: "DynamicDependency"
132
+
133
+ def __init__(self, dependency: DynamicDependency):
134
+ super().__init__()
135
+ self.dependency = dependency
136
+
137
+ @property
138
+ @abstractmethod
139
+ def lock_folder(self) -> Path:
140
+ """Path to the lock folder. Must be implemented by subclasses."""
141
+ ...
142
+
143
+ @property
144
+ def ipc_lock_path(self) -> Path:
145
+ """Path to the IPC lock file."""
146
+ return self.lock_folder / "ipc.lock"
147
+
148
+ @property
149
+ def lock_file_path(self) -> Path:
150
+ """Path to the lock file for the current job."""
151
+ job = self.dependency.target
152
+ return (
153
+ self.lock_folder
154
+ / "jobs"
155
+ / get_job_lock_relpath(job.task_id, job.identifier)
156
+ )
157
+
158
+ async def aio_job_before_start(self, job: Job) -> None:
159
+ """Called before the job is started.
160
+
161
+ This is called AFTER the job directory is created but BEFORE the
162
+ job process is spawned. Use this to set up resources needed by the job.
163
+
164
+ :param job: The job about to start
165
+ """
166
+ pass
167
+
168
+ async def aio_job_started(self, job: Job, process: Process) -> None:
169
+ """Called when the job has started successfully.
170
+
171
+ This is called AFTER the process has been spawned but BEFORE the
172
+ scheduler releases the connector lock. Use this to persist lock state
173
+ for recovery purposes.
174
+
175
+ :param job: The job that started
176
+ :param process: The process running the job
177
+ """
178
+ pass
179
+
180
+ async def aio_job_finished(self, job: Job) -> None:
181
+ """Called when the job has finished (success or failure).
182
+
183
+ This is called BEFORE the lock is released. Use this for any
184
+ pre-release cleanup that requires knowledge of job state.
185
+
186
+ :param job: The job that finished
187
+ """
188
+ pass
189
+
190
+ def to_json(self) -> dict:
191
+ """Serialize lock info for passing to job process.
192
+
193
+ Returns a dict with 'module' and 'class' keys for dynamic import.
194
+ Subclasses should call super().to_json() and update with their data.
195
+
196
+ :return: JSON-serializable dict with lock information
197
+ """
198
+ return {
199
+ "module": self.__class__.__module__,
200
+ "class": self.__class__.__name__,
201
+ }
202
+
203
+ @classmethod
204
+ def from_json(cls, data: dict) -> JobDependencyLock:
205
+ """Deserialize lock info received in job process.
206
+
207
+ This creates a JobDependencyLock variant - a lock that is already
208
+ held and only needs to be released on exit.
209
+
210
+ :param data: Dict from to_json()
211
+ :return: JobDependencyLock instance
212
+ """
213
+ raise NotImplementedError(f"from_json not implemented for {cls.__name__}")
214
+
215
+
216
+ class DynamicDependencyLocks(Lock):
217
+ """Container for dynamic dependency locks with lifecycle support.
218
+
219
+ This container manages a collection of DynamicDependencyLock instances,
220
+ providing batch operations for lifecycle events and serialization.
221
+ """
222
+
223
+ def __init__(self):
224
+ super().__init__()
225
+ self.locks: list[DynamicDependencyLock] = []
226
+
227
+ def append(self, lock: DynamicDependencyLock) -> None:
228
+ """Add a lock to the container."""
229
+ self.locks.append(lock)
230
+
231
+ def clear(self) -> None:
232
+ """Clear all locks from the container (without releasing)."""
233
+ self.locks.clear()
234
+
235
+ def _acquire(self) -> None:
236
+ """Acquire all locks."""
237
+ for lock in self.locks:
238
+ lock.acquire()
239
+
240
+ def _release(self) -> None:
241
+ """Release all locks."""
242
+ logger.debug("Releasing %d dynamic dependency locks", len(self.locks))
243
+ for lock in self.locks:
244
+ logger.debug("[locks] Releasing %s", lock)
245
+ lock.release()
246
+
247
+ async def aio_job_before_start(self, job: Job) -> None:
248
+ """Notify all locks before job starts."""
249
+ for lock in self.locks:
250
+ await lock.aio_job_before_start(job)
251
+
252
+ async def aio_job_started(self, job: Job, process: Process) -> None:
253
+ """Notify all locks that job has started."""
254
+ for lock in self.locks:
255
+ await lock.aio_job_started(job, process)
256
+
257
+ async def aio_job_finished(self, job: Job) -> None:
258
+ """Notify all locks that job has finished."""
259
+ for lock in self.locks:
260
+ await lock.aio_job_finished(job)
261
+
262
+ def to_json(self) -> list[dict]:
263
+ """Serialize all locks for job process."""
264
+ return [lock.to_json() for lock in self.locks]
265
+
266
+
267
+ class JobDependencyLock:
268
+ """Lock held by job process.
269
+
270
+ This is the job-process-side counterpart of DynamicDependencyLock.
271
+ Created via from_json(), then acquire() is called when entering context.
272
+
273
+ The scheduler creates the lock file before starting the job. The job process
274
+ verifies the lock file exists on acquire() and deletes it on release().
275
+
276
+ Subclasses should set lock_file_path in __init__ from JSON data.
277
+ """
278
+
279
+ #: Path to the lock file to delete on release (set from JSON data)
280
+ lock_file_path: Optional[Path] = None
281
+
282
+ def verify_lock_file(self) -> None:
283
+ """Verify the lock file exists.
284
+
285
+ If lock_file_path is None, this is a no-op.
286
+
287
+ Raises:
288
+ LockError: If lock file is missing
289
+ """
290
+ if self.lock_file_path is not None and not self.lock_file_path.is_file():
291
+ raise LockError(f"Lock file missing: {self.lock_file_path}")
292
+
293
+ def acquire(self) -> None:
294
+ """Acquire the lock. Called when entering context.
295
+
296
+ Verifies that the scheduler created the lock file.
297
+ """
298
+ self.verify_lock_file()
299
+
300
+ def release(self) -> None:
301
+ """Release the lock and delete the lock file.
302
+
303
+ Called when exiting context.
304
+ """
305
+ if self.lock_file_path is not None and self.lock_file_path.is_file():
306
+ logger.debug("Deleting lock file: %s", self.lock_file_path)
307
+ self.lock_file_path.unlink()
308
+
309
+ def __enter__(self):
310
+ self.acquire()
311
+ return self
312
+
313
+ def __exit__(self, *args):
314
+ self.release()
315
+
316
+
317
+ class _JobDependencyLocksContext:
318
+ """Context manager for acquiring/releasing job dependency locks."""
319
+
320
+ def __init__(self, locks: list[JobDependencyLock]):
321
+ self._locks = locks
322
+ self._acquired: list[JobDependencyLock] = []
323
+
324
+ def __enter__(self):
325
+ for lock in self._locks:
326
+ lock.acquire()
327
+ self._acquired.append(lock)
328
+ return self
329
+
330
+ def __exit__(self, *args):
331
+ for lock in reversed(self._acquired):
332
+ try:
333
+ lock.release()
334
+ except Exception:
335
+ logger.exception("Error releasing lock %s", lock)
336
+
337
+
338
+ class JobDependencyLocks:
339
+ """Container for locks in job process.
340
+
341
+ Manages a collection of JobDependencyLock instances.
342
+ Use dependency_locks() to get a context manager for acquire/release.
343
+ """
344
+
345
+ def __init__(self):
346
+ self.locks: list[JobDependencyLock] = []
347
+
348
+ def dependency_locks(self) -> _JobDependencyLocksContext:
349
+ """Return a context manager that acquires locks on enter, releases on exit."""
350
+ return _JobDependencyLocksContext(self.locks)
351
+
352
+ @classmethod
353
+ def from_json(cls, locks_data: list[dict]) -> JobDependencyLocks:
354
+ """Create from serialized lock data.
355
+
356
+ Each lock entry must have 'module' and 'class' keys specifying
357
+ the DynamicDependencyLock subclass to use for deserialization.
358
+ """
359
+ import importlib
360
+
361
+ instance = cls()
362
+ for lock_data in locks_data:
363
+ module_name = lock_data.get("module")
364
+ class_name = lock_data.get("class")
365
+
366
+ if module_name is None or class_name is None:
367
+ logger.warning("Lock data missing 'module' or 'class': %s", lock_data)
368
+ continue
369
+
370
+ try:
371
+ module = importlib.import_module(module_name)
372
+ lock_class = getattr(module, class_name)
373
+ job_lock = lock_class.from_json(lock_data)
374
+ instance.locks.append(job_lock)
375
+ except (ImportError, AttributeError) as e:
376
+ logger.warning(
377
+ "Failed to load lock class %s.%s: %s",
378
+ module_name,
379
+ class_name,
380
+ e,
381
+ )
382
+ continue
383
+
384
+ return instance
385
+
386
+
387
+ # --- Generalized dynamic lock file and resource tracking ---
388
+
389
+
390
+ class DynamicLockFile(ABC):
391
+ """Base class for files that track who holds a dynamic lock.
392
+
393
+ Each lock file stores JSON with:
394
+ - job_uri: Reference to the job holding the lock
395
+ - information: Type-specific data
396
+
397
+ Subclasses override from_information() and to_information() to
398
+ handle type-specific data in the "information" field.
399
+ """
400
+
401
+ path: Path
402
+ job_uri: Optional[str]
403
+
404
+ def __init__(self, path: Path):
405
+ """Load lock file from disk.
406
+
407
+ Args:
408
+ path: Path to the lock file
409
+ """
410
+ self.path = path
411
+ self.job_uri = None
412
+
413
+ last_error = None
414
+ retries = 0
415
+ while retries < 5:
416
+ retries += 1
417
+ try:
418
+ with path.open("rt") as fp:
419
+ data = json.load(fp)
420
+ self.job_uri = data.get("job_uri")
421
+ self.from_information(data.get("information"))
422
+ return # Success
423
+ except FileNotFoundError:
424
+ # File was deleted between check and read
425
+ return
426
+ except Exception as e:
427
+ last_error = e
428
+ logging.exception("Error while reading %s", self.path)
429
+ time.sleep(0.1)
430
+ continue
431
+
432
+ # Exhausted retries - re-raise the last error
433
+ if last_error is not None:
434
+ raise last_error
435
+
436
+ @classmethod
437
+ def create(cls, path: Path, job_uri: str, information=None) -> "DynamicLockFile":
438
+ """Create a new lock file on disk.
439
+
440
+ Args:
441
+ path: Path where to create the file
442
+ job_uri: URI of the job holding the lock
443
+ information: Type-specific data for the lock file
444
+
445
+ Returns:
446
+ New lock file instance
447
+ """
448
+ self = object.__new__(cls)
449
+ self.path = path
450
+ self.job_uri = job_uri
451
+ self.from_information(information)
452
+
453
+ logging.debug("Writing lock file %s", path)
454
+ data = {"job_uri": job_uri, "information": self.to_information()}
455
+ with path.open("wt") as fp:
456
+ json.dump(data, fp)
457
+ return self
458
+
459
+ def delete(self) -> None:
460
+ """Delete the lock file from disk."""
461
+ if self.path.is_file():
462
+ logging.debug("Deleting lock file %s", self.path)
463
+ self.path.unlink()
464
+
465
+ def watch(self, on_released: Optional[Callable[[], None]] = None) -> None:
466
+ """Watch the job process and call callback when it finishes.
467
+
468
+ This starts a background thread that:
469
+ 1. Waits for the job lock to be available (job started)
470
+ 2. Waits for the process to finish
471
+ 3. Deletes the lock file
472
+ 4. Calls the callback (if provided)
473
+
474
+ Args:
475
+ on_released: Optional callback to invoke when lock is released
476
+ """
477
+ if self.job_uri is None:
478
+ return
479
+
480
+ logger.debug("Watching process for %s (%s)", self.path, self.job_uri)
481
+ job_path = Path(self.job_uri)
482
+ lockpath = job_path.with_suffix(".lock")
483
+ pidpath = job_path.with_suffix(".pid")
484
+
485
+ def run():
486
+ logger.debug("Locking job lock path %s", lockpath)
487
+ process = None
488
+
489
+ # Acquire the job lock - blocks if scheduler is still starting the job
490
+ # Once we get the lock, the job has either started or finished
491
+ with fasteners.InterProcessLock(lockpath):
492
+ if not pidpath.is_file():
493
+ logger.debug("Job already finished (no PID file %s)", pidpath)
494
+ else:
495
+ s = ""
496
+ while s == "":
497
+ s = pidpath.read_text()
498
+
499
+ logger.info("Loading job watcher from definition")
500
+ from experimaestro.connectors import Process
501
+ from experimaestro.connectors.local import LocalConnector
502
+
503
+ connector = LocalConnector.instance()
504
+ process = Process.fromDefinition(connector, json.loads(s))
505
+
506
+ # Wait out of the lock
507
+ if process is not None:
508
+ process.wait()
509
+
510
+ self.delete()
511
+ if on_released is not None:
512
+ on_released()
513
+
514
+ threading.Thread(target=run).start()
515
+
516
+ def from_information(self, info) -> None:
517
+ """Set type-specific data from the "information" field.
518
+
519
+ Override in subclasses to handle extra data.
520
+
521
+ Args:
522
+ info: The "information" value from the JSON file
523
+ """
524
+ pass
525
+
526
+ def to_information(self):
527
+ """Get type-specific data for the "information" field.
528
+
529
+ Override in subclasses to include extra data.
530
+
531
+ Returns:
532
+ Value to store in the "information" field (JSON-serializable)
533
+ """
534
+ return None
535
+
536
+
537
+ class _TrackedResourceProxy(FileSystemEventHandler):
538
+ """Weak reference proxy for file system events.
539
+
540
+ Prevents the resource from being kept alive by the watcher.
541
+ """
542
+
543
+ def __init__(self, resource: "TrackedDynamicResource"):
544
+ self._resource_ref = weakref.ref(resource)
545
+
546
+ def on_modified(self, event):
547
+ resource = self._resource_ref()
548
+ if resource is not None:
549
+ return resource.on_modified(event)
550
+
551
+ def on_deleted(self, event):
552
+ resource = self._resource_ref()
553
+ if resource is not None:
554
+ return resource.on_deleted(event)
555
+
556
+ def on_created(self, event):
557
+ resource = self._resource_ref()
558
+ if resource is not None:
559
+ return resource.on_created(event)
560
+
561
+
562
+ class TrackedDynamicResource(DynamicResource, ABC):
563
+ """Base class for resources with file-based lock tracking.
564
+
565
+ Inherits from DynamicResource to provide async_wait() via ResourcePoller.
566
+
567
+ This provides:
568
+ - File system watching for lock files
569
+ - IPC and thread locking
570
+ - Condition variable for waiting on availability
571
+ - Cache of lock files
572
+ - Async waiting via ResourcePoller
573
+
574
+ File structure:
575
+ - {lock_folder}/informations.json: Resource-level info (e.g., token counts)
576
+ - {lock_folder}/ipc.lock: IPC lock for inter-process coordination
577
+ - {lock_folder}/jobs/{task_specific_path}.json: Per-job lock files
578
+
579
+ Subclasses must implement:
580
+ - lock_folder: Path to the lock folder (abstract property)
581
+ - lock_file_class: The DynamicLockFile subclass to use
582
+ - is_available(): Check if resource is available for a dependency
583
+ - _do_acquire(): Perform acquire logic
584
+ - _do_release(): Perform release logic
585
+ """
586
+
587
+ #: Subclass of DynamicLockFile to use for lock files
588
+ lock_file_class: type[DynamicLockFile]
589
+
590
+ @property
591
+ @abstractmethod
592
+ def lock_folder(self) -> Path:
593
+ """Path to the lock folder. Must be implemented by subclasses."""
594
+ ...
595
+
596
+ @property
597
+ def informations_path(self) -> Path:
598
+ """Path to the informations.json file."""
599
+ return self.lock_folder / "informations.json"
600
+
601
+ @property
602
+ def ipc_lock_path(self) -> Path:
603
+ """Path to the IPC lock file."""
604
+ return self.lock_folder / "ipc.lock"
605
+
606
+ @property
607
+ def jobs_folder(self) -> Path:
608
+ """Path to the jobs folder containing per-job lock files."""
609
+ return self.lock_folder / "jobs"
610
+
611
+ def __init__(self, name: str):
612
+ """Initialize the resource.
613
+
614
+ Args:
615
+ name: Human-readable name for the resource
616
+ """
617
+ self.name = name
618
+ self.lock_folder.mkdir(exist_ok=True, parents=True)
619
+
620
+ self.cache: dict[str, DynamicLockFile] = {}
621
+
622
+ self.ipc_lock = fasteners.InterProcessLock(self.ipc_lock_path)
623
+ self.lock = threading.Lock()
624
+ self.available_condition = threading.Condition(self.lock)
625
+
626
+ self.timestamp = os.path.getmtime(self.lock_folder)
627
+
628
+ # Initial state update
629
+ with self.lock, self.ipc_lock:
630
+ self._update()
631
+
632
+ # Set up file system watching
633
+ from .ipc import ipcom
634
+
635
+ self.watchedpath = str(self.lock_folder.absolute())
636
+ self.proxy = _TrackedResourceProxy(self)
637
+ self.watcher = ipcom().fswatch(self.proxy, self.lock_folder, recursive=True)
638
+ logger.debug("Watching %s", self.watchedpath)
639
+
640
+ def __del__(self):
641
+ if self.watcher is not None:
642
+ logging.debug("Removing watcher on %s", self.watchedpath)
643
+ from .ipc import ipcom
644
+
645
+ ipcom().fsunwatch(self.watcher)
646
+ self.watcher = None
647
+
648
+ def refresh_state(self) -> None:
649
+ """Refresh state from disk.
650
+
651
+ This is a fallback for when file system notifications are missed.
652
+ Called by ResourcePoller periodically.
653
+ """
654
+ with self.lock, self.ipc_lock:
655
+ self._update()
656
+ self.available_condition.notify_all()
657
+
658
+ async def async_wait(self, timeout: float = 0) -> bool:
659
+ """Wait asynchronously until the resource state may have changed.
660
+
661
+ Uses ResourcePoller for efficient polling across all resources.
662
+
663
+ Args:
664
+ timeout: Maximum time to wait in seconds (0 = wait indefinitely)
665
+
666
+ Returns:
667
+ True if notified of a change, False if timed out
668
+ """
669
+ from experimaestro.dynamic import ResourcePoller
670
+
671
+ loop = asyncio.get_running_loop()
672
+ poller = ResourcePoller.instance()
673
+
674
+ event = poller.register(self, loop, timeout)
675
+
676
+ try:
677
+ if timeout > 0:
678
+ try:
679
+ await asyncio.wait_for(event.wait(), timeout=timeout)
680
+ return True
681
+ except asyncio.TimeoutError:
682
+ return False
683
+ else:
684
+ await event.wait()
685
+ return True
686
+ finally:
687
+ # Event cleanup is handled by poller
688
+ pass
689
+
690
+ def _lock_file_key(self, path: Path) -> str:
691
+ """Get the cache key for a lock file path.
692
+
693
+ The key is the relative path from jobs_folder (e.g., "task_id@identifier.json").
694
+ """
695
+ return str(path.relative_to(self.jobs_folder))
696
+
697
+ def _update(self) -> None:
698
+ """Update state by reading all lock files from disk.
699
+
700
+ Assumes IPC lock is held.
701
+ """
702
+ logging.debug("Full resource state update for %s", self.name)
703
+ old_cache = self.cache
704
+ self.cache = {}
705
+
706
+ self._reset_state()
707
+
708
+ if self.jobs_folder.exists():
709
+ for path in self.jobs_folder.glob("*.json"):
710
+ key = self._lock_file_key(path)
711
+ lf = old_cache.get(key)
712
+ if lf is None:
713
+ lf = self.lock_file_class(path)
714
+ lf.watch(lambda k=key: self._on_lock_released(k))
715
+ logging.debug("Read lock file %s", path)
716
+ else:
717
+ logging.debug("Lock file already in cache %s", key)
718
+
719
+ self.cache[key] = lf
720
+ self._account_lock_file(lf)
721
+
722
+ logging.debug("Full resource state update finished for %s", self.name)
723
+
724
+ def _on_lock_released(self, name: str) -> None:
725
+ """Called when a watched lock is released (job finished).
726
+
727
+ Args:
728
+ name: Name of the lock file
729
+ """
730
+ with self.lock:
731
+ if name in self.cache:
732
+ logging.debug("Lock released (job finished): %s", name)
733
+ lf = self.cache[name]
734
+ del self.cache[name]
735
+ self._unaccount_lock_file(lf)
736
+ self.available_condition.notify_all()
737
+ self._notify_poller()
738
+
739
+ def _is_job_lock_file(self, path: Path) -> bool:
740
+ """Check if path is a job lock file (under jobs_folder)."""
741
+ try:
742
+ path.relative_to(self.jobs_folder)
743
+ return path.suffix == ".json"
744
+ except ValueError:
745
+ return False
746
+
747
+ def _notify_poller(self) -> None:
748
+ """Notify the ResourcePoller that state has changed.
749
+
750
+ Called after file system events to wake up async waiters.
751
+ """
752
+ from experimaestro.dynamic import ResourcePoller
753
+
754
+ if ResourcePoller._instance is not None:
755
+ ResourcePoller._instance.notify(self)
756
+
757
+ def on_deleted(self, event) -> None:
758
+ """Handle file deletion event."""
759
+ logger.debug(
760
+ "Deleted path notification %s [watched %s]",
761
+ event.src_path,
762
+ self.watchedpath,
763
+ )
764
+ path = Path(event.src_path)
765
+ if not self._is_job_lock_file(path):
766
+ return
767
+
768
+ key = self._lock_file_key(path)
769
+ if key in self.cache:
770
+ with self.lock:
771
+ if key in self.cache:
772
+ logging.debug("Deleting %s from cache (event)", key)
773
+ lf = self.cache[key]
774
+ del self.cache[key]
775
+ self._unaccount_lock_file(lf)
776
+ self.available_condition.notify_all()
777
+ self._notify_poller()
778
+
779
+ def on_created(self, event) -> None:
780
+ """Handle file creation event."""
781
+ logger.debug(
782
+ "Created path notification %s [watched %s]",
783
+ event.src_path,
784
+ self.watchedpath,
785
+ )
786
+ path = Path(event.src_path)
787
+ if not self._is_job_lock_file(path):
788
+ return
789
+
790
+ try:
791
+ key = self._lock_file_key(path)
792
+ if key not in self.cache:
793
+ with self.lock:
794
+ if key not in self.cache:
795
+ lf = self.lock_file_class(path)
796
+ lf.watch(lambda k=key: self._on_lock_released(k))
797
+ self.cache[key] = lf
798
+ self._account_lock_file(lf)
799
+ except FileNotFoundError:
800
+ pass
801
+ except Exception:
802
+ logger.exception("Uncaught exception in on_created handler")
803
+ raise
804
+
805
+ def on_modified(self, event) -> None:
806
+ """Handle file modification event."""
807
+ try:
808
+ logger.debug(
809
+ "on modified path: %s [watched %s]",
810
+ event.src_path,
811
+ self.watchedpath,
812
+ )
813
+ path = Path(event.src_path)
814
+
815
+ # Handle informations.json modification
816
+ if event.src_path == str(self.informations_path):
817
+ self._on_information_modified()
818
+ return
819
+
820
+ # Handle job lock files
821
+ if not self._is_job_lock_file(path):
822
+ return
823
+
824
+ key = self._lock_file_key(path)
825
+ if key not in self.cache:
826
+ with self.lock:
827
+ if key not in self.cache:
828
+ logger.debug("Lock file not in cache %s", key)
829
+ try:
830
+ lf = self.lock_file_class(path)
831
+ lf.watch(lambda k=key: self._on_lock_released(k))
832
+ self.cache[key] = lf
833
+ self._account_lock_file(lf)
834
+ except FileNotFoundError:
835
+ pass
836
+ except Exception:
837
+ logger.exception("Uncaught exception in on_modified handler")
838
+ raise
839
+
840
+ def _on_information_modified(self) -> None:
841
+ """Handle informations.json modification.
842
+
843
+ Checks timestamp to avoid duplicate processing, then calls
844
+ _handle_information_change() for subclass-specific logic.
845
+ """
846
+ import os
847
+
848
+ logger.debug("Resource information modified: %s", self.name)
849
+ with self.lock:
850
+ timestamp = os.path.getmtime(self.informations_path)
851
+ if timestamp <= self.timestamp:
852
+ logger.debug(
853
+ "Not reading information file [%f <= %f]",
854
+ timestamp,
855
+ self.timestamp,
856
+ )
857
+ return
858
+
859
+ self._handle_information_change()
860
+
861
+ def _handle_information_change(self) -> None:
862
+ """Handle resource-specific information changes.
863
+
864
+ Override in subclasses to handle changes to informations.json.
865
+ Called after timestamp check passes. Default implementation does nothing.
866
+ """
867
+ pass
868
+
869
+ @abstractmethod
870
+ def _reset_state(self) -> None:
871
+ """Reset resource state before re-reading lock files.
872
+
873
+ Called at the start of _update() before iterating lock files.
874
+ """
875
+ pass
876
+
877
+ @abstractmethod
878
+ def _account_lock_file(self, lf: DynamicLockFile) -> None:
879
+ """Account for a lock file in resource state.
880
+
881
+ Called when a lock file is read or created.
882
+
883
+ Args:
884
+ lf: The lock file to account for
885
+ """
886
+ pass
887
+
888
+ @abstractmethod
889
+ def _unaccount_lock_file(self, lf: DynamicLockFile) -> None:
890
+ """Remove a lock file from resource state accounting.
891
+
892
+ Called when a lock file is deleted.
893
+
894
+ Args:
895
+ lf: The lock file to unaccount
896
+ """
897
+ pass
898
+
899
+ @abstractmethod
900
+ def is_available(self, dependency: "DynamicDependency") -> bool:
901
+ """Check if resource is available for the given dependency.
902
+
903
+ Args:
904
+ dependency: The dependency requesting the resource
905
+
906
+ Returns:
907
+ True if resource is available
908
+ """
909
+ pass
910
+
911
+ @abstractmethod
912
+ def _do_acquire(self, dependency: "DynamicDependency") -> None:
913
+ """Perform acquire logic for the dependency.
914
+
915
+ Called after availability is confirmed and lock file is created.
916
+
917
+ Args:
918
+ dependency: The dependency acquiring the resource
919
+ """
920
+ pass
921
+
922
+ @abstractmethod
923
+ def _do_release(self, dependency: "DynamicDependency") -> None:
924
+ """Perform release logic for the dependency.
925
+
926
+ Called before lock file is deleted.
927
+
928
+ Args:
929
+ dependency: The dependency releasing the resource
930
+ """
931
+ pass
932
+
933
+ def _get_job_lock_path(self, dependency: "DynamicDependency") -> Path:
934
+ """Get the lock file path for a dependency.
935
+
936
+ Returns path under jobs_folder: jobs/{task_id}@{identifier}.json
937
+ """
938
+ job = dependency.target
939
+ return self.jobs_folder / get_job_lock_relpath(job.task_id, job.identifier)
940
+
941
+ def acquire(self, dependency: "DynamicDependency") -> None:
942
+ """Acquire the resource for a dependency.
943
+
944
+ Args:
945
+ dependency: The dependency requesting the resource
946
+
947
+ Raises:
948
+ LockError: If resource is not available
949
+ """
950
+ with self.lock, self.ipc_lock:
951
+ self._update()
952
+ if not self.is_available(dependency):
953
+ raise LockError(f"Resource {self.name} not available")
954
+
955
+ # Create lock file
956
+ lock_path = self._get_job_lock_path(dependency)
957
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
958
+ lock_key = self._lock_file_key(lock_path)
959
+
960
+ lf = self.lock_file_class.create(
961
+ lock_path,
962
+ self._get_job_uri(dependency),
963
+ information=self._get_lock_file_information(dependency),
964
+ )
965
+ self.cache[lock_key] = lf
966
+
967
+ self._do_acquire(dependency)
968
+
969
+ logger.debug("Acquired %s for %s", self.name, dependency)
970
+
971
+ def release(self, dependency: "DynamicDependency") -> None:
972
+ """Release the resource for a dependency.
973
+
974
+ Args:
975
+ dependency: The dependency releasing the resource
976
+ """
977
+ with self.lock, self.ipc_lock:
978
+ self._update()
979
+
980
+ lock_path = self._get_job_lock_path(dependency)
981
+ lock_key = self._lock_file_key(lock_path)
982
+ lf = self.cache.get(lock_key)
983
+ if lf is None:
984
+ # Lock file may have been released already (e.g., job completed)
985
+ logger.debug(
986
+ "Lock file not in cache for %s (%s) - may have been released already",
987
+ dependency,
988
+ lock_key,
989
+ )
990
+ return
991
+
992
+ logger.debug("Deleting %s from cache", lock_key)
993
+ del self.cache[lock_key]
994
+
995
+ self._do_release(dependency)
996
+
997
+ self.available_condition.notify_all()
998
+ lf.delete()
999
+
1000
+ def _get_job_uri(self, dependency: "DynamicDependency") -> str:
1001
+ """Get the job URI for a dependency.
1002
+
1003
+ Default implementation uses dependency.target.basepath.
1004
+
1005
+ Args:
1006
+ dependency: The dependency
1007
+
1008
+ Returns:
1009
+ Job URI string
1010
+ """
1011
+ return str(dependency.target.basepath)
1012
+
1013
+ def _get_lock_file_information(self, dependency: "DynamicDependency"):
1014
+ """Get information to store in lock file.
1015
+
1016
+ Override in subclasses to store type-specific data.
1017
+
1018
+ Args:
1019
+ dependency: The dependency
1020
+
1021
+ Returns:
1022
+ Information for lock file creation (JSON-serializable)
1023
+ """
1024
+ return None