experimaestro 2.0.0a8__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (122) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +278 -7
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/refactor.py +249 -0
  7. experimaestro/click.py +0 -1
  8. experimaestro/commandline.py +19 -3
  9. experimaestro/connectors/__init__.py +20 -1
  10. experimaestro/connectors/local.py +12 -0
  11. experimaestro/core/arguments.py +182 -46
  12. experimaestro/core/identifier.py +107 -6
  13. experimaestro/core/objects/__init__.py +6 -0
  14. experimaestro/core/objects/config.py +542 -25
  15. experimaestro/core/objects/config_walk.py +20 -0
  16. experimaestro/core/serialization.py +91 -34
  17. experimaestro/core/subparameters.py +164 -0
  18. experimaestro/core/types.py +175 -38
  19. experimaestro/exceptions.py +26 -0
  20. experimaestro/experiments/cli.py +111 -25
  21. experimaestro/generators.py +50 -9
  22. experimaestro/huggingface.py +3 -1
  23. experimaestro/launcherfinder/parser.py +29 -0
  24. experimaestro/launchers/__init__.py +26 -1
  25. experimaestro/launchers/direct.py +12 -0
  26. experimaestro/launchers/slurm/base.py +154 -2
  27. experimaestro/mkdocs/metaloader.py +0 -1
  28. experimaestro/mypy.py +452 -7
  29. experimaestro/notifications.py +63 -13
  30. experimaestro/progress.py +0 -2
  31. experimaestro/rpyc.py +0 -1
  32. experimaestro/run.py +19 -6
  33. experimaestro/scheduler/base.py +510 -125
  34. experimaestro/scheduler/dependencies.py +43 -28
  35. experimaestro/scheduler/dynamic_outputs.py +259 -130
  36. experimaestro/scheduler/experiment.py +256 -31
  37. experimaestro/scheduler/interfaces.py +501 -0
  38. experimaestro/scheduler/jobs.py +216 -206
  39. experimaestro/scheduler/remote/__init__.py +31 -0
  40. experimaestro/scheduler/remote/client.py +874 -0
  41. experimaestro/scheduler/remote/protocol.py +467 -0
  42. experimaestro/scheduler/remote/server.py +423 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +323 -23
  45. experimaestro/scheduler/state_db.py +437 -0
  46. experimaestro/scheduler/state_provider.py +2766 -0
  47. experimaestro/scheduler/state_sync.py +891 -0
  48. experimaestro/scheduler/workspace.py +52 -10
  49. experimaestro/scriptbuilder.py +7 -0
  50. experimaestro/server/__init__.py +147 -57
  51. experimaestro/server/data/index.css +0 -125
  52. experimaestro/server/data/index.css.map +1 -1
  53. experimaestro/server/data/index.js +194 -58
  54. experimaestro/server/data/index.js.map +1 -1
  55. experimaestro/settings.py +44 -5
  56. experimaestro/sphinx/__init__.py +3 -3
  57. experimaestro/taskglobals.py +20 -0
  58. experimaestro/tests/conftest.py +80 -0
  59. experimaestro/tests/core/test_generics.py +2 -2
  60. experimaestro/tests/identifier_stability.json +45 -0
  61. experimaestro/tests/launchers/bin/sacct +6 -2
  62. experimaestro/tests/launchers/bin/sbatch +4 -2
  63. experimaestro/tests/launchers/test_slurm.py +80 -0
  64. experimaestro/tests/tasks/test_dynamic.py +231 -0
  65. experimaestro/tests/test_cli_jobs.py +615 -0
  66. experimaestro/tests/test_deprecated.py +630 -0
  67. experimaestro/tests/test_environment.py +200 -0
  68. experimaestro/tests/test_file_progress_integration.py +1 -1
  69. experimaestro/tests/test_forward.py +3 -3
  70. experimaestro/tests/test_identifier.py +372 -41
  71. experimaestro/tests/test_identifier_stability.py +458 -0
  72. experimaestro/tests/test_instance.py +3 -3
  73. experimaestro/tests/test_multitoken.py +442 -0
  74. experimaestro/tests/test_mypy.py +433 -0
  75. experimaestro/tests/test_objects.py +312 -5
  76. experimaestro/tests/test_outputs.py +2 -2
  77. experimaestro/tests/test_param.py +8 -12
  78. experimaestro/tests/test_partial_paths.py +231 -0
  79. experimaestro/tests/test_progress.py +0 -48
  80. experimaestro/tests/test_remote_state.py +671 -0
  81. experimaestro/tests/test_resumable_task.py +480 -0
  82. experimaestro/tests/test_serializers.py +141 -1
  83. experimaestro/tests/test_state_db.py +434 -0
  84. experimaestro/tests/test_subparameters.py +160 -0
  85. experimaestro/tests/test_tags.py +136 -0
  86. experimaestro/tests/test_tasks.py +107 -121
  87. experimaestro/tests/test_token_locking.py +252 -0
  88. experimaestro/tests/test_tokens.py +17 -13
  89. experimaestro/tests/test_types.py +123 -1
  90. experimaestro/tests/test_workspace_triggers.py +158 -0
  91. experimaestro/tests/token_reschedule.py +4 -2
  92. experimaestro/tests/utils.py +2 -2
  93. experimaestro/tokens.py +154 -57
  94. experimaestro/tools/diff.py +1 -1
  95. experimaestro/tui/__init__.py +8 -0
  96. experimaestro/tui/app.py +2395 -0
  97. experimaestro/tui/app.tcss +353 -0
  98. experimaestro/tui/log_viewer.py +228 -0
  99. experimaestro/utils/__init__.py +23 -0
  100. experimaestro/utils/environment.py +148 -0
  101. experimaestro/utils/git.py +129 -0
  102. experimaestro/utils/resources.py +1 -1
  103. experimaestro/version.py +34 -0
  104. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +68 -38
  105. experimaestro-2.0.0b8.dist-info/RECORD +187 -0
  106. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +1 -1
  107. experimaestro-2.0.0b8.dist-info/entry_points.txt +16 -0
  108. experimaestro/compat.py +0 -6
  109. experimaestro/core/objects.pyi +0 -221
  110. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  111. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  112. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  113. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  114. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  115. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  116. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  117. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  118. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  119. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  120. experimaestro-2.0.0a8.dist-info/RECORD +0 -166
  121. experimaestro-2.0.0a8.dist-info/entry_points.txt +0 -17
  122. {experimaestro-2.0.0a8.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,6 @@
1
1
  import asyncio
2
- import time
3
2
  from collections import ChainMap
4
- import enum
3
+ from datetime import datetime
5
4
  from functools import cached_property
6
5
  import itertools
7
6
  from pathlib import Path
@@ -13,60 +12,42 @@ from experimaestro.core.objects import Config, ConfigWalkContext, WatchedOutput
13
12
  from experimaestro.notifications import LevelInformation, Reporter
14
13
 
15
14
  # from experimaestro.scheduler.base import Scheduler
16
- from experimaestro.scheduler.dependencies import Dependency, DependencyStatus, Resource
15
+ from experimaestro.scheduler.dependencies import Dependency, Resource
17
16
  from experimaestro.scheduler.workspace import RunMode, Workspace
18
- from experimaestro.locking import Lock, LockError, Locks
17
+ from experimaestro.scheduler.interfaces import (
18
+ BaseJob,
19
+ JobState,
20
+ JobStateUnscheduled,
21
+ JobStateWaiting,
22
+ JobStateReady,
23
+ JobStateScheduled,
24
+ JobStateRunning,
25
+ JobStateDone,
26
+ JobStateError,
27
+ JobFailureStatus,
28
+ )
29
+ from experimaestro.locking import Lock
19
30
  from experimaestro.utils import logger
20
31
 
21
32
  if TYPE_CHECKING:
22
33
  from experimaestro.connectors import Process
23
34
  from experimaestro.launchers import Launcher
35
+ from experimaestro.scheduler.experiment import experiment
24
36
 
25
37
 
26
- class JobState(enum.Enum):
27
- # Job is not yet scheduled
28
- UNSCHEDULED = 0
29
-
30
- # Job is waiting for dependencies to be done
31
- WAITING = 1
32
-
33
- # Job is ready to run
34
- READY = 2
35
-
36
- # Job is scheduled (e.g. slurm)
37
- SCHEDULED = 3
38
-
39
- # Job is running
40
- RUNNING = 4
41
-
42
- # Job is done (finished)
43
- DONE = 5
44
-
45
- # Job failed (finished)
46
- ERROR = 6
47
-
48
- def notstarted(self):
49
- return self.value <= JobState.READY.value
50
-
51
- def running(self):
52
- return (
53
- self.value == JobState.RUNNING.value
54
- or self.value == JobState.SCHEDULED.value
55
- )
56
-
57
- def finished(self):
58
- return self.value >= JobState.DONE.value
59
-
60
-
61
- class JobFailureStatus(enum.Enum):
62
- #: Job failed
63
- DEPENDENCY = 0
64
-
65
- #: Job dependency failed
66
- FAILED = 1
67
-
68
- #: Memory
69
- MEMORY = 2
38
+ # Re-export JobState for backward compatibility
39
+ __all__ = [
40
+ "JobState",
41
+ "JobStateUnscheduled",
42
+ "JobStateWaiting",
43
+ "JobStateReady",
44
+ "JobStateScheduled",
45
+ "JobStateRunning",
46
+ "JobStateDone",
47
+ "JobStateError",
48
+ "JobFailureStatus",
49
+ "Job",
50
+ ]
70
51
 
71
52
 
72
53
  class JobLock(Lock):
@@ -85,22 +66,43 @@ class JobDependency(Dependency):
85
66
  def __init__(self, job):
86
67
  super().__init__(job)
87
68
 
88
- def status(self) -> DependencyStatus:
89
- if self.origin.state == JobState.DONE:
90
- return DependencyStatus.OK
91
- elif self.origin.state == JobState.ERROR:
92
- return DependencyStatus.FAIL
93
- return DependencyStatus.WAIT
69
+ async def aio_lock(self, timeout: float = 0):
70
+ """Acquire lock on job dependency by waiting for job to complete
94
71
 
95
- def lock(self):
96
- return JobLock(self.origin)
72
+ Args:
73
+ timeout: Must be 0 (wait indefinitely) for job dependencies
97
74
 
75
+ Raises:
76
+ ValueError: If timeout is not 0
77
+ RuntimeError: If the job has not been submitted or if it failed
78
+ """
79
+ if timeout != 0:
80
+ raise ValueError(
81
+ "Job dependencies only support timeout=0 (wait indefinitely)"
82
+ )
98
83
 
99
- class Job(Resource):
84
+ # Wait for the job to finish
85
+ if self.origin._future is None:
86
+ raise RuntimeError(f"Job {self.origin} has no future - not submitted")
87
+ await asyncio.wrap_future(self.origin._future)
88
+
89
+ # Check if the job succeeded
90
+ if self.origin.state != JobState.DONE:
91
+ raise RuntimeError(
92
+ f"Dependency job {self.origin.identifier} failed with state "
93
+ f"{self.origin.state} for {self.target.identifier}"
94
+ )
95
+
96
+ # Job succeeded, acquire and return the lock
97
+ lock = JobLock(self.origin)
98
+ lock.acquire()
99
+ return lock
100
+
101
+
102
+ class Job(BaseJob, Resource):
100
103
  """A job is a resource that is produced by the execution of some code"""
101
104
 
102
105
  # Set by the scheduler
103
- _readyEvent: Optional[asyncio.Event]
104
106
  _future: Optional["concurrent.futures.Future"]
105
107
 
106
108
  def __init__(
@@ -110,6 +112,7 @@ class Job(Resource):
110
112
  workspace: Workspace = None,
111
113
  launcher: "Launcher" = None,
112
114
  run_mode: RunMode = RunMode.NORMAL,
115
+ max_retries: Optional[int] = None,
113
116
  ):
114
117
  from experimaestro.scheduler.base import Scheduler
115
118
 
@@ -128,47 +131,74 @@ class Job(Resource):
128
131
  self.name = str(self.type.identifier).rsplit(".", 1)[-1]
129
132
 
130
133
  self.scheduler: Optional["Scheduler"] = None
134
+ self.experiments: List["experiment"] = [] # Experiments this job belongs to
131
135
  self.config = config
132
136
  self.state: JobState = JobState.UNSCHEDULED
133
137
 
134
- #: If a job has failed, indicates the failure status
135
- self.failure_status: JobFailureStatus = None
136
-
137
138
  # Dependencies
138
139
  self.dependencies: Set[Dependency] = set() # as target
139
140
 
140
- # Watched outputs
141
- self.watched_outputs = {}
142
- for watched in config.__xpm__.watched_outputs:
143
- self.watch_output(watched)
141
+ # Check if this is a resumable task
142
+ from experimaestro.core.objects import ResumableTask
143
+
144
+ self.resumable = isinstance(config, ResumableTask)
145
+
146
+ # Retry configuration for resumable tasks
147
+ # Use workspace setting if max_retries is not specified
148
+ if max_retries is None and self.workspace:
149
+ max_retries = self.workspace.workspace_settings.max_retries
150
+ self.max_retries = max_retries if max_retries is not None else 3
151
+ self.retry_count = 0
152
+
153
+ # Watched outputs (stored for deferred registration with scheduler)
154
+ self.watched_outputs: List["WatchedOutput"] = list(
155
+ config.__xpm__.watched_outputs
156
+ )
144
157
 
145
158
  # Process
146
159
  self._process = None
147
- self.unsatisfied = 0
148
160
 
149
161
  # Meta-information
150
162
  self.starttime: Optional[float] = None
151
163
  self.submittime: Optional[float] = None
152
164
  self.endtime: Optional[float] = None
165
+ self.exit_code: Optional[int] = None
153
166
  self._progress: List[LevelInformation] = []
154
167
  self.tags = config.tags()
155
168
 
156
169
  def watch_output(self, watched: "WatchedOutput"):
157
- """Monitor task outputs
170
+ """Add a watched output to this job.
158
171
 
159
172
  :param watched: A description of the watched output
160
173
  """
161
- self.scheduler.xp.watch_output(watched)
174
+ self.watched_outputs.append(watched)
175
+
176
+ def register_watched_outputs(self):
177
+ """Register all watched outputs with the scheduler.
178
+
179
+ This should be called after the job is submitted and has a scheduler.
180
+ """
181
+ from experimaestro.scheduler.experiment import experiment
162
182
 
163
- def task_output_update(self, subpath: Path):
164
- """Notification of an updated task output"""
165
- if watcher := self.watched_outputs.get(subpath, None):
166
- watcher.update()
183
+ xp = experiment.current()
184
+ for watched in self.watched_outputs:
185
+ # Set the job reference so the watcher knows where to look
186
+ watched.job = self
187
+ xp.watch_output(watched)
167
188
 
168
189
  def done_handler(self):
169
- """The task has been completed"""
170
- for watcher in self.watched_outputs.values():
171
- watcher.update()
190
+ """The task has been completed.
191
+
192
+ Ensures all remaining task output events are processed by explicitly
193
+ reading the task outputs file. This is necessary because file system
194
+ watchers may have latency, and we need to process all outputs before
195
+ the experiment can exit.
196
+ """
197
+ if not self.watched_outputs:
198
+ return
199
+
200
+ for xp in self.experiments:
201
+ xp.taskOutputsWorker.process_job_outputs(self)
172
202
 
173
203
  def __str__(self):
174
204
  return "Job[{}]".format(self.identifier)
@@ -177,6 +207,57 @@ class Job(Resource):
177
207
  assert self._future, "Cannot wait a not submitted job"
178
208
  return self._future.result()
179
209
 
210
+ def set_state(self, new_state: JobState):
211
+ """Set the job state and update experiment statistics
212
+
213
+ This method should be called instead of direct state assignment
214
+ to ensure experiment statistics (unfinishedJobs, failedJobs) are
215
+ properly updated.
216
+
217
+ :param new_state: The new job state
218
+ """
219
+ old_state = self.state
220
+ self.state = new_state
221
+
222
+ # Helper to determine if a state should be "counted" in unfinishedJobs
223
+ # A job is counted when it's been submitted and hasn't finished yet
224
+ def is_counted(state):
225
+ return state != JobState.UNSCHEDULED and not state.finished()
226
+
227
+ # Update experiment statistics based on state transition
228
+ for xp in self.experiments:
229
+ # Handle transitions in/out of "counted" state
230
+ if is_counted(new_state) and not is_counted(old_state):
231
+ # Job is now being tracked (new submission or resubmit)
232
+ xp.unfinishedJobs += 1
233
+ logger.debug(
234
+ "Job %s submitted, unfinished jobs for %s: %d",
235
+ self.identifier[:8],
236
+ xp.workdir.name,
237
+ xp.unfinishedJobs,
238
+ )
239
+ elif not is_counted(new_state) and is_counted(old_state):
240
+ # Job is no longer being tracked (finished)
241
+ xp.unfinishedJobs -= 1
242
+ logger.debug(
243
+ "Job %s finished, unfinished jobs for %s: %d",
244
+ self.identifier[:8],
245
+ xp.workdir.name,
246
+ xp.unfinishedJobs,
247
+ )
248
+
249
+ # Handle error state
250
+ if new_state.is_error() and not old_state.is_error():
251
+ xp.failedJobs[self.identifier] = self
252
+
253
+ # Handle recovery from error (e.g., resubmit)
254
+ if old_state.is_error() and not new_state.is_error():
255
+ xp.failedJobs.pop(self.identifier, None)
256
+
257
+ # Notify listeners via scheduler's thread-safe mechanism
258
+ if self.scheduler:
259
+ self.scheduler.notify_job_state(self)
260
+
180
261
  @cached_property
181
262
  def python_path(self) -> Iterator[str]:
182
263
  """Returns an iterator over python path"""
@@ -220,8 +301,8 @@ class Job(Resource):
220
301
  self._progress[-1].desc = desc
221
302
  self._progress[-1].progress = value
222
303
 
223
- for listener in self.scheduler.listeners:
224
- listener.job_state(self)
304
+ # Notify listeners via scheduler's thread-safe mechanism
305
+ self.scheduler.notify_job_state(self)
225
306
 
226
307
  def add_notification_server(self, server):
227
308
  """Adds a notification server"""
@@ -271,6 +352,16 @@ class Job(Resource):
271
352
  def identifier(self):
272
353
  return self.config.__xpm__.identifier.all.hex()
273
354
 
355
+ @property
356
+ def task_id(self) -> str:
357
+ """Task class identifier (for BaseJob interface)"""
358
+ return str(self.type.identifier)
359
+
360
+ @property
361
+ def locator(self) -> str:
362
+ """Full task locator (for BaseJob interface)"""
363
+ return self.identifier
364
+
274
365
  def prepare(self, overwrite=False):
275
366
  """Prepare all files before starting a task
276
367
 
@@ -278,113 +369,12 @@ class Job(Resource):
278
369
  """
279
370
  pass
280
371
 
281
- async def aio_start(self, sched_dependency_lock, notification_server=None):
282
- """Start the job with core job starting logic
283
-
284
- This method contains the core logic for starting a job that was previously
285
- located in Scheduler.aio_start(). It handles job locking, dependency
286
- acquisition, directory setup, and job execution while using the scheduler's
287
- coordination lock to prevent race conditions between multiple jobs.
288
-
289
- :param sched_dependency_lock: The scheduler's dependency lock for coordination
290
- between jobs to prevent race conditions during dependency acquisition
291
- :param notification_server: Optional notification server from the experiment
292
- for job progress reporting
293
- :return: JobState.DONE if job completed successfully, JobState.ERROR if job
294
- failed during execution, or None if dependencies couldn't be locked
295
- (signals WAITING state to scheduler)
296
- :raises Exception: Various exceptions during job execution, dependency locking,
297
- or process creation
298
- """
299
- # We first lock the job before proceeding
300
- assert self.launcher is not None
301
-
302
- with Locks() as locks:
303
- logger.debug("[starting] Locking job %s", self)
304
- async with self.launcher.connector.lock(self.lockpath):
305
- logger.debug("[starting] Locked job %s", self)
306
-
307
- state = None
308
- try:
309
- logger.debug(
310
- "Starting job %s with %d dependencies",
311
- self,
312
- len(self.dependencies),
313
- )
314
-
315
- # Individual dependency lock acquisition
316
- # We use the scheduler-wide lock to avoid cross-jobs race conditions
317
- async with sched_dependency_lock:
318
- for dependency in self.dependencies:
319
- try:
320
- locks.append(dependency.lock().acquire())
321
- except LockError:
322
- logger.warning(
323
- "Could not lock %s, aborting start for job %s",
324
- dependency,
325
- self,
326
- )
327
- dependency.check()
328
- return None # Signal to scheduler that dependencies couldn't be locked
329
-
330
- # Dependencies have been locked, we can start the job
331
- self.starttime = time.time()
332
-
333
- # Creates the main directory
334
- directory = self.path
335
- logger.debug("Making directories job %s...", directory)
336
- if not directory.is_dir():
337
- directory.mkdir(parents=True, exist_ok=True)
338
-
339
- # Sets up the notification URL
340
- if notification_server is not None:
341
- self.add_notification_server(notification_server)
342
-
343
- except Exception:
344
- logger.warning("Error while locking job", exc_info=True)
345
- return None # Signal waiting state to scheduler
346
-
347
- try:
348
- # Runs the job
349
- process = await self.aio_run()
350
- except Exception:
351
- logger.warning("Error while starting job", exc_info=True)
352
- return JobState.ERROR
353
-
354
- try:
355
- if isinstance(process, JobState):
356
- state = process
357
- logger.debug("Job %s ended (state %s)", self, state)
358
- else:
359
- logger.debug("Waiting for job %s process to end", self)
360
-
361
- code = await process.aio_code()
362
- logger.debug("Got return code %s for %s", code, self)
363
-
364
- # Check the file if there is no return code
365
- if code is None:
366
- # Case where we cannot retrieve the code right away
367
- if self.donepath.is_file():
368
- code = 0
369
- else:
370
- code = int(self.failedpath.read_text())
371
-
372
- logger.debug("Job %s ended with code %s", self, code)
373
- state = JobState.DONE if code == 0 else JobState.ERROR
374
-
375
- except JobError:
376
- logger.warning("Error while running job")
377
- state = JobState.ERROR
378
-
379
- except Exception:
380
- logger.warning(
381
- "Error while running job (in experimaestro)", exc_info=True
382
- )
383
- state = JobState.ERROR
384
- return state
372
+ async def aio_run(self) -> "Process":
373
+ """Actually run the code
385
374
 
386
- async def aio_run(self):
387
- """Actually run the code"""
375
+ Returns:
376
+ A Process instance representing the running job
377
+ """
388
378
  raise NotImplementedError(f"Method aio_run not implemented in {self.__class__}")
389
379
 
390
380
  async def aio_process(self) -> Optional["Process"]:
@@ -420,33 +410,28 @@ class Job(Resource):
420
410
  def stderr(self) -> Path:
421
411
  return self.jobpath / ("%s.err" % self.name)
422
412
 
413
+ def rotate_logs(self) -> None:
414
+ """Rotate log files before restarting a task.
415
+
416
+ Renames non-empty stdout and stderr files with a timestamp suffix
417
+ (e.g., job.20231215143022.out) to preserve logs from previous runs.
418
+ """
419
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
420
+
421
+ for log_path in [self.stdout, self.stderr]:
422
+ if log_path.exists() and log_path.stat().st_size > 0:
423
+ # Extract extension (.out or .err)
424
+ ext = log_path.suffix
425
+ # Create new name with timestamp before extension
426
+ new_name = f"{log_path.stem}.{timestamp}{ext}"
427
+ new_path = log_path.parent / new_name
428
+ logger.info("Rotating log file %s -> %s", log_path.name, new_name)
429
+ log_path.rename(new_path)
430
+
423
431
  @property
424
432
  def basepath(self) -> Path:
425
433
  return self.jobpath / self.name
426
434
 
427
- def dependencychanged(self, dependency, oldstatus, status):
428
- """Called when a dependency has changed"""
429
-
430
- def value(s):
431
- return 1 if s == DependencyStatus.OK else 0
432
-
433
- self.unsatisfied -= value(status) - value(oldstatus)
434
-
435
- logger.debug("Job %s: unsatisfied %d", self, self.unsatisfied)
436
-
437
- if status == DependencyStatus.FAIL:
438
- # Job completed
439
- if not self.state.finished():
440
- self.state = JobState.ERROR
441
- self.failure_status = JobFailureStatus.DEPENDENCY
442
- self._readyEvent.set()
443
-
444
- if self.unsatisfied == 0:
445
- logger.info("Job %s is ready to run", self)
446
- # We are ready
447
- self.state = JobState.READY
448
- self._readyEvent.set()
449
-
450
435
  def finalState(self) -> "concurrent.futures.Future[JobState]":
451
436
  assert self._future is not None
452
437
  return self._future
@@ -469,6 +454,31 @@ class JobContext(ConfigWalkContext):
469
454
  def task(self):
470
455
  return self.job.config
471
456
 
457
+ def partial_path(self, subparameters, config) -> Path:
458
+ """Returns the partial directory path for a given subparameters instance.
459
+
460
+ The partial path structure is:
461
+ WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID/
462
+
463
+ Args:
464
+ subparameters: The Subparameters instance defining which groups to exclude
465
+ config: The configuration to compute the partial identifier for
466
+
467
+ Returns:
468
+ The partial directory path.
469
+ """
470
+ # Compute partial identifier
471
+ partial_id = config.__xpm__.get_partial_identifier(subparameters)
472
+
473
+ # Build partial directory path
474
+ task_id = str(config.__xpmtype__.identifier)
475
+ return (
476
+ self.job.workspace.partialspath
477
+ / task_id
478
+ / subparameters.name
479
+ / partial_id.all.hex()
480
+ )
481
+
472
482
 
473
483
  class JobError(Exception):
474
484
  def __init__(self, code):
@@ -0,0 +1,31 @@
1
+ """Remote monitoring support for experimaestro
2
+
3
+ This package provides SSH-based remote monitoring capabilities for experiments.
4
+
5
+ Main components:
6
+ - SSHStateProviderServer: JSON-RPC server that wraps WorkspaceStateProvider
7
+ - SSHStateProviderClient: Client that connects via SSH and implements StateProvider interface
8
+ - RemoteFileSynchronizer: Rsync-based file synchronization
9
+
10
+ Usage:
11
+ # On remote host (run via SSH):
12
+ from experimaestro.scheduler.remote.server import SSHStateProviderServer
13
+ server = SSHStateProviderServer(workspace_path)
14
+ server.start()
15
+
16
+ # On local host:
17
+ from experimaestro.scheduler.remote.client import SSHStateProviderClient
18
+ client = SSHStateProviderClient(host="server", remote_workspace="/path")
19
+ client.connect()
20
+ experiments = client.get_experiments()
21
+ """
22
+
23
+ from experimaestro.scheduler.remote.server import SSHStateProviderServer
24
+ from experimaestro.scheduler.remote.client import SSHStateProviderClient
25
+ from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
26
+
27
+ __all__ = [
28
+ "SSHStateProviderServer",
29
+ "SSHStateProviderClient",
30
+ "RemoteFileSynchronizer",
31
+ ]