experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (152) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +239 -126
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +217 -50
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +629 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +732 -167
  36. experimaestro/scheduler/interfaces.py +316 -101
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  39. experimaestro/scheduler/remote/client.py +171 -117
  40. experimaestro/scheduler/remote/protocol.py +8 -193
  41. experimaestro/scheduler/remote/server.py +95 -71
  42. experimaestro/scheduler/services.py +53 -28
  43. experimaestro/scheduler/state_provider.py +663 -2430
  44. experimaestro/scheduler/state_status.py +1247 -0
  45. experimaestro/scheduler/transient.py +31 -0
  46. experimaestro/scheduler/workspace.py +1 -1
  47. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  48. experimaestro/scriptbuilder.py +4 -4
  49. experimaestro/settings.py +36 -0
  50. experimaestro/tests/conftest.py +33 -5
  51. experimaestro/tests/connectors/bin/executable.py +1 -1
  52. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  53. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  54. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  55. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  56. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  58. experimaestro/tests/launchers/bin/test.py +1 -0
  59. experimaestro/tests/launchers/test_slurm.py +9 -9
  60. experimaestro/tests/partial_reschedule.py +46 -0
  61. experimaestro/tests/restart.py +3 -3
  62. experimaestro/tests/restart_main.py +1 -0
  63. experimaestro/tests/scripts/notifyandwait.py +1 -0
  64. experimaestro/tests/task_partial.py +38 -0
  65. experimaestro/tests/task_tokens.py +2 -2
  66. experimaestro/tests/tasks/test_dynamic.py +6 -6
  67. experimaestro/tests/test_dependencies.py +3 -3
  68. experimaestro/tests/test_deprecated.py +15 -15
  69. experimaestro/tests/test_dynamic_locking.py +317 -0
  70. experimaestro/tests/test_environment.py +24 -14
  71. experimaestro/tests/test_experiment.py +171 -36
  72. experimaestro/tests/test_identifier.py +25 -25
  73. experimaestro/tests/test_identifier_stability.py +3 -5
  74. experimaestro/tests/test_multitoken.py +2 -4
  75. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  76. experimaestro/tests/test_partial_paths.py +81 -138
  77. experimaestro/tests/test_pre_experiment.py +219 -0
  78. experimaestro/tests/test_progress.py +2 -8
  79. experimaestro/tests/test_remote_state.py +560 -99
  80. experimaestro/tests/test_stray_jobs.py +261 -0
  81. experimaestro/tests/test_tasks.py +1 -2
  82. experimaestro/tests/test_token_locking.py +52 -67
  83. experimaestro/tests/test_tokens.py +5 -6
  84. experimaestro/tests/test_transient.py +225 -0
  85. experimaestro/tests/test_workspace_state_provider.py +768 -0
  86. experimaestro/tests/token_reschedule.py +1 -3
  87. experimaestro/tests/utils.py +2 -7
  88. experimaestro/tokens.py +227 -372
  89. experimaestro/tools/diff.py +1 -0
  90. experimaestro/tools/documentation.py +4 -5
  91. experimaestro/tools/jobs.py +1 -2
  92. experimaestro/tui/app.py +438 -1966
  93. experimaestro/tui/app.tcss +162 -0
  94. experimaestro/tui/dialogs.py +172 -0
  95. experimaestro/tui/log_viewer.py +253 -3
  96. experimaestro/tui/messages.py +137 -0
  97. experimaestro/tui/utils.py +54 -0
  98. experimaestro/tui/widgets/__init__.py +23 -0
  99. experimaestro/tui/widgets/experiments.py +468 -0
  100. experimaestro/tui/widgets/global_services.py +238 -0
  101. experimaestro/tui/widgets/jobs.py +972 -0
  102. experimaestro/tui/widgets/log.py +156 -0
  103. experimaestro/tui/widgets/orphans.py +363 -0
  104. experimaestro/tui/widgets/runs.py +185 -0
  105. experimaestro/tui/widgets/services.py +314 -0
  106. experimaestro/tui/widgets/stray_jobs.py +528 -0
  107. experimaestro/utils/__init__.py +1 -1
  108. experimaestro/utils/environment.py +105 -22
  109. experimaestro/utils/fswatcher.py +124 -0
  110. experimaestro/utils/jobs.py +1 -2
  111. experimaestro/utils/jupyter.py +1 -2
  112. experimaestro/utils/logging.py +72 -0
  113. experimaestro/version.py +2 -2
  114. experimaestro/webui/__init__.py +9 -0
  115. experimaestro/webui/app.py +117 -0
  116. experimaestro/{server → webui}/data/index.css +66 -11
  117. experimaestro/webui/data/index.css.map +1 -0
  118. experimaestro/{server → webui}/data/index.js +82763 -87217
  119. experimaestro/webui/data/index.js.map +1 -0
  120. experimaestro/webui/routes/__init__.py +5 -0
  121. experimaestro/webui/routes/auth.py +53 -0
  122. experimaestro/webui/routes/proxy.py +117 -0
  123. experimaestro/webui/server.py +200 -0
  124. experimaestro/webui/state_bridge.py +152 -0
  125. experimaestro/webui/websocket.py +413 -0
  126. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
  127. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  128. experimaestro/cli/progress.py +0 -269
  129. experimaestro/scheduler/state.py +0 -75
  130. experimaestro/scheduler/state_db.py +0 -437
  131. experimaestro/scheduler/state_sync.py +0 -891
  132. experimaestro/server/__init__.py +0 -467
  133. experimaestro/server/data/index.css.map +0 -1
  134. experimaestro/server/data/index.js.map +0 -1
  135. experimaestro/tests/test_cli_jobs.py +0 -615
  136. experimaestro/tests/test_file_progress.py +0 -425
  137. experimaestro/tests/test_file_progress_integration.py +0 -477
  138. experimaestro/tests/test_state_db.py +0 -434
  139. experimaestro-2.0.0b8.dist-info/RECORD +0 -187
  140. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  141. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  142. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  143. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  145. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  147. /experimaestro/{server → webui}/data/index.html +0 -0
  148. /experimaestro/{server → webui}/data/login.html +0 -0
  149. /experimaestro/{server → webui}/data/manifest.json +0 -0
  150. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  151. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  152. {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
experimaestro/progress.py CHANGED
@@ -1,381 +1,41 @@
1
1
  """File-based progress tracking system for experimaestro tasks."""
2
2
 
3
- import json
4
3
  import threading
5
- import time
6
- from dataclasses import dataclass, asdict
7
4
  from pathlib import Path
8
- from typing import Optional, List, Iterator, Dict, Any
9
- from datetime import datetime, timedelta
10
- import fcntl
11
- import os
5
+ from typing import Optional
12
6
 
13
- from .utils import logger
14
7
 
15
- DEFAULT_MAX_ENTRIES_PER_FILE = 10_000
16
-
17
-
18
- @dataclass
19
- class ProgressEntry:
20
- """A single progress entry in the JSONL file"""
21
-
22
- timestamp: float
23
- level: int
24
- progress: float
25
- desc: Optional[str] = None
26
-
27
- def to_dict(self) -> Dict[str, Any]:
28
- """Convert to dictionary for JSON serialization"""
29
- return asdict(self)
30
-
31
- @classmethod
32
- def from_dict(cls, data: Dict[str, Any]) -> "ProgressEntry":
33
- """Create from dictionary"""
34
- return cls(**data)
35
-
36
-
37
- class StateFile:
38
- """Represents the state file for progress tracking.
39
- Checks if the state must be written based on time and progress changes.
40
- By default, it writes every second or when progress changes significantly (>1%)"""
41
-
42
- def __init__(self, filename: Path):
43
- self.filename = filename
44
- self.state: Dict[int, ProgressEntry] = {}
45
-
46
- # Write threshold to avoid too frequent writes
47
- self._time_threshold = timedelta(seconds=1.0)
48
- self._last_write_time: datetime = datetime.now()
49
- # Minimum progress change to trigger write
50
- self._progress_threshold = 0.01
51
- self._last_write_progress: Optional[Dict[int, float]] = None
52
-
53
- self.filename.parent.mkdir(parents=True, exist_ok=True)
54
- self.load()
55
-
56
- def _allow_write(self) -> bool:
57
- """Check if the state should be written based on time and progress changes.
58
- Allows writing if:
59
- - BOTH: More than 1 second has passed since last write
60
- - AND: Progress has changed significantly (>1%)
61
- - OR: All entries are done (progress >= 1.0)"""
62
- time_check = datetime.now() - self._last_write_time > self._time_threshold
63
- progress_check = self._last_write_progress is None or any(
64
- abs(entry.progress - self._last_write_progress.get(entry.level, 0.0))
65
- > self._progress_threshold
66
- for entry in self.state.values()
67
- )
68
- all_entries_done = all(entry.progress >= 1.0 for entry in self.state.values())
69
- return all_entries_done or (time_check and progress_check)
70
-
71
- def write(self, force: bool = False):
72
- """Write the current state to the file."""
73
- if self._allow_write() or force:
74
- with open(self.filename, "w") as f:
75
- json.dump({k: v.to_dict() for k, v in self.state.items()}, f)
76
- self._last_write_time = datetime.now()
77
- self._last_write_progress = {k: v.progress for k, v in self.state.items()}
78
-
79
- def update(self, entry: ProgressEntry):
80
- self.state[entry.level] = entry
81
-
82
- def load(self):
83
- """Load the state from the file"""
84
- if self.filename.exists():
85
- with self.filename.open("r") as f:
86
- try:
87
- data = json.load(f)
88
- self.state = {
89
- int(k): ProgressEntry.from_dict(v) for k, v in data.items()
90
- }
91
- except (json.JSONDecodeError, IOError):
92
- logger.warning(f"Failed to load state from {self.filename}")
93
-
94
- def read(self) -> Dict[int, ProgressEntry]:
95
- """Read the state from the file"""
96
- self.load()
97
- return self.state
98
-
99
- # flush on exit
100
- def __del__(self):
101
- """Ensure state is written on exit"""
102
- try:
103
- self.write(force=True)
104
- except Exception as e:
105
- logger.error(f"Failed to write state on exit: {e}")
106
-
107
-
108
- class ProgressFileWriter:
109
- def __init__(
110
- self, task_path: Path, max_entries_per_file: int = DEFAULT_MAX_ENTRIES_PER_FILE
111
- ):
112
- self.task_path = task_path
113
- self.progress_dir = task_path / ".experimaestro"
114
- self.max_entries_per_file = max_entries_per_file
115
- self.current_file_index = 0
116
- self.current_file_entries = 0
117
- self.lock = threading.Lock()
118
-
119
- # Ensure directory exists
120
- self.progress_dir.mkdir(exist_ok=True)
121
-
122
- # State is the latest entry per level
123
- self.state = StateFile(self.progress_dir / "progress_state.json")
124
-
125
- # Find the latest file index
126
- self._find_latest_file()
127
-
128
- def _find_latest_file(self):
129
- """Find the latest progress file and entry count"""
130
- progress_files = list(self.progress_dir.glob("progress-*.jsonl"))
131
- if not progress_files:
132
- self.current_file_index = 0
133
- self.current_file_entries = 0
134
- return
135
-
136
- # Sort by file index
137
- max_index = None
138
- for f in progress_files:
139
- try:
140
- index = int(f.stem.split("-")[1])
141
- if max_index is None or index > max_index:
142
- max_index = index
143
- except (ValueError, IndexError):
144
- continue
145
-
146
- if max_index is not None:
147
- self.current_file_index = max_index
148
- # Count entries in current file
149
- current_file = self._get_current_file_path()
150
- if current_file.exists():
151
- with current_file.open("r") as f:
152
- self.current_file_entries = sum(1 for _ in f.readlines())
153
- else:
154
- self.current_file_entries = 0
155
- else:
156
- self.current_file_index = 0
157
- self.current_file_entries = 0
158
-
159
- def _get_current_file_path(self) -> Path:
160
- """Get path to current progress file"""
161
- return self.progress_dir / f"progress-{self.current_file_index:04d}.jsonl"
162
-
163
- def _get_latest_symlink_path(self) -> Path:
164
- """Get path to latest progress symlink"""
165
- return self.progress_dir / "progress-latest.jsonl"
166
-
167
- def _rotate_file_if_needed(self):
168
- """Create new file if current one is full"""
169
- if self.current_file_entries >= self.max_entries_per_file:
170
- self.current_file_index += 1
171
- self.current_file_entries = 0
172
- logger.debug(f"Rotating to new progress file: {self.current_file_index}")
173
-
174
- def _update_latest_symlink(self):
175
- """Update symlink to point to latest file"""
176
- current_file = self._get_current_file_path()
177
- latest_symlink = self._get_latest_symlink_path()
178
-
179
- # Remove existing symlink
180
- if latest_symlink.exists() or latest_symlink.is_symlink():
181
- latest_symlink.unlink()
182
-
183
- # Create new symlink
184
- latest_symlink.symlink_to(current_file.name)
185
-
186
- def write_progress(self, level: int, progress: float, desc: Optional[str] = None):
187
- """Write a progress entry to the file
188
-
189
- Args:
190
- level: Progress level (0 is top level)
191
- progress: Progress value between 0.0 and 1.0
192
- desc: Optional description
193
- """
194
- with self.lock:
195
- # Eventually rotate internal state if needed
196
- self._rotate_file_if_needed()
197
-
198
- entry = ProgressEntry(
199
- timestamp=time.time(), level=level, progress=progress, desc=desc
200
- )
201
- self.state.update(entry)
202
- self.state.write(force=level == -1) # Force write on EOJ
203
-
204
- current_file = self._get_current_file_path()
205
-
206
- # Write with file locking for concurrent access
207
- with current_file.open("a") as f:
208
- try:
209
- fcntl.flock(f.fileno(), fcntl.LOCK_EX)
210
- f.write(json.dumps(entry.to_dict()) + "\n")
211
- f.flush() # Flush the file buffer
212
- os.fsync(f.fileno()) # Ensure data is written to disk
213
- finally:
214
- fcntl.flock(f.fileno(), fcntl.LOCK_UN)
215
-
216
- self.current_file_entries += 1
217
- self._update_latest_symlink()
218
-
219
- logger.debug(
220
- f"Progress written: level={level}, progress={progress}, desc={desc}"
221
- )
222
-
223
- def __del__(self):
224
- """Ensure state is written on exit"""
225
- try:
226
- self.state.write(force=True)
227
- except Exception as e:
228
- logger.error(f"Failed to write state on exit: {e}")
229
-
230
-
231
- class ProgressFileReader:
232
- """Reads progress entries from JSONL files"""
233
-
234
- def __init__(self, task_path: Path):
235
- """Initialize progress file reader
236
-
237
- Args:
238
- task_path: Path to the task directory
239
- """
240
- self.task_path = task_path
241
- self.progress_dir = task_path / ".experimaestro"
242
- self.max_entries_per_file: Optional[int] = None
243
- self.state = StateFile(self.progress_dir / "progress_state.json")
244
-
245
- def get_progress_files(self) -> List[Path]:
246
- """Get all progress files sorted by index"""
247
- if not self.progress_dir.exists():
248
- return []
249
-
250
- progress_files = list(self.progress_dir.glob("progress-*.jsonl"))
251
-
252
- # Filter out symlinks to avoid duplicates
253
- progress_files = [f for f in progress_files if not f.is_symlink()]
254
-
255
- # Sort by file index
256
- # Alternatively, we could simply sort by filename
257
- def get_index(path: Path) -> int:
258
- try:
259
- return int(path.stem.split("-")[1])
260
- except (ValueError, IndexError):
261
- return 0
262
-
263
- return sorted(progress_files, key=get_index)
264
-
265
- def get_latest_file(self) -> Optional[Path]:
266
- """Get the latest progress file via symlink"""
267
- latest_symlink = self.progress_dir / "progress-latest.jsonl"
268
- if latest_symlink.exists() and latest_symlink.is_symlink():
269
- return latest_symlink.resolve()
270
-
271
- # Fallback to finding latest manually
272
- files = self.get_progress_files()
273
- return files[-1] if files else None
274
-
275
- def read_entries(self, file_path: Path) -> Iterator[ProgressEntry]:
276
- """Read progress entries from a file
277
-
278
- Args:
279
- file_path: Path to progress file
280
-
281
- Yields:
282
- ProgressEntry objects
283
- """
284
- if not file_path.exists():
285
- return
286
-
287
- try:
288
- with file_path.open("r") as f:
289
- fcntl.flock(f.fileno(), fcntl.LOCK_SH)
290
- try:
291
- for line in f:
292
- line = line.strip()
293
- if line:
294
- try:
295
- data = json.loads(line)
296
- yield ProgressEntry.from_dict(data)
297
- except json.JSONDecodeError as e:
298
- logger.warning(
299
- f"Invalid JSON in progress file {file_path}: {e}"
300
- )
301
- finally:
302
- fcntl.flock(f.fileno(), fcntl.LOCK_UN)
303
- except IOError as e:
304
- logger.warning(f"Could not read progress file {file_path}: {e}")
305
-
306
- def read_all_entries(self) -> Iterator[ProgressEntry]:
307
- """Read all progress entries from all files in order
308
-
309
- Yields:
310
- ProgressEntry objects in chronological order
311
- """
312
- logger.warning("Reading all progress entries, this may be slow for large jobs.")
313
- for file_path in self.get_progress_files():
314
- yield from self.read_entries(file_path)
315
-
316
- def read_latest_entries(self, count: Optional[int] = None) -> List[ProgressEntry]:
317
- """Read the latest N progress entries"""
318
- entries = []
319
-
320
- # Read files in reverse order to get latest entries first
321
- files = self.get_progress_files()
322
- # Fetch the max length of files, in lines
323
- if files and count is None:
324
- # Fetch the number of entries in the first file
325
- # This is the most likely to be the longest file
326
- count = sum(1 for _ in self.read_entries(files[0]))
327
- if count is None:
328
- count = DEFAULT_MAX_ENTRIES_PER_FILE
329
-
330
- for file_path in reversed(files):
331
- file_entries = list(self.read_entries(file_path))
332
- entries.extend(reversed(file_entries))
333
-
334
- if len(entries) >= count:
335
- break
336
-
337
- # Return latest entries in chronological order
338
- return list(reversed(entries[:count]))
339
-
340
- def get_current_progress(
341
- self, count: Optional[int] = None
342
- ) -> Dict[int, ProgressEntry]:
343
- """Get the current progress for each level"""
344
- logger.warning(
345
- "Reading current progress from progress logs, this may be slow for large jobs."
346
- )
347
- return {entry.level: entry for entry in self.read_latest_entries(count)}
348
-
349
- def get_current_state(self) -> Optional[Dict[int, ProgressEntry]]:
350
- """Fetch the latest progress entry from the state file"""
351
- current_state = self.state.read()
352
- return current_state or self.get_current_progress()
353
-
354
- def is_done(self) -> bool:
355
- """Check if the task is done by looking for a special 'done' file.
356
- Fallback to checking for end-of-job (EOJ) entries."""
357
-
358
- task_name = self.task_path.parent.stem.split(".")[-1]
359
- job_done_file = self.task_path / f"{task_name}.done"
360
- if job_done_file.exists() and job_done_file.is_file():
361
- return True
362
-
363
- # Check if any progress file has a level -1 entry indicating EOJ
364
- return any(entry.level == -1 for entry in self.read_all_entries())
8
+ class FileBasedProgressReporter:
9
+ """File-based progress reporter that writes to job event files.
365
10
 
11
+ Writes JobProgressEvent objects to:
12
+ workspace/.events/jobs/{task_id}/event-{job_id}-*.jsonl
366
13
 
367
- class FileBasedProgressReporter:
368
- """File-based progress reporter that replaces the socket-based Reporter"""
14
+ These files are watched by the scheduler's EventReader to forward
15
+ progress updates to listeners.
16
+ """
369
17
 
370
18
  def __init__(self, task_path: Path):
371
19
  """Initialize file-based progress reporter
372
20
 
373
21
  Args:
374
- task_path: Path to the task directory
22
+ task_path: Path to the task directory (workspace/jobs/task_id/job_id/)
375
23
  """
24
+ from experimaestro.scheduler.state_status import JobEventWriter
25
+
376
26
  self.task_path = task_path
377
- self.writer = ProgressFileWriter(task_path)
378
- self.current_progress = {} # level -> (progress, desc)
27
+ # Extract workspace, task_id, and job_id from task_path
28
+ # task_path is typically: workspace/jobs/task_id/job_id/
29
+ self.job_id = task_path.name
30
+ self.task_id = task_path.parent.name
31
+ workspace_path = task_path.parent.parent.parent
32
+
33
+ # Create event writer for this job's events
34
+ # Pass job_path for permanent storage of events
35
+ self.event_writer = JobEventWriter(
36
+ workspace_path, self.task_id, self.job_id, 0, job_path=task_path
37
+ )
38
+ self.current_progress: dict[int, tuple[float | None, str | None]] = {}
379
39
  self.lock = threading.Lock()
380
40
 
381
41
  def set_progress(self, progress: float, level: int = 0, desc: Optional[str] = None):
@@ -386,6 +46,8 @@ class FileBasedProgressReporter:
386
46
  level: Progress level (0 is top level)
387
47
  desc: Optional description
388
48
  """
49
+ from experimaestro.scheduler.state_status import JobProgressEvent
50
+
389
51
  with self.lock:
390
52
  # Check if progress has changed significantly
391
53
  current = self.current_progress.get(level, (None, None))
@@ -395,10 +57,45 @@ class FileBasedProgressReporter:
395
57
  or desc != current[1]
396
58
  ):
397
59
  self.current_progress[level] = (progress, desc)
398
- self.writer.write_progress(level, progress, desc)
60
+
61
+ # Write to event file (EventWriter handles periodic flushing)
62
+ event = JobProgressEvent(
63
+ job_id=self.job_id,
64
+ level=level,
65
+ progress=progress,
66
+ desc=desc,
67
+ )
68
+ self.event_writer.write_event(event)
69
+
70
+ def start_of_job(self):
71
+ """Start of job notification - called when job execution begins"""
72
+ from experimaestro.scheduler.state_status import JobStateChangedEvent
73
+ import time
74
+
75
+ with self.lock:
76
+ # Write JobStateChangedEvent with state="running" to event file
77
+ event = JobStateChangedEvent(
78
+ job_id=self.job_id,
79
+ state="running",
80
+ started_time=time.time(),
81
+ )
82
+ self.event_writer.write_event(event)
83
+ self.event_writer.flush()
399
84
 
400
85
  def eoj(self):
401
86
  """End of job notification"""
87
+ from experimaestro.scheduler.state_status import JobStateChangedEvent
88
+ import time
89
+
402
90
  with self.lock:
403
- # Write a special end-of-job marker
404
- self.writer.write_progress(-1, 1.0, "EOJ")
91
+ # Write JobStateChangedEvent with state="done" to event file
92
+ event = JobStateChangedEvent(
93
+ job_id=self.job_id,
94
+ state="done",
95
+ ended_time=time.time(),
96
+ )
97
+ self.event_writer.write_event(event)
98
+ self.event_writer.flush()
99
+
100
+ # Archive events from .events/ to job directory
101
+ self.event_writer.archive_events()
experimaestro/rpyc.py CHANGED
@@ -100,7 +100,6 @@ class ClassicService(rpyc.core.service.ClassicService):
100
100
  super().on_connect(conn)
101
101
 
102
102
  def on_disconnect(self, conn):
103
- print("Disconnected")
104
103
  super().on_disconnect(conn)
105
104
 
106
105
 
@@ -119,7 +118,6 @@ def start_server(unix_path, clean=None):
119
118
  def sayhello():
120
119
  while not server.active:
121
120
  time.sleep(0.01)
122
- print("HELLO", flush=True)
123
121
  logger.debug("Server started")
124
122
 
125
123
  time.sleep(5)
experimaestro/run.py CHANGED
@@ -7,9 +7,10 @@ import sys
7
7
  import json
8
8
  from typing import List
9
9
  import fasteners
10
- from experimaestro.notifications import progress, report_eoj
10
+ from experimaestro.notifications import progress, report_eoj, start_of_job
11
11
  from experimaestro.utils.multiprocessing import delayed_shutdown
12
12
  from experimaestro.exceptions import GracefulTimeout
13
+ from experimaestro.locking import JobDependencyLocks
13
14
  from .core.types import ObjectType
14
15
  from experimaestro.utils import logger
15
16
  from experimaestro.core.objects import ConfigInformation
@@ -67,6 +68,7 @@ class TaskRunner:
67
68
  self.failedpath = self.scriptpath.with_suffix(".failed")
68
69
  self.started = False
69
70
  self.locks = []
71
+ self.dynamic_locks = JobDependencyLocks()
70
72
  env = taskglobals.Env.instance()
71
73
  env.taskpath = self.scriptpath.parent
72
74
 
@@ -77,6 +79,8 @@ class TaskRunner:
77
79
  self.cleaned = True
78
80
  logger.info("Cleaning up")
79
81
  rmfile(self.pidfile)
82
+
83
+ # Release IPC locks
80
84
  for lock in self.locks:
81
85
  try:
82
86
  if lock.acquired:
@@ -86,6 +90,9 @@ class TaskRunner:
86
90
  except Exception:
87
91
  logger.error("Error while releasing lock %s", lock)
88
92
 
93
+ # Note: dynamic dependency locks are released via context manager
94
+ # in the run() method, not here
95
+
89
96
  if self.started:
90
97
  report_eoj()
91
98
  logger.info("Finished cleanup")
@@ -141,6 +148,19 @@ class TaskRunner:
141
148
  raise AssertionError("Could not lock %s", lockfile)
142
149
  self.locks.append(lock)
143
150
 
151
+ # Load and setup dynamic dependency locks from locks.json
152
+ locks_path = workdir / "locks.json"
153
+ if locks_path.exists():
154
+ logger.info("Loading dynamic dependency locks from %s", locks_path)
155
+ with locks_path.open() as f:
156
+ locks_data = json.load(f)
157
+ self.dynamic_locks = JobDependencyLocks.from_json(
158
+ locks_data.get("dynamic_locks", [])
159
+ )
160
+ logger.info(
161
+ "Loaded %d dynamic dependency locks", len(self.dynamic_locks.locks)
162
+ )
163
+
144
164
  # Check if failed/done have been generated by another job
145
165
  if self.donepath.is_file():
146
166
  logger.info("Job already completed")
@@ -148,13 +168,20 @@ class TaskRunner:
148
168
  logger.info("Running task")
149
169
  rmfile(self.failedpath)
150
170
  self.started = True
151
- run(workdir / "params.json")
171
+
172
+ # Notify that the job has started
173
+ start_of_job()
174
+
175
+ # Acquire dynamic dependency locks while running the task
176
+ with self.dynamic_locks.dependency_locks():
177
+ run(workdir / "params.json")
152
178
 
153
179
  # ... remove the handlers
154
180
  remove_signal_handlers(remove_cleanup=False)
155
181
 
156
182
  # Everything went OK
157
183
  logger.info("Task ended successfully")
184
+ self.cleanup()
158
185
  sys.exit(0)
159
186
  except GracefulTimeout as e:
160
187
  logger.info("Task requested graceful timeout: %s", e.message)
@@ -1,6 +1,11 @@
1
1
  from .base import Scheduler, Listener
2
2
  from .workspace import Workspace, RunMode
3
- from .experiment import experiment, FailedExperiment
3
+ from .experiment import (
4
+ experiment,
5
+ FailedExperiment,
6
+ DirtyGitError,
7
+ GracefulExperimentExit,
8
+ )
4
9
  from .jobs import Job, JobState, JobFailureStatus, JobDependency, JobContext
5
10
 
6
11
  __all__ = [
@@ -10,6 +15,8 @@ __all__ = [
10
15
  "RunMode",
11
16
  "experiment",
12
17
  "FailedExperiment",
18
+ "DirtyGitError",
19
+ "GracefulExperimentExit",
13
20
  "Job",
14
21
  "JobState",
15
22
  "JobFailureStatus",