experimaestro 1.5.1__py3-none-any.whl → 2.0.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (118) hide show
  1. experimaestro/__init__.py +14 -4
  2. experimaestro/__main__.py +3 -423
  3. experimaestro/annotations.py +14 -4
  4. experimaestro/cli/__init__.py +311 -0
  5. experimaestro/{filter.py → cli/filter.py} +23 -9
  6. experimaestro/cli/jobs.py +268 -0
  7. experimaestro/cli/progress.py +269 -0
  8. experimaestro/click.py +0 -35
  9. experimaestro/commandline.py +3 -7
  10. experimaestro/connectors/__init__.py +29 -14
  11. experimaestro/connectors/local.py +19 -10
  12. experimaestro/connectors/ssh.py +27 -8
  13. experimaestro/core/arguments.py +45 -3
  14. experimaestro/core/callbacks.py +52 -0
  15. experimaestro/core/context.py +8 -9
  16. experimaestro/core/identifier.py +310 -0
  17. experimaestro/core/objects/__init__.py +44 -0
  18. experimaestro/core/{objects.py → objects/config.py} +399 -772
  19. experimaestro/core/objects/config_utils.py +58 -0
  20. experimaestro/core/objects/config_walk.py +151 -0
  21. experimaestro/core/objects.pyi +15 -45
  22. experimaestro/core/serialization.py +63 -9
  23. experimaestro/core/serializers.py +1 -8
  24. experimaestro/core/types.py +104 -66
  25. experimaestro/experiments/cli.py +154 -72
  26. experimaestro/experiments/configuration.py +10 -1
  27. experimaestro/generators.py +6 -1
  28. experimaestro/ipc.py +4 -1
  29. experimaestro/launcherfinder/__init__.py +1 -1
  30. experimaestro/launcherfinder/base.py +2 -18
  31. experimaestro/launcherfinder/parser.py +8 -3
  32. experimaestro/launcherfinder/registry.py +52 -140
  33. experimaestro/launcherfinder/specs.py +49 -10
  34. experimaestro/launchers/direct.py +0 -47
  35. experimaestro/launchers/slurm/base.py +54 -14
  36. experimaestro/mkdocs/__init__.py +1 -1
  37. experimaestro/mkdocs/base.py +6 -8
  38. experimaestro/notifications.py +38 -12
  39. experimaestro/progress.py +406 -0
  40. experimaestro/run.py +24 -3
  41. experimaestro/scheduler/__init__.py +18 -1
  42. experimaestro/scheduler/base.py +108 -808
  43. experimaestro/scheduler/dynamic_outputs.py +184 -0
  44. experimaestro/scheduler/experiment.py +387 -0
  45. experimaestro/scheduler/jobs.py +475 -0
  46. experimaestro/scheduler/signal_handler.py +32 -0
  47. experimaestro/scheduler/state.py +75 -0
  48. experimaestro/scheduler/workspace.py +27 -8
  49. experimaestro/scriptbuilder.py +18 -3
  50. experimaestro/server/__init__.py +36 -5
  51. experimaestro/server/data/1815e00441357e01619e.ttf +0 -0
  52. experimaestro/server/data/2463b90d9a316e4e5294.woff2 +0 -0
  53. experimaestro/server/data/2582b0e4bcf85eceead0.ttf +0 -0
  54. experimaestro/server/data/89999bdf5d835c012025.woff2 +0 -0
  55. experimaestro/server/data/914997e1bdfc990d0897.ttf +0 -0
  56. experimaestro/server/data/c210719e60948b211a12.woff2 +0 -0
  57. experimaestro/server/data/index.css +5187 -5068
  58. experimaestro/server/data/index.css.map +1 -1
  59. experimaestro/server/data/index.js +68887 -68064
  60. experimaestro/server/data/index.js.map +1 -1
  61. experimaestro/settings.py +45 -5
  62. experimaestro/sphinx/__init__.py +7 -17
  63. experimaestro/taskglobals.py +7 -2
  64. experimaestro/tests/core/__init__.py +0 -0
  65. experimaestro/tests/core/test_generics.py +206 -0
  66. experimaestro/tests/definitions_types.py +5 -3
  67. experimaestro/tests/launchers/bin/sbatch +34 -7
  68. experimaestro/tests/launchers/bin/srun +5 -0
  69. experimaestro/tests/launchers/common.py +17 -5
  70. experimaestro/tests/launchers/config_slurm/launchers.py +25 -0
  71. experimaestro/tests/restart.py +10 -5
  72. experimaestro/tests/tasks/all.py +23 -10
  73. experimaestro/tests/tasks/foreign.py +2 -4
  74. experimaestro/tests/test_checkers.py +2 -2
  75. experimaestro/tests/test_dependencies.py +11 -17
  76. experimaestro/tests/test_experiment.py +73 -0
  77. experimaestro/tests/test_file_progress.py +425 -0
  78. experimaestro/tests/test_file_progress_integration.py +477 -0
  79. experimaestro/tests/test_findlauncher.py +12 -5
  80. experimaestro/tests/test_forward.py +5 -5
  81. experimaestro/tests/test_generators.py +93 -0
  82. experimaestro/tests/test_identifier.py +182 -158
  83. experimaestro/tests/test_instance.py +19 -27
  84. experimaestro/tests/test_objects.py +13 -20
  85. experimaestro/tests/test_outputs.py +6 -6
  86. experimaestro/tests/test_param.py +68 -30
  87. experimaestro/tests/test_progress.py +4 -4
  88. experimaestro/tests/test_serializers.py +24 -64
  89. experimaestro/tests/test_ssh.py +7 -0
  90. experimaestro/tests/test_tags.py +50 -21
  91. experimaestro/tests/test_tasks.py +42 -51
  92. experimaestro/tests/test_tokens.py +11 -8
  93. experimaestro/tests/test_types.py +24 -21
  94. experimaestro/tests/test_validation.py +67 -110
  95. experimaestro/tests/token_reschedule.py +1 -1
  96. experimaestro/tokens.py +24 -13
  97. experimaestro/tools/diff.py +8 -1
  98. experimaestro/typingutils.py +20 -11
  99. experimaestro/utils/asyncio.py +6 -2
  100. experimaestro/utils/multiprocessing.py +44 -0
  101. experimaestro/utils/resources.py +11 -3
  102. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/METADATA +28 -36
  103. experimaestro-2.0.0a8.dist-info/RECORD +166 -0
  104. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/WHEEL +1 -1
  105. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info}/entry_points.txt +0 -4
  106. experimaestro/launchers/slurm/cli.py +0 -29
  107. experimaestro/launchers/slurm/configuration.py +0 -597
  108. experimaestro/scheduler/environment.py +0 -94
  109. experimaestro/server/data/016b4a6cdced82ab3aa1.ttf +0 -0
  110. experimaestro/server/data/50701fbb8177c2dde530.ttf +0 -0
  111. experimaestro/server/data/878f31251d960bd6266f.woff2 +0 -0
  112. experimaestro/server/data/b041b1fa4fe241b23445.woff2 +0 -0
  113. experimaestro/server/data/b6879d41b0852f01ed5b.woff2 +0 -0
  114. experimaestro/server/data/d75e3fd1eb12e9bd6655.ttf +0 -0
  115. experimaestro/tests/launchers/config_slurm/launchers.yaml +0 -134
  116. experimaestro/utils/yaml.py +0 -202
  117. experimaestro-1.5.1.dist-info/RECORD +0 -148
  118. {experimaestro-1.5.1.dist-info → experimaestro-2.0.0a8.dist-info/licenses}/LICENSE +0 -0
@@ -12,6 +12,7 @@ from tqdm.auto import tqdm as std_tqdm
12
12
 
13
13
  from .utils import logger
14
14
  from experimaestro.taskglobals import Env as TaskEnv
15
+ from .progress import FileBasedProgressReporter
15
16
 
16
17
  # --- Progress and other notifications
17
18
 
@@ -41,7 +42,13 @@ class LevelInformation:
41
42
  return result
42
43
 
43
44
  def __repr__(self) -> str:
44
- return f"[{self.level}] {self.desc} {int(self.progress*1000)/10}%"
45
+ return f"[{self.level}] {self.desc} {int(self.progress * 1000) / 10}%"
46
+
47
+
48
+ class ListenerInformation:
49
+ def __init__(self, url: str):
50
+ self.url = url
51
+ self.error_count = 0
45
52
 
46
53
 
47
54
  class Reporter(threading.Thread):
@@ -59,7 +66,7 @@ class Reporter(threading.Thread):
59
66
  super().__init__(daemon=True)
60
67
  self.path = path / Reporter.NOTIFICATION_FOLDER
61
68
  self.path.mkdir(exist_ok=True)
62
- self.urls: Dict[str, str] = {}
69
+ self.urls: Dict[str, ListenerInformation] = {}
63
70
 
64
71
  # Last check of notification URLs
65
72
  self.lastcheck = 0
@@ -72,15 +79,18 @@ class Reporter(threading.Thread):
72
79
 
73
80
  self.progress_threshold = 0.01
74
81
  self.cv = threading.Condition()
75
- self.start()
82
+
83
+ # File-based progress reporter
84
+ self.file_reporter = FileBasedProgressReporter(task_path=path)
76
85
 
77
86
  def stop(self):
78
87
  self.stopping = True
79
88
  with self.cv:
80
- self.cv.notifyAll()
89
+ # self.cv.notifyAll()
90
+ self.cv.notify_all()
81
91
 
82
92
  @staticmethod
83
- def isfatal_httperror(e: Exception) -> bool:
93
+ def isfatal_httperror(e: Exception, info: ListenerInformation) -> bool:
84
94
  """Returns True if this HTTP error indicates that the server won't recover"""
85
95
  if isinstance(e, HTTPError):
86
96
  if e.code >= 400 and e.code < 500:
@@ -90,6 +100,13 @@ class Reporter(threading.Thread):
90
100
  return True
91
101
  if isinstance(e.reason, socket.gaierror) and e.reason.errno == -2:
92
102
  return True
103
+ if isinstance(e.reason, TimeoutError):
104
+ info.error_count += 1
105
+
106
+ # Too many errors
107
+ if info.error_count > 3:
108
+ logger.info("Too many errors with %s", info.error_count)
109
+ return True
93
110
 
94
111
  return False
95
112
 
@@ -97,11 +114,12 @@ class Reporter(threading.Thread):
97
114
  return any(level.modified(self) for level in self.levels)
98
115
 
99
116
  def check_urls(self):
117
+ """Check whether we have new schedulers to notify"""
100
118
  mtime = os.path.getmtime(self.path)
101
119
  if mtime > self.lastcheck:
102
120
  for f in self.path.iterdir():
103
- self.urls[f.name] = f.read_text().strip()
104
- logger.info("Added new notification URL: %s", self.urls[f.name])
121
+ self.urls[f.name] = ListenerInformation(f.read_text().strip())
122
+ logger.info("Added new notification URL: %s", self.urls[f.name].url)
105
123
  f.unlink()
106
124
 
107
125
  self.lastcheck = os.path.getmtime(self.path)
@@ -128,7 +146,9 @@ class Reporter(threading.Thread):
128
146
  params = level.report()
129
147
 
130
148
  # Go over all URLs
131
- for key, baseurl in self.urls.items():
149
+ for key, info in self.urls.items():
150
+ baseurl = info.url
151
+
132
152
  url = "{}/progress?{}".format(
133
153
  baseurl, urllib.parse.urlencode(params)
134
154
  )
@@ -147,7 +167,7 @@ class Reporter(threading.Thread):
147
167
  url,
148
168
  e,
149
169
  )
150
- if Reporter.isfatal_httperror(e):
170
+ if Reporter.isfatal_httperror(e, info):
151
171
  toremove.append(key)
152
172
 
153
173
  # Removes unvalid URLs
@@ -165,12 +185,13 @@ class Reporter(threading.Thread):
165
185
  self.check_urls()
166
186
  if self.urls:
167
187
  # Go over all URLs
168
- for key, baseurl in self.urls.items():
188
+ for key, info in self.urls.items():
189
+ baseurl = info.url
169
190
  url = "{}?status=eoj".format(baseurl)
170
191
  try:
171
192
  with urlopen(url) as _:
172
193
  logger.debug(
173
- "EOJ botification sent for %s",
194
+ "EOJ notification sent for %s",
174
195
  baseurl,
175
196
  )
176
197
  except Exception:
@@ -178,6 +199,8 @@ class Reporter(threading.Thread):
178
199
  "Could not report EOJ",
179
200
  )
180
201
 
202
+ self.file_reporter.eoj()
203
+
181
204
  def set_progress(
182
205
  self, progress: float, level: int, desc: Optional[str], console=False
183
206
  ):
@@ -196,6 +219,8 @@ class Reporter(threading.Thread):
196
219
  self.levels[level].desc = desc
197
220
  self.levels[level].progress = progress
198
221
 
222
+ self.file_reporter.set_progress(progress, level, desc)
223
+
199
224
  self.cv.notify_all()
200
225
 
201
226
  INSTANCE: ClassVar[Optional["Reporter"]] = None
@@ -206,6 +231,7 @@ class Reporter(threading.Thread):
206
231
  taskpath = TaskEnv.instance().taskpath
207
232
  assert taskpath is not None, "Task path is not defined"
208
233
  Reporter.INSTANCE = Reporter(taskpath)
234
+ Reporter.INSTANCE.start()
209
235
  return Reporter.INSTANCE
210
236
 
211
237
 
@@ -243,7 +269,7 @@ class xpm_tqdm(std_tqdm):
243
269
 
244
270
  def update(self, n=1):
245
271
  result = super().update(n)
246
- if self.total is not None:
272
+ if self.total is not None and self.total > 0:
247
273
  progress(self.n / self.total, level=self.pos, console=False)
248
274
  return result
249
275
 
@@ -0,0 +1,406 @@
1
+ """File-based progress tracking system for experimaestro tasks."""
2
+
3
+ import json
4
+ import threading
5
+ import time
6
+ from dataclasses import dataclass, asdict
7
+ from pathlib import Path
8
+ from typing import Optional, List, Iterator, Dict, Any
9
+ from datetime import datetime, timedelta
10
+ import fcntl
11
+ import os
12
+
13
+ from .utils import logger
14
+
15
+ DEFAULT_MAX_ENTRIES_PER_FILE = 10_000
16
+
17
+
18
+ @dataclass
19
+ class ProgressEntry:
20
+ """A single progress entry in the JSONL file"""
21
+
22
+ timestamp: float
23
+ level: int
24
+ progress: float
25
+ desc: Optional[str] = None
26
+
27
+ def to_dict(self) -> Dict[str, Any]:
28
+ """Convert to dictionary for JSON serialization"""
29
+ return asdict(self)
30
+
31
+ @classmethod
32
+ def from_dict(cls, data: Dict[str, Any]) -> "ProgressEntry":
33
+ """Create from dictionary"""
34
+ return cls(**data)
35
+
36
+
37
+ class StateFile:
38
+ """Represents the state file for progress tracking.
39
+ Checks if the state must be written based on time and progress changes.
40
+ By default, it writes every second or when progress changes significantly (>1%)"""
41
+
42
+ def __init__(self, filename: Path):
43
+ self.filename = filename
44
+ self.state: Dict[int, ProgressEntry] = {}
45
+
46
+ # Write threshold to avoid too frequent writes
47
+ self._time_threshold = timedelta(seconds=1.0)
48
+ self._last_write_time: datetime = datetime.now()
49
+ # Minimum progress change to trigger write
50
+ self._progress_threshold = 0.01
51
+ self._last_write_progress: Optional[Dict[int, float]] = None
52
+
53
+ self.filename.parent.mkdir(parents=True, exist_ok=True)
54
+ self.load()
55
+
56
+ def _allow_write(self) -> bool:
57
+ """Check if the state should be written based on time and progress changes.
58
+ Allows writing if:
59
+ - BOTH: More than 1 second has passed since last write
60
+ - AND: Progress has changed significantly (>1%)
61
+ - OR: All entries are done (progress >= 1.0)"""
62
+ time_check = datetime.now() - self._last_write_time > self._time_threshold
63
+ progress_check = self._last_write_progress is None or any(
64
+ abs(entry.progress - self._last_write_progress.get(entry.level, 0.0))
65
+ > self._progress_threshold
66
+ for entry in self.state.values()
67
+ )
68
+ all_entries_done = all(entry.progress >= 1.0 for entry in self.state.values())
69
+ return all_entries_done or (time_check and progress_check)
70
+
71
+ def write(self, force: bool = False):
72
+ """Write the current state to the file."""
73
+ if self._allow_write() or force:
74
+ with open(self.filename, "w") as f:
75
+ json.dump({k: v.to_dict() for k, v in self.state.items()}, f)
76
+ self._last_write_time = datetime.now()
77
+ self._last_write_progress = {k: v.progress for k, v in self.state.items()}
78
+
79
+ def update(self, entry: ProgressEntry):
80
+ self.state[entry.level] = entry
81
+
82
+ def load(self):
83
+ """Load the state from the file"""
84
+ if self.filename.exists():
85
+ with self.filename.open("r") as f:
86
+ try:
87
+ data = json.load(f)
88
+ self.state = {
89
+ int(k): ProgressEntry.from_dict(v) for k, v in data.items()
90
+ }
91
+ except (json.JSONDecodeError, IOError):
92
+ logger.warning(f"Failed to load state from {self.filename}")
93
+
94
+ def read(self) -> Dict[int, ProgressEntry]:
95
+ """Read the state from the file"""
96
+ self.load()
97
+ return self.state
98
+
99
+ # flush on exit
100
+ def __del__(self):
101
+ """Ensure state is written on exit"""
102
+ try:
103
+ self.write(force=True)
104
+ except Exception as e:
105
+ logger.error(f"Failed to write state on exit: {e}")
106
+
107
+
108
+ class ProgressFileWriter:
109
+ # TODO: Implement buffering and flushing
110
+
111
+ def __init__(
112
+ self, task_path: Path, max_entries_per_file: int = DEFAULT_MAX_ENTRIES_PER_FILE
113
+ ):
114
+ self.task_path = task_path
115
+ self.progress_dir = task_path / ".experimaestro"
116
+ self.max_entries_per_file = max_entries_per_file
117
+ self.current_file_index = 0
118
+ self.current_file_entries = 0
119
+ self.lock = threading.Lock()
120
+
121
+ # Ensure directory exists
122
+ self.progress_dir.mkdir(exist_ok=True)
123
+
124
+ # State is the latest entry per level
125
+ self.state = StateFile(self.progress_dir / "progress_state.json")
126
+
127
+ # Find the latest file index
128
+ self._find_latest_file()
129
+
130
+ def _find_latest_file(self):
131
+ """Find the latest progress file and entry count"""
132
+ progress_files = list(self.progress_dir.glob("progress-*.jsonl"))
133
+ if not progress_files:
134
+ self.current_file_index = 0
135
+ self.current_file_entries = 0
136
+ return
137
+
138
+ # Sort by file index
139
+ max_index = None
140
+ for f in progress_files:
141
+ try:
142
+ index = int(f.stem.split("-")[1])
143
+ if max_index is None or index > max_index:
144
+ max_index = index
145
+ except (ValueError, IndexError):
146
+ continue
147
+
148
+ if max_index is not None:
149
+ self.current_file_index = max_index
150
+ # Count entries in current file
151
+ current_file = self._get_current_file_path()
152
+ if current_file.exists():
153
+ with current_file.open("r") as f:
154
+ self.current_file_entries = sum(1 for _ in f.readlines())
155
+ else:
156
+ self.current_file_entries = 0
157
+ else:
158
+ self.current_file_index = 0
159
+ self.current_file_entries = 0
160
+
161
+ def _get_current_file_path(self) -> Path:
162
+ """Get path to current progress file"""
163
+ return self.progress_dir / f"progress-{self.current_file_index:04d}.jsonl"
164
+
165
+ def _get_latest_symlink_path(self) -> Path:
166
+ """Get path to latest progress symlink"""
167
+ return self.progress_dir / "progress-latest.jsonl"
168
+
169
+ def _rotate_file_if_needed(self):
170
+ """Create new file if current one is full"""
171
+ if self.current_file_entries >= self.max_entries_per_file:
172
+ self.current_file_index += 1
173
+ self.current_file_entries = 0
174
+ logger.debug(f"Rotating to new progress file: {self.current_file_index}")
175
+
176
+ def _update_latest_symlink(self):
177
+ """Update symlink to point to latest file"""
178
+ current_file = self._get_current_file_path()
179
+ latest_symlink = self._get_latest_symlink_path()
180
+
181
+ # Remove existing symlink
182
+ if latest_symlink.exists() or latest_symlink.is_symlink():
183
+ latest_symlink.unlink()
184
+
185
+ # Create new symlink
186
+ latest_symlink.symlink_to(current_file.name)
187
+
188
+ def write_progress(self, level: int, progress: float, desc: Optional[str] = None):
189
+ """Write a progress entry to the file
190
+
191
+ Args:
192
+ level: Progress level (0 is top level)
193
+ progress: Progress value between 0.0 and 1.0
194
+ desc: Optional description
195
+ """
196
+ with self.lock:
197
+ # Eventually rotate internal state if needed
198
+ self._rotate_file_if_needed()
199
+
200
+ entry = ProgressEntry(
201
+ timestamp=time.time(), level=level, progress=progress, desc=desc
202
+ )
203
+ self.state.update(entry)
204
+ self.state.write(force=level == -1) # Force write on EOJ
205
+
206
+ current_file = self._get_current_file_path()
207
+
208
+ # Write with file locking for concurrent access
209
+ with current_file.open("a") as f:
210
+ try:
211
+ fcntl.flock(f.fileno(), fcntl.LOCK_EX)
212
+ f.write(json.dumps(entry.to_dict()) + "\n")
213
+ f.flush() # Flush the file buffer
214
+ os.fsync(f.fileno()) # Ensure data is written to disk
215
+ finally:
216
+ fcntl.flock(f.fileno(), fcntl.LOCK_UN)
217
+
218
+ self.current_file_entries += 1
219
+ self._update_latest_symlink()
220
+
221
+ logger.debug(
222
+ f"Progress written: level={level}, progress={progress}, desc={desc}"
223
+ )
224
+
225
+ def __del__(self):
226
+ """Ensure state is written on exit"""
227
+ try:
228
+ self.state.write(force=True)
229
+ except Exception as e:
230
+ logger.error(f"Failed to write state on exit: {e}")
231
+
232
+
233
+ class ProgressFileReader:
234
+ """Reads progress entries from JSONL files"""
235
+
236
+ def __init__(self, task_path: Path):
237
+ """Initialize progress file reader
238
+
239
+ Args:
240
+ task_path: Path to the task directory
241
+ """
242
+ self.task_path = task_path
243
+ self.progress_dir = task_path / ".experimaestro"
244
+ self.max_entries_per_file: Optional[int] = None
245
+ self.state = StateFile(self.progress_dir / "progress_state.json")
246
+
247
+ def get_progress_files(self) -> List[Path]:
248
+ """Get all progress files sorted by index"""
249
+ if not self.progress_dir.exists():
250
+ return []
251
+
252
+ progress_files = list(self.progress_dir.glob("progress-*.jsonl"))
253
+
254
+ # Filter out symlinks to avoid duplicates
255
+ progress_files = [f for f in progress_files if not f.is_symlink()]
256
+
257
+ # Sort by file index
258
+ # Alternatively, we could simply sort by filename
259
+ def get_index(path: Path) -> int:
260
+ try:
261
+ return int(path.stem.split("-")[1])
262
+ except (ValueError, IndexError):
263
+ return 0
264
+
265
+ return sorted(progress_files, key=get_index)
266
+
267
+ def get_latest_file(self) -> Optional[Path]:
268
+ """Get the latest progress file via symlink"""
269
+ latest_symlink = self.progress_dir / "progress-latest.jsonl"
270
+ if latest_symlink.exists() and latest_symlink.is_symlink():
271
+ return latest_symlink.resolve()
272
+
273
+ # Fallback to finding latest manually
274
+ files = self.get_progress_files()
275
+ return files[-1] if files else None
276
+
277
+ def read_entries(self, file_path: Path) -> Iterator[ProgressEntry]:
278
+ """Read progress entries from a file
279
+
280
+ Args:
281
+ file_path: Path to progress file
282
+
283
+ Yields:
284
+ ProgressEntry objects
285
+ """
286
+ if not file_path.exists():
287
+ return
288
+
289
+ try:
290
+ with file_path.open("r") as f:
291
+ fcntl.flock(f.fileno(), fcntl.LOCK_SH)
292
+ try:
293
+ for line in f:
294
+ line = line.strip()
295
+ if line:
296
+ try:
297
+ data = json.loads(line)
298
+ yield ProgressEntry.from_dict(data)
299
+ except json.JSONDecodeError as e:
300
+ logger.warning(
301
+ f"Invalid JSON in progress file {file_path}: {e}"
302
+ )
303
+ finally:
304
+ fcntl.flock(f.fileno(), fcntl.LOCK_UN)
305
+ except IOError as e:
306
+ logger.warning(f"Could not read progress file {file_path}: {e}")
307
+
308
+ def read_all_entries(self) -> Iterator[ProgressEntry]:
309
+ """Read all progress entries from all files in order
310
+
311
+ Yields:
312
+ ProgressEntry objects in chronological order
313
+ """
314
+ logger.warning("Reading all progress entries, this may be slow for large jobs.")
315
+ for file_path in self.get_progress_files():
316
+ yield from self.read_entries(file_path)
317
+
318
+ def read_latest_entries(self, count: Optional[int] = None) -> List[ProgressEntry]:
319
+ """Read the latest N progress entries"""
320
+ entries = []
321
+
322
+ # Read files in reverse order to get latest entries first
323
+ files = self.get_progress_files()
324
+ # Fetch the max length of files, in lines
325
+ if files and count is None:
326
+ # Fetch the number of entries in the first file
327
+ # This is the most likely to be the longest file
328
+ count = sum(1 for _ in self.read_entries(files[0]))
329
+ if count is None:
330
+ count = DEFAULT_MAX_ENTRIES_PER_FILE
331
+
332
+ for file_path in reversed(files):
333
+ file_entries = list(self.read_entries(file_path))
334
+ entries.extend(reversed(file_entries))
335
+
336
+ if len(entries) >= count:
337
+ break
338
+
339
+ # Return latest entries in chronological order
340
+ return list(reversed(entries[:count]))
341
+
342
+ def get_current_progress(
343
+ self, count: Optional[int] = None
344
+ ) -> Dict[int, ProgressEntry]:
345
+ """Get the current progress for each level"""
346
+ logger.warning(
347
+ "Reading current progress from progress logs, this may be slow for large jobs."
348
+ )
349
+ return {entry.level: entry for entry in self.read_latest_entries(count)}
350
+
351
+ def get_current_state(self) -> Optional[Dict[int, ProgressEntry]]:
352
+ """Fetch the latest progress entry from the state file"""
353
+ current_state = self.state.read()
354
+ return current_state or self.get_current_progress()
355
+
356
+ def is_done(self) -> bool:
357
+ """Check if the task is done by looking for a special 'done' file.
358
+ Fallback to checking for end-of-job (EOJ) entries."""
359
+
360
+ task_name = self.task_path.parent.stem.split(".")[-1]
361
+ job_done_file = self.task_path / f"{task_name}.done"
362
+ if job_done_file.exists() and job_done_file.is_file():
363
+ return True
364
+
365
+ # Check if any progress file has a level -1 entry indicating EOJ
366
+ return any(entry.level == -1 for entry in self.read_all_entries())
367
+
368
+
369
+ class FileBasedProgressReporter:
370
+ """File-based progress reporter that replaces the socket-based Reporter"""
371
+
372
+ def __init__(self, task_path: Path):
373
+ """Initialize file-based progress reporter
374
+
375
+ Args:
376
+ task_path: Path to the task directory
377
+ """
378
+ self.task_path = task_path
379
+ self.writer = ProgressFileWriter(task_path)
380
+ self.current_progress = {} # level -> (progress, desc)
381
+ self.lock = threading.Lock()
382
+
383
+ def set_progress(self, progress: float, level: int = 0, desc: Optional[str] = None):
384
+ """Set progress for a specific level
385
+
386
+ Args:
387
+ progress: Progress value between 0.0 and 1.0
388
+ level: Progress level (0 is top level)
389
+ desc: Optional description
390
+ """
391
+ with self.lock:
392
+ # Check if progress has changed significantly
393
+ current = self.current_progress.get(level, (None, None))
394
+ if (
395
+ current[0] is None
396
+ or abs(progress - current[0]) > 0.01
397
+ or desc != current[1]
398
+ ):
399
+ self.current_progress[level] = (progress, desc)
400
+ self.writer.write_progress(level, progress, desc)
401
+
402
+ def eoj(self):
403
+ """End of job notification"""
404
+ with self.lock:
405
+ # Write a special end-of-job marker
406
+ self.writer.write_progress(-1, 1.0, "EOJ")
experimaestro/run.py CHANGED
@@ -8,6 +8,7 @@ import json
8
8
  from typing import List
9
9
  import fasteners
10
10
  from experimaestro.notifications import progress, report_eoj
11
+ from experimaestro.utils.multiprocessing import delayed_shutdown
11
12
  from .core.types import ObjectType
12
13
  from experimaestro.utils import logger
13
14
  from experimaestro.core.objects import ConfigInformation
@@ -92,15 +93,27 @@ class TaskRunner:
92
93
  logger.info("Finished cleanup")
93
94
 
94
95
  def handle_error(self, code, frame_type):
95
- logger.info("Handling error")
96
+ logger.info("Error handler: finished with code %d", code)
96
97
  self.failedpath.write_text(str(code))
97
98
  self.cleanup()
99
+ logger.info("Exiting")
100
+ delayed_shutdown(60, exit_code=code)
98
101
  sys.exit(1)
99
102
 
100
103
  def run(self):
101
104
  atexit.register(self.cleanup)
102
- signal.signal(signal.SIGTERM, self.handle_error)
103
- signal.signal(signal.SIGINT, self.handle_error)
105
+ sigterm_handler = signal.signal(signal.SIGTERM, self.handle_error)
106
+ sigint_handler = signal.signal(signal.SIGINT, self.handle_error)
107
+
108
+ def remove_signal_handlers(remove_cleanup=True):
109
+ """Removes cleanup in forked processes"""
110
+ signal.signal(signal.SIGTERM, sigterm_handler)
111
+ signal.signal(signal.SIGINT, sigint_handler)
112
+ atexit.unregister(self.cleanup)
113
+
114
+ if sys.platform != "win32":
115
+ os.register_at_fork(after_in_child=remove_signal_handlers)
116
+
104
117
  try:
105
118
  workdir = self.scriptpath.parent
106
119
  os.chdir(workdir)
@@ -128,7 +141,11 @@ class TaskRunner:
128
141
  self.started = True
129
142
  run(workdir / "params.json")
130
143
 
144
+ # ... remove the handlers
145
+ remove_signal_handlers(remove_cleanup=False)
146
+
131
147
  # Everything went OK
148
+ logger.info("Task ended successfully")
132
149
  sys.exit(0)
133
150
  except Exception:
134
151
  logger.exception("Got exception while running")
@@ -136,6 +153,10 @@ class TaskRunner:
136
153
 
137
154
  except SystemExit as e:
138
155
  if e.code == 0:
156
+ # Normal exit, just create the ".done" file
139
157
  self.donepath.touch()
158
+
159
+ # ... and finish the exit process
160
+ raise
140
161
  else:
141
162
  self.handle_error(e.code, None)
@@ -1 +1,18 @@
1
- from .base import *
1
+ from .base import Scheduler, Listener
2
+ from .workspace import Workspace, RunMode
3
+ from .experiment import experiment, FailedExperiment
4
+ from .jobs import Job, JobState, JobFailureStatus, JobDependency, JobContext
5
+
6
+ __all__ = [
7
+ "Scheduler",
8
+ "Listener",
9
+ "Workspace",
10
+ "RunMode",
11
+ "experiment",
12
+ "FailedExperiment",
13
+ "Job",
14
+ "JobState",
15
+ "JobFailureStatus",
16
+ "JobDependency",
17
+ "JobContext",
18
+ ]