experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (154) hide show
  1. experimaestro/__init__.py +12 -5
  2. experimaestro/cli/__init__.py +393 -134
  3. experimaestro/cli/filter.py +48 -23
  4. experimaestro/cli/jobs.py +253 -71
  5. experimaestro/cli/refactor.py +1 -2
  6. experimaestro/commandline.py +7 -4
  7. experimaestro/connectors/__init__.py +9 -1
  8. experimaestro/connectors/local.py +43 -3
  9. experimaestro/core/arguments.py +18 -18
  10. experimaestro/core/identifier.py +11 -11
  11. experimaestro/core/objects/config.py +96 -39
  12. experimaestro/core/objects/config_walk.py +3 -3
  13. experimaestro/core/{subparameters.py → partial.py} +16 -16
  14. experimaestro/core/partial_lock.py +394 -0
  15. experimaestro/core/types.py +12 -15
  16. experimaestro/dynamic.py +290 -0
  17. experimaestro/experiments/__init__.py +6 -2
  18. experimaestro/experiments/cli.py +223 -52
  19. experimaestro/experiments/configuration.py +24 -0
  20. experimaestro/generators.py +5 -5
  21. experimaestro/ipc.py +118 -1
  22. experimaestro/launcherfinder/__init__.py +2 -2
  23. experimaestro/launcherfinder/registry.py +6 -7
  24. experimaestro/launcherfinder/specs.py +2 -9
  25. experimaestro/launchers/slurm/__init__.py +2 -2
  26. experimaestro/launchers/slurm/base.py +62 -0
  27. experimaestro/locking.py +957 -1
  28. experimaestro/notifications.py +89 -201
  29. experimaestro/progress.py +63 -366
  30. experimaestro/rpyc.py +0 -2
  31. experimaestro/run.py +29 -2
  32. experimaestro/scheduler/__init__.py +8 -1
  33. experimaestro/scheduler/base.py +650 -53
  34. experimaestro/scheduler/dependencies.py +20 -16
  35. experimaestro/scheduler/experiment.py +764 -169
  36. experimaestro/scheduler/interfaces.py +338 -96
  37. experimaestro/scheduler/jobs.py +58 -20
  38. experimaestro/scheduler/remote/__init__.py +31 -0
  39. experimaestro/scheduler/remote/adaptive_sync.py +265 -0
  40. experimaestro/scheduler/remote/client.py +928 -0
  41. experimaestro/scheduler/remote/protocol.py +282 -0
  42. experimaestro/scheduler/remote/server.py +447 -0
  43. experimaestro/scheduler/remote/sync.py +144 -0
  44. experimaestro/scheduler/services.py +186 -35
  45. experimaestro/scheduler/state_provider.py +811 -2157
  46. experimaestro/scheduler/state_status.py +1247 -0
  47. experimaestro/scheduler/transient.py +31 -0
  48. experimaestro/scheduler/workspace.py +1 -1
  49. experimaestro/scheduler/workspace_state_provider.py +1273 -0
  50. experimaestro/scriptbuilder.py +4 -4
  51. experimaestro/settings.py +36 -0
  52. experimaestro/tests/conftest.py +33 -5
  53. experimaestro/tests/connectors/bin/executable.py +1 -1
  54. experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
  55. experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
  56. experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
  57. experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
  58. experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
  59. experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
  60. experimaestro/tests/launchers/bin/test.py +1 -0
  61. experimaestro/tests/launchers/test_slurm.py +9 -9
  62. experimaestro/tests/partial_reschedule.py +46 -0
  63. experimaestro/tests/restart.py +3 -3
  64. experimaestro/tests/restart_main.py +1 -0
  65. experimaestro/tests/scripts/notifyandwait.py +1 -0
  66. experimaestro/tests/task_partial.py +38 -0
  67. experimaestro/tests/task_tokens.py +2 -2
  68. experimaestro/tests/tasks/test_dynamic.py +6 -6
  69. experimaestro/tests/test_dependencies.py +3 -3
  70. experimaestro/tests/test_deprecated.py +15 -15
  71. experimaestro/tests/test_dynamic_locking.py +317 -0
  72. experimaestro/tests/test_environment.py +24 -14
  73. experimaestro/tests/test_experiment.py +171 -36
  74. experimaestro/tests/test_identifier.py +25 -25
  75. experimaestro/tests/test_identifier_stability.py +3 -5
  76. experimaestro/tests/test_multitoken.py +2 -4
  77. experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
  78. experimaestro/tests/test_partial_paths.py +81 -138
  79. experimaestro/tests/test_pre_experiment.py +219 -0
  80. experimaestro/tests/test_progress.py +2 -8
  81. experimaestro/tests/test_remote_state.py +1132 -0
  82. experimaestro/tests/test_stray_jobs.py +261 -0
  83. experimaestro/tests/test_tasks.py +1 -2
  84. experimaestro/tests/test_token_locking.py +52 -67
  85. experimaestro/tests/test_tokens.py +5 -6
  86. experimaestro/tests/test_transient.py +225 -0
  87. experimaestro/tests/test_workspace_state_provider.py +768 -0
  88. experimaestro/tests/token_reschedule.py +1 -3
  89. experimaestro/tests/utils.py +2 -7
  90. experimaestro/tokens.py +227 -372
  91. experimaestro/tools/diff.py +1 -0
  92. experimaestro/tools/documentation.py +4 -5
  93. experimaestro/tools/jobs.py +1 -2
  94. experimaestro/tui/app.py +459 -1895
  95. experimaestro/tui/app.tcss +162 -0
  96. experimaestro/tui/dialogs.py +172 -0
  97. experimaestro/tui/log_viewer.py +253 -3
  98. experimaestro/tui/messages.py +137 -0
  99. experimaestro/tui/utils.py +54 -0
  100. experimaestro/tui/widgets/__init__.py +23 -0
  101. experimaestro/tui/widgets/experiments.py +468 -0
  102. experimaestro/tui/widgets/global_services.py +238 -0
  103. experimaestro/tui/widgets/jobs.py +972 -0
  104. experimaestro/tui/widgets/log.py +156 -0
  105. experimaestro/tui/widgets/orphans.py +363 -0
  106. experimaestro/tui/widgets/runs.py +185 -0
  107. experimaestro/tui/widgets/services.py +314 -0
  108. experimaestro/tui/widgets/stray_jobs.py +528 -0
  109. experimaestro/utils/__init__.py +1 -1
  110. experimaestro/utils/environment.py +105 -22
  111. experimaestro/utils/fswatcher.py +124 -0
  112. experimaestro/utils/jobs.py +1 -2
  113. experimaestro/utils/jupyter.py +1 -2
  114. experimaestro/utils/logging.py +72 -0
  115. experimaestro/version.py +2 -2
  116. experimaestro/webui/__init__.py +9 -0
  117. experimaestro/webui/app.py +117 -0
  118. experimaestro/{server → webui}/data/index.css +66 -11
  119. experimaestro/webui/data/index.css.map +1 -0
  120. experimaestro/{server → webui}/data/index.js +82763 -87217
  121. experimaestro/webui/data/index.js.map +1 -0
  122. experimaestro/webui/routes/__init__.py +5 -0
  123. experimaestro/webui/routes/auth.py +53 -0
  124. experimaestro/webui/routes/proxy.py +117 -0
  125. experimaestro/webui/server.py +200 -0
  126. experimaestro/webui/state_bridge.py +152 -0
  127. experimaestro/webui/websocket.py +413 -0
  128. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
  129. experimaestro-2.0.0b17.dist-info/RECORD +219 -0
  130. experimaestro/cli/progress.py +0 -269
  131. experimaestro/scheduler/state.py +0 -75
  132. experimaestro/scheduler/state_db.py +0 -388
  133. experimaestro/scheduler/state_sync.py +0 -834
  134. experimaestro/server/__init__.py +0 -467
  135. experimaestro/server/data/index.css.map +0 -1
  136. experimaestro/server/data/index.js.map +0 -1
  137. experimaestro/tests/test_cli_jobs.py +0 -615
  138. experimaestro/tests/test_file_progress.py +0 -425
  139. experimaestro/tests/test_file_progress_integration.py +0 -477
  140. experimaestro/tests/test_state_db.py +0 -434
  141. experimaestro-2.0.0b4.dist-info/RECORD +0 -181
  142. /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
  143. /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
  144. /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
  145. /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
  146. /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
  147. /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
  148. /experimaestro/{server → webui}/data/favicon.ico +0 -0
  149. /experimaestro/{server → webui}/data/index.html +0 -0
  150. /experimaestro/{server → webui}/data/login.html +0 -0
  151. /experimaestro/{server → webui}/data/manifest.json +0 -0
  152. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
  153. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
  154. {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1247 @@
1
+ """Filesystem-based state tracking for experiments
2
+
3
+ This module provides event and status file handling for tracking experiment state
4
+ without using a database. It replaces the SQLite/peewee-based state tracking.
5
+
6
+ Key components:
7
+ - Event dataclasses: Serializable events for JSONL event files
8
+ - EventWriter/EventReader: Base classes for event I/O
9
+ - JobEventWriter: Job-specific event handling
10
+ - ExperimentEventWriter/ExperimentEventReader: Experiment-specific event handling
11
+
12
+ File structure:
13
+ - workspace/.events/experiments/{experiment-id}/events-{count}.jsonl
14
+ - workspace/.events/jobs/{task-id}/event-{job-id}-{count}.jsonl
15
+ - workspace/experiments/{experiment-id}/{run-id}/status.json
16
+ - workspace/jobs/{task-id}/{job-id}/.experimaestro/information.json
17
+ """
18
+
19
+ import json
20
+ import logging
21
+ import os
22
+ import shutil
23
+ import time
24
+ from abc import ABC, abstractmethod
25
+ from dataclasses import asdict, dataclass, field
26
+ from pathlib import Path
27
+ from typing import Any, Callable, Optional, TYPE_CHECKING
28
+
29
+
30
+ if TYPE_CHECKING:
31
+ from experimaestro.scheduler.interfaces import BaseExperiment
32
+
33
+ logger = logging.getLogger("xpm.state_status")
34
+
35
+ # Status file version
36
+ STATUS_VERSION = 1
37
+
38
+
39
+ # =============================================================================
40
+ # Hardlink Support Utilities
41
+ # =============================================================================
42
+
43
+
44
+ def supports_hardlinks(path: Path) -> bool:
45
+ """Check if the filesystem at path supports hardlinks
46
+
47
+ Creates temporary test files to verify hardlink support. Useful for
48
+ determining whether to use hardlinks for event file archiving.
49
+
50
+ Args:
51
+ path: Directory to test for hardlink support
52
+
53
+ Returns:
54
+ True if hardlinks are supported, False otherwise
55
+ """
56
+ path.mkdir(parents=True, exist_ok=True)
57
+ test_file = path / ".hardlink_test"
58
+ test_link = path / ".hardlink_test_link"
59
+ try:
60
+ # Clean up any leftover test files
61
+ if test_link.exists():
62
+ test_link.unlink()
63
+ if test_file.exists():
64
+ test_file.unlink()
65
+
66
+ # Create test file and hardlink
67
+ test_file.touch()
68
+ os.link(test_file, test_link)
69
+
70
+ # Verify it's actually a hardlink (same inode)
71
+ success = test_file.stat().st_ino == test_link.stat().st_ino
72
+
73
+ # Clean up
74
+ test_link.unlink()
75
+ test_file.unlink()
76
+ return success
77
+ except (OSError, AttributeError):
78
+ # Clean up on failure
79
+ try:
80
+ if test_link.exists():
81
+ test_link.unlink()
82
+ if test_file.exists():
83
+ test_file.unlink()
84
+ except OSError:
85
+ pass
86
+ return False
87
+
88
+
89
+ def safe_link_or_copy(src: Path, dst: Path, use_hardlinks: bool = True) -> bool:
90
+ """Create hardlink if supported, otherwise copy the file
91
+
92
+ Args:
93
+ src: Source file path
94
+ dst: Destination file path
95
+ use_hardlinks: If True, attempt hardlink first; if False, always copy
96
+
97
+ Returns:
98
+ True if hardlink was used, False if file was copied
99
+ """
100
+ dst.parent.mkdir(parents=True, exist_ok=True)
101
+
102
+ if use_hardlinks:
103
+ try:
104
+ # Remove destination if it exists (can't hardlink over existing file)
105
+ if dst.exists():
106
+ dst.unlink()
107
+ os.link(src, dst)
108
+ return True
109
+ except OSError:
110
+ pass
111
+
112
+ # Fall back to copy
113
+ shutil.copy2(src, dst)
114
+ return False
115
+
116
+
117
+ # =============================================================================
118
+ # Event System
119
+ # =============================================================================
120
+
121
+ # Registry for event deserialization (auto-populated by __init_subclass__)
122
+ EVENT_TYPES: dict[str, type["EventBase"]] = {}
123
+
124
+
125
+ @dataclass
126
+ class EventBase:
127
+ """Base class for all events
128
+
129
+ Events are lightweight - they carry only IDs, not object references.
130
+ Use StateProvider to fetch actual objects (BaseJob, BaseExperiment, etc.)
131
+ when needed.
132
+
133
+ Subclasses are automatically registered in EVENT_TYPES by their class name.
134
+ JSON serialization/deserialization is handled transparently via event_type field.
135
+ """
136
+
137
+ timestamp: float = field(default_factory=time.time)
138
+
139
+ def __init_subclass__(cls, **kwargs):
140
+ super().__init_subclass__(**kwargs)
141
+ # Register by class name
142
+ EVENT_TYPES[cls.__name__] = cls
143
+
144
+ @property
145
+ def event_type(self) -> str:
146
+ """Event type derived from class name"""
147
+ return self.__class__.__name__
148
+
149
+ def to_json(self) -> str:
150
+ """Serialize event to JSON string"""
151
+ d = asdict(self)
152
+ d["event_type"] = self.event_type
153
+ return json.dumps(d, separators=(",", ":"))
154
+
155
+ @classmethod
156
+ def from_dict(cls, d: dict) -> "EventBase":
157
+ """Deserialize event from dictionary"""
158
+ event_type = d.get("event_type")
159
+ event_class = EVENT_TYPES.get(event_type, EventBase)
160
+ # Filter to only known fields for the event class
161
+ valid_fields = {f for f in event_class.__dataclass_fields__}
162
+ filtered = {k: v for k, v in d.items() if k in valid_fields}
163
+ return event_class(**filtered)
164
+
165
+ @classmethod
166
+ def get_class(cls, name: str) -> "type[EventBase] | None":
167
+ """Get an EventBase subclass by class name"""
168
+ return EVENT_TYPES.get(name)
169
+
170
+
171
+ # -----------------------------------------------------------------------------
172
+ # Event Base Classes (for filtering)
173
+ # -----------------------------------------------------------------------------
174
+
175
+
176
+ @dataclass
177
+ class JobEventBase(EventBase):
178
+ """Base class for job-related events (have job_id)"""
179
+
180
+ job_id: str = ""
181
+
182
+
183
+ @dataclass
184
+ class ExperimentEventBase(EventBase):
185
+ """Base class for experiment-related events (have experiment_id)"""
186
+
187
+ experiment_id: str = ""
188
+
189
+
190
+ @dataclass
191
+ class ServiceEventBase(ExperimentEventBase):
192
+ """Base class for service-related events (have service_id)"""
193
+
194
+ service_id: str = ""
195
+
196
+
197
+ # -----------------------------------------------------------------------------
198
+ # Supporting Dataclasses
199
+ # -----------------------------------------------------------------------------
200
+
201
+
202
+ @dataclass
203
+ class ProgressLevel:
204
+ """Progress information for a single level"""
205
+
206
+ level: int = 0
207
+ progress: float = 0.0
208
+ desc: Optional[str] = None
209
+
210
+ def to_dict(self) -> dict:
211
+ return {"level": self.level, "progress": self.progress, "desc": self.desc}
212
+
213
+ @classmethod
214
+ def from_dict(cls, d: dict) -> "ProgressLevel":
215
+ return cls(
216
+ level=d.get("level", 0),
217
+ progress=d.get("progress", 0.0),
218
+ desc=d.get("desc"),
219
+ )
220
+
221
+
222
+ @dataclass
223
+ class JobTag:
224
+ """A job tag (key-value pair)"""
225
+
226
+ key: str
227
+ value: str
228
+
229
+
230
+ # -----------------------------------------------------------------------------
231
+ # Job Events
232
+ # -----------------------------------------------------------------------------
233
+
234
+
235
+ @dataclass
236
+ class JobSubmittedEvent(JobEventBase, ExperimentEventBase):
237
+ """Event: Job was submitted to the scheduler
238
+
239
+ Fired when a job is added to an experiment run.
240
+ This is both a job event and an experiment event.
241
+ """
242
+
243
+ task_id: str = ""
244
+ run_id: str = ""
245
+ transient: int = 0
246
+ tags: list[JobTag] = field(default_factory=list)
247
+ depends_on: list[str] = field(default_factory=list)
248
+
249
+
250
+ @dataclass
251
+ class JobStateChangedEvent(JobEventBase):
252
+ """Event: Job state changed
253
+
254
+ Fired when a job's state changes (scheduled, running, done, error, etc.)
255
+ """
256
+
257
+ state: str = ""
258
+ failure_reason: Optional[str] = None
259
+ submitted_time: Optional[float] = None
260
+ started_time: Optional[float] = None
261
+ ended_time: Optional[float] = None
262
+ exit_code: Optional[int] = None
263
+ retry_count: int = 0
264
+ progress: list[ProgressLevel] = field(default_factory=list)
265
+
266
+
267
+ @dataclass
268
+ class JobProgressEvent(JobEventBase):
269
+ """Event: Job progress update
270
+
271
+ Written by the running job process to report progress.
272
+ """
273
+
274
+ level: int = 0
275
+ progress: float = 0.0
276
+ desc: Optional[str] = None
277
+
278
+
279
+ # -----------------------------------------------------------------------------
280
+ # Experiment Events
281
+ # -----------------------------------------------------------------------------
282
+
283
+
284
+ @dataclass
285
+ class ExperimentUpdatedEvent(ExperimentEventBase):
286
+ """Event: Experiment was created or updated"""
287
+
288
+ pass
289
+
290
+
291
+ @dataclass
292
+ class RunUpdatedEvent(ExperimentEventBase):
293
+ """Event: Experiment run was created or updated"""
294
+
295
+ run_id: str = ""
296
+
297
+
298
+ @dataclass
299
+ class RunCompletedEvent(ExperimentEventBase):
300
+ """Event: Experiment run completed"""
301
+
302
+ run_id: str = ""
303
+ status: str = "completed"
304
+ ended_at: str = ""
305
+
306
+
307
+ # -----------------------------------------------------------------------------
308
+ # Service Events
309
+ # -----------------------------------------------------------------------------
310
+
311
+
312
+ @dataclass
313
+ class ServiceAddedEvent(ServiceEventBase):
314
+ """Event: Service was added to the experiment"""
315
+
316
+ run_id: str = ""
317
+ description: str = ""
318
+ service_class: str = ""
319
+ state_dict: dict[str, Any] = field(default_factory=dict)
320
+
321
+
322
+ @dataclass
323
+ class ServiceStateChangedEvent(ServiceEventBase):
324
+ """Event: Service state changed (STOPPED, STARTING, RUNNING, STOPPING)"""
325
+
326
+ run_id: str = ""
327
+ state: str = ""
328
+
329
+
330
+ # =============================================================================
331
+ # Event Writer Classes
332
+ # =============================================================================
333
+
334
+
335
+ class EventWriter(ABC):
336
+ """Base class for writing events to JSONL files
337
+
338
+ Events are written to {events_dir}/events-{count}.jsonl
339
+ Uses line buffering so each event is flushed immediately after write.
340
+
341
+ Supports proactive hardlinking: when a permanent_dir is set and hardlinks
342
+ are supported, a hardlink is created to permanent storage immediately when
343
+ the event file is opened. This ensures events are written to both locations
344
+ simultaneously and no data is lost if the process crashes.
345
+ """
346
+
347
+ def __init__(self, initial_count: int = 0, permanent_dir: Path | None = None):
348
+ """Initialize event writer
349
+
350
+ Args:
351
+ initial_count: Starting event file count for rotation
352
+ permanent_dir: Optional permanent storage directory for archiving.
353
+ If set and hardlinks are supported, events are written to both
354
+ temporary and permanent locations via hardlink.
355
+ """
356
+ self._count = initial_count
357
+ self._file = None
358
+ self._permanent_dir = permanent_dir
359
+ self._use_hardlinks: bool | None = None # None = not checked yet
360
+ self._hardlink_created_for_count: int | None = (
361
+ None # Track which file has hardlink
362
+ )
363
+
364
+ @property
365
+ @abstractmethod
366
+ def events_dir(self) -> Path:
367
+ """Get the directory where events are written"""
368
+ ...
369
+
370
+ @property
371
+ def permanent_dir(self) -> Path | None:
372
+ """Get the permanent storage directory"""
373
+ return self._permanent_dir
374
+
375
+ def _check_hardlink_support(self) -> bool:
376
+ """Check and cache hardlink support"""
377
+ if self._use_hardlinks is None:
378
+ if self._permanent_dir:
379
+ self._permanent_dir.mkdir(parents=True, exist_ok=True)
380
+ self._use_hardlinks = supports_hardlinks(self._permanent_dir)
381
+ else:
382
+ self._use_hardlinks = False
383
+ return self._use_hardlinks
384
+
385
+ def _get_event_file_path(self) -> Path:
386
+ return self.events_dir / f"events-{self._count}.jsonl"
387
+
388
+ def _get_permanent_event_file_path(self) -> Path:
389
+ """Get path for permanent event file"""
390
+ if self._permanent_dir is None:
391
+ raise ValueError("No permanent directory configured")
392
+ return self._permanent_dir / f"event-{self._count}.jsonl"
393
+
394
+ def write_event(self, event: EventBase) -> None:
395
+ """Write an event to the current event file
396
+
397
+ If permanent storage is configured and hardlinks are supported,
398
+ creates a hardlink immediately when the file is first opened.
399
+ """
400
+ if self._file is None:
401
+ self.events_dir.mkdir(parents=True, exist_ok=True)
402
+ # Use line buffering (buffering=1) so each line is flushed automatically
403
+ self._file = self._get_event_file_path().open("a", buffering=1)
404
+
405
+ # Create hardlink to permanent storage immediately if supported
406
+ if self._check_hardlink_support() and self._permanent_dir:
407
+ temp_path = self._get_event_file_path()
408
+ perm_path = self._get_permanent_event_file_path()
409
+ try:
410
+ perm_path.parent.mkdir(parents=True, exist_ok=True)
411
+ if not perm_path.exists():
412
+ os.link(temp_path, perm_path)
413
+ self._hardlink_created_for_count = self._count
414
+ logger.debug("Created hardlink %s -> %s", temp_path, perm_path)
415
+ except FileExistsError:
416
+ pass # Already linked
417
+ except OSError as e:
418
+ logger.warning("Failed to create hardlink: %s", e)
419
+
420
+ self._file.write(event.to_json() + "\n")
421
+
422
+ def flush(self) -> None:
423
+ """Flush the current event file to disk"""
424
+ if self._file is not None:
425
+ self._file.flush()
426
+ os.fsync(self._file.fileno())
427
+
428
+ def close(self) -> None:
429
+ """Close the current event file"""
430
+ if self._file is not None:
431
+ self._file.flush()
432
+ os.fsync(self._file.fileno())
433
+ self._file.close()
434
+ self._file = None
435
+
436
+ def rotate(self, new_count: int) -> None:
437
+ """Rotate to a new event file (called after status file update)"""
438
+ self.close()
439
+ self._count = new_count
440
+
441
+ def cleanup(self) -> None:
442
+ """Delete all event files in this directory (temporary files only)"""
443
+ self.close()
444
+ for i in range(self._count + 1):
445
+ path = self.events_dir / f"events-{i}.jsonl"
446
+ if path.exists():
447
+ try:
448
+ path.unlink()
449
+ except OSError as e:
450
+ logger.warning("Failed to delete event file %s: %s", path, e)
451
+
452
+ def archive_events(self) -> None:
453
+ """Archive events to permanent storage (called on completion)
454
+
455
+ For each temp file:
456
+ - If permanent file exists (hardlink already created): just delete temp
457
+ - If permanent file doesn't exist: move temp to permanent
458
+ """
459
+ self.close()
460
+
461
+ if not self._permanent_dir:
462
+ return
463
+
464
+ self._permanent_dir.mkdir(parents=True, exist_ok=True)
465
+
466
+ for i in range(self._count + 1):
467
+ temp_path = self._get_temp_event_file_path(i)
468
+ perm_path = self._permanent_dir / f"event-{i}.jsonl"
469
+
470
+ if not temp_path.exists():
471
+ continue
472
+
473
+ if perm_path.exists():
474
+ # Permanent file exists (hardlink) - just delete temp
475
+ try:
476
+ temp_path.unlink()
477
+ except OSError as e:
478
+ logger.warning("Failed to delete temp file %s: %s", temp_path, e)
479
+ else:
480
+ # No permanent file - move temp to permanent
481
+ try:
482
+ shutil.move(str(temp_path), str(perm_path))
483
+ except OSError as e:
484
+ logger.warning("Failed to archive %s: %s", temp_path, e)
485
+
486
+ def _get_temp_event_file_path(self, count: int) -> Path:
487
+ """Get path for temporary event file at a specific count"""
488
+ return self.events_dir / f"events-{count}.jsonl"
489
+
490
+
491
+ class JobEventWriter(EventWriter):
492
+ """Writes events to job event files
493
+
494
+ Events are stored in: {workspace}/.events/jobs/{task_id}/event-{job_id}-{count}.jsonl
495
+ Permanent storage: {job_path}/.experimaestro/events/event-{count}.jsonl
496
+ """
497
+
498
+ def __init__(
499
+ self,
500
+ workspace_path: Path,
501
+ task_id: str,
502
+ job_id: str,
503
+ initial_count: int = 0,
504
+ job_path: Path | None = None,
505
+ ):
506
+ """Initialize job event writer
507
+
508
+ Args:
509
+ workspace_path: Path to workspace directory
510
+ task_id: Task identifier (groups events by task type)
511
+ job_id: Job identifier (unique hash for this job instance)
512
+ initial_count: Starting event file count for rotation
513
+ job_path: Optional job directory path for permanent storage
514
+ """
515
+ # Permanent storage: job_path/.experimaestro/events/
516
+ permanent_dir = job_path / ".experimaestro" / "events" if job_path else None
517
+ super().__init__(initial_count, permanent_dir)
518
+ self.workspace_path = workspace_path
519
+ self.task_id = task_id
520
+ self.job_id = job_id
521
+ self.job_path = job_path
522
+ self._events_dir = workspace_path / ".events" / "jobs" / task_id
523
+
524
+ @property
525
+ def events_dir(self) -> Path:
526
+ return self._events_dir
527
+
528
+ def _get_event_file_path(self) -> Path:
529
+ """Get the path for job event file with job_id in filename"""
530
+ return self.events_dir / f"event-{self.job_id}-{self._count}.jsonl"
531
+
532
+ def _get_temp_event_file_path(self, count: int) -> Path:
533
+ """Get path for temporary event file at a specific count"""
534
+ return self.events_dir / f"event-{self.job_id}-{count}.jsonl"
535
+
536
+
537
+ class ExperimentEventWriter(EventWriter):
538
+ """Writes events to experiment event files
539
+
540
+ Events are stored in: {workspace}/.events/experiments/{experiment_id}/events-{count}.jsonl
541
+ Permanent storage: {run_dir}/events/event-{count}.jsonl
542
+ """
543
+
544
+ def __init__(
545
+ self,
546
+ experiment: "BaseExperiment",
547
+ workspace_path: Path,
548
+ initial_count: int = 0,
549
+ ):
550
+ """Initialize experiment event writer
551
+
552
+ Args:
553
+ experiment: The experiment (BaseExperiment) to write events for
554
+ workspace_path: Path to workspace directory
555
+ initial_count: Starting event file count for rotation
556
+ """
557
+ from experimaestro.scheduler.interfaces import BaseExperiment
558
+
559
+ assert isinstance(experiment, BaseExperiment), (
560
+ f"experiment must be a BaseExperiment, got {type(experiment)}"
561
+ )
562
+ # Permanent storage: run_dir/events/
563
+ run_dir = experiment.run_dir
564
+ permanent_dir = run_dir / "events" if run_dir else None
565
+ super().__init__(initial_count, permanent_dir)
566
+ self.experiment = experiment
567
+ self.workspace_path = workspace_path
568
+ self._events_dir = (
569
+ workspace_path / ".events" / "experiments" / experiment.experiment_id
570
+ )
571
+
572
+ @property
573
+ def events_dir(self) -> Path:
574
+ return self._events_dir
575
+
576
+ @property
577
+ def experiment_id(self) -> str:
578
+ return self.experiment.experiment_id
579
+
580
+ @property
581
+ def run_dir(self) -> Path | None:
582
+ return self.experiment.run_dir
583
+
584
+ def init_status(self) -> None:
585
+ """Initialize status.json for a new run
586
+
587
+ Uses the experiment's write_status() method to write the initial status.
588
+ """
589
+ # Ensure run directory exists
590
+ run_dir = self.experiment.run_dir
591
+ if run_dir:
592
+ run_dir.mkdir(parents=True, exist_ok=True)
593
+ # Write initial status using experiment's write_status method
594
+ self.experiment.write_status()
595
+
596
+ def create_symlink(self) -> None:
597
+ """Create/update symlink to current run directory
598
+
599
+ The symlink is created at:
600
+ .events/experiments/{experiment_id}/current -> run_dir
601
+ """
602
+ run_dir = self.experiment.run_dir
603
+ if run_dir is None:
604
+ return
605
+
606
+ # Ensure the experiment events directory exists
607
+ self._events_dir.mkdir(parents=True, exist_ok=True)
608
+
609
+ # Handle legacy: if experiment_id path is a symlink (old format), remove it
610
+ # Check both old .experimaestro and current .events paths
611
+ for events_base in [".experimaestro", ".events"]:
612
+ experiments_dir = self.workspace_path / events_base / "experiments"
613
+ old_symlink = experiments_dir / self.experiment_id
614
+ if old_symlink.is_symlink():
615
+ old_symlink.unlink()
616
+
617
+ # Create symlink inside the experiment directory
618
+ symlink = self._events_dir / "current"
619
+
620
+ # Compute relative path from symlink location to run_dir
621
+ try:
622
+ rel_path = os.path.relpath(run_dir, self._events_dir)
623
+ except ValueError:
624
+ # On Windows, relpath fails for paths on different drives
625
+ rel_path = str(run_dir)
626
+
627
+ # Remove existing symlink if present
628
+ if symlink.is_symlink() or symlink.exists():
629
+ symlink.unlink()
630
+
631
+ symlink.symlink_to(rel_path)
632
+
633
+
634
+ # =============================================================================
635
+ # Event Reader Class
636
+ # =============================================================================
637
+
638
+ # Callback types for event watching
639
+ # (entity_id, event) - entity_id is job_id or experiment_id
640
+ EntityEventCallback = Callable[[str, EventBase], None]
641
+ # (entity_id,) - called when event files are deleted
642
+ EntityDeletedCallback = Callable[[str], None]
643
+ # Extracts entity_id from path
644
+ EntityIdExtractor = Callable[[Path], Optional[str]]
645
+
646
+
647
+ def default_entity_id_extractor(path: Path) -> Optional[str]:
648
+ """Default: entity_id is the parent directory name
649
+
650
+ Path format: {base_dir}/{entity_id}/events-{count}.jsonl
651
+ """
652
+ return path.parent.name
653
+
654
+
655
+ def job_entity_id_extractor(path: Path) -> Optional[str]:
656
+ """For jobs: entity_id (job_id) is extracted from the filename
657
+
658
+ Path format: {base_dir}/{task_id}/event-{job_id}-{count}.jsonl
659
+ """
660
+ # Job ID is extracted from the filename
661
+ # e.g., .events/jobs/my.task/event-abc123-0.jsonl -> abc123
662
+ name = path.name
663
+ if not name.startswith("event-"):
664
+ return None
665
+ # Remove "event-" prefix and ".jsonl" suffix
666
+ # Format: event-{job_id}-{count}.jsonl
667
+ rest = name[6:] # Remove "event-"
668
+ if rest.endswith(".jsonl"):
669
+ rest = rest[:-6] # Remove ".jsonl"
670
+ # Now rest is "{job_id}-{count}", split on last "-" to get job_id
671
+ parts = rest.rsplit("-", 1)
672
+ if len(parts) == 2:
673
+ return parts[0]
674
+ return None
675
+
676
+
677
+ # Resolver type: given an entity_id, returns the permanent storage path
678
+ PermanentStorageResolver = Callable[[str], Path]
679
+
680
+
681
+ @dataclass
682
+ class WatchedDirectory:
683
+ """Configuration for a directory to watch for events
684
+
685
+ Attributes:
686
+ path: Temporary events directory (.events/...)
687
+ entity_id_extractor: Function to extract entity ID from event file path
688
+ glob_pattern: Pattern for matching event files
689
+ permanent_storage_resolver: Optional function that returns permanent storage
690
+ path for an entity. Used for hardlink archiving and deletion recovery.
691
+ """
692
+
693
+ path: Path
694
+ entity_id_extractor: EntityIdExtractor = field(
695
+ default_factory=lambda: default_entity_id_extractor
696
+ )
697
+ glob_pattern: str = "*/events-*.jsonl"
698
+ # For archiving and deletion handling:
699
+ permanent_storage_resolver: PermanentStorageResolver | None = None
700
+
701
+
702
+ class EventReader:
703
+ """Generic reader for events from JSONL files
704
+
705
+ Watches multiple directories with configurable entity ID extraction.
706
+
707
+ Supports:
708
+ - One-shot reading: read_events_since_count()
709
+ - Incremental reading: read_new_events() - tracks file positions
710
+ - File watching: start_watching(), stop_watching() - uses watchdog
711
+ - Buffered mode: buffer events during initialization, flush after
712
+ """
713
+
714
+ def __init__(self, directories: list[WatchedDirectory]):
715
+ """Initialize event reader
716
+
717
+ Args:
718
+ directories: List of directories to watch with their configurations
719
+ """
720
+ self.directories = directories
721
+ # For incremental reading (live monitoring)
722
+ self._file_positions: dict[Path, int] = {}
723
+ # For file watching (using ipcom)
724
+ self._watches: list[Any] = []
725
+ self._handler: Any = None
726
+ self._event_callbacks: list[EntityEventCallback] = []
727
+ self._deleted_callbacks: list[EntityDeletedCallback] = []
728
+ # Buffering mode: queue events instead of forwarding immediately
729
+ self._buffering = False
730
+ self._event_buffer: list[tuple[str, EventBase]] = []
731
+ self._deleted_buffer: list[str] = []
732
+
733
+ def _extract_entity_id(self, path: Path) -> Optional[str]:
734
+ """Extract entity ID from event file path"""
735
+ dir_config = self._find_dir_config(path)
736
+ if dir_config:
737
+ return dir_config.entity_id_extractor(path)
738
+ return None
739
+
740
+ def _find_dir_config(self, path: Path) -> WatchedDirectory | None:
741
+ """Find the WatchedDirectory config that contains the given path"""
742
+ for dir_config in self.directories:
743
+ try:
744
+ path.relative_to(dir_config.path)
745
+ return dir_config
746
+ except ValueError:
747
+ continue
748
+ return None
749
+
750
+ def get_all_event_files(self) -> list[Path]:
751
+ """Get all event files across all directories, sorted by modification time"""
752
+ all_files = []
753
+ for dir_config in self.directories:
754
+ if dir_config.path.exists():
755
+ all_files.extend(dir_config.path.glob(dir_config.glob_pattern))
756
+ return sorted(
757
+ all_files,
758
+ key=lambda p: p.stat().st_mtime if p.exists() else 0,
759
+ )
760
+
761
+ def scan_existing_files(self) -> None:
762
+ """Scan for existing event files and set initial positions to end of file
763
+
764
+ Call this before start_watching() to skip existing events and only
765
+ receive new ones.
766
+ """
767
+ for path in self.get_all_event_files():
768
+ try:
769
+ self._file_positions[path] = path.stat().st_size
770
+ except OSError:
771
+ pass
772
+
773
+ def read_new_events(self) -> list[tuple[str, EventBase]]:
774
+ """Read new events since last call (incremental reading)
775
+
776
+ Returns:
777
+ List of (entity_id, event) tuples
778
+ """
779
+ results = []
780
+ for event_file in self.get_all_event_files():
781
+ entity_id = self._extract_entity_id(event_file)
782
+ if not entity_id:
783
+ continue
784
+
785
+ last_pos = self._file_positions.get(event_file, 0)
786
+ try:
787
+ with event_file.open("r") as f:
788
+ f.seek(last_pos)
789
+ # Use readline() instead of iterator to allow tell()
790
+ while True:
791
+ line = f.readline()
792
+ if not line:
793
+ break
794
+ line = line.strip()
795
+ if not line:
796
+ continue
797
+ try:
798
+ event_dict = json.loads(line)
799
+ event = EventBase.from_dict(event_dict)
800
+ results.append((entity_id, event))
801
+ except json.JSONDecodeError:
802
+ pass
803
+ self._file_positions[event_file] = f.tell()
804
+ except OSError:
805
+ pass
806
+ return results
807
+
808
+ def start_watching(
809
+ self,
810
+ on_event: Optional[EntityEventCallback] = None,
811
+ on_deleted: Optional[EntityDeletedCallback] = None,
812
+ ) -> None:
813
+ """Start watching for file changes using ipcom
814
+
815
+ Args:
816
+ on_event: Callback called with (entity_id, event) for each new event
817
+ on_deleted: Callback called with (entity_id,) when event files are deleted
818
+ """
819
+ from watchdog.events import FileSystemEventHandler
820
+
821
+ from experimaestro.ipc import ipcom
822
+
823
+ if on_event:
824
+ self._event_callbacks.append(on_event)
825
+ if on_deleted:
826
+ self._deleted_callbacks.append(on_deleted)
827
+
828
+ if self._watches:
829
+ return # Already watching
830
+
831
+ # Create event handler
832
+ reader = self
833
+
834
+ class EventFileHandler(FileSystemEventHandler):
835
+ def _is_event_file(self, path: Path) -> bool:
836
+ """Check if path is an event file (experiment or job)"""
837
+ # Experiment events: events-{count}.jsonl
838
+ # Job events: event-{job_id}-{count}.jsonl
839
+ return path.suffix == ".jsonl" and path.name.startswith(
840
+ ("events-", "event-")
841
+ )
842
+
843
+ def on_modified(self, event):
844
+ if event.is_directory:
845
+ return
846
+ path = Path(event.src_path)
847
+ if self._is_event_file(path):
848
+ logger.debug("Detected modification of event file: %s", path)
849
+ reader._process_file_change(path)
850
+
851
+ def on_created(self, event):
852
+ if event.is_directory:
853
+ return
854
+ path = Path(event.src_path)
855
+ if self._is_event_file(path):
856
+ logger.debug("Detected creation of event file: %s", path)
857
+ reader._file_positions[path] = 0
858
+ reader._process_file_change(path)
859
+
860
+ def on_deleted(self, event):
861
+ if event.is_directory:
862
+ return
863
+ path = Path(event.src_path)
864
+ if self._is_event_file(path):
865
+ logger.debug("Detected deletion of event file: %s", path)
866
+ entity_id = reader._extract_entity_id(path)
867
+ dir_config = reader._find_dir_config(path)
868
+ reader._file_positions.pop(path, None)
869
+ if entity_id:
870
+ reader._handle_deletion(entity_id, path, dir_config)
871
+
872
+ self._handler = EventFileHandler()
873
+ ipc = ipcom()
874
+
875
+ # Register watches for each directory
876
+ for dir_config in self.directories:
877
+ dir_config.path.mkdir(parents=True, exist_ok=True)
878
+ watch = ipc.fswatch(self._handler, dir_config.path, recursive=True)
879
+ self._watches.append(watch)
880
+ logger.debug("Started watching %s", dir_config.path)
881
+
882
+ def stop_watching(self) -> None:
883
+ """Stop watching for file changes"""
884
+ from experimaestro.ipc import ipcom
885
+
886
+ ipc = ipcom()
887
+ for watch in self._watches:
888
+ ipc.fsunwatch(watch)
889
+ self._watches.clear()
890
+ self._handler = None
891
+ logger.debug("Stopped watching all directories")
892
+
893
+ def _process_file_change(self, path: Path) -> None:
894
+ """Process a changed event file and notify callbacks (or buffer)"""
895
+ entity_id = self._extract_entity_id(path)
896
+ if not entity_id:
897
+ return
898
+
899
+ last_pos = self._file_positions.get(path, 0)
900
+ try:
901
+ with path.open("r") as f:
902
+ f.seek(last_pos)
903
+ # Use readline() instead of iterator to allow tell()
904
+ while True:
905
+ line = f.readline()
906
+ if not line:
907
+ break
908
+
909
+ # Skip incomplete lines (writer may be mid-write)
910
+ if not line.endswith("\n"):
911
+ break
912
+
913
+ line = line.strip()
914
+ if not line:
915
+ # Update position for empty lines
916
+ last_pos = f.tell()
917
+ continue
918
+ try:
919
+ event_dict = json.loads(line)
920
+ event = EventBase.from_dict(event_dict)
921
+ if self._buffering:
922
+ # Queue event for later
923
+ self._event_buffer.append((entity_id, event))
924
+ else:
925
+ # Forward immediately
926
+ for callback in self._event_callbacks:
927
+ try:
928
+ callback(entity_id, event)
929
+ except Exception:
930
+ logger.exception("Error in event callback")
931
+ except json.JSONDecodeError:
932
+ pass
933
+
934
+ # Update position after each complete line
935
+ last_pos = f.tell()
936
+
937
+ self._file_positions[path] = last_pos
938
+ except FileNotFoundError:
939
+ pass
940
+ except OSError as e:
941
+ logger.warning("Failed to read event file %s: %s", path, e)
942
+
943
+ def _handle_deletion(
944
+ self,
945
+ entity_id: str,
946
+ deleted_path: Path | None = None,
947
+ dir_config: WatchedDirectory | None = None,
948
+ ) -> None:
949
+ """Handle entity deletion - read from permanent storage if available
950
+
951
+ When event files are deleted from the temporary (.events/) directory,
952
+ this method attempts to read any remaining events from permanent storage
953
+ (if configured via permanent_storage_resolver).
954
+
955
+ Args:
956
+ entity_id: The entity identifier (job_id or experiment_id)
957
+ deleted_path: The path of the deleted file (optional)
958
+ dir_config: The WatchedDirectory config for this path (optional)
959
+ """
960
+ # Try to read remaining events from permanent storage
961
+ if dir_config and dir_config.permanent_storage_resolver:
962
+ permanent_dir = dir_config.permanent_storage_resolver(entity_id)
963
+
964
+ # Read any events from permanent storage that we haven't processed
965
+ events = self._read_events_from_permanent(permanent_dir, entity_id)
966
+ for event in events:
967
+ if self._buffering:
968
+ self._event_buffer.append((entity_id, event))
969
+ else:
970
+ for callback in self._event_callbacks:
971
+ try:
972
+ callback(entity_id, event)
973
+ except Exception:
974
+ logger.exception("Error in event callback")
975
+
976
+ # Notify deletion callbacks
977
+ if self._buffering:
978
+ self._deleted_buffer.append(entity_id)
979
+ else:
980
+ for callback in self._deleted_callbacks:
981
+ try:
982
+ callback(entity_id)
983
+ except Exception:
984
+ logger.exception("Error in deleted callback")
985
+
986
+ def _read_events_from_permanent(
987
+ self, permanent_dir: Path, entity_id: str
988
+ ) -> list[EventBase]:
989
+ """Read events from permanent storage directory
990
+
991
+ Args:
992
+ permanent_dir: Path to permanent storage directory
993
+ entity_id: Entity identifier (for logging)
994
+
995
+ Returns:
996
+ List of events read from permanent storage
997
+ """
998
+ events = []
999
+ if not permanent_dir.exists():
1000
+ return events
1001
+
1002
+ # Read all event files in permanent storage
1003
+ event_files = sorted(permanent_dir.glob("event-*.jsonl"))
1004
+ for event_file in event_files:
1005
+ try:
1006
+ with event_file.open("r") as f:
1007
+ for line in f:
1008
+ line = line.strip()
1009
+ if not line:
1010
+ continue
1011
+ try:
1012
+ event_dict = json.loads(line)
1013
+ event = EventBase.from_dict(event_dict)
1014
+ events.append(event)
1015
+ except json.JSONDecodeError:
1016
+ pass
1017
+ except OSError as e:
1018
+ logger.warning(
1019
+ "Failed to read permanent event file %s: %s", event_file, e
1020
+ )
1021
+
1022
+ if events:
1023
+ logger.debug(
1024
+ "Read %d events from permanent storage for entity %s",
1025
+ len(events),
1026
+ entity_id,
1027
+ )
1028
+ return events
1029
+
1030
+ def clear_callbacks(self) -> None:
1031
+ """Clear all registered callbacks"""
1032
+ self._event_callbacks.clear()
1033
+ self._deleted_callbacks.clear()
1034
+
1035
+ def start_buffering(self) -> None:
1036
+ """Start buffering events instead of forwarding to callbacks
1037
+
1038
+ Call this before scan_existing_files() to ensure events arriving
1039
+ during initialization are queued and not lost.
1040
+ """
1041
+ self._buffering = True
1042
+ self._event_buffer.clear()
1043
+ self._deleted_buffer.clear()
1044
+
1045
+ def flush_buffer(self) -> None:
1046
+ """Stop buffering and forward all buffered events to callbacks
1047
+
1048
+ Call this after initial state loading is complete.
1049
+ """
1050
+ self._buffering = False
1051
+
1052
+ # Forward buffered events
1053
+ for entity_id, event in self._event_buffer:
1054
+ for callback in self._event_callbacks:
1055
+ try:
1056
+ callback(entity_id, event)
1057
+ except Exception:
1058
+ logger.exception("Error in event callback")
1059
+
1060
+ # Forward buffered deletions
1061
+ for entity_id in self._deleted_buffer:
1062
+ for callback in self._deleted_callbacks:
1063
+ try:
1064
+ callback(entity_id)
1065
+ except Exception:
1066
+ logger.exception("Error in deleted callback")
1067
+
1068
+ self._event_buffer.clear()
1069
+ self._deleted_buffer.clear()
1070
+
1071
+ def read_events_since_count(
1072
+ self, entity_id: str, start_count: int, base_dir: Optional[Path] = None
1073
+ ) -> list[EventBase]:
1074
+ """Read events for an entity starting from a specific file count
1075
+
1076
+ Args:
1077
+ entity_id: Entity identifier (job_id or experiment_id)
1078
+ start_count: File count to start reading from (events-{count}.jsonl)
1079
+ base_dir: Optional base directory to search in (defaults to first directory)
1080
+
1081
+ Returns:
1082
+ List of events from files starting at start_count
1083
+ """
1084
+ events = []
1085
+
1086
+ # Determine which directory to search
1087
+ if base_dir is None and self.directories:
1088
+ base_dir = self.directories[0].path
1089
+
1090
+ if base_dir is None:
1091
+ return events
1092
+
1093
+ entity_events_dir = base_dir / entity_id
1094
+ count = start_count
1095
+ while True:
1096
+ event_path = entity_events_dir / f"events-{count}.jsonl"
1097
+ if not event_path.exists():
1098
+ break
1099
+
1100
+ try:
1101
+ with event_path.open("r") as f:
1102
+ for line in f:
1103
+ line = line.strip()
1104
+ if line:
1105
+ try:
1106
+ event_dict = json.loads(line)
1107
+ event = EventBase.from_dict(event_dict)
1108
+ events.append(event)
1109
+ except json.JSONDecodeError:
1110
+ pass
1111
+ except OSError:
1112
+ pass
1113
+ count += 1
1114
+ return events
1115
+
1116
+
1117
+ class JobProgressReader:
1118
+ """Convenience reader for job progress events
1119
+
1120
+ Reads progress events from a job's event files.
1121
+ Used for monitoring job progress via CLI and scheduler.
1122
+ """
1123
+
1124
+ def __init__(self, job_path: Path):
1125
+ """Initialize job progress reader
1126
+
1127
+ Args:
1128
+ job_path: Path to the job directory (workspace/jobs/task_id/job_id)
1129
+ """
1130
+ self.job_path = job_path
1131
+ # Extract task_id and job_id from path
1132
+ self.job_id = job_path.name
1133
+ self.task_id = job_path.parent.name
1134
+ # Progress events are stored in workspace/.events/jobs/{task_id}/
1135
+ self._events_dir = (
1136
+ job_path.parent.parent.parent / ".events" / "jobs" / self.task_id
1137
+ )
1138
+
1139
+ def get_event_files(self) -> list[Path]:
1140
+ """Get all event files for this job"""
1141
+ if not self._events_dir.exists():
1142
+ return []
1143
+ return sorted(self._events_dir.glob("events-*.jsonl"))
1144
+
1145
+ def read_progress_events(self) -> list[JobProgressEvent]:
1146
+ """Read all progress events from event files
1147
+
1148
+ Returns:
1149
+ List of JobProgressEvent objects in chronological order
1150
+ """
1151
+ events = []
1152
+ for event_file in self.get_event_files():
1153
+ try:
1154
+ with event_file.open("r") as f:
1155
+ for line in f:
1156
+ line = line.strip()
1157
+ if not line:
1158
+ continue
1159
+ try:
1160
+ data = json.loads(line)
1161
+ if data.get("type") == "job_progress":
1162
+ events.append(
1163
+ JobProgressEvent(
1164
+ job_id=data.get("job_id", ""),
1165
+ level=data.get("level", 0),
1166
+ progress=data.get("progress", 0.0),
1167
+ desc=data.get("desc"),
1168
+ timestamp=data.get("timestamp", 0.0),
1169
+ )
1170
+ )
1171
+ except json.JSONDecodeError:
1172
+ pass
1173
+ except OSError:
1174
+ pass
1175
+ return events
1176
+
1177
+ def get_current_progress(self) -> dict[int, JobProgressEvent]:
1178
+ """Get current progress state per level
1179
+
1180
+ Returns:
1181
+ Dict mapping level to the latest JobProgressEvent for that level
1182
+ """
1183
+ progress: dict[int, JobProgressEvent] = {}
1184
+ for event in self.read_progress_events():
1185
+ progress[event.level] = event
1186
+ return progress
1187
+
1188
+ def is_done(self) -> bool:
1189
+ """Check if job is complete (JobStateChangedEvent with done/error state)
1190
+
1191
+ Returns:
1192
+ True if job state is "done" or "error" in event files
1193
+ """
1194
+ for event_file in self.get_event_files():
1195
+ try:
1196
+ with event_file.open("r") as f:
1197
+ for line in f:
1198
+ line = line.strip()
1199
+ if not line:
1200
+ continue
1201
+ try:
1202
+ data = json.loads(line)
1203
+ # Check for job state changed event with final state
1204
+ if data.get("type") == "job_state_changed":
1205
+ state = data.get("state", "")
1206
+ if state in ("done", "error"):
1207
+ return True
1208
+ # Also check for old EOJ marker for backward compatibility
1209
+ if (
1210
+ data.get("type") == "job_progress"
1211
+ and data.get("level") == -1
1212
+ ):
1213
+ return True
1214
+ except json.JSONDecodeError:
1215
+ pass
1216
+ except OSError:
1217
+ pass
1218
+ return False
1219
+
1220
+
1221
+ __all__ = [
1222
+ # Hardlink utilities
1223
+ "supports_hardlinks",
1224
+ "safe_link_or_copy",
1225
+ # Event classes
1226
+ "EventBase",
1227
+ "JobSubmittedEvent",
1228
+ "JobStateChangedEvent",
1229
+ "JobProgressEvent",
1230
+ "ServiceAddedEvent",
1231
+ "ServiceStateChangedEvent",
1232
+ "RunCompletedEvent",
1233
+ "EVENT_TYPES",
1234
+ # Event writer classes
1235
+ "EventWriter",
1236
+ "JobEventWriter",
1237
+ "ExperimentEventWriter",
1238
+ # Event reader class
1239
+ "EventReader",
1240
+ "WatchedDirectory",
1241
+ "PermanentStorageResolver",
1242
+ "job_entity_id_extractor",
1243
+ "JobProgressReader",
1244
+ # Callback types
1245
+ "EntityEventCallback",
1246
+ "EntityDeletedCallback",
1247
+ ]