experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +393 -134
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +223 -52
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +650 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +764 -169
- experimaestro/scheduler/interfaces.py +338 -96
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +928 -0
- experimaestro/scheduler/remote/protocol.py +282 -0
- experimaestro/scheduler/remote/server.py +447 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +186 -35
- experimaestro/scheduler/state_provider.py +811 -2157
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +1132 -0
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +459 -1895
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +8 -9
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -388
- experimaestro/scheduler/state_sync.py +0 -834
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b4.dist-info/RECORD +0 -181
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1247 @@
|
|
|
1
|
+
"""Filesystem-based state tracking for experiments
|
|
2
|
+
|
|
3
|
+
This module provides event and status file handling for tracking experiment state
|
|
4
|
+
without using a database. It replaces the SQLite/peewee-based state tracking.
|
|
5
|
+
|
|
6
|
+
Key components:
|
|
7
|
+
- Event dataclasses: Serializable events for JSONL event files
|
|
8
|
+
- EventWriter/EventReader: Base classes for event I/O
|
|
9
|
+
- JobEventWriter: Job-specific event handling
|
|
10
|
+
- ExperimentEventWriter/ExperimentEventReader: Experiment-specific event handling
|
|
11
|
+
|
|
12
|
+
File structure:
|
|
13
|
+
- workspace/.events/experiments/{experiment-id}/events-{count}.jsonl
|
|
14
|
+
- workspace/.events/jobs/{task-id}/event-{job-id}-{count}.jsonl
|
|
15
|
+
- workspace/experiments/{experiment-id}/{run-id}/status.json
|
|
16
|
+
- workspace/jobs/{task-id}/{job-id}/.experimaestro/information.json
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
import shutil
|
|
23
|
+
import time
|
|
24
|
+
from abc import ABC, abstractmethod
|
|
25
|
+
from dataclasses import asdict, dataclass, field
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any, Callable, Optional, TYPE_CHECKING
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from experimaestro.scheduler.interfaces import BaseExperiment
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger("xpm.state_status")
|
|
34
|
+
|
|
35
|
+
# Status file version
|
|
36
|
+
STATUS_VERSION = 1
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# =============================================================================
|
|
40
|
+
# Hardlink Support Utilities
|
|
41
|
+
# =============================================================================
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def supports_hardlinks(path: Path) -> bool:
|
|
45
|
+
"""Check if the filesystem at path supports hardlinks
|
|
46
|
+
|
|
47
|
+
Creates temporary test files to verify hardlink support. Useful for
|
|
48
|
+
determining whether to use hardlinks for event file archiving.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
path: Directory to test for hardlink support
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
True if hardlinks are supported, False otherwise
|
|
55
|
+
"""
|
|
56
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
test_file = path / ".hardlink_test"
|
|
58
|
+
test_link = path / ".hardlink_test_link"
|
|
59
|
+
try:
|
|
60
|
+
# Clean up any leftover test files
|
|
61
|
+
if test_link.exists():
|
|
62
|
+
test_link.unlink()
|
|
63
|
+
if test_file.exists():
|
|
64
|
+
test_file.unlink()
|
|
65
|
+
|
|
66
|
+
# Create test file and hardlink
|
|
67
|
+
test_file.touch()
|
|
68
|
+
os.link(test_file, test_link)
|
|
69
|
+
|
|
70
|
+
# Verify it's actually a hardlink (same inode)
|
|
71
|
+
success = test_file.stat().st_ino == test_link.stat().st_ino
|
|
72
|
+
|
|
73
|
+
# Clean up
|
|
74
|
+
test_link.unlink()
|
|
75
|
+
test_file.unlink()
|
|
76
|
+
return success
|
|
77
|
+
except (OSError, AttributeError):
|
|
78
|
+
# Clean up on failure
|
|
79
|
+
try:
|
|
80
|
+
if test_link.exists():
|
|
81
|
+
test_link.unlink()
|
|
82
|
+
if test_file.exists():
|
|
83
|
+
test_file.unlink()
|
|
84
|
+
except OSError:
|
|
85
|
+
pass
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def safe_link_or_copy(src: Path, dst: Path, use_hardlinks: bool = True) -> bool:
|
|
90
|
+
"""Create hardlink if supported, otherwise copy the file
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
src: Source file path
|
|
94
|
+
dst: Destination file path
|
|
95
|
+
use_hardlinks: If True, attempt hardlink first; if False, always copy
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
True if hardlink was used, False if file was copied
|
|
99
|
+
"""
|
|
100
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
|
|
102
|
+
if use_hardlinks:
|
|
103
|
+
try:
|
|
104
|
+
# Remove destination if it exists (can't hardlink over existing file)
|
|
105
|
+
if dst.exists():
|
|
106
|
+
dst.unlink()
|
|
107
|
+
os.link(src, dst)
|
|
108
|
+
return True
|
|
109
|
+
except OSError:
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
# Fall back to copy
|
|
113
|
+
shutil.copy2(src, dst)
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# =============================================================================
|
|
118
|
+
# Event System
|
|
119
|
+
# =============================================================================
|
|
120
|
+
|
|
121
|
+
# Registry for event deserialization (auto-populated by __init_subclass__)
|
|
122
|
+
EVENT_TYPES: dict[str, type["EventBase"]] = {}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass
|
|
126
|
+
class EventBase:
|
|
127
|
+
"""Base class for all events
|
|
128
|
+
|
|
129
|
+
Events are lightweight - they carry only IDs, not object references.
|
|
130
|
+
Use StateProvider to fetch actual objects (BaseJob, BaseExperiment, etc.)
|
|
131
|
+
when needed.
|
|
132
|
+
|
|
133
|
+
Subclasses are automatically registered in EVENT_TYPES by their class name.
|
|
134
|
+
JSON serialization/deserialization is handled transparently via event_type field.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
timestamp: float = field(default_factory=time.time)
|
|
138
|
+
|
|
139
|
+
def __init_subclass__(cls, **kwargs):
|
|
140
|
+
super().__init_subclass__(**kwargs)
|
|
141
|
+
# Register by class name
|
|
142
|
+
EVENT_TYPES[cls.__name__] = cls
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def event_type(self) -> str:
|
|
146
|
+
"""Event type derived from class name"""
|
|
147
|
+
return self.__class__.__name__
|
|
148
|
+
|
|
149
|
+
def to_json(self) -> str:
|
|
150
|
+
"""Serialize event to JSON string"""
|
|
151
|
+
d = asdict(self)
|
|
152
|
+
d["event_type"] = self.event_type
|
|
153
|
+
return json.dumps(d, separators=(",", ":"))
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def from_dict(cls, d: dict) -> "EventBase":
|
|
157
|
+
"""Deserialize event from dictionary"""
|
|
158
|
+
event_type = d.get("event_type")
|
|
159
|
+
event_class = EVENT_TYPES.get(event_type, EventBase)
|
|
160
|
+
# Filter to only known fields for the event class
|
|
161
|
+
valid_fields = {f for f in event_class.__dataclass_fields__}
|
|
162
|
+
filtered = {k: v for k, v in d.items() if k in valid_fields}
|
|
163
|
+
return event_class(**filtered)
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def get_class(cls, name: str) -> "type[EventBase] | None":
|
|
167
|
+
"""Get an EventBase subclass by class name"""
|
|
168
|
+
return EVENT_TYPES.get(name)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# -----------------------------------------------------------------------------
|
|
172
|
+
# Event Base Classes (for filtering)
|
|
173
|
+
# -----------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@dataclass
|
|
177
|
+
class JobEventBase(EventBase):
|
|
178
|
+
"""Base class for job-related events (have job_id)"""
|
|
179
|
+
|
|
180
|
+
job_id: str = ""
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclass
|
|
184
|
+
class ExperimentEventBase(EventBase):
|
|
185
|
+
"""Base class for experiment-related events (have experiment_id)"""
|
|
186
|
+
|
|
187
|
+
experiment_id: str = ""
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@dataclass
|
|
191
|
+
class ServiceEventBase(ExperimentEventBase):
|
|
192
|
+
"""Base class for service-related events (have service_id)"""
|
|
193
|
+
|
|
194
|
+
service_id: str = ""
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# -----------------------------------------------------------------------------
|
|
198
|
+
# Supporting Dataclasses
|
|
199
|
+
# -----------------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@dataclass
|
|
203
|
+
class ProgressLevel:
|
|
204
|
+
"""Progress information for a single level"""
|
|
205
|
+
|
|
206
|
+
level: int = 0
|
|
207
|
+
progress: float = 0.0
|
|
208
|
+
desc: Optional[str] = None
|
|
209
|
+
|
|
210
|
+
def to_dict(self) -> dict:
|
|
211
|
+
return {"level": self.level, "progress": self.progress, "desc": self.desc}
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
def from_dict(cls, d: dict) -> "ProgressLevel":
|
|
215
|
+
return cls(
|
|
216
|
+
level=d.get("level", 0),
|
|
217
|
+
progress=d.get("progress", 0.0),
|
|
218
|
+
desc=d.get("desc"),
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dataclass
|
|
223
|
+
class JobTag:
|
|
224
|
+
"""A job tag (key-value pair)"""
|
|
225
|
+
|
|
226
|
+
key: str
|
|
227
|
+
value: str
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# -----------------------------------------------------------------------------
|
|
231
|
+
# Job Events
|
|
232
|
+
# -----------------------------------------------------------------------------
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@dataclass
|
|
236
|
+
class JobSubmittedEvent(JobEventBase, ExperimentEventBase):
|
|
237
|
+
"""Event: Job was submitted to the scheduler
|
|
238
|
+
|
|
239
|
+
Fired when a job is added to an experiment run.
|
|
240
|
+
This is both a job event and an experiment event.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
task_id: str = ""
|
|
244
|
+
run_id: str = ""
|
|
245
|
+
transient: int = 0
|
|
246
|
+
tags: list[JobTag] = field(default_factory=list)
|
|
247
|
+
depends_on: list[str] = field(default_factory=list)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@dataclass
|
|
251
|
+
class JobStateChangedEvent(JobEventBase):
|
|
252
|
+
"""Event: Job state changed
|
|
253
|
+
|
|
254
|
+
Fired when a job's state changes (scheduled, running, done, error, etc.)
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
state: str = ""
|
|
258
|
+
failure_reason: Optional[str] = None
|
|
259
|
+
submitted_time: Optional[float] = None
|
|
260
|
+
started_time: Optional[float] = None
|
|
261
|
+
ended_time: Optional[float] = None
|
|
262
|
+
exit_code: Optional[int] = None
|
|
263
|
+
retry_count: int = 0
|
|
264
|
+
progress: list[ProgressLevel] = field(default_factory=list)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
@dataclass
|
|
268
|
+
class JobProgressEvent(JobEventBase):
|
|
269
|
+
"""Event: Job progress update
|
|
270
|
+
|
|
271
|
+
Written by the running job process to report progress.
|
|
272
|
+
"""
|
|
273
|
+
|
|
274
|
+
level: int = 0
|
|
275
|
+
progress: float = 0.0
|
|
276
|
+
desc: Optional[str] = None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
# -----------------------------------------------------------------------------
|
|
280
|
+
# Experiment Events
|
|
281
|
+
# -----------------------------------------------------------------------------
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@dataclass
|
|
285
|
+
class ExperimentUpdatedEvent(ExperimentEventBase):
|
|
286
|
+
"""Event: Experiment was created or updated"""
|
|
287
|
+
|
|
288
|
+
pass
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@dataclass
|
|
292
|
+
class RunUpdatedEvent(ExperimentEventBase):
|
|
293
|
+
"""Event: Experiment run was created or updated"""
|
|
294
|
+
|
|
295
|
+
run_id: str = ""
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@dataclass
|
|
299
|
+
class RunCompletedEvent(ExperimentEventBase):
|
|
300
|
+
"""Event: Experiment run completed"""
|
|
301
|
+
|
|
302
|
+
run_id: str = ""
|
|
303
|
+
status: str = "completed"
|
|
304
|
+
ended_at: str = ""
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# -----------------------------------------------------------------------------
|
|
308
|
+
# Service Events
|
|
309
|
+
# -----------------------------------------------------------------------------
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
@dataclass
|
|
313
|
+
class ServiceAddedEvent(ServiceEventBase):
|
|
314
|
+
"""Event: Service was added to the experiment"""
|
|
315
|
+
|
|
316
|
+
run_id: str = ""
|
|
317
|
+
description: str = ""
|
|
318
|
+
service_class: str = ""
|
|
319
|
+
state_dict: dict[str, Any] = field(default_factory=dict)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
@dataclass
|
|
323
|
+
class ServiceStateChangedEvent(ServiceEventBase):
|
|
324
|
+
"""Event: Service state changed (STOPPED, STARTING, RUNNING, STOPPING)"""
|
|
325
|
+
|
|
326
|
+
run_id: str = ""
|
|
327
|
+
state: str = ""
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
# =============================================================================
|
|
331
|
+
# Event Writer Classes
|
|
332
|
+
# =============================================================================
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class EventWriter(ABC):
|
|
336
|
+
"""Base class for writing events to JSONL files
|
|
337
|
+
|
|
338
|
+
Events are written to {events_dir}/events-{count}.jsonl
|
|
339
|
+
Uses line buffering so each event is flushed immediately after write.
|
|
340
|
+
|
|
341
|
+
Supports proactive hardlinking: when a permanent_dir is set and hardlinks
|
|
342
|
+
are supported, a hardlink is created to permanent storage immediately when
|
|
343
|
+
the event file is opened. This ensures events are written to both locations
|
|
344
|
+
simultaneously and no data is lost if the process crashes.
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
def __init__(self, initial_count: int = 0, permanent_dir: Path | None = None):
|
|
348
|
+
"""Initialize event writer
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
initial_count: Starting event file count for rotation
|
|
352
|
+
permanent_dir: Optional permanent storage directory for archiving.
|
|
353
|
+
If set and hardlinks are supported, events are written to both
|
|
354
|
+
temporary and permanent locations via hardlink.
|
|
355
|
+
"""
|
|
356
|
+
self._count = initial_count
|
|
357
|
+
self._file = None
|
|
358
|
+
self._permanent_dir = permanent_dir
|
|
359
|
+
self._use_hardlinks: bool | None = None # None = not checked yet
|
|
360
|
+
self._hardlink_created_for_count: int | None = (
|
|
361
|
+
None # Track which file has hardlink
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
@property
|
|
365
|
+
@abstractmethod
|
|
366
|
+
def events_dir(self) -> Path:
|
|
367
|
+
"""Get the directory where events are written"""
|
|
368
|
+
...
|
|
369
|
+
|
|
370
|
+
@property
|
|
371
|
+
def permanent_dir(self) -> Path | None:
|
|
372
|
+
"""Get the permanent storage directory"""
|
|
373
|
+
return self._permanent_dir
|
|
374
|
+
|
|
375
|
+
def _check_hardlink_support(self) -> bool:
|
|
376
|
+
"""Check and cache hardlink support"""
|
|
377
|
+
if self._use_hardlinks is None:
|
|
378
|
+
if self._permanent_dir:
|
|
379
|
+
self._permanent_dir.mkdir(parents=True, exist_ok=True)
|
|
380
|
+
self._use_hardlinks = supports_hardlinks(self._permanent_dir)
|
|
381
|
+
else:
|
|
382
|
+
self._use_hardlinks = False
|
|
383
|
+
return self._use_hardlinks
|
|
384
|
+
|
|
385
|
+
def _get_event_file_path(self) -> Path:
|
|
386
|
+
return self.events_dir / f"events-{self._count}.jsonl"
|
|
387
|
+
|
|
388
|
+
def _get_permanent_event_file_path(self) -> Path:
|
|
389
|
+
"""Get path for permanent event file"""
|
|
390
|
+
if self._permanent_dir is None:
|
|
391
|
+
raise ValueError("No permanent directory configured")
|
|
392
|
+
return self._permanent_dir / f"event-{self._count}.jsonl"
|
|
393
|
+
|
|
394
|
+
def write_event(self, event: EventBase) -> None:
|
|
395
|
+
"""Write an event to the current event file
|
|
396
|
+
|
|
397
|
+
If permanent storage is configured and hardlinks are supported,
|
|
398
|
+
creates a hardlink immediately when the file is first opened.
|
|
399
|
+
"""
|
|
400
|
+
if self._file is None:
|
|
401
|
+
self.events_dir.mkdir(parents=True, exist_ok=True)
|
|
402
|
+
# Use line buffering (buffering=1) so each line is flushed automatically
|
|
403
|
+
self._file = self._get_event_file_path().open("a", buffering=1)
|
|
404
|
+
|
|
405
|
+
# Create hardlink to permanent storage immediately if supported
|
|
406
|
+
if self._check_hardlink_support() and self._permanent_dir:
|
|
407
|
+
temp_path = self._get_event_file_path()
|
|
408
|
+
perm_path = self._get_permanent_event_file_path()
|
|
409
|
+
try:
|
|
410
|
+
perm_path.parent.mkdir(parents=True, exist_ok=True)
|
|
411
|
+
if not perm_path.exists():
|
|
412
|
+
os.link(temp_path, perm_path)
|
|
413
|
+
self._hardlink_created_for_count = self._count
|
|
414
|
+
logger.debug("Created hardlink %s -> %s", temp_path, perm_path)
|
|
415
|
+
except FileExistsError:
|
|
416
|
+
pass # Already linked
|
|
417
|
+
except OSError as e:
|
|
418
|
+
logger.warning("Failed to create hardlink: %s", e)
|
|
419
|
+
|
|
420
|
+
self._file.write(event.to_json() + "\n")
|
|
421
|
+
|
|
422
|
+
def flush(self) -> None:
|
|
423
|
+
"""Flush the current event file to disk"""
|
|
424
|
+
if self._file is not None:
|
|
425
|
+
self._file.flush()
|
|
426
|
+
os.fsync(self._file.fileno())
|
|
427
|
+
|
|
428
|
+
def close(self) -> None:
|
|
429
|
+
"""Close the current event file"""
|
|
430
|
+
if self._file is not None:
|
|
431
|
+
self._file.flush()
|
|
432
|
+
os.fsync(self._file.fileno())
|
|
433
|
+
self._file.close()
|
|
434
|
+
self._file = None
|
|
435
|
+
|
|
436
|
+
def rotate(self, new_count: int) -> None:
|
|
437
|
+
"""Rotate to a new event file (called after status file update)"""
|
|
438
|
+
self.close()
|
|
439
|
+
self._count = new_count
|
|
440
|
+
|
|
441
|
+
def cleanup(self) -> None:
|
|
442
|
+
"""Delete all event files in this directory (temporary files only)"""
|
|
443
|
+
self.close()
|
|
444
|
+
for i in range(self._count + 1):
|
|
445
|
+
path = self.events_dir / f"events-{i}.jsonl"
|
|
446
|
+
if path.exists():
|
|
447
|
+
try:
|
|
448
|
+
path.unlink()
|
|
449
|
+
except OSError as e:
|
|
450
|
+
logger.warning("Failed to delete event file %s: %s", path, e)
|
|
451
|
+
|
|
452
|
+
def archive_events(self) -> None:
|
|
453
|
+
"""Archive events to permanent storage (called on completion)
|
|
454
|
+
|
|
455
|
+
For each temp file:
|
|
456
|
+
- If permanent file exists (hardlink already created): just delete temp
|
|
457
|
+
- If permanent file doesn't exist: move temp to permanent
|
|
458
|
+
"""
|
|
459
|
+
self.close()
|
|
460
|
+
|
|
461
|
+
if not self._permanent_dir:
|
|
462
|
+
return
|
|
463
|
+
|
|
464
|
+
self._permanent_dir.mkdir(parents=True, exist_ok=True)
|
|
465
|
+
|
|
466
|
+
for i in range(self._count + 1):
|
|
467
|
+
temp_path = self._get_temp_event_file_path(i)
|
|
468
|
+
perm_path = self._permanent_dir / f"event-{i}.jsonl"
|
|
469
|
+
|
|
470
|
+
if not temp_path.exists():
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
if perm_path.exists():
|
|
474
|
+
# Permanent file exists (hardlink) - just delete temp
|
|
475
|
+
try:
|
|
476
|
+
temp_path.unlink()
|
|
477
|
+
except OSError as e:
|
|
478
|
+
logger.warning("Failed to delete temp file %s: %s", temp_path, e)
|
|
479
|
+
else:
|
|
480
|
+
# No permanent file - move temp to permanent
|
|
481
|
+
try:
|
|
482
|
+
shutil.move(str(temp_path), str(perm_path))
|
|
483
|
+
except OSError as e:
|
|
484
|
+
logger.warning("Failed to archive %s: %s", temp_path, e)
|
|
485
|
+
|
|
486
|
+
def _get_temp_event_file_path(self, count: int) -> Path:
|
|
487
|
+
"""Get path for temporary event file at a specific count"""
|
|
488
|
+
return self.events_dir / f"events-{count}.jsonl"
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
class JobEventWriter(EventWriter):
|
|
492
|
+
"""Writes events to job event files
|
|
493
|
+
|
|
494
|
+
Events are stored in: {workspace}/.events/jobs/{task_id}/event-{job_id}-{count}.jsonl
|
|
495
|
+
Permanent storage: {job_path}/.experimaestro/events/event-{count}.jsonl
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
def __init__(
|
|
499
|
+
self,
|
|
500
|
+
workspace_path: Path,
|
|
501
|
+
task_id: str,
|
|
502
|
+
job_id: str,
|
|
503
|
+
initial_count: int = 0,
|
|
504
|
+
job_path: Path | None = None,
|
|
505
|
+
):
|
|
506
|
+
"""Initialize job event writer
|
|
507
|
+
|
|
508
|
+
Args:
|
|
509
|
+
workspace_path: Path to workspace directory
|
|
510
|
+
task_id: Task identifier (groups events by task type)
|
|
511
|
+
job_id: Job identifier (unique hash for this job instance)
|
|
512
|
+
initial_count: Starting event file count for rotation
|
|
513
|
+
job_path: Optional job directory path for permanent storage
|
|
514
|
+
"""
|
|
515
|
+
# Permanent storage: job_path/.experimaestro/events/
|
|
516
|
+
permanent_dir = job_path / ".experimaestro" / "events" if job_path else None
|
|
517
|
+
super().__init__(initial_count, permanent_dir)
|
|
518
|
+
self.workspace_path = workspace_path
|
|
519
|
+
self.task_id = task_id
|
|
520
|
+
self.job_id = job_id
|
|
521
|
+
self.job_path = job_path
|
|
522
|
+
self._events_dir = workspace_path / ".events" / "jobs" / task_id
|
|
523
|
+
|
|
524
|
+
@property
|
|
525
|
+
def events_dir(self) -> Path:
|
|
526
|
+
return self._events_dir
|
|
527
|
+
|
|
528
|
+
def _get_event_file_path(self) -> Path:
|
|
529
|
+
"""Get the path for job event file with job_id in filename"""
|
|
530
|
+
return self.events_dir / f"event-{self.job_id}-{self._count}.jsonl"
|
|
531
|
+
|
|
532
|
+
def _get_temp_event_file_path(self, count: int) -> Path:
|
|
533
|
+
"""Get path for temporary event file at a specific count"""
|
|
534
|
+
return self.events_dir / f"event-{self.job_id}-{count}.jsonl"
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
class ExperimentEventWriter(EventWriter):
|
|
538
|
+
"""Writes events to experiment event files
|
|
539
|
+
|
|
540
|
+
Events are stored in: {workspace}/.events/experiments/{experiment_id}/events-{count}.jsonl
|
|
541
|
+
Permanent storage: {run_dir}/events/event-{count}.jsonl
|
|
542
|
+
"""
|
|
543
|
+
|
|
544
|
+
def __init__(
|
|
545
|
+
self,
|
|
546
|
+
experiment: "BaseExperiment",
|
|
547
|
+
workspace_path: Path,
|
|
548
|
+
initial_count: int = 0,
|
|
549
|
+
):
|
|
550
|
+
"""Initialize experiment event writer
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
experiment: The experiment (BaseExperiment) to write events for
|
|
554
|
+
workspace_path: Path to workspace directory
|
|
555
|
+
initial_count: Starting event file count for rotation
|
|
556
|
+
"""
|
|
557
|
+
from experimaestro.scheduler.interfaces import BaseExperiment
|
|
558
|
+
|
|
559
|
+
assert isinstance(experiment, BaseExperiment), (
|
|
560
|
+
f"experiment must be a BaseExperiment, got {type(experiment)}"
|
|
561
|
+
)
|
|
562
|
+
# Permanent storage: run_dir/events/
|
|
563
|
+
run_dir = experiment.run_dir
|
|
564
|
+
permanent_dir = run_dir / "events" if run_dir else None
|
|
565
|
+
super().__init__(initial_count, permanent_dir)
|
|
566
|
+
self.experiment = experiment
|
|
567
|
+
self.workspace_path = workspace_path
|
|
568
|
+
self._events_dir = (
|
|
569
|
+
workspace_path / ".events" / "experiments" / experiment.experiment_id
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
@property
|
|
573
|
+
def events_dir(self) -> Path:
|
|
574
|
+
return self._events_dir
|
|
575
|
+
|
|
576
|
+
@property
|
|
577
|
+
def experiment_id(self) -> str:
|
|
578
|
+
return self.experiment.experiment_id
|
|
579
|
+
|
|
580
|
+
@property
|
|
581
|
+
def run_dir(self) -> Path | None:
|
|
582
|
+
return self.experiment.run_dir
|
|
583
|
+
|
|
584
|
+
def init_status(self) -> None:
|
|
585
|
+
"""Initialize status.json for a new run
|
|
586
|
+
|
|
587
|
+
Uses the experiment's write_status() method to write the initial status.
|
|
588
|
+
"""
|
|
589
|
+
# Ensure run directory exists
|
|
590
|
+
run_dir = self.experiment.run_dir
|
|
591
|
+
if run_dir:
|
|
592
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
593
|
+
# Write initial status using experiment's write_status method
|
|
594
|
+
self.experiment.write_status()
|
|
595
|
+
|
|
596
|
+
def create_symlink(self) -> None:
|
|
597
|
+
"""Create/update symlink to current run directory
|
|
598
|
+
|
|
599
|
+
The symlink is created at:
|
|
600
|
+
.events/experiments/{experiment_id}/current -> run_dir
|
|
601
|
+
"""
|
|
602
|
+
run_dir = self.experiment.run_dir
|
|
603
|
+
if run_dir is None:
|
|
604
|
+
return
|
|
605
|
+
|
|
606
|
+
# Ensure the experiment events directory exists
|
|
607
|
+
self._events_dir.mkdir(parents=True, exist_ok=True)
|
|
608
|
+
|
|
609
|
+
# Handle legacy: if experiment_id path is a symlink (old format), remove it
|
|
610
|
+
# Check both old .experimaestro and current .events paths
|
|
611
|
+
for events_base in [".experimaestro", ".events"]:
|
|
612
|
+
experiments_dir = self.workspace_path / events_base / "experiments"
|
|
613
|
+
old_symlink = experiments_dir / self.experiment_id
|
|
614
|
+
if old_symlink.is_symlink():
|
|
615
|
+
old_symlink.unlink()
|
|
616
|
+
|
|
617
|
+
# Create symlink inside the experiment directory
|
|
618
|
+
symlink = self._events_dir / "current"
|
|
619
|
+
|
|
620
|
+
# Compute relative path from symlink location to run_dir
|
|
621
|
+
try:
|
|
622
|
+
rel_path = os.path.relpath(run_dir, self._events_dir)
|
|
623
|
+
except ValueError:
|
|
624
|
+
# On Windows, relpath fails for paths on different drives
|
|
625
|
+
rel_path = str(run_dir)
|
|
626
|
+
|
|
627
|
+
# Remove existing symlink if present
|
|
628
|
+
if symlink.is_symlink() or symlink.exists():
|
|
629
|
+
symlink.unlink()
|
|
630
|
+
|
|
631
|
+
symlink.symlink_to(rel_path)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
# =============================================================================
|
|
635
|
+
# Event Reader Class
|
|
636
|
+
# =============================================================================
|
|
637
|
+
|
|
638
|
+
# Callback types for event watching
|
|
639
|
+
# (entity_id, event) - entity_id is job_id or experiment_id
|
|
640
|
+
EntityEventCallback = Callable[[str, EventBase], None]
|
|
641
|
+
# (entity_id,) - called when event files are deleted
|
|
642
|
+
EntityDeletedCallback = Callable[[str], None]
|
|
643
|
+
# Extracts entity_id from path
|
|
644
|
+
EntityIdExtractor = Callable[[Path], Optional[str]]
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def default_entity_id_extractor(path: Path) -> Optional[str]:
|
|
648
|
+
"""Default: entity_id is the parent directory name
|
|
649
|
+
|
|
650
|
+
Path format: {base_dir}/{entity_id}/events-{count}.jsonl
|
|
651
|
+
"""
|
|
652
|
+
return path.parent.name
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def job_entity_id_extractor(path: Path) -> Optional[str]:
|
|
656
|
+
"""For jobs: entity_id (job_id) is extracted from the filename
|
|
657
|
+
|
|
658
|
+
Path format: {base_dir}/{task_id}/event-{job_id}-{count}.jsonl
|
|
659
|
+
"""
|
|
660
|
+
# Job ID is extracted from the filename
|
|
661
|
+
# e.g., .events/jobs/my.task/event-abc123-0.jsonl -> abc123
|
|
662
|
+
name = path.name
|
|
663
|
+
if not name.startswith("event-"):
|
|
664
|
+
return None
|
|
665
|
+
# Remove "event-" prefix and ".jsonl" suffix
|
|
666
|
+
# Format: event-{job_id}-{count}.jsonl
|
|
667
|
+
rest = name[6:] # Remove "event-"
|
|
668
|
+
if rest.endswith(".jsonl"):
|
|
669
|
+
rest = rest[:-6] # Remove ".jsonl"
|
|
670
|
+
# Now rest is "{job_id}-{count}", split on last "-" to get job_id
|
|
671
|
+
parts = rest.rsplit("-", 1)
|
|
672
|
+
if len(parts) == 2:
|
|
673
|
+
return parts[0]
|
|
674
|
+
return None
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
# Resolver type: given an entity_id, returns the permanent storage path
|
|
678
|
+
PermanentStorageResolver = Callable[[str], Path]
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
@dataclass
|
|
682
|
+
class WatchedDirectory:
|
|
683
|
+
"""Configuration for a directory to watch for events
|
|
684
|
+
|
|
685
|
+
Attributes:
|
|
686
|
+
path: Temporary events directory (.events/...)
|
|
687
|
+
entity_id_extractor: Function to extract entity ID from event file path
|
|
688
|
+
glob_pattern: Pattern for matching event files
|
|
689
|
+
permanent_storage_resolver: Optional function that returns permanent storage
|
|
690
|
+
path for an entity. Used for hardlink archiving and deletion recovery.
|
|
691
|
+
"""
|
|
692
|
+
|
|
693
|
+
path: Path
|
|
694
|
+
entity_id_extractor: EntityIdExtractor = field(
|
|
695
|
+
default_factory=lambda: default_entity_id_extractor
|
|
696
|
+
)
|
|
697
|
+
glob_pattern: str = "*/events-*.jsonl"
|
|
698
|
+
# For archiving and deletion handling:
|
|
699
|
+
permanent_storage_resolver: PermanentStorageResolver | None = None
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
class EventReader:
|
|
703
|
+
"""Generic reader for events from JSONL files
|
|
704
|
+
|
|
705
|
+
Watches multiple directories with configurable entity ID extraction.
|
|
706
|
+
|
|
707
|
+
Supports:
|
|
708
|
+
- One-shot reading: read_events_since_count()
|
|
709
|
+
- Incremental reading: read_new_events() - tracks file positions
|
|
710
|
+
- File watching: start_watching(), stop_watching() - uses watchdog
|
|
711
|
+
- Buffered mode: buffer events during initialization, flush after
|
|
712
|
+
"""
|
|
713
|
+
|
|
714
|
+
def __init__(self, directories: list[WatchedDirectory]):
|
|
715
|
+
"""Initialize event reader
|
|
716
|
+
|
|
717
|
+
Args:
|
|
718
|
+
directories: List of directories to watch with their configurations
|
|
719
|
+
"""
|
|
720
|
+
self.directories = directories
|
|
721
|
+
# For incremental reading (live monitoring)
|
|
722
|
+
self._file_positions: dict[Path, int] = {}
|
|
723
|
+
# For file watching (using ipcom)
|
|
724
|
+
self._watches: list[Any] = []
|
|
725
|
+
self._handler: Any = None
|
|
726
|
+
self._event_callbacks: list[EntityEventCallback] = []
|
|
727
|
+
self._deleted_callbacks: list[EntityDeletedCallback] = []
|
|
728
|
+
# Buffering mode: queue events instead of forwarding immediately
|
|
729
|
+
self._buffering = False
|
|
730
|
+
self._event_buffer: list[tuple[str, EventBase]] = []
|
|
731
|
+
self._deleted_buffer: list[str] = []
|
|
732
|
+
|
|
733
|
+
def _extract_entity_id(self, path: Path) -> Optional[str]:
|
|
734
|
+
"""Extract entity ID from event file path"""
|
|
735
|
+
dir_config = self._find_dir_config(path)
|
|
736
|
+
if dir_config:
|
|
737
|
+
return dir_config.entity_id_extractor(path)
|
|
738
|
+
return None
|
|
739
|
+
|
|
740
|
+
def _find_dir_config(self, path: Path) -> WatchedDirectory | None:
|
|
741
|
+
"""Find the WatchedDirectory config that contains the given path"""
|
|
742
|
+
for dir_config in self.directories:
|
|
743
|
+
try:
|
|
744
|
+
path.relative_to(dir_config.path)
|
|
745
|
+
return dir_config
|
|
746
|
+
except ValueError:
|
|
747
|
+
continue
|
|
748
|
+
return None
|
|
749
|
+
|
|
750
|
+
def get_all_event_files(self) -> list[Path]:
|
|
751
|
+
"""Get all event files across all directories, sorted by modification time"""
|
|
752
|
+
all_files = []
|
|
753
|
+
for dir_config in self.directories:
|
|
754
|
+
if dir_config.path.exists():
|
|
755
|
+
all_files.extend(dir_config.path.glob(dir_config.glob_pattern))
|
|
756
|
+
return sorted(
|
|
757
|
+
all_files,
|
|
758
|
+
key=lambda p: p.stat().st_mtime if p.exists() else 0,
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
def scan_existing_files(self) -> None:
|
|
762
|
+
"""Scan for existing event files and set initial positions to end of file
|
|
763
|
+
|
|
764
|
+
Call this before start_watching() to skip existing events and only
|
|
765
|
+
receive new ones.
|
|
766
|
+
"""
|
|
767
|
+
for path in self.get_all_event_files():
|
|
768
|
+
try:
|
|
769
|
+
self._file_positions[path] = path.stat().st_size
|
|
770
|
+
except OSError:
|
|
771
|
+
pass
|
|
772
|
+
|
|
773
|
+
def read_new_events(self) -> list[tuple[str, EventBase]]:
|
|
774
|
+
"""Read new events since last call (incremental reading)
|
|
775
|
+
|
|
776
|
+
Returns:
|
|
777
|
+
List of (entity_id, event) tuples
|
|
778
|
+
"""
|
|
779
|
+
results = []
|
|
780
|
+
for event_file in self.get_all_event_files():
|
|
781
|
+
entity_id = self._extract_entity_id(event_file)
|
|
782
|
+
if not entity_id:
|
|
783
|
+
continue
|
|
784
|
+
|
|
785
|
+
last_pos = self._file_positions.get(event_file, 0)
|
|
786
|
+
try:
|
|
787
|
+
with event_file.open("r") as f:
|
|
788
|
+
f.seek(last_pos)
|
|
789
|
+
# Use readline() instead of iterator to allow tell()
|
|
790
|
+
while True:
|
|
791
|
+
line = f.readline()
|
|
792
|
+
if not line:
|
|
793
|
+
break
|
|
794
|
+
line = line.strip()
|
|
795
|
+
if not line:
|
|
796
|
+
continue
|
|
797
|
+
try:
|
|
798
|
+
event_dict = json.loads(line)
|
|
799
|
+
event = EventBase.from_dict(event_dict)
|
|
800
|
+
results.append((entity_id, event))
|
|
801
|
+
except json.JSONDecodeError:
|
|
802
|
+
pass
|
|
803
|
+
self._file_positions[event_file] = f.tell()
|
|
804
|
+
except OSError:
|
|
805
|
+
pass
|
|
806
|
+
return results
|
|
807
|
+
|
|
808
|
+
def start_watching(
|
|
809
|
+
self,
|
|
810
|
+
on_event: Optional[EntityEventCallback] = None,
|
|
811
|
+
on_deleted: Optional[EntityDeletedCallback] = None,
|
|
812
|
+
) -> None:
|
|
813
|
+
"""Start watching for file changes using ipcom
|
|
814
|
+
|
|
815
|
+
Args:
|
|
816
|
+
on_event: Callback called with (entity_id, event) for each new event
|
|
817
|
+
on_deleted: Callback called with (entity_id,) when event files are deleted
|
|
818
|
+
"""
|
|
819
|
+
from watchdog.events import FileSystemEventHandler
|
|
820
|
+
|
|
821
|
+
from experimaestro.ipc import ipcom
|
|
822
|
+
|
|
823
|
+
if on_event:
|
|
824
|
+
self._event_callbacks.append(on_event)
|
|
825
|
+
if on_deleted:
|
|
826
|
+
self._deleted_callbacks.append(on_deleted)
|
|
827
|
+
|
|
828
|
+
if self._watches:
|
|
829
|
+
return # Already watching
|
|
830
|
+
|
|
831
|
+
# Create event handler
|
|
832
|
+
reader = self
|
|
833
|
+
|
|
834
|
+
class EventFileHandler(FileSystemEventHandler):
|
|
835
|
+
def _is_event_file(self, path: Path) -> bool:
|
|
836
|
+
"""Check if path is an event file (experiment or job)"""
|
|
837
|
+
# Experiment events: events-{count}.jsonl
|
|
838
|
+
# Job events: event-{job_id}-{count}.jsonl
|
|
839
|
+
return path.suffix == ".jsonl" and path.name.startswith(
|
|
840
|
+
("events-", "event-")
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
def on_modified(self, event):
|
|
844
|
+
if event.is_directory:
|
|
845
|
+
return
|
|
846
|
+
path = Path(event.src_path)
|
|
847
|
+
if self._is_event_file(path):
|
|
848
|
+
logger.debug("Detected modification of event file: %s", path)
|
|
849
|
+
reader._process_file_change(path)
|
|
850
|
+
|
|
851
|
+
def on_created(self, event):
|
|
852
|
+
if event.is_directory:
|
|
853
|
+
return
|
|
854
|
+
path = Path(event.src_path)
|
|
855
|
+
if self._is_event_file(path):
|
|
856
|
+
logger.debug("Detected creation of event file: %s", path)
|
|
857
|
+
reader._file_positions[path] = 0
|
|
858
|
+
reader._process_file_change(path)
|
|
859
|
+
|
|
860
|
+
def on_deleted(self, event):
|
|
861
|
+
if event.is_directory:
|
|
862
|
+
return
|
|
863
|
+
path = Path(event.src_path)
|
|
864
|
+
if self._is_event_file(path):
|
|
865
|
+
logger.debug("Detected deletion of event file: %s", path)
|
|
866
|
+
entity_id = reader._extract_entity_id(path)
|
|
867
|
+
dir_config = reader._find_dir_config(path)
|
|
868
|
+
reader._file_positions.pop(path, None)
|
|
869
|
+
if entity_id:
|
|
870
|
+
reader._handle_deletion(entity_id, path, dir_config)
|
|
871
|
+
|
|
872
|
+
self._handler = EventFileHandler()
|
|
873
|
+
ipc = ipcom()
|
|
874
|
+
|
|
875
|
+
# Register watches for each directory
|
|
876
|
+
for dir_config in self.directories:
|
|
877
|
+
dir_config.path.mkdir(parents=True, exist_ok=True)
|
|
878
|
+
watch = ipc.fswatch(self._handler, dir_config.path, recursive=True)
|
|
879
|
+
self._watches.append(watch)
|
|
880
|
+
logger.debug("Started watching %s", dir_config.path)
|
|
881
|
+
|
|
882
|
+
def stop_watching(self) -> None:
|
|
883
|
+
"""Stop watching for file changes"""
|
|
884
|
+
from experimaestro.ipc import ipcom
|
|
885
|
+
|
|
886
|
+
ipc = ipcom()
|
|
887
|
+
for watch in self._watches:
|
|
888
|
+
ipc.fsunwatch(watch)
|
|
889
|
+
self._watches.clear()
|
|
890
|
+
self._handler = None
|
|
891
|
+
logger.debug("Stopped watching all directories")
|
|
892
|
+
|
|
893
|
+
def _process_file_change(self, path: Path) -> None:
|
|
894
|
+
"""Process a changed event file and notify callbacks (or buffer)"""
|
|
895
|
+
entity_id = self._extract_entity_id(path)
|
|
896
|
+
if not entity_id:
|
|
897
|
+
return
|
|
898
|
+
|
|
899
|
+
last_pos = self._file_positions.get(path, 0)
|
|
900
|
+
try:
|
|
901
|
+
with path.open("r") as f:
|
|
902
|
+
f.seek(last_pos)
|
|
903
|
+
# Use readline() instead of iterator to allow tell()
|
|
904
|
+
while True:
|
|
905
|
+
line = f.readline()
|
|
906
|
+
if not line:
|
|
907
|
+
break
|
|
908
|
+
|
|
909
|
+
# Skip incomplete lines (writer may be mid-write)
|
|
910
|
+
if not line.endswith("\n"):
|
|
911
|
+
break
|
|
912
|
+
|
|
913
|
+
line = line.strip()
|
|
914
|
+
if not line:
|
|
915
|
+
# Update position for empty lines
|
|
916
|
+
last_pos = f.tell()
|
|
917
|
+
continue
|
|
918
|
+
try:
|
|
919
|
+
event_dict = json.loads(line)
|
|
920
|
+
event = EventBase.from_dict(event_dict)
|
|
921
|
+
if self._buffering:
|
|
922
|
+
# Queue event for later
|
|
923
|
+
self._event_buffer.append((entity_id, event))
|
|
924
|
+
else:
|
|
925
|
+
# Forward immediately
|
|
926
|
+
for callback in self._event_callbacks:
|
|
927
|
+
try:
|
|
928
|
+
callback(entity_id, event)
|
|
929
|
+
except Exception:
|
|
930
|
+
logger.exception("Error in event callback")
|
|
931
|
+
except json.JSONDecodeError:
|
|
932
|
+
pass
|
|
933
|
+
|
|
934
|
+
# Update position after each complete line
|
|
935
|
+
last_pos = f.tell()
|
|
936
|
+
|
|
937
|
+
self._file_positions[path] = last_pos
|
|
938
|
+
except FileNotFoundError:
|
|
939
|
+
pass
|
|
940
|
+
except OSError as e:
|
|
941
|
+
logger.warning("Failed to read event file %s: %s", path, e)
|
|
942
|
+
|
|
943
|
+
def _handle_deletion(
|
|
944
|
+
self,
|
|
945
|
+
entity_id: str,
|
|
946
|
+
deleted_path: Path | None = None,
|
|
947
|
+
dir_config: WatchedDirectory | None = None,
|
|
948
|
+
) -> None:
|
|
949
|
+
"""Handle entity deletion - read from permanent storage if available
|
|
950
|
+
|
|
951
|
+
When event files are deleted from the temporary (.events/) directory,
|
|
952
|
+
this method attempts to read any remaining events from permanent storage
|
|
953
|
+
(if configured via permanent_storage_resolver).
|
|
954
|
+
|
|
955
|
+
Args:
|
|
956
|
+
entity_id: The entity identifier (job_id or experiment_id)
|
|
957
|
+
deleted_path: The path of the deleted file (optional)
|
|
958
|
+
dir_config: The WatchedDirectory config for this path (optional)
|
|
959
|
+
"""
|
|
960
|
+
# Try to read remaining events from permanent storage
|
|
961
|
+
if dir_config and dir_config.permanent_storage_resolver:
|
|
962
|
+
permanent_dir = dir_config.permanent_storage_resolver(entity_id)
|
|
963
|
+
|
|
964
|
+
# Read any events from permanent storage that we haven't processed
|
|
965
|
+
events = self._read_events_from_permanent(permanent_dir, entity_id)
|
|
966
|
+
for event in events:
|
|
967
|
+
if self._buffering:
|
|
968
|
+
self._event_buffer.append((entity_id, event))
|
|
969
|
+
else:
|
|
970
|
+
for callback in self._event_callbacks:
|
|
971
|
+
try:
|
|
972
|
+
callback(entity_id, event)
|
|
973
|
+
except Exception:
|
|
974
|
+
logger.exception("Error in event callback")
|
|
975
|
+
|
|
976
|
+
# Notify deletion callbacks
|
|
977
|
+
if self._buffering:
|
|
978
|
+
self._deleted_buffer.append(entity_id)
|
|
979
|
+
else:
|
|
980
|
+
for callback in self._deleted_callbacks:
|
|
981
|
+
try:
|
|
982
|
+
callback(entity_id)
|
|
983
|
+
except Exception:
|
|
984
|
+
logger.exception("Error in deleted callback")
|
|
985
|
+
|
|
986
|
+
def _read_events_from_permanent(
|
|
987
|
+
self, permanent_dir: Path, entity_id: str
|
|
988
|
+
) -> list[EventBase]:
|
|
989
|
+
"""Read events from permanent storage directory
|
|
990
|
+
|
|
991
|
+
Args:
|
|
992
|
+
permanent_dir: Path to permanent storage directory
|
|
993
|
+
entity_id: Entity identifier (for logging)
|
|
994
|
+
|
|
995
|
+
Returns:
|
|
996
|
+
List of events read from permanent storage
|
|
997
|
+
"""
|
|
998
|
+
events = []
|
|
999
|
+
if not permanent_dir.exists():
|
|
1000
|
+
return events
|
|
1001
|
+
|
|
1002
|
+
# Read all event files in permanent storage
|
|
1003
|
+
event_files = sorted(permanent_dir.glob("event-*.jsonl"))
|
|
1004
|
+
for event_file in event_files:
|
|
1005
|
+
try:
|
|
1006
|
+
with event_file.open("r") as f:
|
|
1007
|
+
for line in f:
|
|
1008
|
+
line = line.strip()
|
|
1009
|
+
if not line:
|
|
1010
|
+
continue
|
|
1011
|
+
try:
|
|
1012
|
+
event_dict = json.loads(line)
|
|
1013
|
+
event = EventBase.from_dict(event_dict)
|
|
1014
|
+
events.append(event)
|
|
1015
|
+
except json.JSONDecodeError:
|
|
1016
|
+
pass
|
|
1017
|
+
except OSError as e:
|
|
1018
|
+
logger.warning(
|
|
1019
|
+
"Failed to read permanent event file %s: %s", event_file, e
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
if events:
|
|
1023
|
+
logger.debug(
|
|
1024
|
+
"Read %d events from permanent storage for entity %s",
|
|
1025
|
+
len(events),
|
|
1026
|
+
entity_id,
|
|
1027
|
+
)
|
|
1028
|
+
return events
|
|
1029
|
+
|
|
1030
|
+
def clear_callbacks(self) -> None:
|
|
1031
|
+
"""Clear all registered callbacks"""
|
|
1032
|
+
self._event_callbacks.clear()
|
|
1033
|
+
self._deleted_callbacks.clear()
|
|
1034
|
+
|
|
1035
|
+
def start_buffering(self) -> None:
|
|
1036
|
+
"""Start buffering events instead of forwarding to callbacks
|
|
1037
|
+
|
|
1038
|
+
Call this before scan_existing_files() to ensure events arriving
|
|
1039
|
+
during initialization are queued and not lost.
|
|
1040
|
+
"""
|
|
1041
|
+
self._buffering = True
|
|
1042
|
+
self._event_buffer.clear()
|
|
1043
|
+
self._deleted_buffer.clear()
|
|
1044
|
+
|
|
1045
|
+
def flush_buffer(self) -> None:
|
|
1046
|
+
"""Stop buffering and forward all buffered events to callbacks
|
|
1047
|
+
|
|
1048
|
+
Call this after initial state loading is complete.
|
|
1049
|
+
"""
|
|
1050
|
+
self._buffering = False
|
|
1051
|
+
|
|
1052
|
+
# Forward buffered events
|
|
1053
|
+
for entity_id, event in self._event_buffer:
|
|
1054
|
+
for callback in self._event_callbacks:
|
|
1055
|
+
try:
|
|
1056
|
+
callback(entity_id, event)
|
|
1057
|
+
except Exception:
|
|
1058
|
+
logger.exception("Error in event callback")
|
|
1059
|
+
|
|
1060
|
+
# Forward buffered deletions
|
|
1061
|
+
for entity_id in self._deleted_buffer:
|
|
1062
|
+
for callback in self._deleted_callbacks:
|
|
1063
|
+
try:
|
|
1064
|
+
callback(entity_id)
|
|
1065
|
+
except Exception:
|
|
1066
|
+
logger.exception("Error in deleted callback")
|
|
1067
|
+
|
|
1068
|
+
self._event_buffer.clear()
|
|
1069
|
+
self._deleted_buffer.clear()
|
|
1070
|
+
|
|
1071
|
+
def read_events_since_count(
|
|
1072
|
+
self, entity_id: str, start_count: int, base_dir: Optional[Path] = None
|
|
1073
|
+
) -> list[EventBase]:
|
|
1074
|
+
"""Read events for an entity starting from a specific file count
|
|
1075
|
+
|
|
1076
|
+
Args:
|
|
1077
|
+
entity_id: Entity identifier (job_id or experiment_id)
|
|
1078
|
+
start_count: File count to start reading from (events-{count}.jsonl)
|
|
1079
|
+
base_dir: Optional base directory to search in (defaults to first directory)
|
|
1080
|
+
|
|
1081
|
+
Returns:
|
|
1082
|
+
List of events from files starting at start_count
|
|
1083
|
+
"""
|
|
1084
|
+
events = []
|
|
1085
|
+
|
|
1086
|
+
# Determine which directory to search
|
|
1087
|
+
if base_dir is None and self.directories:
|
|
1088
|
+
base_dir = self.directories[0].path
|
|
1089
|
+
|
|
1090
|
+
if base_dir is None:
|
|
1091
|
+
return events
|
|
1092
|
+
|
|
1093
|
+
entity_events_dir = base_dir / entity_id
|
|
1094
|
+
count = start_count
|
|
1095
|
+
while True:
|
|
1096
|
+
event_path = entity_events_dir / f"events-{count}.jsonl"
|
|
1097
|
+
if not event_path.exists():
|
|
1098
|
+
break
|
|
1099
|
+
|
|
1100
|
+
try:
|
|
1101
|
+
with event_path.open("r") as f:
|
|
1102
|
+
for line in f:
|
|
1103
|
+
line = line.strip()
|
|
1104
|
+
if line:
|
|
1105
|
+
try:
|
|
1106
|
+
event_dict = json.loads(line)
|
|
1107
|
+
event = EventBase.from_dict(event_dict)
|
|
1108
|
+
events.append(event)
|
|
1109
|
+
except json.JSONDecodeError:
|
|
1110
|
+
pass
|
|
1111
|
+
except OSError:
|
|
1112
|
+
pass
|
|
1113
|
+
count += 1
|
|
1114
|
+
return events
|
|
1115
|
+
|
|
1116
|
+
|
|
1117
|
+
class JobProgressReader:
|
|
1118
|
+
"""Convenience reader for job progress events
|
|
1119
|
+
|
|
1120
|
+
Reads progress events from a job's event files.
|
|
1121
|
+
Used for monitoring job progress via CLI and scheduler.
|
|
1122
|
+
"""
|
|
1123
|
+
|
|
1124
|
+
def __init__(self, job_path: Path):
|
|
1125
|
+
"""Initialize job progress reader
|
|
1126
|
+
|
|
1127
|
+
Args:
|
|
1128
|
+
job_path: Path to the job directory (workspace/jobs/task_id/job_id)
|
|
1129
|
+
"""
|
|
1130
|
+
self.job_path = job_path
|
|
1131
|
+
# Extract task_id and job_id from path
|
|
1132
|
+
self.job_id = job_path.name
|
|
1133
|
+
self.task_id = job_path.parent.name
|
|
1134
|
+
# Progress events are stored in workspace/.events/jobs/{task_id}/
|
|
1135
|
+
self._events_dir = (
|
|
1136
|
+
job_path.parent.parent.parent / ".events" / "jobs" / self.task_id
|
|
1137
|
+
)
|
|
1138
|
+
|
|
1139
|
+
def get_event_files(self) -> list[Path]:
|
|
1140
|
+
"""Get all event files for this job"""
|
|
1141
|
+
if not self._events_dir.exists():
|
|
1142
|
+
return []
|
|
1143
|
+
return sorted(self._events_dir.glob("events-*.jsonl"))
|
|
1144
|
+
|
|
1145
|
+
def read_progress_events(self) -> list[JobProgressEvent]:
|
|
1146
|
+
"""Read all progress events from event files
|
|
1147
|
+
|
|
1148
|
+
Returns:
|
|
1149
|
+
List of JobProgressEvent objects in chronological order
|
|
1150
|
+
"""
|
|
1151
|
+
events = []
|
|
1152
|
+
for event_file in self.get_event_files():
|
|
1153
|
+
try:
|
|
1154
|
+
with event_file.open("r") as f:
|
|
1155
|
+
for line in f:
|
|
1156
|
+
line = line.strip()
|
|
1157
|
+
if not line:
|
|
1158
|
+
continue
|
|
1159
|
+
try:
|
|
1160
|
+
data = json.loads(line)
|
|
1161
|
+
if data.get("type") == "job_progress":
|
|
1162
|
+
events.append(
|
|
1163
|
+
JobProgressEvent(
|
|
1164
|
+
job_id=data.get("job_id", ""),
|
|
1165
|
+
level=data.get("level", 0),
|
|
1166
|
+
progress=data.get("progress", 0.0),
|
|
1167
|
+
desc=data.get("desc"),
|
|
1168
|
+
timestamp=data.get("timestamp", 0.0),
|
|
1169
|
+
)
|
|
1170
|
+
)
|
|
1171
|
+
except json.JSONDecodeError:
|
|
1172
|
+
pass
|
|
1173
|
+
except OSError:
|
|
1174
|
+
pass
|
|
1175
|
+
return events
|
|
1176
|
+
|
|
1177
|
+
def get_current_progress(self) -> dict[int, JobProgressEvent]:
|
|
1178
|
+
"""Get current progress state per level
|
|
1179
|
+
|
|
1180
|
+
Returns:
|
|
1181
|
+
Dict mapping level to the latest JobProgressEvent for that level
|
|
1182
|
+
"""
|
|
1183
|
+
progress: dict[int, JobProgressEvent] = {}
|
|
1184
|
+
for event in self.read_progress_events():
|
|
1185
|
+
progress[event.level] = event
|
|
1186
|
+
return progress
|
|
1187
|
+
|
|
1188
|
+
def is_done(self) -> bool:
|
|
1189
|
+
"""Check if job is complete (JobStateChangedEvent with done/error state)
|
|
1190
|
+
|
|
1191
|
+
Returns:
|
|
1192
|
+
True if job state is "done" or "error" in event files
|
|
1193
|
+
"""
|
|
1194
|
+
for event_file in self.get_event_files():
|
|
1195
|
+
try:
|
|
1196
|
+
with event_file.open("r") as f:
|
|
1197
|
+
for line in f:
|
|
1198
|
+
line = line.strip()
|
|
1199
|
+
if not line:
|
|
1200
|
+
continue
|
|
1201
|
+
try:
|
|
1202
|
+
data = json.loads(line)
|
|
1203
|
+
# Check for job state changed event with final state
|
|
1204
|
+
if data.get("type") == "job_state_changed":
|
|
1205
|
+
state = data.get("state", "")
|
|
1206
|
+
if state in ("done", "error"):
|
|
1207
|
+
return True
|
|
1208
|
+
# Also check for old EOJ marker for backward compatibility
|
|
1209
|
+
if (
|
|
1210
|
+
data.get("type") == "job_progress"
|
|
1211
|
+
and data.get("level") == -1
|
|
1212
|
+
):
|
|
1213
|
+
return True
|
|
1214
|
+
except json.JSONDecodeError:
|
|
1215
|
+
pass
|
|
1216
|
+
except OSError:
|
|
1217
|
+
pass
|
|
1218
|
+
return False
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
__all__ = [
|
|
1222
|
+
# Hardlink utilities
|
|
1223
|
+
"supports_hardlinks",
|
|
1224
|
+
"safe_link_or_copy",
|
|
1225
|
+
# Event classes
|
|
1226
|
+
"EventBase",
|
|
1227
|
+
"JobSubmittedEvent",
|
|
1228
|
+
"JobStateChangedEvent",
|
|
1229
|
+
"JobProgressEvent",
|
|
1230
|
+
"ServiceAddedEvent",
|
|
1231
|
+
"ServiceStateChangedEvent",
|
|
1232
|
+
"RunCompletedEvent",
|
|
1233
|
+
"EVENT_TYPES",
|
|
1234
|
+
# Event writer classes
|
|
1235
|
+
"EventWriter",
|
|
1236
|
+
"JobEventWriter",
|
|
1237
|
+
"ExperimentEventWriter",
|
|
1238
|
+
# Event reader class
|
|
1239
|
+
"EventReader",
|
|
1240
|
+
"WatchedDirectory",
|
|
1241
|
+
"PermanentStorageResolver",
|
|
1242
|
+
"job_entity_id_extractor",
|
|
1243
|
+
"JobProgressReader",
|
|
1244
|
+
# Callback types
|
|
1245
|
+
"EntityEventCallback",
|
|
1246
|
+
"EntityDeletedCallback",
|
|
1247
|
+
]
|