experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +10 -11
- experimaestro/annotations.py +167 -206
- experimaestro/cli/__init__.py +140 -16
- experimaestro/cli/filter.py +42 -74
- experimaestro/cli/jobs.py +157 -106
- experimaestro/cli/progress.py +269 -0
- experimaestro/cli/refactor.py +249 -0
- experimaestro/click.py +0 -1
- experimaestro/commandline.py +19 -3
- experimaestro/connectors/__init__.py +22 -3
- experimaestro/connectors/local.py +12 -0
- experimaestro/core/arguments.py +192 -37
- experimaestro/core/identifier.py +127 -12
- experimaestro/core/objects/__init__.py +6 -0
- experimaestro/core/objects/config.py +702 -285
- experimaestro/core/objects/config_walk.py +24 -6
- experimaestro/core/serialization.py +91 -34
- experimaestro/core/serializers.py +1 -8
- experimaestro/core/subparameters.py +164 -0
- experimaestro/core/types.py +198 -83
- experimaestro/exceptions.py +26 -0
- experimaestro/experiments/cli.py +107 -25
- experimaestro/generators.py +50 -9
- experimaestro/huggingface.py +3 -1
- experimaestro/launcherfinder/parser.py +29 -0
- experimaestro/launcherfinder/registry.py +3 -3
- experimaestro/launchers/__init__.py +26 -1
- experimaestro/launchers/direct.py +12 -0
- experimaestro/launchers/slurm/base.py +154 -2
- experimaestro/mkdocs/base.py +6 -8
- experimaestro/mkdocs/metaloader.py +0 -1
- experimaestro/mypy.py +452 -7
- experimaestro/notifications.py +75 -16
- experimaestro/progress.py +404 -0
- experimaestro/rpyc.py +0 -1
- experimaestro/run.py +19 -6
- experimaestro/scheduler/__init__.py +18 -1
- experimaestro/scheduler/base.py +504 -959
- experimaestro/scheduler/dependencies.py +43 -28
- experimaestro/scheduler/dynamic_outputs.py +259 -130
- experimaestro/scheduler/experiment.py +582 -0
- experimaestro/scheduler/interfaces.py +474 -0
- experimaestro/scheduler/jobs.py +485 -0
- experimaestro/scheduler/services.py +186 -12
- experimaestro/scheduler/signal_handler.py +32 -0
- experimaestro/scheduler/state.py +1 -1
- experimaestro/scheduler/state_db.py +388 -0
- experimaestro/scheduler/state_provider.py +2345 -0
- experimaestro/scheduler/state_sync.py +834 -0
- experimaestro/scheduler/workspace.py +52 -10
- experimaestro/scriptbuilder.py +7 -0
- experimaestro/server/__init__.py +153 -32
- experimaestro/server/data/index.css +0 -125
- experimaestro/server/data/index.css.map +1 -1
- experimaestro/server/data/index.js +194 -58
- experimaestro/server/data/index.js.map +1 -1
- experimaestro/settings.py +47 -6
- experimaestro/sphinx/__init__.py +3 -3
- experimaestro/taskglobals.py +20 -0
- experimaestro/tests/conftest.py +80 -0
- experimaestro/tests/core/test_generics.py +2 -2
- experimaestro/tests/identifier_stability.json +45 -0
- experimaestro/tests/launchers/bin/sacct +6 -2
- experimaestro/tests/launchers/bin/sbatch +4 -2
- experimaestro/tests/launchers/common.py +2 -2
- experimaestro/tests/launchers/test_slurm.py +80 -0
- experimaestro/tests/restart.py +1 -1
- experimaestro/tests/tasks/all.py +7 -0
- experimaestro/tests/tasks/test_dynamic.py +231 -0
- experimaestro/tests/test_checkers.py +2 -2
- experimaestro/tests/test_cli_jobs.py +615 -0
- experimaestro/tests/test_dependencies.py +11 -17
- experimaestro/tests/test_deprecated.py +630 -0
- experimaestro/tests/test_environment.py +200 -0
- experimaestro/tests/test_experiment.py +3 -3
- experimaestro/tests/test_file_progress.py +425 -0
- experimaestro/tests/test_file_progress_integration.py +477 -0
- experimaestro/tests/test_forward.py +3 -3
- experimaestro/tests/test_generators.py +93 -0
- experimaestro/tests/test_identifier.py +520 -169
- experimaestro/tests/test_identifier_stability.py +458 -0
- experimaestro/tests/test_instance.py +16 -21
- experimaestro/tests/test_multitoken.py +442 -0
- experimaestro/tests/test_mypy.py +433 -0
- experimaestro/tests/test_objects.py +314 -30
- experimaestro/tests/test_outputs.py +8 -8
- experimaestro/tests/test_param.py +22 -26
- experimaestro/tests/test_partial_paths.py +231 -0
- experimaestro/tests/test_progress.py +2 -50
- experimaestro/tests/test_resumable_task.py +480 -0
- experimaestro/tests/test_serializers.py +141 -60
- experimaestro/tests/test_state_db.py +434 -0
- experimaestro/tests/test_subparameters.py +160 -0
- experimaestro/tests/test_tags.py +151 -15
- experimaestro/tests/test_tasks.py +137 -160
- experimaestro/tests/test_token_locking.py +252 -0
- experimaestro/tests/test_tokens.py +25 -19
- experimaestro/tests/test_types.py +133 -11
- experimaestro/tests/test_validation.py +19 -19
- experimaestro/tests/test_workspace_triggers.py +158 -0
- experimaestro/tests/token_reschedule.py +5 -3
- experimaestro/tests/utils.py +2 -2
- experimaestro/tokens.py +154 -57
- experimaestro/tools/diff.py +8 -1
- experimaestro/tui/__init__.py +8 -0
- experimaestro/tui/app.py +2303 -0
- experimaestro/tui/app.tcss +353 -0
- experimaestro/tui/log_viewer.py +228 -0
- experimaestro/typingutils.py +11 -2
- experimaestro/utils/__init__.py +23 -0
- experimaestro/utils/environment.py +148 -0
- experimaestro/utils/git.py +129 -0
- experimaestro/utils/resources.py +1 -1
- experimaestro/version.py +34 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
- experimaestro-2.0.0b4.dist-info/RECORD +181 -0
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
- experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
- experimaestro/compat.py +0 -6
- experimaestro/core/objects.pyi +0 -225
- experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
- experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
- experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
- experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
- experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
- experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
- experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
- experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
- experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
- experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
- experimaestro-1.11.1.dist-info/RECORD +0 -158
- experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
- {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from enum import Enum
|
|
3
3
|
import functools
|
|
4
|
+
import logging
|
|
4
5
|
import threading
|
|
5
6
|
from typing import Set
|
|
6
7
|
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
7
10
|
|
|
8
11
|
class ServiceListener:
|
|
9
12
|
"""A service listener"""
|
|
@@ -13,6 +16,12 @@ class ServiceListener:
|
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
class ServiceState(Enum):
|
|
19
|
+
"""State of a service lifecycle.
|
|
20
|
+
|
|
21
|
+
Services transition through these states:
|
|
22
|
+
STOPPED -> STARTING -> RUNNING -> STOPPING -> STOPPED
|
|
23
|
+
"""
|
|
24
|
+
|
|
16
25
|
STOPPED = 0
|
|
17
26
|
STARTING = 1
|
|
18
27
|
RUNNING = 2
|
|
@@ -24,27 +33,72 @@ class Service:
|
|
|
24
33
|
|
|
25
34
|
Services can be associated with an experiment. They send
|
|
26
35
|
notifications to service listeners.
|
|
36
|
+
|
|
37
|
+
To support restarting services from monitor mode, subclasses should
|
|
38
|
+
override :meth:`state_dict` to return the data needed to recreate
|
|
39
|
+
the service, and implement :meth:`from_state_dict` to recreate it.
|
|
27
40
|
"""
|
|
28
41
|
|
|
29
42
|
id: str
|
|
30
43
|
_state: ServiceState = ServiceState.STOPPED
|
|
31
44
|
|
|
32
45
|
def __init__(self):
|
|
33
|
-
self.
|
|
46
|
+
self._listeners: Set[ServiceListener] = set()
|
|
47
|
+
self._listeners_lock = threading.Lock()
|
|
48
|
+
|
|
49
|
+
def state_dict(self) -> dict:
|
|
50
|
+
"""Return a dictionary representation for serialization.
|
|
51
|
+
|
|
52
|
+
Subclasses should override this to include any parameters needed
|
|
53
|
+
to recreate the service. The base implementation returns the
|
|
54
|
+
class module and name.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Dict with '__class__' key and any additional kwargs.
|
|
58
|
+
"""
|
|
59
|
+
return {
|
|
60
|
+
"__class__": f"{self.__class__.__module__}.{self.__class__.__name__}",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def from_state_dict(data: dict) -> "Service":
|
|
65
|
+
"""Recreate a service from a state dictionary.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
data: Dictionary from :meth:`state_dict`
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
A new Service instance, or raises if the class cannot be loaded.
|
|
72
|
+
"""
|
|
73
|
+
import importlib
|
|
74
|
+
|
|
75
|
+
class_path = data.get("__class__")
|
|
76
|
+
if not class_path:
|
|
77
|
+
raise ValueError("Missing '__class__' in service state_dict")
|
|
78
|
+
|
|
79
|
+
module_name, class_name = class_path.rsplit(".", 1)
|
|
80
|
+
module = importlib.import_module(module_name)
|
|
81
|
+
cls = getattr(module, class_name)
|
|
82
|
+
|
|
83
|
+
# Remove __class__ and pass remaining as kwargs
|
|
84
|
+
kwargs = {k: v for k, v in data.items() if k != "__class__"}
|
|
85
|
+
return cls(**kwargs)
|
|
34
86
|
|
|
35
87
|
def add_listener(self, listener: ServiceListener):
|
|
36
88
|
"""Adds a listener
|
|
37
89
|
|
|
38
90
|
:param listener: The listener to add
|
|
39
91
|
"""
|
|
40
|
-
self.
|
|
92
|
+
with self._listeners_lock:
|
|
93
|
+
self._listeners.add(listener)
|
|
41
94
|
|
|
42
95
|
def remove_listener(self, listener: ServiceListener):
|
|
43
96
|
"""Removes a listener
|
|
44
97
|
|
|
45
98
|
:param listener: The listener to remove
|
|
46
99
|
"""
|
|
47
|
-
self.
|
|
100
|
+
with self._listeners_lock:
|
|
101
|
+
self._listeners.discard(listener)
|
|
48
102
|
|
|
49
103
|
def description(self):
|
|
50
104
|
return ""
|
|
@@ -58,35 +112,147 @@ class Service:
|
|
|
58
112
|
# Set the state
|
|
59
113
|
self._state = state
|
|
60
114
|
|
|
61
|
-
|
|
62
|
-
|
|
115
|
+
# Notify listeners with thread-safe snapshot
|
|
116
|
+
with self._listeners_lock:
|
|
117
|
+
listeners_snapshot = list(self._listeners)
|
|
118
|
+
|
|
119
|
+
for listener in listeners_snapshot:
|
|
120
|
+
try:
|
|
121
|
+
listener.service_state_changed(self)
|
|
122
|
+
except Exception:
|
|
123
|
+
logger.exception("Error notifying listener %s", listener)
|
|
63
124
|
|
|
64
125
|
|
|
65
126
|
class WebService(Service):
|
|
66
|
-
"""
|
|
127
|
+
"""Base class for web-based experiment services.
|
|
128
|
+
|
|
129
|
+
Web services provide HTTP endpoints that can be accessed through the
|
|
130
|
+
experimaestro web interface. When an experiment is running with a port
|
|
131
|
+
configured, web services are automatically proxied through the main
|
|
132
|
+
experimaestro server.
|
|
133
|
+
|
|
134
|
+
To implement a web service:
|
|
135
|
+
|
|
136
|
+
1. Subclass ``WebService``
|
|
137
|
+
2. Set a unique ``id`` class attribute
|
|
138
|
+
3. Implement the :meth:`_serve` method to start your web server
|
|
139
|
+
4. Set ``self.url`` and call ``running.set()`` when ready
|
|
140
|
+
5. Optionally check ``self.should_stop()`` to handle graceful shutdown
|
|
141
|
+
|
|
142
|
+
Example::
|
|
143
|
+
|
|
144
|
+
class MyWebService(WebService):
|
|
145
|
+
id = "myservice"
|
|
146
|
+
|
|
147
|
+
def _serve(self, running: threading.Event):
|
|
148
|
+
# Start your web server
|
|
149
|
+
self.url = "http://localhost:8080"
|
|
150
|
+
running.set()
|
|
151
|
+
# Keep serving, checking for stop signal
|
|
152
|
+
while not self.should_stop():
|
|
153
|
+
time.sleep(1)
|
|
154
|
+
"""
|
|
67
155
|
|
|
68
156
|
def __init__(self):
|
|
69
157
|
super().__init__()
|
|
70
158
|
self.url = None
|
|
159
|
+
self.thread = None
|
|
160
|
+
self._stop_event = threading.Event()
|
|
161
|
+
|
|
162
|
+
def should_stop(self) -> bool:
|
|
163
|
+
"""Check if the service should stop.
|
|
164
|
+
|
|
165
|
+
Subclasses can call this in their _serve loop to check for
|
|
166
|
+
graceful shutdown requests.
|
|
167
|
+
|
|
168
|
+
:return: True if stop() has been called
|
|
169
|
+
"""
|
|
170
|
+
return self._stop_event.is_set()
|
|
71
171
|
|
|
72
172
|
def get_url(self):
|
|
173
|
+
"""Get the URL of this web service, starting it if needed.
|
|
174
|
+
|
|
175
|
+
If the service is not running, this method will start it and
|
|
176
|
+
block until the URL is available.
|
|
177
|
+
|
|
178
|
+
:return: The URL where this service can be accessed
|
|
179
|
+
"""
|
|
73
180
|
if self.state == ServiceState.STOPPED:
|
|
181
|
+
self._stop_event.clear()
|
|
74
182
|
self.state = ServiceState.STARTING
|
|
75
183
|
self.running = threading.Event()
|
|
76
184
|
self.serve()
|
|
77
185
|
|
|
78
186
|
# Wait until the server is ready
|
|
79
187
|
self.running.wait()
|
|
188
|
+
self.state = ServiceState.RUNNING
|
|
80
189
|
|
|
81
190
|
# Returns the URL
|
|
82
191
|
return self.url
|
|
83
192
|
|
|
84
|
-
def stop(self):
|
|
85
|
-
|
|
193
|
+
def stop(self, timeout: float = 2.0):
|
|
194
|
+
"""Stop the web service.
|
|
195
|
+
|
|
196
|
+
This method signals the service to stop and waits for the thread
|
|
197
|
+
to terminate. If the thread doesn't stop gracefully within the
|
|
198
|
+
timeout, it attempts to forcefully terminate it.
|
|
199
|
+
|
|
200
|
+
:param timeout: Seconds to wait for graceful shutdown before forcing
|
|
201
|
+
"""
|
|
202
|
+
if self.state == ServiceState.STOPPED:
|
|
203
|
+
return
|
|
204
|
+
|
|
205
|
+
self.state = ServiceState.STOPPING
|
|
206
|
+
|
|
207
|
+
# Signal the service to stop
|
|
208
|
+
self._stop_event.set()
|
|
209
|
+
|
|
210
|
+
# Wait for the thread to finish
|
|
211
|
+
if self.thread is not None and self.thread.is_alive():
|
|
212
|
+
self.thread.join(timeout=timeout)
|
|
213
|
+
|
|
214
|
+
# If thread is still alive, try to terminate it forcefully
|
|
215
|
+
if self.thread.is_alive():
|
|
216
|
+
self._force_stop_thread()
|
|
217
|
+
|
|
218
|
+
self.url = None
|
|
219
|
+
self.state = ServiceState.STOPPED
|
|
220
|
+
|
|
221
|
+
def _force_stop_thread(self):
|
|
222
|
+
"""Attempt to forcefully stop the service thread.
|
|
223
|
+
|
|
224
|
+
This uses ctypes to raise an exception in the thread. It's not
|
|
225
|
+
guaranteed to work (e.g., if the thread is blocked in C code),
|
|
226
|
+
but it's the best we can do in Python.
|
|
227
|
+
"""
|
|
228
|
+
import ctypes
|
|
229
|
+
|
|
230
|
+
if self.thread is None or not self.thread.is_alive():
|
|
231
|
+
return
|
|
232
|
+
|
|
233
|
+
thread_id = self.thread.ident
|
|
234
|
+
if thread_id is None:
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
# Raise SystemExit in the target thread
|
|
238
|
+
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
|
|
239
|
+
ctypes.c_ulong(thread_id), ctypes.py_object(SystemExit)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if res == 0:
|
|
243
|
+
# Thread ID was invalid
|
|
244
|
+
pass
|
|
245
|
+
elif res > 1:
|
|
246
|
+
# Multiple threads affected - reset
|
|
247
|
+
ctypes.pythonapi.PyThreadState_SetAsyncExc(
|
|
248
|
+
ctypes.c_ulong(thread_id), ctypes.c_long(0)
|
|
249
|
+
)
|
|
86
250
|
|
|
87
251
|
def serve(self):
|
|
88
|
-
|
|
252
|
+
"""Start the web service in a background thread.
|
|
89
253
|
|
|
254
|
+
This method creates a daemon thread that calls :meth:`_serve`.
|
|
255
|
+
"""
|
|
90
256
|
self.thread = threading.Thread(
|
|
91
257
|
target=functools.partial(self._serve, self.running),
|
|
92
258
|
name=f"service[{self.id}]",
|
|
@@ -95,9 +261,17 @@ class WebService(Service):
|
|
|
95
261
|
self.thread.start()
|
|
96
262
|
|
|
97
263
|
@abc.abstractmethod
|
|
98
|
-
def
|
|
99
|
-
"""
|
|
264
|
+
def _serve(self, running: threading.Event):
|
|
265
|
+
"""Start the web server (implement in subclasses).
|
|
266
|
+
|
|
267
|
+
This method should:
|
|
268
|
+
|
|
269
|
+
1. Start your web server
|
|
270
|
+
2. Set ``self.url`` to the service URL
|
|
271
|
+
3. Call ``running.set()`` to signal readiness
|
|
272
|
+
4. Keep the server running (this runs in a background thread)
|
|
273
|
+
5. Optionally check ``self.should_stop()`` for graceful shutdown
|
|
100
274
|
|
|
101
|
-
:param running:
|
|
275
|
+
:param running: Event to signal when ``self.url`` is set
|
|
102
276
|
"""
|
|
103
277
|
...
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import signal
|
|
2
|
+
from typing import Set
|
|
3
|
+
from experimaestro.scheduler import experiment
|
|
4
|
+
from experimaestro.utils import logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SignalHandler:
|
|
8
|
+
def __init__(self):
|
|
9
|
+
self.experiments: Set["experiment"] = set()
|
|
10
|
+
self.original_sigint_handler = None
|
|
11
|
+
|
|
12
|
+
def add(self, xp: "experiment"):
|
|
13
|
+
if not self.experiments:
|
|
14
|
+
self.original_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
15
|
+
|
|
16
|
+
signal.signal(signal.SIGINT, self)
|
|
17
|
+
|
|
18
|
+
self.experiments.add(xp)
|
|
19
|
+
|
|
20
|
+
def remove(self, xp):
|
|
21
|
+
self.experiments.remove(xp)
|
|
22
|
+
if not self.experiments:
|
|
23
|
+
signal.signal(signal.SIGINT, self.original_sigint_handler)
|
|
24
|
+
|
|
25
|
+
def __call__(self, signum, frame):
|
|
26
|
+
"""SIGINT signal handler"""
|
|
27
|
+
logger.warning("Signal received")
|
|
28
|
+
for xp in self.experiments:
|
|
29
|
+
xp.stop()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
SIGNAL_HANDLER = SignalHandler()
|
experimaestro/scheduler/state.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Iterable, Optional, Type
|
|
|
5
5
|
from experimaestro import Task
|
|
6
6
|
|
|
7
7
|
from experimaestro.core.context import SerializationContext
|
|
8
|
-
from experimaestro.scheduler.
|
|
8
|
+
from experimaestro.scheduler.jobs import Job, JobDependency
|
|
9
9
|
from experimaestro.settings import find_workspace
|
|
10
10
|
from experimaestro.core.serialization import from_state_dict, save_definition
|
|
11
11
|
|
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
"""Database models for experiment state persistence
|
|
2
|
+
|
|
3
|
+
This module provides peewee ORM models for storing job and service state
|
|
4
|
+
in a workspace-level SQLite database. The workspace has a single database
|
|
5
|
+
file (.experimaestro/workspace.db) with WAL mode enabled for concurrent
|
|
6
|
+
read/write access.
|
|
7
|
+
|
|
8
|
+
Key design:
|
|
9
|
+
- One database per workspace at: workdir/.experimaestro/workspace.db
|
|
10
|
+
- Experiments can be run multiple times, each run tracked separately
|
|
11
|
+
- Jobs and services are scoped to (experiment_id, run_id)
|
|
12
|
+
- Tags are scoped to (job_id, experiment_id, run_id) - fixes GH #128
|
|
13
|
+
- Current state and progress stored in JobModel - no history tracking
|
|
14
|
+
- Database instance is passed explicitly to avoid global state
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from peewee import (
|
|
19
|
+
Model,
|
|
20
|
+
SqliteDatabase,
|
|
21
|
+
CharField,
|
|
22
|
+
FloatField,
|
|
23
|
+
IntegerField,
|
|
24
|
+
TextField,
|
|
25
|
+
DateTimeField,
|
|
26
|
+
CompositeKey,
|
|
27
|
+
IntegrityError,
|
|
28
|
+
OperationalError,
|
|
29
|
+
)
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
import fasteners
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class BaseModel(Model):
|
|
35
|
+
"""Base model for workspace database tables
|
|
36
|
+
|
|
37
|
+
Models are unbound by default. Use database.bind_ctx() when querying:
|
|
38
|
+
|
|
39
|
+
with workspace.workspace_db.bind_ctx([ExperimentModel, JobModel, ...]):
|
|
40
|
+
experiments = ExperimentModel.select()
|
|
41
|
+
|
|
42
|
+
Or use the convenience method bind_models() defined below.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
class Meta:
|
|
46
|
+
database = None # Unbound - will be bound when used
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ExperimentModel(BaseModel):
|
|
50
|
+
"""Experiment metadata - tracks experiment definitions
|
|
51
|
+
|
|
52
|
+
An experiment can be run multiple times. This table tracks the experiment
|
|
53
|
+
itself and points to the current/latest run.
|
|
54
|
+
|
|
55
|
+
Fields:
|
|
56
|
+
experiment_id: Unique identifier for the experiment
|
|
57
|
+
current_run_id: Points to the current/latest run (null if no runs yet)
|
|
58
|
+
created_at: When experiment was first created
|
|
59
|
+
updated_at: When experiment was last modified (for incremental queries)
|
|
60
|
+
|
|
61
|
+
Note: Experiment path is derivable: {workspace}/xp/{experiment_id}
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
experiment_id = CharField(primary_key=True)
|
|
65
|
+
current_run_id = CharField(null=True)
|
|
66
|
+
created_at = DateTimeField(default=datetime.now)
|
|
67
|
+
updated_at = DateTimeField(default=datetime.now, index=True)
|
|
68
|
+
|
|
69
|
+
class Meta:
|
|
70
|
+
table_name = "experiments"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class ExperimentRunModel(BaseModel):
|
|
74
|
+
"""Individual experiment runs
|
|
75
|
+
|
|
76
|
+
Each time an experiment is executed, a new run is created.
|
|
77
|
+
Runs are identified by (experiment_id, run_id) composite key.
|
|
78
|
+
|
|
79
|
+
run_id format: timestamp-based like "20250120_143022" or sequential counter
|
|
80
|
+
|
|
81
|
+
Fields:
|
|
82
|
+
experiment_id: ID of the experiment this run belongs to
|
|
83
|
+
run_id: Unique ID for this run (timestamp or sequential)
|
|
84
|
+
started_at: When this run started
|
|
85
|
+
ended_at: When this run completed (null if still active)
|
|
86
|
+
status: Run status (active, completed, failed, abandoned)
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
experiment_id = CharField(index=True)
|
|
90
|
+
run_id = CharField(index=True)
|
|
91
|
+
started_at = DateTimeField(default=datetime.now)
|
|
92
|
+
ended_at = DateTimeField(null=True)
|
|
93
|
+
status = CharField(default="active", index=True)
|
|
94
|
+
|
|
95
|
+
class Meta:
|
|
96
|
+
table_name = "experiment_runs"
|
|
97
|
+
primary_key = CompositeKey("experiment_id", "run_id")
|
|
98
|
+
indexes = ((("experiment_id", "started_at"), False),) # For finding latest run
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class WorkspaceSyncMetadata(BaseModel):
|
|
102
|
+
"""Workspace-level metadata for disk sync tracking
|
|
103
|
+
|
|
104
|
+
Single-row table to track when the last disk sync occurred.
|
|
105
|
+
Used to throttle sync operations and prevent excessive disk scanning.
|
|
106
|
+
|
|
107
|
+
Fields:
|
|
108
|
+
id: Always "workspace" (single row table)
|
|
109
|
+
last_sync_time: When last sync completed
|
|
110
|
+
sync_interval_minutes: Minimum interval between syncs
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
id = CharField(primary_key=True, default="workspace")
|
|
114
|
+
last_sync_time = DateTimeField(null=True)
|
|
115
|
+
sync_interval_minutes = IntegerField(default=5)
|
|
116
|
+
|
|
117
|
+
class Meta:
|
|
118
|
+
table_name = "workspace_sync_metadata"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class JobModel(BaseModel):
|
|
122
|
+
"""Job information linked to specific experiment run
|
|
123
|
+
|
|
124
|
+
Jobs are tied to a specific run of an experiment via (experiment_id, run_id).
|
|
125
|
+
The same job can appear in multiple runs with different states/tags.
|
|
126
|
+
|
|
127
|
+
Fields:
|
|
128
|
+
job_id: Unique identifier for the job (from task identifier)
|
|
129
|
+
experiment_id: ID of the experiment this job belongs to
|
|
130
|
+
run_id: ID of the run this job belongs to
|
|
131
|
+
task_id: Task class identifier
|
|
132
|
+
locator: Full task locator (identifier)
|
|
133
|
+
state: Current job state (e.g., "unscheduled", "waiting", "running", "done", "error")
|
|
134
|
+
failure_reason: Optional failure reason for error states (e.g., "TIMEOUT", "DEPENDENCY")
|
|
135
|
+
submitted_time: When job was submitted (Unix timestamp)
|
|
136
|
+
started_time: When job started running (Unix timestamp)
|
|
137
|
+
ended_time: When job finished (Unix timestamp)
|
|
138
|
+
progress: JSON-encoded list of progress updates
|
|
139
|
+
updated_at: When job was last modified (for incremental queries)
|
|
140
|
+
|
|
141
|
+
Note: Job path is derivable: {workspace}/jobs/{task_id}/{job_id}
|
|
142
|
+
Note: Tags are stored in separate JobTagModel table (run-scoped)
|
|
143
|
+
Note: Dependencies are NOT stored in DB (available in state.json only)
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
job_id = CharField(index=True)
|
|
147
|
+
experiment_id = CharField(index=True)
|
|
148
|
+
run_id = CharField(index=True)
|
|
149
|
+
task_id = CharField(index=True)
|
|
150
|
+
locator = CharField()
|
|
151
|
+
state = CharField(default="unscheduled", index=True)
|
|
152
|
+
failure_reason = CharField(null=True)
|
|
153
|
+
submitted_time = FloatField(null=True)
|
|
154
|
+
started_time = FloatField(null=True)
|
|
155
|
+
ended_time = FloatField(null=True)
|
|
156
|
+
progress = TextField(default="[]")
|
|
157
|
+
updated_at = DateTimeField(default=datetime.now, index=True)
|
|
158
|
+
|
|
159
|
+
class Meta:
|
|
160
|
+
table_name = "jobs"
|
|
161
|
+
primary_key = CompositeKey("job_id", "experiment_id", "run_id")
|
|
162
|
+
indexes = (
|
|
163
|
+
(
|
|
164
|
+
("experiment_id", "run_id", "state"),
|
|
165
|
+
False,
|
|
166
|
+
), # Query jobs by run and state
|
|
167
|
+
(
|
|
168
|
+
("experiment_id", "run_id", "task_id"),
|
|
169
|
+
False,
|
|
170
|
+
), # Query jobs by run and task
|
|
171
|
+
(
|
|
172
|
+
("experiment_id", "run_id", "updated_at"),
|
|
173
|
+
False,
|
|
174
|
+
), # Query jobs by run and update time
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class JobTagModel(BaseModel):
|
|
179
|
+
"""Job tags for efficient searching (fixes GH #128)
|
|
180
|
+
|
|
181
|
+
**FIX FOR GH ISSUE #128**: Tags are now experiment-run-dependent, not job-dependent.
|
|
182
|
+
The same job in different experiment runs can have different tags, because tags
|
|
183
|
+
are scoped to the (job_id, experiment_id, run_id) combination.
|
|
184
|
+
|
|
185
|
+
Tags are stored as key-value pairs in a separate table for efficient indexing.
|
|
186
|
+
Each job can have multiple tags within an experiment run context.
|
|
187
|
+
|
|
188
|
+
Key change from old behavior:
|
|
189
|
+
- OLD: Tags were global per job_id (broken - same job in different experiments/runs shared tags)
|
|
190
|
+
- NEW: Tags are scoped per (job_id, experiment_id, run_id) - same job can have different tags in different runs
|
|
191
|
+
|
|
192
|
+
Fields:
|
|
193
|
+
job_id: ID of the job
|
|
194
|
+
experiment_id: ID of the experiment
|
|
195
|
+
run_id: ID of the run
|
|
196
|
+
tag_key: Tag name
|
|
197
|
+
tag_value: Tag value
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
job_id = CharField(index=True)
|
|
201
|
+
experiment_id = CharField(index=True)
|
|
202
|
+
run_id = CharField(index=True)
|
|
203
|
+
tag_key = CharField(index=True)
|
|
204
|
+
tag_value = CharField(index=True)
|
|
205
|
+
|
|
206
|
+
class Meta:
|
|
207
|
+
table_name = "job_tags"
|
|
208
|
+
primary_key = CompositeKey("job_id", "experiment_id", "run_id", "tag_key")
|
|
209
|
+
indexes = (
|
|
210
|
+
(("tag_key", "tag_value"), False), # For tag-based queries
|
|
211
|
+
(
|
|
212
|
+
("experiment_id", "run_id", "tag_key"),
|
|
213
|
+
False,
|
|
214
|
+
), # For experiment run tag queries
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class ServiceModel(BaseModel):
|
|
219
|
+
"""Service information linked to specific experiment run
|
|
220
|
+
|
|
221
|
+
Services are tied to a specific run of an experiment via (experiment_id, run_id).
|
|
222
|
+
|
|
223
|
+
Fields:
|
|
224
|
+
service_id: Unique identifier for the service
|
|
225
|
+
experiment_id: ID of the experiment this service belongs to
|
|
226
|
+
run_id: ID of the run this service belongs to
|
|
227
|
+
description: Human-readable description
|
|
228
|
+
state: Service state (e.g., "running", "stopped")
|
|
229
|
+
state_dict: JSON serialized state_dict for service recreation
|
|
230
|
+
created_at: When service was created
|
|
231
|
+
updated_at: Timestamp of last update
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
service_id = CharField()
|
|
235
|
+
experiment_id = CharField(index=True)
|
|
236
|
+
run_id = CharField(index=True)
|
|
237
|
+
description = TextField(default="")
|
|
238
|
+
state = CharField()
|
|
239
|
+
state_dict = TextField(default="{}") # JSON for service recreation
|
|
240
|
+
created_at = DateTimeField(default=datetime.now)
|
|
241
|
+
updated_at = DateTimeField(default=datetime.now)
|
|
242
|
+
|
|
243
|
+
class Meta:
|
|
244
|
+
table_name = "services"
|
|
245
|
+
primary_key = CompositeKey("service_id", "experiment_id", "run_id")
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class PartialModel(BaseModel):
|
|
249
|
+
"""Partial directory tracking for subparameters
|
|
250
|
+
|
|
251
|
+
Tracks partial directories that are shared across jobs with different
|
|
252
|
+
parameter values (but same partial identifier). These directories are
|
|
253
|
+
at WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID/ (reconstructible).
|
|
254
|
+
|
|
255
|
+
Fields:
|
|
256
|
+
partial_id: Hex hash of the partial identifier
|
|
257
|
+
task_id: Task class identifier
|
|
258
|
+
subparameters_name: Name of the subparameters definition
|
|
259
|
+
created_at: When this partial directory was first created
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
partial_id = CharField(primary_key=True)
|
|
263
|
+
task_id = CharField(index=True)
|
|
264
|
+
subparameters_name = CharField(index=True)
|
|
265
|
+
created_at = DateTimeField(default=datetime.now)
|
|
266
|
+
|
|
267
|
+
class Meta:
|
|
268
|
+
table_name = "partials"
|
|
269
|
+
indexes = ((("task_id", "subparameters_name"), False),)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class JobPartialModel(BaseModel):
|
|
273
|
+
"""Links jobs to partial directories they use
|
|
274
|
+
|
|
275
|
+
Tracks which jobs reference which partial directories. This enables
|
|
276
|
+
cleanup of orphan partials when all referencing jobs are deleted.
|
|
277
|
+
|
|
278
|
+
A job can use multiple partials (different subparameters definitions),
|
|
279
|
+
and a partial can be used by multiple jobs.
|
|
280
|
+
|
|
281
|
+
Fields:
|
|
282
|
+
job_id: ID of the job using this partial
|
|
283
|
+
experiment_id: ID of the experiment
|
|
284
|
+
run_id: ID of the run
|
|
285
|
+
partial_id: ID of the partial directory being used
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
job_id = CharField(index=True)
|
|
289
|
+
experiment_id = CharField(index=True)
|
|
290
|
+
run_id = CharField(index=True)
|
|
291
|
+
partial_id = CharField(index=True)
|
|
292
|
+
|
|
293
|
+
class Meta:
|
|
294
|
+
table_name = "job_partials"
|
|
295
|
+
primary_key = CompositeKey("job_id", "experiment_id", "run_id", "partial_id")
|
|
296
|
+
indexes = ((("partial_id",), False),) # For finding jobs using a partial
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# List of all models for binding
|
|
300
|
+
ALL_MODELS = [
|
|
301
|
+
ExperimentModel,
|
|
302
|
+
ExperimentRunModel,
|
|
303
|
+
WorkspaceSyncMetadata,
|
|
304
|
+
JobModel,
|
|
305
|
+
JobTagModel,
|
|
306
|
+
ServiceModel,
|
|
307
|
+
PartialModel,
|
|
308
|
+
JobPartialModel,
|
|
309
|
+
]
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def initialize_workspace_database(
|
|
313
|
+
db_path: Path, read_only: bool = False
|
|
314
|
+
) -> SqliteDatabase:
|
|
315
|
+
"""Initialize a workspace database connection with proper configuration
|
|
316
|
+
|
|
317
|
+
Creates and configures a SQLite database connection for the workspace.
|
|
318
|
+
Models must be bound to this database before querying.
|
|
319
|
+
|
|
320
|
+
Uses file-based locking to prevent multiple processes from initializing
|
|
321
|
+
the database simultaneously, which could cause SQLite locking issues.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
db_path: Path to the workspace SQLite database file
|
|
325
|
+
read_only: If True, open database in read-only mode
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Configured SqliteDatabase instance
|
|
329
|
+
"""
|
|
330
|
+
# Ensure parent directory exists (unless read-only)
|
|
331
|
+
if not read_only:
|
|
332
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
333
|
+
|
|
334
|
+
# Use file-based lock to prevent concurrent initialization from multiple processes
|
|
335
|
+
# This prevents SQLite locking issues during table creation
|
|
336
|
+
lock_path = db_path.parent / f".{db_path.name}.init.lock"
|
|
337
|
+
lock = fasteners.InterProcessLock(str(lock_path))
|
|
338
|
+
|
|
339
|
+
# Acquire lock (blocking) - only one process can initialize at a time
|
|
340
|
+
with lock:
|
|
341
|
+
# Create database connection
|
|
342
|
+
# check_same_thread=False allows the connection to be used from multiple threads
|
|
343
|
+
# This is safe with WAL mode and proper locking
|
|
344
|
+
db = SqliteDatabase(
|
|
345
|
+
str(db_path),
|
|
346
|
+
pragmas={
|
|
347
|
+
"journal_mode": "wal", # Write-Ahead Logging for concurrent reads
|
|
348
|
+
"foreign_keys": 1, # Enable foreign key constraints
|
|
349
|
+
"ignore_check_constraints": 0,
|
|
350
|
+
"synchronous": 1, # NORMAL mode (balance safety/speed)
|
|
351
|
+
"busy_timeout": 5000, # Wait up to 5 seconds for locks
|
|
352
|
+
},
|
|
353
|
+
check_same_thread=False,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
if read_only:
|
|
357
|
+
# Set query-only mode for read-only access
|
|
358
|
+
db.execute_sql("PRAGMA query_only = ON")
|
|
359
|
+
|
|
360
|
+
# Bind all models to this database
|
|
361
|
+
db.bind(ALL_MODELS)
|
|
362
|
+
|
|
363
|
+
# Create tables if they don't exist (only in write mode)
|
|
364
|
+
if not read_only:
|
|
365
|
+
db.create_tables(ALL_MODELS, safe=True)
|
|
366
|
+
|
|
367
|
+
# Initialize WorkspaceSyncMetadata with default row if not exists
|
|
368
|
+
# Use try/except to handle race condition (shouldn't happen with lock, but be safe)
|
|
369
|
+
try:
|
|
370
|
+
WorkspaceSyncMetadata.get_or_create(
|
|
371
|
+
id="workspace",
|
|
372
|
+
defaults={"last_sync_time": None, "sync_interval_minutes": 5},
|
|
373
|
+
)
|
|
374
|
+
except (IntegrityError, OperationalError):
|
|
375
|
+
# If get_or_create fails, the row likely already exists
|
|
376
|
+
pass
|
|
377
|
+
|
|
378
|
+
return db
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def close_workspace_database(db: SqliteDatabase):
|
|
382
|
+
"""Close a workspace database connection
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
db: The database connection to close
|
|
386
|
+
"""
|
|
387
|
+
if db and not db.is_closed():
|
|
388
|
+
db.close()
|