experimaestro 1.11.1__py3-none-any.whl → 2.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

Files changed (133) hide show
  1. experimaestro/__init__.py +10 -11
  2. experimaestro/annotations.py +167 -206
  3. experimaestro/cli/__init__.py +140 -16
  4. experimaestro/cli/filter.py +42 -74
  5. experimaestro/cli/jobs.py +157 -106
  6. experimaestro/cli/progress.py +269 -0
  7. experimaestro/cli/refactor.py +249 -0
  8. experimaestro/click.py +0 -1
  9. experimaestro/commandline.py +19 -3
  10. experimaestro/connectors/__init__.py +22 -3
  11. experimaestro/connectors/local.py +12 -0
  12. experimaestro/core/arguments.py +192 -37
  13. experimaestro/core/identifier.py +127 -12
  14. experimaestro/core/objects/__init__.py +6 -0
  15. experimaestro/core/objects/config.py +702 -285
  16. experimaestro/core/objects/config_walk.py +24 -6
  17. experimaestro/core/serialization.py +91 -34
  18. experimaestro/core/serializers.py +1 -8
  19. experimaestro/core/subparameters.py +164 -0
  20. experimaestro/core/types.py +198 -83
  21. experimaestro/exceptions.py +26 -0
  22. experimaestro/experiments/cli.py +107 -25
  23. experimaestro/generators.py +50 -9
  24. experimaestro/huggingface.py +3 -1
  25. experimaestro/launcherfinder/parser.py +29 -0
  26. experimaestro/launcherfinder/registry.py +3 -3
  27. experimaestro/launchers/__init__.py +26 -1
  28. experimaestro/launchers/direct.py +12 -0
  29. experimaestro/launchers/slurm/base.py +154 -2
  30. experimaestro/mkdocs/base.py +6 -8
  31. experimaestro/mkdocs/metaloader.py +0 -1
  32. experimaestro/mypy.py +452 -7
  33. experimaestro/notifications.py +75 -16
  34. experimaestro/progress.py +404 -0
  35. experimaestro/rpyc.py +0 -1
  36. experimaestro/run.py +19 -6
  37. experimaestro/scheduler/__init__.py +18 -1
  38. experimaestro/scheduler/base.py +504 -959
  39. experimaestro/scheduler/dependencies.py +43 -28
  40. experimaestro/scheduler/dynamic_outputs.py +259 -130
  41. experimaestro/scheduler/experiment.py +582 -0
  42. experimaestro/scheduler/interfaces.py +474 -0
  43. experimaestro/scheduler/jobs.py +485 -0
  44. experimaestro/scheduler/services.py +186 -12
  45. experimaestro/scheduler/signal_handler.py +32 -0
  46. experimaestro/scheduler/state.py +1 -1
  47. experimaestro/scheduler/state_db.py +388 -0
  48. experimaestro/scheduler/state_provider.py +2345 -0
  49. experimaestro/scheduler/state_sync.py +834 -0
  50. experimaestro/scheduler/workspace.py +52 -10
  51. experimaestro/scriptbuilder.py +7 -0
  52. experimaestro/server/__init__.py +153 -32
  53. experimaestro/server/data/index.css +0 -125
  54. experimaestro/server/data/index.css.map +1 -1
  55. experimaestro/server/data/index.js +194 -58
  56. experimaestro/server/data/index.js.map +1 -1
  57. experimaestro/settings.py +47 -6
  58. experimaestro/sphinx/__init__.py +3 -3
  59. experimaestro/taskglobals.py +20 -0
  60. experimaestro/tests/conftest.py +80 -0
  61. experimaestro/tests/core/test_generics.py +2 -2
  62. experimaestro/tests/identifier_stability.json +45 -0
  63. experimaestro/tests/launchers/bin/sacct +6 -2
  64. experimaestro/tests/launchers/bin/sbatch +4 -2
  65. experimaestro/tests/launchers/common.py +2 -2
  66. experimaestro/tests/launchers/test_slurm.py +80 -0
  67. experimaestro/tests/restart.py +1 -1
  68. experimaestro/tests/tasks/all.py +7 -0
  69. experimaestro/tests/tasks/test_dynamic.py +231 -0
  70. experimaestro/tests/test_checkers.py +2 -2
  71. experimaestro/tests/test_cli_jobs.py +615 -0
  72. experimaestro/tests/test_dependencies.py +11 -17
  73. experimaestro/tests/test_deprecated.py +630 -0
  74. experimaestro/tests/test_environment.py +200 -0
  75. experimaestro/tests/test_experiment.py +3 -3
  76. experimaestro/tests/test_file_progress.py +425 -0
  77. experimaestro/tests/test_file_progress_integration.py +477 -0
  78. experimaestro/tests/test_forward.py +3 -3
  79. experimaestro/tests/test_generators.py +93 -0
  80. experimaestro/tests/test_identifier.py +520 -169
  81. experimaestro/tests/test_identifier_stability.py +458 -0
  82. experimaestro/tests/test_instance.py +16 -21
  83. experimaestro/tests/test_multitoken.py +442 -0
  84. experimaestro/tests/test_mypy.py +433 -0
  85. experimaestro/tests/test_objects.py +314 -30
  86. experimaestro/tests/test_outputs.py +8 -8
  87. experimaestro/tests/test_param.py +22 -26
  88. experimaestro/tests/test_partial_paths.py +231 -0
  89. experimaestro/tests/test_progress.py +2 -50
  90. experimaestro/tests/test_resumable_task.py +480 -0
  91. experimaestro/tests/test_serializers.py +141 -60
  92. experimaestro/tests/test_state_db.py +434 -0
  93. experimaestro/tests/test_subparameters.py +160 -0
  94. experimaestro/tests/test_tags.py +151 -15
  95. experimaestro/tests/test_tasks.py +137 -160
  96. experimaestro/tests/test_token_locking.py +252 -0
  97. experimaestro/tests/test_tokens.py +25 -19
  98. experimaestro/tests/test_types.py +133 -11
  99. experimaestro/tests/test_validation.py +19 -19
  100. experimaestro/tests/test_workspace_triggers.py +158 -0
  101. experimaestro/tests/token_reschedule.py +5 -3
  102. experimaestro/tests/utils.py +2 -2
  103. experimaestro/tokens.py +154 -57
  104. experimaestro/tools/diff.py +8 -1
  105. experimaestro/tui/__init__.py +8 -0
  106. experimaestro/tui/app.py +2303 -0
  107. experimaestro/tui/app.tcss +353 -0
  108. experimaestro/tui/log_viewer.py +228 -0
  109. experimaestro/typingutils.py +11 -2
  110. experimaestro/utils/__init__.py +23 -0
  111. experimaestro/utils/environment.py +148 -0
  112. experimaestro/utils/git.py +129 -0
  113. experimaestro/utils/resources.py +1 -1
  114. experimaestro/version.py +34 -0
  115. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/METADATA +70 -39
  116. experimaestro-2.0.0b4.dist-info/RECORD +181 -0
  117. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info}/WHEEL +1 -1
  118. experimaestro-2.0.0b4.dist-info/entry_points.txt +16 -0
  119. experimaestro/compat.py +0 -6
  120. experimaestro/core/objects.pyi +0 -225
  121. experimaestro/server/data/0c35d18bf06992036b69.woff2 +0 -0
  122. experimaestro/server/data/219aa9140e099e6c72ed.woff2 +0 -0
  123. experimaestro/server/data/3a4004a46a653d4b2166.woff +0 -0
  124. experimaestro/server/data/3baa5b8f3469222b822d.woff +0 -0
  125. experimaestro/server/data/4d73cb90e394b34b7670.woff +0 -0
  126. experimaestro/server/data/4ef4218c522f1eb6b5b1.woff2 +0 -0
  127. experimaestro/server/data/5d681e2edae8c60630db.woff +0 -0
  128. experimaestro/server/data/6f420cf17cc0d7676fad.woff2 +0 -0
  129. experimaestro/server/data/c380809fd3677d7d6903.woff2 +0 -0
  130. experimaestro/server/data/f882956fd323fd322f31.woff +0 -0
  131. experimaestro-1.11.1.dist-info/RECORD +0 -158
  132. experimaestro-1.11.1.dist-info/entry_points.txt +0 -17
  133. {experimaestro-1.11.1.dist-info → experimaestro-2.0.0b4.dist-info/licenses}/LICENSE +0 -0
@@ -1,9 +1,12 @@
1
1
  import abc
2
2
  from enum import Enum
3
3
  import functools
4
+ import logging
4
5
  import threading
5
6
  from typing import Set
6
7
 
8
+ logger = logging.getLogger(__name__)
9
+
7
10
 
8
11
  class ServiceListener:
9
12
  """A service listener"""
@@ -13,6 +16,12 @@ class ServiceListener:
13
16
 
14
17
 
15
18
  class ServiceState(Enum):
19
+ """State of a service lifecycle.
20
+
21
+ Services transition through these states:
22
+ STOPPED -> STARTING -> RUNNING -> STOPPING -> STOPPED
23
+ """
24
+
16
25
  STOPPED = 0
17
26
  STARTING = 1
18
27
  RUNNING = 2
@@ -24,27 +33,72 @@ class Service:
24
33
 
25
34
  Services can be associated with an experiment. They send
26
35
  notifications to service listeners.
36
+
37
+ To support restarting services from monitor mode, subclasses should
38
+ override :meth:`state_dict` to return the data needed to recreate
39
+ the service, and implement :meth:`from_state_dict` to recreate it.
27
40
  """
28
41
 
29
42
  id: str
30
43
  _state: ServiceState = ServiceState.STOPPED
31
44
 
32
45
  def __init__(self):
33
- self.listeners: Set[ServiceListener] = set()
46
+ self._listeners: Set[ServiceListener] = set()
47
+ self._listeners_lock = threading.Lock()
48
+
49
+ def state_dict(self) -> dict:
50
+ """Return a dictionary representation for serialization.
51
+
52
+ Subclasses should override this to include any parameters needed
53
+ to recreate the service. The base implementation returns the
54
+ class module and name.
55
+
56
+ Returns:
57
+ Dict with '__class__' key and any additional kwargs.
58
+ """
59
+ return {
60
+ "__class__": f"{self.__class__.__module__}.{self.__class__.__name__}",
61
+ }
62
+
63
+ @staticmethod
64
+ def from_state_dict(data: dict) -> "Service":
65
+ """Recreate a service from a state dictionary.
66
+
67
+ Args:
68
+ data: Dictionary from :meth:`state_dict`
69
+
70
+ Returns:
71
+ A new Service instance, or raises if the class cannot be loaded.
72
+ """
73
+ import importlib
74
+
75
+ class_path = data.get("__class__")
76
+ if not class_path:
77
+ raise ValueError("Missing '__class__' in service state_dict")
78
+
79
+ module_name, class_name = class_path.rsplit(".", 1)
80
+ module = importlib.import_module(module_name)
81
+ cls = getattr(module, class_name)
82
+
83
+ # Remove __class__ and pass remaining as kwargs
84
+ kwargs = {k: v for k, v in data.items() if k != "__class__"}
85
+ return cls(**kwargs)
34
86
 
35
87
  def add_listener(self, listener: ServiceListener):
36
88
  """Adds a listener
37
89
 
38
90
  :param listener: The listener to add
39
91
  """
40
- self.listeners.add(listener)
92
+ with self._listeners_lock:
93
+ self._listeners.add(listener)
41
94
 
42
95
  def remove_listener(self, listener: ServiceListener):
43
96
  """Removes a listener
44
97
 
45
98
  :param listener: The listener to remove
46
99
  """
47
- self.listeners.remove(listener)
100
+ with self._listeners_lock:
101
+ self._listeners.discard(listener)
48
102
 
49
103
  def description(self):
50
104
  return ""
@@ -58,35 +112,147 @@ class Service:
58
112
  # Set the state
59
113
  self._state = state
60
114
 
61
- for listener in self.listeners:
62
- listener.service_state_changed(self)
115
+ # Notify listeners with thread-safe snapshot
116
+ with self._listeners_lock:
117
+ listeners_snapshot = list(self._listeners)
118
+
119
+ for listener in listeners_snapshot:
120
+ try:
121
+ listener.service_state_changed(self)
122
+ except Exception:
123
+ logger.exception("Error notifying listener %s", listener)
63
124
 
64
125
 
65
126
  class WebService(Service):
66
- """Web service"""
127
+ """Base class for web-based experiment services.
128
+
129
+ Web services provide HTTP endpoints that can be accessed through the
130
+ experimaestro web interface. When an experiment is running with a port
131
+ configured, web services are automatically proxied through the main
132
+ experimaestro server.
133
+
134
+ To implement a web service:
135
+
136
+ 1. Subclass ``WebService``
137
+ 2. Set a unique ``id`` class attribute
138
+ 3. Implement the :meth:`_serve` method to start your web server
139
+ 4. Set ``self.url`` and call ``running.set()`` when ready
140
+ 5. Optionally check ``self.should_stop()`` to handle graceful shutdown
141
+
142
+ Example::
143
+
144
+ class MyWebService(WebService):
145
+ id = "myservice"
146
+
147
+ def _serve(self, running: threading.Event):
148
+ # Start your web server
149
+ self.url = "http://localhost:8080"
150
+ running.set()
151
+ # Keep serving, checking for stop signal
152
+ while not self.should_stop():
153
+ time.sleep(1)
154
+ """
67
155
 
68
156
  def __init__(self):
69
157
  super().__init__()
70
158
  self.url = None
159
+ self.thread = None
160
+ self._stop_event = threading.Event()
161
+
162
+ def should_stop(self) -> bool:
163
+ """Check if the service should stop.
164
+
165
+ Subclasses can call this in their _serve loop to check for
166
+ graceful shutdown requests.
167
+
168
+ :return: True if stop() has been called
169
+ """
170
+ return self._stop_event.is_set()
71
171
 
72
172
  def get_url(self):
173
+ """Get the URL of this web service, starting it if needed.
174
+
175
+ If the service is not running, this method will start it and
176
+ block until the URL is available.
177
+
178
+ :return: The URL where this service can be accessed
179
+ """
73
180
  if self.state == ServiceState.STOPPED:
181
+ self._stop_event.clear()
74
182
  self.state = ServiceState.STARTING
75
183
  self.running = threading.Event()
76
184
  self.serve()
77
185
 
78
186
  # Wait until the server is ready
79
187
  self.running.wait()
188
+ self.state = ServiceState.RUNNING
80
189
 
81
190
  # Returns the URL
82
191
  return self.url
83
192
 
84
- def stop(self):
85
- ...
193
+ def stop(self, timeout: float = 2.0):
194
+ """Stop the web service.
195
+
196
+ This method signals the service to stop and waits for the thread
197
+ to terminate. If the thread doesn't stop gracefully within the
198
+ timeout, it attempts to forcefully terminate it.
199
+
200
+ :param timeout: Seconds to wait for graceful shutdown before forcing
201
+ """
202
+ if self.state == ServiceState.STOPPED:
203
+ return
204
+
205
+ self.state = ServiceState.STOPPING
206
+
207
+ # Signal the service to stop
208
+ self._stop_event.set()
209
+
210
+ # Wait for the thread to finish
211
+ if self.thread is not None and self.thread.is_alive():
212
+ self.thread.join(timeout=timeout)
213
+
214
+ # If thread is still alive, try to terminate it forcefully
215
+ if self.thread.is_alive():
216
+ self._force_stop_thread()
217
+
218
+ self.url = None
219
+ self.state = ServiceState.STOPPED
220
+
221
+ def _force_stop_thread(self):
222
+ """Attempt to forcefully stop the service thread.
223
+
224
+ This uses ctypes to raise an exception in the thread. It's not
225
+ guaranteed to work (e.g., if the thread is blocked in C code),
226
+ but it's the best we can do in Python.
227
+ """
228
+ import ctypes
229
+
230
+ if self.thread is None or not self.thread.is_alive():
231
+ return
232
+
233
+ thread_id = self.thread.ident
234
+ if thread_id is None:
235
+ return
236
+
237
+ # Raise SystemExit in the target thread
238
+ res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
239
+ ctypes.c_ulong(thread_id), ctypes.py_object(SystemExit)
240
+ )
241
+
242
+ if res == 0:
243
+ # Thread ID was invalid
244
+ pass
245
+ elif res > 1:
246
+ # Multiple threads affected - reset
247
+ ctypes.pythonapi.PyThreadState_SetAsyncExc(
248
+ ctypes.c_ulong(thread_id), ctypes.c_long(0)
249
+ )
86
250
 
87
251
  def serve(self):
88
- import threading
252
+ """Start the web service in a background thread.
89
253
 
254
+ This method creates a daemon thread that calls :meth:`_serve`.
255
+ """
90
256
  self.thread = threading.Thread(
91
257
  target=functools.partial(self._serve, self.running),
92
258
  name=f"service[{self.id}]",
@@ -95,9 +261,17 @@ class WebService(Service):
95
261
  self.thread.start()
96
262
 
97
263
  @abc.abstractmethod
98
- def _server(self, running: threading.Event):
99
- """Starts the web service
264
+ def _serve(self, running: threading.Event):
265
+ """Start the web server (implement in subclasses).
266
+
267
+ This method should:
268
+
269
+ 1. Start your web server
270
+ 2. Set ``self.url`` to the service URL
271
+ 3. Call ``running.set()`` to signal readiness
272
+ 4. Keep the server running (this runs in a background thread)
273
+ 5. Optionally check ``self.should_stop()`` for graceful shutdown
100
274
 
101
- :param running: signals that `self.url` is set
275
+ :param running: Event to signal when ``self.url`` is set
102
276
  """
103
277
  ...
@@ -0,0 +1,32 @@
1
+ import signal
2
+ from typing import Set
3
+ from experimaestro.scheduler import experiment
4
+ from experimaestro.utils import logger
5
+
6
+
7
+ class SignalHandler:
8
+ def __init__(self):
9
+ self.experiments: Set["experiment"] = set()
10
+ self.original_sigint_handler = None
11
+
12
+ def add(self, xp: "experiment"):
13
+ if not self.experiments:
14
+ self.original_sigint_handler = signal.getsignal(signal.SIGINT)
15
+
16
+ signal.signal(signal.SIGINT, self)
17
+
18
+ self.experiments.add(xp)
19
+
20
+ def remove(self, xp):
21
+ self.experiments.remove(xp)
22
+ if not self.experiments:
23
+ signal.signal(signal.SIGINT, self.original_sigint_handler)
24
+
25
+ def __call__(self, signum, frame):
26
+ """SIGINT signal handler"""
27
+ logger.warning("Signal received")
28
+ for xp in self.experiments:
29
+ xp.stop()
30
+
31
+
32
+ SIGNAL_HANDLER = SignalHandler()
@@ -5,7 +5,7 @@ from typing import Iterable, Optional, Type
5
5
  from experimaestro import Task
6
6
 
7
7
  from experimaestro.core.context import SerializationContext
8
- from experimaestro.scheduler.base import Job, JobDependency
8
+ from experimaestro.scheduler.jobs import Job, JobDependency
9
9
  from experimaestro.settings import find_workspace
10
10
  from experimaestro.core.serialization import from_state_dict, save_definition
11
11
 
@@ -0,0 +1,388 @@
1
+ """Database models for experiment state persistence
2
+
3
+ This module provides peewee ORM models for storing job and service state
4
+ in a workspace-level SQLite database. The workspace has a single database
5
+ file (.experimaestro/workspace.db) with WAL mode enabled for concurrent
6
+ read/write access.
7
+
8
+ Key design:
9
+ - One database per workspace at: workdir/.experimaestro/workspace.db
10
+ - Experiments can be run multiple times, each run tracked separately
11
+ - Jobs and services are scoped to (experiment_id, run_id)
12
+ - Tags are scoped to (job_id, experiment_id, run_id) - fixes GH #128
13
+ - Current state and progress stored in JobModel - no history tracking
14
+ - Database instance is passed explicitly to avoid global state
15
+ """
16
+
17
+ from pathlib import Path
18
+ from peewee import (
19
+ Model,
20
+ SqliteDatabase,
21
+ CharField,
22
+ FloatField,
23
+ IntegerField,
24
+ TextField,
25
+ DateTimeField,
26
+ CompositeKey,
27
+ IntegrityError,
28
+ OperationalError,
29
+ )
30
+ from datetime import datetime
31
+ import fasteners
32
+
33
+
34
+ class BaseModel(Model):
35
+ """Base model for workspace database tables
36
+
37
+ Models are unbound by default. Use database.bind_ctx() when querying:
38
+
39
+ with workspace.workspace_db.bind_ctx([ExperimentModel, JobModel, ...]):
40
+ experiments = ExperimentModel.select()
41
+
42
+ Or use the convenience method bind_models() defined below.
43
+ """
44
+
45
+ class Meta:
46
+ database = None # Unbound - will be bound when used
47
+
48
+
49
+ class ExperimentModel(BaseModel):
50
+ """Experiment metadata - tracks experiment definitions
51
+
52
+ An experiment can be run multiple times. This table tracks the experiment
53
+ itself and points to the current/latest run.
54
+
55
+ Fields:
56
+ experiment_id: Unique identifier for the experiment
57
+ current_run_id: Points to the current/latest run (null if no runs yet)
58
+ created_at: When experiment was first created
59
+ updated_at: When experiment was last modified (for incremental queries)
60
+
61
+ Note: Experiment path is derivable: {workspace}/xp/{experiment_id}
62
+ """
63
+
64
+ experiment_id = CharField(primary_key=True)
65
+ current_run_id = CharField(null=True)
66
+ created_at = DateTimeField(default=datetime.now)
67
+ updated_at = DateTimeField(default=datetime.now, index=True)
68
+
69
+ class Meta:
70
+ table_name = "experiments"
71
+
72
+
73
+ class ExperimentRunModel(BaseModel):
74
+ """Individual experiment runs
75
+
76
+ Each time an experiment is executed, a new run is created.
77
+ Runs are identified by (experiment_id, run_id) composite key.
78
+
79
+ run_id format: timestamp-based like "20250120_143022" or sequential counter
80
+
81
+ Fields:
82
+ experiment_id: ID of the experiment this run belongs to
83
+ run_id: Unique ID for this run (timestamp or sequential)
84
+ started_at: When this run started
85
+ ended_at: When this run completed (null if still active)
86
+ status: Run status (active, completed, failed, abandoned)
87
+ """
88
+
89
+ experiment_id = CharField(index=True)
90
+ run_id = CharField(index=True)
91
+ started_at = DateTimeField(default=datetime.now)
92
+ ended_at = DateTimeField(null=True)
93
+ status = CharField(default="active", index=True)
94
+
95
+ class Meta:
96
+ table_name = "experiment_runs"
97
+ primary_key = CompositeKey("experiment_id", "run_id")
98
+ indexes = ((("experiment_id", "started_at"), False),) # For finding latest run
99
+
100
+
101
+ class WorkspaceSyncMetadata(BaseModel):
102
+ """Workspace-level metadata for disk sync tracking
103
+
104
+ Single-row table to track when the last disk sync occurred.
105
+ Used to throttle sync operations and prevent excessive disk scanning.
106
+
107
+ Fields:
108
+ id: Always "workspace" (single row table)
109
+ last_sync_time: When last sync completed
110
+ sync_interval_minutes: Minimum interval between syncs
111
+ """
112
+
113
+ id = CharField(primary_key=True, default="workspace")
114
+ last_sync_time = DateTimeField(null=True)
115
+ sync_interval_minutes = IntegerField(default=5)
116
+
117
+ class Meta:
118
+ table_name = "workspace_sync_metadata"
119
+
120
+
121
+ class JobModel(BaseModel):
122
+ """Job information linked to specific experiment run
123
+
124
+ Jobs are tied to a specific run of an experiment via (experiment_id, run_id).
125
+ The same job can appear in multiple runs with different states/tags.
126
+
127
+ Fields:
128
+ job_id: Unique identifier for the job (from task identifier)
129
+ experiment_id: ID of the experiment this job belongs to
130
+ run_id: ID of the run this job belongs to
131
+ task_id: Task class identifier
132
+ locator: Full task locator (identifier)
133
+ state: Current job state (e.g., "unscheduled", "waiting", "running", "done", "error")
134
+ failure_reason: Optional failure reason for error states (e.g., "TIMEOUT", "DEPENDENCY")
135
+ submitted_time: When job was submitted (Unix timestamp)
136
+ started_time: When job started running (Unix timestamp)
137
+ ended_time: When job finished (Unix timestamp)
138
+ progress: JSON-encoded list of progress updates
139
+ updated_at: When job was last modified (for incremental queries)
140
+
141
+ Note: Job path is derivable: {workspace}/jobs/{task_id}/{job_id}
142
+ Note: Tags are stored in separate JobTagModel table (run-scoped)
143
+ Note: Dependencies are NOT stored in DB (available in state.json only)
144
+ """
145
+
146
+ job_id = CharField(index=True)
147
+ experiment_id = CharField(index=True)
148
+ run_id = CharField(index=True)
149
+ task_id = CharField(index=True)
150
+ locator = CharField()
151
+ state = CharField(default="unscheduled", index=True)
152
+ failure_reason = CharField(null=True)
153
+ submitted_time = FloatField(null=True)
154
+ started_time = FloatField(null=True)
155
+ ended_time = FloatField(null=True)
156
+ progress = TextField(default="[]")
157
+ updated_at = DateTimeField(default=datetime.now, index=True)
158
+
159
+ class Meta:
160
+ table_name = "jobs"
161
+ primary_key = CompositeKey("job_id", "experiment_id", "run_id")
162
+ indexes = (
163
+ (
164
+ ("experiment_id", "run_id", "state"),
165
+ False,
166
+ ), # Query jobs by run and state
167
+ (
168
+ ("experiment_id", "run_id", "task_id"),
169
+ False,
170
+ ), # Query jobs by run and task
171
+ (
172
+ ("experiment_id", "run_id", "updated_at"),
173
+ False,
174
+ ), # Query jobs by run and update time
175
+ )
176
+
177
+
178
+ class JobTagModel(BaseModel):
179
+ """Job tags for efficient searching (fixes GH #128)
180
+
181
+ **FIX FOR GH ISSUE #128**: Tags are now experiment-run-dependent, not job-dependent.
182
+ The same job in different experiment runs can have different tags, because tags
183
+ are scoped to the (job_id, experiment_id, run_id) combination.
184
+
185
+ Tags are stored as key-value pairs in a separate table for efficient indexing.
186
+ Each job can have multiple tags within an experiment run context.
187
+
188
+ Key change from old behavior:
189
+ - OLD: Tags were global per job_id (broken - same job in different experiments/runs shared tags)
190
+ - NEW: Tags are scoped per (job_id, experiment_id, run_id) - same job can have different tags in different runs
191
+
192
+ Fields:
193
+ job_id: ID of the job
194
+ experiment_id: ID of the experiment
195
+ run_id: ID of the run
196
+ tag_key: Tag name
197
+ tag_value: Tag value
198
+ """
199
+
200
+ job_id = CharField(index=True)
201
+ experiment_id = CharField(index=True)
202
+ run_id = CharField(index=True)
203
+ tag_key = CharField(index=True)
204
+ tag_value = CharField(index=True)
205
+
206
+ class Meta:
207
+ table_name = "job_tags"
208
+ primary_key = CompositeKey("job_id", "experiment_id", "run_id", "tag_key")
209
+ indexes = (
210
+ (("tag_key", "tag_value"), False), # For tag-based queries
211
+ (
212
+ ("experiment_id", "run_id", "tag_key"),
213
+ False,
214
+ ), # For experiment run tag queries
215
+ )
216
+
217
+
218
+ class ServiceModel(BaseModel):
219
+ """Service information linked to specific experiment run
220
+
221
+ Services are tied to a specific run of an experiment via (experiment_id, run_id).
222
+
223
+ Fields:
224
+ service_id: Unique identifier for the service
225
+ experiment_id: ID of the experiment this service belongs to
226
+ run_id: ID of the run this service belongs to
227
+ description: Human-readable description
228
+ state: Service state (e.g., "running", "stopped")
229
+ state_dict: JSON serialized state_dict for service recreation
230
+ created_at: When service was created
231
+ updated_at: Timestamp of last update
232
+ """
233
+
234
+ service_id = CharField()
235
+ experiment_id = CharField(index=True)
236
+ run_id = CharField(index=True)
237
+ description = TextField(default="")
238
+ state = CharField()
239
+ state_dict = TextField(default="{}") # JSON for service recreation
240
+ created_at = DateTimeField(default=datetime.now)
241
+ updated_at = DateTimeField(default=datetime.now)
242
+
243
+ class Meta:
244
+ table_name = "services"
245
+ primary_key = CompositeKey("service_id", "experiment_id", "run_id")
246
+
247
+
248
+ class PartialModel(BaseModel):
249
+ """Partial directory tracking for subparameters
250
+
251
+ Tracks partial directories that are shared across jobs with different
252
+ parameter values (but same partial identifier). These directories are
253
+ at WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID/ (reconstructible).
254
+
255
+ Fields:
256
+ partial_id: Hex hash of the partial identifier
257
+ task_id: Task class identifier
258
+ subparameters_name: Name of the subparameters definition
259
+ created_at: When this partial directory was first created
260
+ """
261
+
262
+ partial_id = CharField(primary_key=True)
263
+ task_id = CharField(index=True)
264
+ subparameters_name = CharField(index=True)
265
+ created_at = DateTimeField(default=datetime.now)
266
+
267
+ class Meta:
268
+ table_name = "partials"
269
+ indexes = ((("task_id", "subparameters_name"), False),)
270
+
271
+
272
+ class JobPartialModel(BaseModel):
273
+ """Links jobs to partial directories they use
274
+
275
+ Tracks which jobs reference which partial directories. This enables
276
+ cleanup of orphan partials when all referencing jobs are deleted.
277
+
278
+ A job can use multiple partials (different subparameters definitions),
279
+ and a partial can be used by multiple jobs.
280
+
281
+ Fields:
282
+ job_id: ID of the job using this partial
283
+ experiment_id: ID of the experiment
284
+ run_id: ID of the run
285
+ partial_id: ID of the partial directory being used
286
+ """
287
+
288
+ job_id = CharField(index=True)
289
+ experiment_id = CharField(index=True)
290
+ run_id = CharField(index=True)
291
+ partial_id = CharField(index=True)
292
+
293
+ class Meta:
294
+ table_name = "job_partials"
295
+ primary_key = CompositeKey("job_id", "experiment_id", "run_id", "partial_id")
296
+ indexes = ((("partial_id",), False),) # For finding jobs using a partial
297
+
298
+
299
+ # List of all models for binding
300
+ ALL_MODELS = [
301
+ ExperimentModel,
302
+ ExperimentRunModel,
303
+ WorkspaceSyncMetadata,
304
+ JobModel,
305
+ JobTagModel,
306
+ ServiceModel,
307
+ PartialModel,
308
+ JobPartialModel,
309
+ ]
310
+
311
+
312
+ def initialize_workspace_database(
313
+ db_path: Path, read_only: bool = False
314
+ ) -> SqliteDatabase:
315
+ """Initialize a workspace database connection with proper configuration
316
+
317
+ Creates and configures a SQLite database connection for the workspace.
318
+ Models must be bound to this database before querying.
319
+
320
+ Uses file-based locking to prevent multiple processes from initializing
321
+ the database simultaneously, which could cause SQLite locking issues.
322
+
323
+ Args:
324
+ db_path: Path to the workspace SQLite database file
325
+ read_only: If True, open database in read-only mode
326
+
327
+ Returns:
328
+ Configured SqliteDatabase instance
329
+ """
330
+ # Ensure parent directory exists (unless read-only)
331
+ if not read_only:
332
+ db_path.parent.mkdir(parents=True, exist_ok=True)
333
+
334
+ # Use file-based lock to prevent concurrent initialization from multiple processes
335
+ # This prevents SQLite locking issues during table creation
336
+ lock_path = db_path.parent / f".{db_path.name}.init.lock"
337
+ lock = fasteners.InterProcessLock(str(lock_path))
338
+
339
+ # Acquire lock (blocking) - only one process can initialize at a time
340
+ with lock:
341
+ # Create database connection
342
+ # check_same_thread=False allows the connection to be used from multiple threads
343
+ # This is safe with WAL mode and proper locking
344
+ db = SqliteDatabase(
345
+ str(db_path),
346
+ pragmas={
347
+ "journal_mode": "wal", # Write-Ahead Logging for concurrent reads
348
+ "foreign_keys": 1, # Enable foreign key constraints
349
+ "ignore_check_constraints": 0,
350
+ "synchronous": 1, # NORMAL mode (balance safety/speed)
351
+ "busy_timeout": 5000, # Wait up to 5 seconds for locks
352
+ },
353
+ check_same_thread=False,
354
+ )
355
+
356
+ if read_only:
357
+ # Set query-only mode for read-only access
358
+ db.execute_sql("PRAGMA query_only = ON")
359
+
360
+ # Bind all models to this database
361
+ db.bind(ALL_MODELS)
362
+
363
+ # Create tables if they don't exist (only in write mode)
364
+ if not read_only:
365
+ db.create_tables(ALL_MODELS, safe=True)
366
+
367
+ # Initialize WorkspaceSyncMetadata with default row if not exists
368
+ # Use try/except to handle race condition (shouldn't happen with lock, but be safe)
369
+ try:
370
+ WorkspaceSyncMetadata.get_or_create(
371
+ id="workspace",
372
+ defaults={"last_sync_time": None, "sync_interval_minutes": 5},
373
+ )
374
+ except (IntegrityError, OperationalError):
375
+ # If get_or_create fails, the row likely already exists
376
+ pass
377
+
378
+ return db
379
+
380
+
381
+ def close_workspace_database(db: SqliteDatabase):
382
+ """Close a workspace database connection
383
+
384
+ Args:
385
+ db: The database connection to close
386
+ """
387
+ if db and not db.is_closed():
388
+ db.close()