experimaestro 2.0.0b8__py3-none-any.whl → 2.0.0b17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/__init__.py +12 -5
- experimaestro/cli/__init__.py +239 -126
- experimaestro/cli/filter.py +48 -23
- experimaestro/cli/jobs.py +253 -71
- experimaestro/cli/refactor.py +1 -2
- experimaestro/commandline.py +7 -4
- experimaestro/connectors/__init__.py +9 -1
- experimaestro/connectors/local.py +43 -3
- experimaestro/core/arguments.py +18 -18
- experimaestro/core/identifier.py +11 -11
- experimaestro/core/objects/config.py +96 -39
- experimaestro/core/objects/config_walk.py +3 -3
- experimaestro/core/{subparameters.py → partial.py} +16 -16
- experimaestro/core/partial_lock.py +394 -0
- experimaestro/core/types.py +12 -15
- experimaestro/dynamic.py +290 -0
- experimaestro/experiments/__init__.py +6 -2
- experimaestro/experiments/cli.py +217 -50
- experimaestro/experiments/configuration.py +24 -0
- experimaestro/generators.py +5 -5
- experimaestro/ipc.py +118 -1
- experimaestro/launcherfinder/__init__.py +2 -2
- experimaestro/launcherfinder/registry.py +6 -7
- experimaestro/launcherfinder/specs.py +2 -9
- experimaestro/launchers/slurm/__init__.py +2 -2
- experimaestro/launchers/slurm/base.py +62 -0
- experimaestro/locking.py +957 -1
- experimaestro/notifications.py +89 -201
- experimaestro/progress.py +63 -366
- experimaestro/rpyc.py +0 -2
- experimaestro/run.py +29 -2
- experimaestro/scheduler/__init__.py +8 -1
- experimaestro/scheduler/base.py +629 -53
- experimaestro/scheduler/dependencies.py +20 -16
- experimaestro/scheduler/experiment.py +732 -167
- experimaestro/scheduler/interfaces.py +316 -101
- experimaestro/scheduler/jobs.py +58 -20
- experimaestro/scheduler/remote/adaptive_sync.py +265 -0
- experimaestro/scheduler/remote/client.py +171 -117
- experimaestro/scheduler/remote/protocol.py +8 -193
- experimaestro/scheduler/remote/server.py +95 -71
- experimaestro/scheduler/services.py +53 -28
- experimaestro/scheduler/state_provider.py +663 -2430
- experimaestro/scheduler/state_status.py +1247 -0
- experimaestro/scheduler/transient.py +31 -0
- experimaestro/scheduler/workspace.py +1 -1
- experimaestro/scheduler/workspace_state_provider.py +1273 -0
- experimaestro/scriptbuilder.py +4 -4
- experimaestro/settings.py +36 -0
- experimaestro/tests/conftest.py +33 -5
- experimaestro/tests/connectors/bin/executable.py +1 -1
- experimaestro/tests/fixtures/pre_experiment/experiment_check_env.py +16 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_check_mock.py +14 -0
- experimaestro/tests/fixtures/pre_experiment/experiment_simple.py +12 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_env.py +5 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_error.py +3 -0
- experimaestro/tests/fixtures/pre_experiment/pre_setup_mock.py +8 -0
- experimaestro/tests/launchers/bin/test.py +1 -0
- experimaestro/tests/launchers/test_slurm.py +9 -9
- experimaestro/tests/partial_reschedule.py +46 -0
- experimaestro/tests/restart.py +3 -3
- experimaestro/tests/restart_main.py +1 -0
- experimaestro/tests/scripts/notifyandwait.py +1 -0
- experimaestro/tests/task_partial.py +38 -0
- experimaestro/tests/task_tokens.py +2 -2
- experimaestro/tests/tasks/test_dynamic.py +6 -6
- experimaestro/tests/test_dependencies.py +3 -3
- experimaestro/tests/test_deprecated.py +15 -15
- experimaestro/tests/test_dynamic_locking.py +317 -0
- experimaestro/tests/test_environment.py +24 -14
- experimaestro/tests/test_experiment.py +171 -36
- experimaestro/tests/test_identifier.py +25 -25
- experimaestro/tests/test_identifier_stability.py +3 -5
- experimaestro/tests/test_multitoken.py +2 -4
- experimaestro/tests/{test_subparameters.py → test_partial.py} +25 -25
- experimaestro/tests/test_partial_paths.py +81 -138
- experimaestro/tests/test_pre_experiment.py +219 -0
- experimaestro/tests/test_progress.py +2 -8
- experimaestro/tests/test_remote_state.py +560 -99
- experimaestro/tests/test_stray_jobs.py +261 -0
- experimaestro/tests/test_tasks.py +1 -2
- experimaestro/tests/test_token_locking.py +52 -67
- experimaestro/tests/test_tokens.py +5 -6
- experimaestro/tests/test_transient.py +225 -0
- experimaestro/tests/test_workspace_state_provider.py +768 -0
- experimaestro/tests/token_reschedule.py +1 -3
- experimaestro/tests/utils.py +2 -7
- experimaestro/tokens.py +227 -372
- experimaestro/tools/diff.py +1 -0
- experimaestro/tools/documentation.py +4 -5
- experimaestro/tools/jobs.py +1 -2
- experimaestro/tui/app.py +438 -1966
- experimaestro/tui/app.tcss +162 -0
- experimaestro/tui/dialogs.py +172 -0
- experimaestro/tui/log_viewer.py +253 -3
- experimaestro/tui/messages.py +137 -0
- experimaestro/tui/utils.py +54 -0
- experimaestro/tui/widgets/__init__.py +23 -0
- experimaestro/tui/widgets/experiments.py +468 -0
- experimaestro/tui/widgets/global_services.py +238 -0
- experimaestro/tui/widgets/jobs.py +972 -0
- experimaestro/tui/widgets/log.py +156 -0
- experimaestro/tui/widgets/orphans.py +363 -0
- experimaestro/tui/widgets/runs.py +185 -0
- experimaestro/tui/widgets/services.py +314 -0
- experimaestro/tui/widgets/stray_jobs.py +528 -0
- experimaestro/utils/__init__.py +1 -1
- experimaestro/utils/environment.py +105 -22
- experimaestro/utils/fswatcher.py +124 -0
- experimaestro/utils/jobs.py +1 -2
- experimaestro/utils/jupyter.py +1 -2
- experimaestro/utils/logging.py +72 -0
- experimaestro/version.py +2 -2
- experimaestro/webui/__init__.py +9 -0
- experimaestro/webui/app.py +117 -0
- experimaestro/{server → webui}/data/index.css +66 -11
- experimaestro/webui/data/index.css.map +1 -0
- experimaestro/{server → webui}/data/index.js +82763 -87217
- experimaestro/webui/data/index.js.map +1 -0
- experimaestro/webui/routes/__init__.py +5 -0
- experimaestro/webui/routes/auth.py +53 -0
- experimaestro/webui/routes/proxy.py +117 -0
- experimaestro/webui/server.py +200 -0
- experimaestro/webui/state_bridge.py +152 -0
- experimaestro/webui/websocket.py +413 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/METADATA +5 -6
- experimaestro-2.0.0b17.dist-info/RECORD +219 -0
- experimaestro/cli/progress.py +0 -269
- experimaestro/scheduler/state.py +0 -75
- experimaestro/scheduler/state_db.py +0 -437
- experimaestro/scheduler/state_sync.py +0 -891
- experimaestro/server/__init__.py +0 -467
- experimaestro/server/data/index.css.map +0 -1
- experimaestro/server/data/index.js.map +0 -1
- experimaestro/tests/test_cli_jobs.py +0 -615
- experimaestro/tests/test_file_progress.py +0 -425
- experimaestro/tests/test_file_progress_integration.py +0 -477
- experimaestro/tests/test_state_db.py +0 -434
- experimaestro-2.0.0b8.dist-info/RECORD +0 -187
- /experimaestro/{server → webui}/data/1815e00441357e01619e.ttf +0 -0
- /experimaestro/{server → webui}/data/2463b90d9a316e4e5294.woff2 +0 -0
- /experimaestro/{server → webui}/data/2582b0e4bcf85eceead0.ttf +0 -0
- /experimaestro/{server → webui}/data/89999bdf5d835c012025.woff2 +0 -0
- /experimaestro/{server → webui}/data/914997e1bdfc990d0897.ttf +0 -0
- /experimaestro/{server → webui}/data/c210719e60948b211a12.woff2 +0 -0
- /experimaestro/{server → webui}/data/favicon.ico +0 -0
- /experimaestro/{server → webui}/data/index.html +0 -0
- /experimaestro/{server → webui}/data/login.html +0 -0
- /experimaestro/{server → webui}/data/manifest.json +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b8.dist-info → experimaestro-2.0.0b17.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,164 +1,170 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""State provider interfaces for accessing experiment and job information
|
|
2
2
|
|
|
3
|
-
This module provides
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
This module provides the abstract StateProvider interface and related data classes.
|
|
4
|
+
The concrete implementations are in db_state_provider.py (DbStateProvider) and
|
|
5
|
+
remote/client.py (SSHStateProviderClient).
|
|
6
6
|
|
|
7
7
|
Key features:
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
-
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
- Push notifications via listener callbacks (for reactive UI)
|
|
8
|
+
- StateProvider ABC: Abstract base class for all state providers
|
|
9
|
+
- Mock classes: Concrete implementations for database-loaded state objects
|
|
10
|
+
- StateListener: Type alias for listener callbacks
|
|
11
|
+
|
|
12
|
+
Note: Event classes are defined in state_status.py (EventBase and subclasses).
|
|
14
13
|
"""
|
|
15
14
|
|
|
16
15
|
import json
|
|
17
16
|
import logging
|
|
18
|
-
import socket
|
|
19
17
|
import threading
|
|
20
|
-
import time
|
|
21
18
|
from dataclasses import dataclass
|
|
22
19
|
from datetime import datetime
|
|
23
|
-
from enum import Enum, auto
|
|
24
20
|
from pathlib import Path
|
|
25
21
|
from abc import ABC, abstractmethod
|
|
26
|
-
from typing import Callable, Dict, List, Optional, Set, Tuple
|
|
27
|
-
|
|
28
|
-
from watchdog.events import FileSystemEventHandler
|
|
29
|
-
from watchdog.observers.api import ObservedWatch
|
|
30
|
-
|
|
31
|
-
from experimaestro.scheduler.state_db import (
|
|
32
|
-
ExperimentModel,
|
|
33
|
-
ExperimentRunModel,
|
|
34
|
-
JobModel,
|
|
35
|
-
JobTagModel,
|
|
36
|
-
ServiceModel,
|
|
37
|
-
PartialModel,
|
|
38
|
-
JobPartialModel,
|
|
39
|
-
WorkspaceSyncMetadata,
|
|
40
|
-
ALL_MODELS,
|
|
41
|
-
CURRENT_DB_VERSION,
|
|
42
|
-
)
|
|
22
|
+
from typing import Callable, Dict, List, Optional, Set, Tuple
|
|
23
|
+
|
|
43
24
|
from experimaestro.scheduler.interfaces import (
|
|
44
25
|
BaseJob,
|
|
45
26
|
BaseExperiment,
|
|
46
27
|
BaseService,
|
|
28
|
+
ExperimentJobInformation,
|
|
29
|
+
ExperimentStatus,
|
|
47
30
|
JobState,
|
|
48
31
|
JobFailureStatus,
|
|
49
32
|
STATE_NAME_TO_JOBSTATE,
|
|
33
|
+
deserialize_timestamp,
|
|
50
34
|
)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
35
|
+
from experimaestro.scheduler.transient import TransientMode
|
|
36
|
+
from experimaestro.notifications import (
|
|
37
|
+
ProgressInformation,
|
|
38
|
+
get_progress_information_from_dict,
|
|
39
|
+
)
|
|
40
|
+
from experimaestro.scheduler.state_status import EventBase
|
|
55
41
|
|
|
56
42
|
logger = logging.getLogger("xpm.state")
|
|
57
43
|
|
|
58
44
|
|
|
59
|
-
#
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
EXPERIMENT_UPDATED = auto()
|
|
64
|
-
RUN_UPDATED = auto()
|
|
65
|
-
JOB_UPDATED = auto()
|
|
66
|
-
SERVICE_UPDATED = auto()
|
|
45
|
+
# =============================================================================
|
|
46
|
+
# Process Information
|
|
47
|
+
# =============================================================================
|
|
67
48
|
|
|
68
49
|
|
|
69
50
|
@dataclass
|
|
70
|
-
class
|
|
71
|
-
"""
|
|
51
|
+
class ProcessInfo:
|
|
52
|
+
"""Information about a running or completed process"""
|
|
72
53
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
54
|
+
pid: int
|
|
55
|
+
"""Process ID"""
|
|
56
|
+
|
|
57
|
+
type: str
|
|
58
|
+
"""Process type (e.g., 'local', 'slurm', 'oar')"""
|
|
77
59
|
|
|
78
|
-
|
|
79
|
-
|
|
60
|
+
running: bool = False
|
|
61
|
+
"""Whether the process is currently running"""
|
|
80
62
|
|
|
63
|
+
cpu_percent: Optional[float] = None
|
|
64
|
+
"""CPU usage percentage (if available)"""
|
|
81
65
|
|
|
82
|
-
|
|
83
|
-
|
|
66
|
+
memory_mb: Optional[float] = None
|
|
67
|
+
"""Memory usage in MB (if available)"""
|
|
68
|
+
|
|
69
|
+
num_threads: Optional[int] = None
|
|
70
|
+
"""Number of threads (if available)"""
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# Type alias for listener callbacks (uses EventBase from state_status)
|
|
74
|
+
StateListener = Callable[[EventBase], None]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# =============================================================================
|
|
78
|
+
# State Provider ABC
|
|
79
|
+
# =============================================================================
|
|
84
80
|
|
|
85
81
|
|
|
86
82
|
class StateProvider(ABC):
|
|
87
83
|
"""Abstract base class for state providers
|
|
88
84
|
|
|
89
85
|
Defines the interface that all state providers must implement.
|
|
90
|
-
This enables both local (
|
|
91
|
-
(
|
|
86
|
+
This enables both local (DbStateProvider), remote (SSHStateProviderClient),
|
|
87
|
+
and live (Scheduler) providers to be used interchangeably.
|
|
88
|
+
|
|
89
|
+
Concrete implementations:
|
|
90
|
+
- Scheduler: Live in-memory state from running experiments
|
|
91
|
+
- OfflineStateProvider: Base for cached/persistent state (in db_state_provider.py)
|
|
92
|
+
- DbStateProvider: SQLite database-backed state
|
|
93
|
+
- SSHStateProviderClient: Remote SSH-based state
|
|
92
94
|
|
|
93
|
-
|
|
94
|
-
(and their URLs) across calls to get_services(). Subclasses should call
|
|
95
|
-
_init_service_cache() in their __init__ and implement _fetch_services_from_storage().
|
|
95
|
+
State listener management is provided by the base class with default implementations.
|
|
96
96
|
"""
|
|
97
97
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
self._service_cache: Dict[Tuple[str, str], Dict[str, "BaseService"]] = {}
|
|
101
|
-
self._service_cache_lock = threading.Lock()
|
|
98
|
+
#: Whether this provider is connected to a live scheduler
|
|
99
|
+
is_live: bool = False
|
|
102
100
|
|
|
103
|
-
def
|
|
104
|
-
"""
|
|
105
|
-
|
|
106
|
-
|
|
101
|
+
def __init__(self) -> None:
|
|
102
|
+
"""Initialize state listener management"""
|
|
103
|
+
self._state_listeners: Set[StateListener] = set()
|
|
104
|
+
self._state_listener_lock = threading.Lock()
|
|
107
105
|
|
|
108
|
-
def
|
|
109
|
-
|
|
110
|
-
) -> List[BaseService]:
|
|
111
|
-
"""Get services for an experiment
|
|
106
|
+
def add_listener(self, listener: StateListener) -> None:
|
|
107
|
+
"""Register a listener for state change events
|
|
112
108
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
and must implement _fetch_services_from_storage() for persistent storage.
|
|
109
|
+
Args:
|
|
110
|
+
listener: Callback function that receives StateEvent objects
|
|
116
111
|
"""
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
run_id = self.get_current_run(experiment_id)
|
|
120
|
-
if run_id is None:
|
|
121
|
-
return []
|
|
112
|
+
with self._state_listener_lock:
|
|
113
|
+
self._state_listeners.add(listener)
|
|
122
114
|
|
|
123
|
-
|
|
115
|
+
def remove_listener(self, listener: StateListener) -> None:
|
|
116
|
+
"""Unregister a listener
|
|
124
117
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
self._service_cache[cache_key] = {s.id: s for s in live_services}
|
|
131
|
-
return live_services
|
|
118
|
+
Args:
|
|
119
|
+
listener: Previously registered callback function
|
|
120
|
+
"""
|
|
121
|
+
with self._state_listener_lock:
|
|
122
|
+
self._state_listeners.discard(listener)
|
|
132
123
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
if cached is not None:
|
|
136
|
-
return list(cached.values())
|
|
124
|
+
def _notify_state_listeners(self, event: EventBase) -> None:
|
|
125
|
+
"""Notify all state listeners of an event
|
|
137
126
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
127
|
+
Args:
|
|
128
|
+
event: State change event to broadcast
|
|
129
|
+
"""
|
|
130
|
+
with self._state_listener_lock:
|
|
131
|
+
listeners = list(self._state_listeners)
|
|
142
132
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
133
|
+
logger.debug(
|
|
134
|
+
"Notifying %d listeners of %s", len(listeners), type(event).__name__
|
|
135
|
+
)
|
|
136
|
+
for listener in listeners:
|
|
137
|
+
try:
|
|
138
|
+
listener(event)
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.exception("Error in state listener: %s", e)
|
|
147
141
|
|
|
148
|
-
|
|
149
|
-
|
|
142
|
+
def service_state_changed(self, service) -> None:
|
|
143
|
+
"""Called when a service's state changes - emit event to listeners
|
|
144
|
+
|
|
145
|
+
StateProvider registers itself as a listener on services it returns,
|
|
146
|
+
so this method is called when those services' states change.
|
|
150
147
|
"""
|
|
151
|
-
|
|
148
|
+
from experimaestro.scheduler.state_status import ServiceStateChangedEvent
|
|
152
149
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
) -> List[BaseService]:
|
|
157
|
-
"""Fetch services from persistent storage (DB or remote).
|
|
150
|
+
experiment_id = getattr(service, "_experiment_id", "") or ""
|
|
151
|
+
run_id = getattr(service, "_run_id", "") or ""
|
|
152
|
+
state_name = service.state.name if hasattr(service.state, "name") else "UNKNOWN"
|
|
158
153
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
154
|
+
logger.debug(
|
|
155
|
+
"Service %s state changed to %s (experiment=%s)",
|
|
156
|
+
service.id,
|
|
157
|
+
state_name,
|
|
158
|
+
experiment_id,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
event = ServiceStateChangedEvent(
|
|
162
|
+
experiment_id=experiment_id,
|
|
163
|
+
run_id=run_id,
|
|
164
|
+
service_id=service.id,
|
|
165
|
+
state=state_name,
|
|
166
|
+
)
|
|
167
|
+
self._notify_state_listeners(event)
|
|
162
168
|
|
|
163
169
|
@abstractmethod
|
|
164
170
|
def get_experiments(self, since: Optional[datetime] = None) -> List[BaseExperiment]:
|
|
@@ -171,8 +177,13 @@ class StateProvider(ABC):
|
|
|
171
177
|
...
|
|
172
178
|
|
|
173
179
|
@abstractmethod
|
|
174
|
-
def get_experiment_runs(self, experiment_id: str) -> List[
|
|
175
|
-
"""Get all runs for an experiment
|
|
180
|
+
def get_experiment_runs(self, experiment_id: str) -> List[BaseExperiment]:
|
|
181
|
+
"""Get all runs for an experiment
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List of BaseExperiment instances (MockExperiment for past runs,
|
|
185
|
+
or live experiment for the current run in Scheduler)
|
|
186
|
+
"""
|
|
176
187
|
...
|
|
177
188
|
|
|
178
189
|
@abstractmethod
|
|
@@ -210,25 +221,55 @@ class StateProvider(ABC):
|
|
|
210
221
|
"""Get all jobs across all experiments"""
|
|
211
222
|
...
|
|
212
223
|
|
|
213
|
-
# Note: get_services is implemented in base class using _fetch_services_from_storage
|
|
214
|
-
|
|
215
224
|
@abstractmethod
|
|
216
|
-
def
|
|
217
|
-
self,
|
|
218
|
-
|
|
219
|
-
|
|
225
|
+
def get_tags_map(
|
|
226
|
+
self,
|
|
227
|
+
experiment_id: str,
|
|
228
|
+
run_id: Optional[str] = None,
|
|
229
|
+
) -> Dict[str, Dict[str, str]]:
|
|
230
|
+
"""Get tags map for jobs in an experiment/run
|
|
231
|
+
|
|
232
|
+
Tags are stored per (job_id, experiment_id, run_id) in JobTagModel.
|
|
233
|
+
This method returns a map from job_id to {tag_key: tag_value}.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
experiment_id: Experiment identifier
|
|
237
|
+
run_id: Run identifier (None = current run)
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Dictionary mapping job identifiers to their tags dict
|
|
241
|
+
"""
|
|
220
242
|
...
|
|
221
243
|
|
|
222
244
|
@abstractmethod
|
|
223
|
-
def
|
|
224
|
-
|
|
245
|
+
def get_dependencies_map(
|
|
246
|
+
self,
|
|
247
|
+
experiment_id: str,
|
|
248
|
+
run_id: Optional[str] = None,
|
|
249
|
+
) -> Dict[str, List[str]]:
|
|
250
|
+
"""Get dependencies map for jobs in an experiment/run
|
|
251
|
+
|
|
252
|
+
Dependencies are stored per (job_id, experiment_id, run_id) in JobDependenciesModel.
|
|
253
|
+
This method returns a map from job_id to list of job_ids it depends on.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
experiment_id: Experiment identifier
|
|
257
|
+
run_id: Run identifier (None = current run)
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Dictionary mapping job identifiers to list of job IDs they depend on
|
|
261
|
+
"""
|
|
225
262
|
...
|
|
226
263
|
|
|
227
264
|
@abstractmethod
|
|
228
|
-
def
|
|
229
|
-
|
|
265
|
+
def get_services(
|
|
266
|
+
self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
|
|
267
|
+
) -> List[BaseService]:
|
|
268
|
+
"""Get services for an experiment"""
|
|
230
269
|
...
|
|
231
270
|
|
|
271
|
+
# add_listener and remove_listener are implemented in base class
|
|
272
|
+
|
|
232
273
|
@abstractmethod
|
|
233
274
|
def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
|
|
234
275
|
"""Kill a running job"""
|
|
@@ -257,6 +298,19 @@ class StateProvider(ABC):
|
|
|
257
298
|
"""Get orphan jobs (jobs not associated with any experiment run)"""
|
|
258
299
|
return []
|
|
259
300
|
|
|
301
|
+
def get_stray_jobs(self) -> List[BaseJob]:
|
|
302
|
+
"""Get stray jobs (running jobs not associated with any active experiment)
|
|
303
|
+
|
|
304
|
+
Stray jobs are a subset of orphan jobs - they are orphan jobs that are
|
|
305
|
+
currently running or scheduled. These represent jobs where the experimental
|
|
306
|
+
plan changed but the job process is still running.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
List of running/scheduled jobs not in any active experiment
|
|
310
|
+
"""
|
|
311
|
+
# Default implementation: filter orphan jobs to running ones
|
|
312
|
+
return [j for j in self.get_orphan_jobs() if j.state and j.state.running()]
|
|
313
|
+
|
|
260
314
|
def delete_job_safely(self, job: BaseJob, perform: bool = True) -> Tuple[bool, str]:
|
|
261
315
|
"""Safely delete a job and its data"""
|
|
262
316
|
return False, "Not implemented"
|
|
@@ -271,6 +325,13 @@ class StateProvider(ABC):
|
|
|
271
325
|
"""Clean up orphan partial directories"""
|
|
272
326
|
return []
|
|
273
327
|
|
|
328
|
+
def get_process_info(self, job: BaseJob) -> Optional[ProcessInfo]:
|
|
329
|
+
"""Get process information for a job
|
|
330
|
+
|
|
331
|
+
Returns a ProcessInfo dataclass or None if not available.
|
|
332
|
+
"""
|
|
333
|
+
return None
|
|
334
|
+
|
|
274
335
|
def get_last_sync_time(self) -> Optional[datetime]:
|
|
275
336
|
"""Get the last sync time (for incremental updates)"""
|
|
276
337
|
return None
|
|
@@ -290,165 +351,102 @@ class StateProvider(ABC):
|
|
|
290
351
|
return False
|
|
291
352
|
|
|
292
353
|
|
|
293
|
-
|
|
294
|
-
|
|
354
|
+
# =============================================================================
|
|
355
|
+
# Offline State Provider (with service caching)
|
|
356
|
+
# =============================================================================
|
|
295
357
|
|
|
296
|
-
Uses a semaphore pattern so that the watchdog event handler never blocks.
|
|
297
|
-
The watchdog just signals the semaphore, and this thread does the actual
|
|
298
|
-
database queries and listener notifications.
|
|
299
358
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
359
|
+
class OfflineStateProvider(StateProvider):
|
|
360
|
+
"""State provider for offline/cached state access
|
|
361
|
+
|
|
362
|
+
Provides state listener management and service caching shared by
|
|
363
|
+
WorkspaceStateProvider and SSHStateProviderClient.
|
|
364
|
+
|
|
365
|
+
This is an intermediate class between StateProvider (the ABC) and concrete
|
|
366
|
+
implementations that need state listener support and service caching.
|
|
304
367
|
"""
|
|
305
368
|
|
|
306
|
-
def __init__(self
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
self.
|
|
310
|
-
self._change_pending = False # Protected by _change_condition
|
|
311
|
-
self._thread: Optional[threading.Thread] = None
|
|
312
|
-
self._debounce_seconds = 0.5 # Wait before processing to batch rapid changes
|
|
313
|
-
self._state_lock = threading.Lock() # Protects start/stop transitions
|
|
314
|
-
self._stopped = False # Once True, cannot be restarted
|
|
315
|
-
|
|
316
|
-
def start(self) -> None:
|
|
317
|
-
"""Start the change detection thread"""
|
|
318
|
-
with self._state_lock:
|
|
319
|
-
# Once stopped, cannot restart
|
|
320
|
-
if self._stopped:
|
|
321
|
-
logger.debug("Cannot start change detector - already stopped")
|
|
322
|
-
return
|
|
323
|
-
|
|
324
|
-
if self._thread is not None and self._thread.is_alive():
|
|
325
|
-
return # Already running
|
|
326
|
-
|
|
327
|
-
self._thread = threading.Thread(
|
|
328
|
-
target=self._run,
|
|
329
|
-
daemon=True,
|
|
330
|
-
name="DBChangeDetector",
|
|
331
|
-
)
|
|
332
|
-
self._thread.start()
|
|
333
|
-
logger.debug("Started database change detector thread")
|
|
334
|
-
|
|
335
|
-
def stop(self) -> None:
|
|
336
|
-
"""Stop the change detection thread"""
|
|
337
|
-
with self._state_lock:
|
|
338
|
-
self._stopped = True # Mark as permanently stopped
|
|
339
|
-
|
|
340
|
-
# Wake up the thread so it can exit
|
|
341
|
-
with self._change_condition:
|
|
342
|
-
self._change_condition.notify_all()
|
|
343
|
-
|
|
344
|
-
# Join outside the lock to avoid deadlock
|
|
345
|
-
if self._thread is not None:
|
|
346
|
-
self._thread.join(timeout=2.0)
|
|
347
|
-
self._thread = None
|
|
348
|
-
logger.debug("Stopped database change detector thread")
|
|
349
|
-
|
|
350
|
-
def signal_change(self) -> None:
|
|
351
|
-
"""Signal that a database change was detected (non-blocking)"""
|
|
352
|
-
with self._change_condition:
|
|
353
|
-
self._change_pending = True
|
|
354
|
-
self._change_condition.notify()
|
|
355
|
-
|
|
356
|
-
def _run(self) -> None:
|
|
357
|
-
"""Main loop: wait for changes and process them"""
|
|
358
|
-
while not self._stopped:
|
|
359
|
-
# Wait for a change signal and clear it atomically
|
|
360
|
-
with self._change_condition:
|
|
361
|
-
while not self._change_pending and not self._stopped:
|
|
362
|
-
self._change_condition.wait()
|
|
363
|
-
|
|
364
|
-
if self._stopped:
|
|
365
|
-
break
|
|
366
|
-
|
|
367
|
-
# Clear the pending flag atomically while holding the lock
|
|
368
|
-
self._change_pending = False
|
|
369
|
-
|
|
370
|
-
# Debounce - wait a bit for more changes to accumulate
|
|
371
|
-
time.sleep(self._debounce_seconds)
|
|
372
|
-
|
|
373
|
-
# Process all accumulated changes
|
|
374
|
-
self._detect_and_notify_changes()
|
|
375
|
-
|
|
376
|
-
def _detect_and_notify_changes(self) -> None:
|
|
377
|
-
"""Query the database to detect what changed and send events"""
|
|
378
|
-
try:
|
|
379
|
-
since = self._last_check_time
|
|
380
|
-
self._last_check_time = datetime.now()
|
|
381
|
-
|
|
382
|
-
# Query for changed experiments
|
|
383
|
-
with self.state_provider.workspace_db.bind_ctx([ExperimentModel]):
|
|
384
|
-
query = ExperimentModel.select()
|
|
385
|
-
if since:
|
|
386
|
-
query = query.where(ExperimentModel.updated_at > since)
|
|
387
|
-
|
|
388
|
-
for exp in query:
|
|
389
|
-
self.state_provider._notify_listeners(
|
|
390
|
-
StateEvent(
|
|
391
|
-
event_type=StateEventType.EXPERIMENT_UPDATED,
|
|
392
|
-
data={
|
|
393
|
-
"experiment_id": exp.experiment_id,
|
|
394
|
-
},
|
|
395
|
-
)
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
# Query for changed jobs
|
|
399
|
-
with self.state_provider.workspace_db.bind_ctx([JobModel]):
|
|
400
|
-
query = JobModel.select()
|
|
401
|
-
if since:
|
|
402
|
-
query = query.where(JobModel.updated_at > since)
|
|
403
|
-
|
|
404
|
-
for job in query:
|
|
405
|
-
self.state_provider._notify_listeners(
|
|
406
|
-
StateEvent(
|
|
407
|
-
event_type=StateEventType.JOB_UPDATED,
|
|
408
|
-
data={
|
|
409
|
-
"jobId": job.job_id,
|
|
410
|
-
"experimentId": job.experiment_id,
|
|
411
|
-
"runId": job.run_id,
|
|
412
|
-
"status": job.state,
|
|
413
|
-
},
|
|
414
|
-
)
|
|
415
|
-
)
|
|
369
|
+
def __init__(self):
|
|
370
|
+
"""Initialize offline state provider with service cache and listener management"""
|
|
371
|
+
super().__init__() # Initialize state listener management
|
|
372
|
+
self._init_service_cache()
|
|
416
373
|
|
|
417
|
-
|
|
418
|
-
|
|
374
|
+
# =========================================================================
|
|
375
|
+
# Service caching methods
|
|
376
|
+
# =========================================================================
|
|
419
377
|
|
|
378
|
+
def _init_service_cache(self) -> None:
|
|
379
|
+
"""Initialize service cache - call from subclass __init__"""
|
|
380
|
+
self._service_cache: Dict[tuple[str, str], Dict[str, "BaseService"]] = {}
|
|
381
|
+
self._service_cache_lock = threading.Lock()
|
|
420
382
|
|
|
421
|
-
|
|
422
|
-
|
|
383
|
+
def _clear_service_cache(self) -> None:
|
|
384
|
+
"""Clear the service cache"""
|
|
385
|
+
with self._service_cache_lock:
|
|
386
|
+
self._service_cache.clear()
|
|
423
387
|
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
""
|
|
388
|
+
def get_services(
|
|
389
|
+
self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
|
|
390
|
+
) -> List["BaseService"]:
|
|
391
|
+
"""Get services for an experiment
|
|
392
|
+
|
|
393
|
+
Uses caching to preserve service instances (and their URLs) across calls.
|
|
394
|
+
Subclasses can override _get_live_services() for live service support
|
|
395
|
+
and must implement _fetch_services_from_storage() for persistent storage.
|
|
396
|
+
"""
|
|
397
|
+
# Resolve run_id if needed
|
|
398
|
+
if experiment_id is not None and run_id is None:
|
|
399
|
+
run_id = self.get_current_run(experiment_id)
|
|
400
|
+
if run_id is None:
|
|
401
|
+
return []
|
|
402
|
+
|
|
403
|
+
cache_key = (experiment_id or "", run_id or "")
|
|
404
|
+
|
|
405
|
+
with self._service_cache_lock:
|
|
406
|
+
# Try to get live services (scheduler, etc.) - may return None
|
|
407
|
+
live_services = self._get_live_services(experiment_id, run_id)
|
|
408
|
+
if live_services is not None:
|
|
409
|
+
# Cache and return live services
|
|
410
|
+
self._service_cache[cache_key] = {s.id: s for s in live_services}
|
|
411
|
+
return live_services
|
|
412
|
+
|
|
413
|
+
# Check cache
|
|
414
|
+
cached = self._service_cache.get(cache_key)
|
|
415
|
+
if cached is not None:
|
|
416
|
+
return list(cached.values())
|
|
417
|
+
|
|
418
|
+
# Fetch from persistent storage (filesystem or remote)
|
|
419
|
+
services = self._fetch_services_from_storage(experiment_id, run_id)
|
|
420
|
+
self._service_cache[cache_key] = {s.id: s for s in services}
|
|
421
|
+
return services
|
|
422
|
+
|
|
423
|
+
def _get_live_services(
|
|
424
|
+
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
425
|
+
) -> Optional[List["BaseService"]]:
|
|
426
|
+
"""Get live services if available (e.g., from scheduler).
|
|
427
427
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
428
|
+
Returns None if no live services are available (default).
|
|
429
|
+
Subclasses may override to check for live services.
|
|
430
|
+
"""
|
|
431
|
+
return None
|
|
431
432
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
433
|
+
@abstractmethod
|
|
434
|
+
def _fetch_services_from_storage(
|
|
435
|
+
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
436
|
+
) -> List["BaseService"]:
|
|
437
|
+
"""Fetch services from persistent storage (filesystem or remote).
|
|
437
438
|
|
|
438
|
-
|
|
439
|
-
|
|
439
|
+
Called when no live services and cache is empty.
|
|
440
|
+
"""
|
|
441
|
+
...
|
|
440
442
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
if path.name not in ("workspace.db", "workspace.db-wal"):
|
|
444
|
-
return
|
|
443
|
+
# State listener methods (add_listener, remove_listener, _notify_state_listeners)
|
|
444
|
+
# are inherited from StateProvider base class
|
|
445
445
|
|
|
446
|
-
logger.debug(
|
|
447
|
-
"Database file changed: %s (event: %s)", path.name, event.event_type
|
|
448
|
-
)
|
|
449
446
|
|
|
450
|
-
|
|
451
|
-
|
|
447
|
+
# =============================================================================
|
|
448
|
+
# Mock Classes for Database-Loaded State
|
|
449
|
+
# =============================================================================
|
|
452
450
|
|
|
453
451
|
|
|
454
452
|
class MockJob(BaseJob):
|
|
@@ -458,28 +456,69 @@ class MockJob(BaseJob):
|
|
|
458
456
|
as opposed to live Job instances which are created during experiment runs.
|
|
459
457
|
"""
|
|
460
458
|
|
|
459
|
+
def apply_event(self, event: "EventBase") -> None:
|
|
460
|
+
"""Apply a job event to update this job's state"""
|
|
461
|
+
from experimaestro.scheduler.state_status import (
|
|
462
|
+
JobStateChangedEvent,
|
|
463
|
+
JobProgressEvent,
|
|
464
|
+
)
|
|
465
|
+
from experimaestro.notifications import LevelInformation
|
|
466
|
+
|
|
467
|
+
if isinstance(event, JobStateChangedEvent):
|
|
468
|
+
self.state = STATE_NAME_TO_JOBSTATE.get(event.state, self.state)
|
|
469
|
+
if event.failure_reason:
|
|
470
|
+
try:
|
|
471
|
+
self.failure_reason = JobFailureStatus[event.failure_reason]
|
|
472
|
+
except KeyError:
|
|
473
|
+
pass
|
|
474
|
+
if event.submitted_time is not None:
|
|
475
|
+
self.submittime = event.submitted_time
|
|
476
|
+
if event.started_time is not None:
|
|
477
|
+
self.starttime = event.started_time
|
|
478
|
+
if event.ended_time is not None:
|
|
479
|
+
self.endtime = event.ended_time
|
|
480
|
+
if event.exit_code is not None:
|
|
481
|
+
self.exit_code = event.exit_code
|
|
482
|
+
if event.retry_count:
|
|
483
|
+
self.retry_count = event.retry_count
|
|
484
|
+
logger.debug(
|
|
485
|
+
"Applied state change to job %s: %s", self.identifier, self.state
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
elif isinstance(event, JobProgressEvent):
|
|
489
|
+
level = event.level
|
|
490
|
+
# Truncate to level + 1 entries
|
|
491
|
+
self.progress = self.progress[: (level + 1)]
|
|
492
|
+
# Extend if needed
|
|
493
|
+
while len(self.progress) <= level:
|
|
494
|
+
self.progress.append(LevelInformation(len(self.progress), None, 0.0))
|
|
495
|
+
# Update the level's progress and description
|
|
496
|
+
if event.desc:
|
|
497
|
+
self.progress[-1].desc = event.desc
|
|
498
|
+
self.progress[-1].progress = event.progress
|
|
499
|
+
logger.debug(
|
|
500
|
+
"Applied progress to job %s: %s", self.identifier, self.progress
|
|
501
|
+
)
|
|
502
|
+
|
|
461
503
|
def __init__(
|
|
462
504
|
self,
|
|
463
505
|
identifier: str,
|
|
464
506
|
task_id: str,
|
|
465
|
-
locator: str,
|
|
466
507
|
path: Path,
|
|
467
508
|
state: str, # State name string from DB
|
|
468
509
|
submittime: Optional[float],
|
|
469
510
|
starttime: Optional[float],
|
|
470
511
|
endtime: Optional[float],
|
|
471
|
-
progress:
|
|
472
|
-
tags: Dict[str, str],
|
|
473
|
-
experiment_id: str,
|
|
474
|
-
run_id: str,
|
|
512
|
+
progress: ProgressInformation,
|
|
475
513
|
updated_at: str,
|
|
476
514
|
exit_code: Optional[int] = None,
|
|
477
515
|
retry_count: int = 0,
|
|
478
516
|
failure_reason: Optional[JobFailureStatus] = None,
|
|
517
|
+
transient: TransientMode = TransientMode.NONE,
|
|
518
|
+
process: dict | None = None,
|
|
479
519
|
):
|
|
480
520
|
self.identifier = identifier
|
|
481
521
|
self.task_id = task_id
|
|
482
|
-
self.locator = locator
|
|
483
522
|
self.path = path
|
|
484
523
|
# Convert state name to JobState instance
|
|
485
524
|
self.state = STATE_NAME_TO_JOBSTATE.get(state, JobState.UNSCHEDULED)
|
|
@@ -487,56 +526,16 @@ class MockJob(BaseJob):
|
|
|
487
526
|
self.starttime = starttime
|
|
488
527
|
self.endtime = endtime
|
|
489
528
|
self.progress = progress
|
|
490
|
-
self.tags = tags
|
|
491
|
-
self.experiment_id = experiment_id
|
|
492
|
-
self.run_id = run_id
|
|
493
529
|
self.updated_at = updated_at
|
|
494
530
|
self.exit_code = exit_code
|
|
495
531
|
self.retry_count = retry_count
|
|
496
532
|
self.failure_reason = failure_reason
|
|
533
|
+
self.transient = transient
|
|
534
|
+
self._process_dict = process
|
|
497
535
|
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
Args:
|
|
503
|
-
path: Path to the job directory
|
|
504
|
-
|
|
505
|
-
Returns:
|
|
506
|
-
MockJob instance if metadata exists, None otherwise
|
|
507
|
-
"""
|
|
508
|
-
metadata_path = path / ".xpm_metadata.json"
|
|
509
|
-
if not metadata_path.exists():
|
|
510
|
-
return None
|
|
511
|
-
|
|
512
|
-
try:
|
|
513
|
-
import json
|
|
514
|
-
|
|
515
|
-
with metadata_path.open("r") as f:
|
|
516
|
-
metadata = json.load(f)
|
|
517
|
-
|
|
518
|
-
return cls(
|
|
519
|
-
identifier=metadata.get("job_id", path.name),
|
|
520
|
-
task_id=metadata.get(
|
|
521
|
-
"task_id", path.parent.name if path.parent else "unknown"
|
|
522
|
-
),
|
|
523
|
-
locator=metadata.get("job_id", path.name),
|
|
524
|
-
path=path,
|
|
525
|
-
state=metadata.get("state", "unscheduled"),
|
|
526
|
-
submittime=metadata.get("submitted_time"),
|
|
527
|
-
starttime=metadata.get("started_time"),
|
|
528
|
-
endtime=metadata.get("ended_time"),
|
|
529
|
-
progress=[], # Progress not stored in metadata
|
|
530
|
-
tags={}, # Tags come from jobs.jsonl, not metadata
|
|
531
|
-
experiment_id="", # Not stored in job metadata
|
|
532
|
-
run_id="", # Not stored in job metadata
|
|
533
|
-
updated_at=str(metadata.get("last_updated", "")),
|
|
534
|
-
exit_code=metadata.get("exit_code"),
|
|
535
|
-
retry_count=metadata.get("retry_count", 0),
|
|
536
|
-
)
|
|
537
|
-
except Exception as e:
|
|
538
|
-
logger.warning("Failed to read job metadata from %s: %s", path, e)
|
|
539
|
-
return None
|
|
536
|
+
def process_state_dict(self) -> dict | None:
|
|
537
|
+
"""Get process state as dictionary."""
|
|
538
|
+
return self._process_dict
|
|
540
539
|
|
|
541
540
|
def getprocess(self):
|
|
542
541
|
"""Get process handle for running job
|
|
@@ -565,2202 +564,436 @@ class MockJob(BaseJob):
|
|
|
565
564
|
logger.warning("Could not get process for job at %s: %s", self.path, e)
|
|
566
565
|
return None
|
|
567
566
|
|
|
567
|
+
@classmethod
|
|
568
|
+
def from_state_dict(cls, d: Dict, workspace_path: Path) -> "MockJob":
|
|
569
|
+
"""Create MockJob from state dictionary
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
d: Dictionary from state_dict()
|
|
573
|
+
workspace_path: Workspace path to compute job path if not provided
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
MockJob instance
|
|
577
|
+
"""
|
|
578
|
+
task_id = d["task_id"]
|
|
579
|
+
identifier = d["job_id"]
|
|
580
|
+
|
|
581
|
+
# Use path from dict if it's already a Path, otherwise compute it
|
|
582
|
+
path = d.get("path")
|
|
583
|
+
if path is None:
|
|
584
|
+
path = workspace_path / "jobs" / task_id / identifier
|
|
585
|
+
elif isinstance(path, str):
|
|
586
|
+
path = Path(path)
|
|
587
|
+
|
|
588
|
+
failure_reason = None
|
|
589
|
+
if d.get("failure_reason"):
|
|
590
|
+
failure_reason = JobFailureStatus[d["failure_reason"]]
|
|
591
|
+
|
|
592
|
+
# Convert progress dicts to LevelInformation objects
|
|
593
|
+
progress_list = get_progress_information_from_dict(d.get("progress", []))
|
|
594
|
+
|
|
595
|
+
return cls(
|
|
596
|
+
identifier=identifier,
|
|
597
|
+
task_id=task_id,
|
|
598
|
+
path=path,
|
|
599
|
+
state=d["state"],
|
|
600
|
+
submittime=deserialize_timestamp(d.get("submitted_time")),
|
|
601
|
+
starttime=deserialize_timestamp(d.get("started_time")),
|
|
602
|
+
endtime=deserialize_timestamp(d.get("ended_time")),
|
|
603
|
+
progress=progress_list,
|
|
604
|
+
updated_at=d.get("updated_at", ""),
|
|
605
|
+
exit_code=d.get("exit_code"),
|
|
606
|
+
retry_count=d.get("retry_count", 0),
|
|
607
|
+
failure_reason=failure_reason,
|
|
608
|
+
process=d.get("process"),
|
|
609
|
+
)
|
|
610
|
+
|
|
568
611
|
|
|
569
612
|
class MockExperiment(BaseExperiment):
|
|
570
|
-
"""Concrete implementation of BaseExperiment for
|
|
613
|
+
"""Concrete implementation of BaseExperiment for loaded experiments
|
|
571
614
|
|
|
572
|
-
This class is used when loading experiment information from
|
|
615
|
+
This class is used when loading experiment information from disk,
|
|
573
616
|
as opposed to live experiment instances which are created during runs.
|
|
617
|
+
|
|
618
|
+
It stores all experiment state including jobs, services, tags,
|
|
619
|
+
dependencies, and event tracking (replaces StatusData).
|
|
574
620
|
"""
|
|
575
621
|
|
|
576
622
|
def __init__(
|
|
577
623
|
self,
|
|
578
624
|
workdir: Path,
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
625
|
+
run_id: str,
|
|
626
|
+
*,
|
|
627
|
+
status: ExperimentStatus = ExperimentStatus.RUNNING,
|
|
628
|
+
events_count: int = 0,
|
|
629
|
+
hostname: Optional[str] = None,
|
|
584
630
|
started_at: Optional[float] = None,
|
|
585
631
|
ended_at: Optional[float] = None,
|
|
586
|
-
|
|
632
|
+
job_infos: Optional[Dict[str, "ExperimentJobInformation"]] = None,
|
|
633
|
+
services: Optional[Dict[str, "MockService"]] = None,
|
|
634
|
+
dependencies: Optional[Dict[str, List[str]]] = None,
|
|
635
|
+
experiment_id_override: Optional[str] = None,
|
|
636
|
+
finished_jobs: int = 0,
|
|
637
|
+
failed_jobs: int = 0,
|
|
587
638
|
):
|
|
588
639
|
self.workdir = workdir
|
|
589
|
-
self.
|
|
590
|
-
self.
|
|
591
|
-
self.
|
|
592
|
-
self.
|
|
593
|
-
self.
|
|
594
|
-
self.
|
|
595
|
-
self.
|
|
596
|
-
self.
|
|
640
|
+
self.run_id = run_id
|
|
641
|
+
self._status = status
|
|
642
|
+
self._events_count = events_count
|
|
643
|
+
self._hostname = hostname
|
|
644
|
+
self._started_at = started_at
|
|
645
|
+
self._ended_at = ended_at
|
|
646
|
+
self._job_infos = job_infos or {}
|
|
647
|
+
self._services = services or {}
|
|
648
|
+
self._dependencies = dependencies or {}
|
|
649
|
+
self._experiment_id_override = experiment_id_override
|
|
650
|
+
self._finished_jobs = finished_jobs
|
|
651
|
+
self._failed_jobs = failed_jobs
|
|
597
652
|
|
|
598
653
|
@property
|
|
599
654
|
def experiment_id(self) -> str:
|
|
600
|
-
"""
|
|
601
|
-
|
|
655
|
+
"""Return experiment_id (overriding base class if needed for v1 layout)"""
|
|
656
|
+
if self._experiment_id_override:
|
|
657
|
+
return self._experiment_id_override
|
|
658
|
+
return super().experiment_id
|
|
602
659
|
|
|
660
|
+
# Implement abstract properties from BaseExperiment
|
|
603
661
|
|
|
604
|
-
|
|
605
|
-
|
|
662
|
+
@property
|
|
663
|
+
def status(self) -> ExperimentStatus:
|
|
664
|
+
return self._status
|
|
606
665
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
def __init__(
|
|
613
|
-
self,
|
|
614
|
-
service_id: str,
|
|
615
|
-
description_text: str,
|
|
616
|
-
state_dict_data: dict,
|
|
617
|
-
experiment_id: Optional[str] = None,
|
|
618
|
-
run_id: Optional[str] = None,
|
|
619
|
-
url: Optional[str] = None,
|
|
620
|
-
state: str = "STOPPED",
|
|
621
|
-
):
|
|
622
|
-
self.id = service_id
|
|
623
|
-
self._description = description_text
|
|
624
|
-
self._state_name = state
|
|
625
|
-
self._state_dict_data = state_dict_data
|
|
626
|
-
self.experiment_id = experiment_id
|
|
627
|
-
self.run_id = run_id
|
|
628
|
-
self.url = url
|
|
666
|
+
@property
|
|
667
|
+
def job_infos(self) -> Dict[str, "ExperimentJobInformation"]:
|
|
668
|
+
"""Lightweight job info from jobs.jsonl (job_id, task_id, tags, timestamp)"""
|
|
669
|
+
return self._job_infos
|
|
629
670
|
|
|
630
671
|
@property
|
|
631
|
-
def
|
|
632
|
-
|
|
633
|
-
from experimaestro.scheduler.services import ServiceState
|
|
634
|
-
|
|
635
|
-
# Convert state name to ServiceState enum
|
|
636
|
-
try:
|
|
637
|
-
return ServiceState[self._state_name]
|
|
638
|
-
except KeyError:
|
|
639
|
-
# Return a mock object with name attribute for unknown states
|
|
640
|
-
class MockState:
|
|
641
|
-
def __init__(self, name):
|
|
642
|
-
self.name = name
|
|
643
|
-
|
|
644
|
-
return MockState(self._state_name)
|
|
645
|
-
|
|
646
|
-
def description(self) -> str:
|
|
647
|
-
"""Return service description"""
|
|
648
|
-
return self._description
|
|
649
|
-
|
|
650
|
-
def state_dict(self) -> dict:
|
|
651
|
-
"""Return state dictionary for service recreation"""
|
|
652
|
-
return self._state_dict_data
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
def _with_db_context(func):
|
|
656
|
-
"""Decorator to wrap method in database bind context
|
|
657
|
-
|
|
658
|
-
This ensures all database queries have the models bound to the database.
|
|
659
|
-
"""
|
|
660
|
-
from functools import wraps
|
|
661
|
-
|
|
662
|
-
@wraps(func)
|
|
663
|
-
def wrapper(self, *args, **kwargs):
|
|
664
|
-
try:
|
|
665
|
-
with self.workspace_db.bind_ctx(ALL_MODELS):
|
|
666
|
-
return func(self, *args, **kwargs)
|
|
667
|
-
except Exception as e:
|
|
668
|
-
logger.exception("Error in %s with database context: %s", func.__name__, e)
|
|
669
|
-
raise
|
|
670
|
-
|
|
671
|
-
return wrapper
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
class WorkspaceStateProvider(StateProvider):
|
|
675
|
-
"""Unified state provider for workspace-level database (singleton per workspace path)
|
|
676
|
-
|
|
677
|
-
Provides access to experiment and job state from a single workspace database.
|
|
678
|
-
Supports both read-only (monitoring) and read-write (scheduler) modes.
|
|
679
|
-
|
|
680
|
-
Only one WorkspaceStateProvider instance exists per workspace path. Subsequent
|
|
681
|
-
requests for the same path return the existing instance.
|
|
682
|
-
|
|
683
|
-
Thread safety:
|
|
684
|
-
- Database connections are thread-local (managed by state_db module)
|
|
685
|
-
- Singleton registry is protected by a lock
|
|
686
|
-
- Each thread gets its own database connection
|
|
687
|
-
|
|
688
|
-
Run tracking:
|
|
689
|
-
- Each experiment can have multiple runs
|
|
690
|
-
- Jobs/services are scoped to (experiment_id, run_id)
|
|
691
|
-
- Tags are scoped to (job_id, experiment_id, run_id) - fixes GH #128
|
|
692
|
-
"""
|
|
693
|
-
|
|
694
|
-
# Registry of state provider instances by absolute path
|
|
695
|
-
_instances: Dict[Path, "WorkspaceStateProvider"] = {}
|
|
696
|
-
_lock = threading.Lock()
|
|
697
|
-
|
|
698
|
-
@classmethod
|
|
699
|
-
def get_instance(
|
|
700
|
-
cls,
|
|
701
|
-
workspace_path: Path,
|
|
702
|
-
read_only: bool = False,
|
|
703
|
-
sync_on_start: bool = False,
|
|
704
|
-
sync_interval_minutes: int = 5,
|
|
705
|
-
) -> "WorkspaceStateProvider":
|
|
706
|
-
"""Get or create WorkspaceStateProvider instance for a workspace path
|
|
707
|
-
|
|
708
|
-
Args:
|
|
709
|
-
workspace_path: Root workspace directory
|
|
710
|
-
read_only: If True, database is in read-only mode
|
|
711
|
-
sync_on_start: If True, sync from disk on initialization
|
|
712
|
-
sync_interval_minutes: Minimum interval between syncs (default: 5)
|
|
713
|
-
|
|
714
|
-
Returns:
|
|
715
|
-
WorkspaceStateProvider instance (singleton per path)
|
|
716
|
-
"""
|
|
717
|
-
# Normalize path
|
|
718
|
-
if isinstance(workspace_path, Path):
|
|
719
|
-
workspace_path = workspace_path.absolute()
|
|
720
|
-
else:
|
|
721
|
-
workspace_path = Path(workspace_path).absolute()
|
|
722
|
-
|
|
723
|
-
# Check if instance already exists
|
|
724
|
-
with cls._lock:
|
|
725
|
-
if workspace_path in cls._instances:
|
|
726
|
-
existing = cls._instances[workspace_path]
|
|
727
|
-
# Fail if requesting different read_only mode than cached instance
|
|
728
|
-
if existing.read_only != read_only:
|
|
729
|
-
raise RuntimeError(
|
|
730
|
-
f"WorkspaceStateProvider for {workspace_path} already exists "
|
|
731
|
-
f"with read_only={existing.read_only}, cannot open with "
|
|
732
|
-
f"read_only={read_only}. Close the existing instance first."
|
|
733
|
-
)
|
|
734
|
-
return existing
|
|
735
|
-
|
|
736
|
-
# Create new instance - register BEFORE __init__ to handle
|
|
737
|
-
# nested get_instance calls during sync_on_start
|
|
738
|
-
instance = object.__new__(cls)
|
|
739
|
-
cls._instances[workspace_path] = instance
|
|
740
|
-
|
|
741
|
-
# Initialize outside the lock to avoid deadlock during sync
|
|
742
|
-
try:
|
|
743
|
-
instance.__init__(
|
|
744
|
-
workspace_path, read_only, sync_on_start, sync_interval_minutes
|
|
745
|
-
)
|
|
746
|
-
except Exception:
|
|
747
|
-
# Remove from registry if initialization fails
|
|
748
|
-
with cls._lock:
|
|
749
|
-
cls._instances.pop(workspace_path, None)
|
|
750
|
-
raise
|
|
751
|
-
return instance
|
|
752
|
-
|
|
753
|
-
def __init__(
|
|
754
|
-
self,
|
|
755
|
-
workspace_path: Path,
|
|
756
|
-
read_only: bool = False,
|
|
757
|
-
sync_on_start: bool = False,
|
|
758
|
-
sync_interval_minutes: int = 5,
|
|
759
|
-
):
|
|
760
|
-
"""Initialize workspace state provider (called by get_instance())
|
|
761
|
-
|
|
762
|
-
Args:
|
|
763
|
-
workspace_path: Root workspace directory
|
|
764
|
-
read_only: If True, database is in read-only mode
|
|
765
|
-
sync_on_start: If True, sync from disk on initialization
|
|
766
|
-
sync_interval_minutes: Minimum interval between syncs (default: 5)
|
|
767
|
-
"""
|
|
768
|
-
# Normalize path
|
|
769
|
-
if isinstance(workspace_path, Path):
|
|
770
|
-
workspace_path = workspace_path.absolute()
|
|
771
|
-
else:
|
|
772
|
-
workspace_path = Path(workspace_path).absolute()
|
|
773
|
-
|
|
774
|
-
self.workspace_path = workspace_path
|
|
775
|
-
self._read_only = read_only
|
|
776
|
-
self.sync_interval_minutes = sync_interval_minutes
|
|
777
|
-
|
|
778
|
-
# Listeners for push notifications
|
|
779
|
-
self._listeners: Set[StateListener] = set()
|
|
780
|
-
self._listeners_lock = threading.Lock()
|
|
781
|
-
|
|
782
|
-
# Service cache (from base class)
|
|
783
|
-
self._init_service_cache()
|
|
672
|
+
def services(self) -> Dict[str, "BaseService"]:
|
|
673
|
+
return self._services
|
|
784
674
|
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
version_file = self.workspace_path / ".__experimaestro__"
|
|
794
|
-
|
|
795
|
-
if version_file.exists():
|
|
796
|
-
# Read existing version
|
|
797
|
-
content = version_file.read_text().strip()
|
|
798
|
-
if content == "":
|
|
799
|
-
# Empty file = v0
|
|
800
|
-
workspace_version = 0
|
|
801
|
-
else:
|
|
802
|
-
try:
|
|
803
|
-
workspace_version = int(content)
|
|
804
|
-
except ValueError:
|
|
805
|
-
raise RuntimeError(
|
|
806
|
-
f"Invalid workspace version file at {version_file}: "
|
|
807
|
-
f"expected integer, got '{content}'"
|
|
808
|
-
)
|
|
809
|
-
|
|
810
|
-
# Check if workspace version is supported
|
|
811
|
-
if workspace_version > WORKSPACE_VERSION:
|
|
812
|
-
raise RuntimeError(
|
|
813
|
-
f"Workspace version {workspace_version} is not supported by "
|
|
814
|
-
f"this version of experimaestro (supports up to version "
|
|
815
|
-
f"{WORKSPACE_VERSION}). Please upgrade experimaestro."
|
|
816
|
-
)
|
|
817
|
-
if workspace_version < WORKSPACE_VERSION:
|
|
818
|
-
raise RuntimeError(
|
|
819
|
-
f"Workspace version {workspace_version} is not supported by "
|
|
820
|
-
"this version of experimaestro (please upgrade the experimaestro "
|
|
821
|
-
"workspace)"
|
|
822
|
-
)
|
|
823
|
-
else:
|
|
824
|
-
# New workspace - create the file
|
|
825
|
-
workspace_version = WORKSPACE_VERSION
|
|
826
|
-
|
|
827
|
-
# Write current version to file (update empty v0 workspaces)
|
|
828
|
-
if not read_only and (
|
|
829
|
-
not version_file.exists() or version_file.read_text().strip() == ""
|
|
830
|
-
):
|
|
831
|
-
version_file.write_text(str(WORKSPACE_VERSION))
|
|
832
|
-
|
|
833
|
-
# Initialize workspace database in hidden .experimaestro directory
|
|
834
|
-
from .state_db import initialize_workspace_database
|
|
675
|
+
@property
|
|
676
|
+
def tags(self) -> Dict[str, Dict[str, str]]:
|
|
677
|
+
"""Build tags dict from job_infos"""
|
|
678
|
+
return {
|
|
679
|
+
job_id: job_info.tags
|
|
680
|
+
for job_id, job_info in self._job_infos.items()
|
|
681
|
+
if job_info.tags
|
|
682
|
+
}
|
|
835
683
|
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
684
|
+
@property
|
|
685
|
+
def dependencies(self) -> Dict[str, List[str]]:
|
|
686
|
+
return self._dependencies
|
|
839
687
|
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
)
|
|
844
|
-
self._db_dir = experimaestro_dir # Store for file watcher
|
|
688
|
+
@property
|
|
689
|
+
def events_count(self) -> int:
|
|
690
|
+
return self._events_count
|
|
845
691
|
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
"Database schema version changed, triggering full resync from disk"
|
|
850
|
-
)
|
|
851
|
-
sync_on_start = True # Force sync
|
|
852
|
-
|
|
853
|
-
# Optionally sync from disk on start (only in write mode)
|
|
854
|
-
# Syncing requires write access to update the database and sync timestamp
|
|
855
|
-
if sync_on_start and not read_only:
|
|
856
|
-
from .state_sync import sync_workspace_from_disk
|
|
857
|
-
|
|
858
|
-
sync_workspace_from_disk(
|
|
859
|
-
self.workspace_path,
|
|
860
|
-
write_mode=True,
|
|
861
|
-
force=needs_resync, # Force full sync if schema changed
|
|
862
|
-
sync_interval_minutes=sync_interval_minutes,
|
|
863
|
-
)
|
|
692
|
+
@property
|
|
693
|
+
def hostname(self) -> Optional[str]:
|
|
694
|
+
return self._hostname
|
|
864
695
|
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
WorkspaceSyncMetadata.update(db_version=CURRENT_DB_VERSION).where(
|
|
869
|
-
WorkspaceSyncMetadata.id == "workspace"
|
|
870
|
-
).execute()
|
|
871
|
-
logger.info("Database schema updated to version %d", CURRENT_DB_VERSION)
|
|
872
|
-
|
|
873
|
-
logger.info(
|
|
874
|
-
"WorkspaceStateProvider initialized (read_only=%s, workspace=%s)",
|
|
875
|
-
read_only,
|
|
876
|
-
workspace_path,
|
|
877
|
-
)
|
|
696
|
+
@property
|
|
697
|
+
def started_at(self) -> Optional[float]:
|
|
698
|
+
return self._started_at
|
|
878
699
|
|
|
879
700
|
@property
|
|
880
|
-
def
|
|
881
|
-
|
|
882
|
-
return self._read_only
|
|
701
|
+
def ended_at(self) -> Optional[float]:
|
|
702
|
+
return self._ended_at
|
|
883
703
|
|
|
884
|
-
|
|
704
|
+
@property
|
|
705
|
+
def total_jobs(self) -> int:
|
|
706
|
+
return len(self._job_infos)
|
|
885
707
|
|
|
886
|
-
@
|
|
887
|
-
def
|
|
888
|
-
|
|
708
|
+
@property
|
|
709
|
+
def finished_jobs(self) -> int:
|
|
710
|
+
return self._finished_jobs
|
|
889
711
|
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
if self.read_only:
|
|
894
|
-
raise RuntimeError("Cannot modify experiments in read-only mode")
|
|
712
|
+
@property
|
|
713
|
+
def failed_jobs(self) -> int:
|
|
714
|
+
return self._failed_jobs
|
|
895
715
|
|
|
896
|
-
|
|
897
|
-
ExperimentModel.insert(
|
|
898
|
-
experiment_id=experiment_id,
|
|
899
|
-
created_at=now,
|
|
900
|
-
updated_at=now,
|
|
901
|
-
).on_conflict(
|
|
902
|
-
conflict_target=[ExperimentModel.experiment_id],
|
|
903
|
-
update={
|
|
904
|
-
ExperimentModel.updated_at: now,
|
|
905
|
-
},
|
|
906
|
-
).execute()
|
|
907
|
-
|
|
908
|
-
logger.debug("Ensured experiment: %s", experiment_id)
|
|
909
|
-
|
|
910
|
-
# Notify listeners
|
|
911
|
-
exp_path = str(self.workspace_path / "xp" / experiment_id)
|
|
912
|
-
self._notify_listeners(
|
|
913
|
-
StateEvent(
|
|
914
|
-
event_type=StateEventType.EXPERIMENT_UPDATED,
|
|
915
|
-
data={
|
|
916
|
-
"experiment_id": experiment_id,
|
|
917
|
-
"workdir_path": exp_path,
|
|
918
|
-
"updated_at": now.isoformat(),
|
|
919
|
-
},
|
|
920
|
-
)
|
|
921
|
-
)
|
|
716
|
+
# state_dict() is inherited from BaseExperiment
|
|
922
717
|
|
|
923
|
-
@
|
|
924
|
-
def
|
|
925
|
-
|
|
718
|
+
@classmethod
|
|
719
|
+
def from_disk(
|
|
720
|
+
cls, run_dir: Path, workspace_path: Path
|
|
721
|
+
) -> Optional["MockExperiment"]:
|
|
722
|
+
"""Load MockExperiment from status.json and jobs.jsonl on disk
|
|
926
723
|
|
|
927
724
|
Args:
|
|
928
|
-
|
|
929
|
-
|
|
725
|
+
run_dir: Path to the run directory containing status.json
|
|
726
|
+
workspace_path: Workspace path for resolving relative paths
|
|
930
727
|
|
|
931
728
|
Returns:
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
Raises:
|
|
935
|
-
RuntimeError: If in read-only mode
|
|
729
|
+
MockExperiment instance or None if status.json doesn't exist
|
|
936
730
|
"""
|
|
937
|
-
|
|
938
|
-
raise RuntimeError("Cannot create runs in read-only mode")
|
|
939
|
-
|
|
940
|
-
# Auto-generate run_id from timestamp if not provided
|
|
941
|
-
if run_id is None:
|
|
942
|
-
now = datetime.now()
|
|
943
|
-
run_id = now.strftime("%Y%m%d_%H%M%S") + f"_{now.microsecond:06d}"
|
|
731
|
+
import fasteners
|
|
944
732
|
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
733
|
+
status_path = run_dir / "status.json"
|
|
734
|
+
if not status_path.exists():
|
|
735
|
+
return None
|
|
948
736
|
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
run_id=run_id,
|
|
953
|
-
started_at=started_at,
|
|
954
|
-
status="active",
|
|
955
|
-
hostname=hostname,
|
|
956
|
-
).execute()
|
|
957
|
-
|
|
958
|
-
# Persist to disk in experiment folder (informations.json)
|
|
959
|
-
exp_dir = self.workspace_path / "xp" / experiment_id
|
|
960
|
-
exp_dir.mkdir(parents=True, exist_ok=True)
|
|
961
|
-
info_file = exp_dir / "informations.json"
|
|
962
|
-
|
|
963
|
-
# Merge with existing data (may have multiple runs)
|
|
964
|
-
info_data: Dict = {}
|
|
965
|
-
if info_file.exists():
|
|
737
|
+
lock_path = status_path.parent / f".{status_path.name}.lock"
|
|
738
|
+
lock = fasteners.InterProcessLock(str(lock_path))
|
|
739
|
+
with lock:
|
|
966
740
|
try:
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
info_data["runs"] = {}
|
|
973
|
-
info_data["runs"][run_id] = {
|
|
974
|
-
"hostname": hostname,
|
|
975
|
-
"started_at": started_at.isoformat(),
|
|
976
|
-
}
|
|
977
|
-
info_file.write_text(json.dumps(info_data, indent=2))
|
|
978
|
-
|
|
979
|
-
# Update experiment's current_run_id and updated_at
|
|
980
|
-
now = datetime.now()
|
|
981
|
-
ExperimentModel.update(
|
|
982
|
-
current_run_id=run_id,
|
|
983
|
-
updated_at=now,
|
|
984
|
-
).where(ExperimentModel.experiment_id == experiment_id).execute()
|
|
985
|
-
|
|
986
|
-
logger.info(
|
|
987
|
-
"Created run %s for experiment %s on host %s",
|
|
988
|
-
run_id,
|
|
989
|
-
experiment_id,
|
|
990
|
-
hostname,
|
|
991
|
-
)
|
|
992
|
-
|
|
993
|
-
# Notify listeners
|
|
994
|
-
self._notify_listeners(
|
|
995
|
-
StateEvent(
|
|
996
|
-
event_type=StateEventType.RUN_UPDATED,
|
|
997
|
-
data={
|
|
998
|
-
"experiment_id": experiment_id,
|
|
999
|
-
"run_id": run_id,
|
|
1000
|
-
"status": "active",
|
|
1001
|
-
"started_at": now.isoformat(),
|
|
1002
|
-
"hostname": hostname,
|
|
1003
|
-
},
|
|
1004
|
-
)
|
|
1005
|
-
)
|
|
1006
|
-
|
|
1007
|
-
return run_id
|
|
1008
|
-
|
|
1009
|
-
@_with_db_context
|
|
1010
|
-
def get_current_run(self, experiment_id: str) -> Optional[str]:
|
|
1011
|
-
"""Get the current/latest run_id for an experiment
|
|
741
|
+
with status_path.open("r") as f:
|
|
742
|
+
data = json.load(f)
|
|
743
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
744
|
+
logger.warning("Failed to read %s: %s", status_path, e)
|
|
745
|
+
return None
|
|
1012
746
|
|
|
1013
|
-
|
|
1014
|
-
|
|
747
|
+
# Create experiment from status.json
|
|
748
|
+
exp = cls.from_state_dict(data, workspace_path)
|
|
1015
749
|
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
750
|
+
# Load jobs from jobs.jsonl
|
|
751
|
+
jobs_jsonl_path = run_dir / "jobs.jsonl"
|
|
752
|
+
if jobs_jsonl_path.exists():
|
|
753
|
+
try:
|
|
754
|
+
with jobs_jsonl_path.open("r") as f:
|
|
755
|
+
for line in f:
|
|
756
|
+
line = line.strip()
|
|
757
|
+
if not line:
|
|
758
|
+
continue
|
|
759
|
+
try:
|
|
760
|
+
record = json.loads(line)
|
|
761
|
+
job_info = ExperimentJobInformation.from_dict(record)
|
|
762
|
+
exp._job_infos[job_info.job_id] = job_info
|
|
763
|
+
except (json.JSONDecodeError, KeyError):
|
|
764
|
+
continue
|
|
765
|
+
except OSError as e:
|
|
766
|
+
logger.warning("Failed to read %s: %s", jobs_jsonl_path, e)
|
|
767
|
+
|
|
768
|
+
return exp
|
|
1026
769
|
|
|
1027
|
-
@
|
|
1028
|
-
def
|
|
1029
|
-
"""
|
|
770
|
+
@classmethod
|
|
771
|
+
def from_state_dict(cls, d: Dict, workspace_path: Path) -> "MockExperiment":
|
|
772
|
+
"""Create MockExperiment from state dictionary
|
|
1030
773
|
|
|
1031
774
|
Args:
|
|
1032
|
-
|
|
775
|
+
d: Dictionary from state_dict()
|
|
776
|
+
workspace_path: Workspace path to compute experiment path if not provided
|
|
1033
777
|
|
|
1034
778
|
Returns:
|
|
1035
|
-
|
|
1036
|
-
- workdir: Path to experiment directory
|
|
1037
|
-
- experiment_id: Unique identifier (property derived from workdir.name)
|
|
1038
|
-
- current_run_id: Current/latest run ID
|
|
1039
|
-
- total_jobs: Total number of jobs (for current run)
|
|
1040
|
-
- finished_jobs: Number of completed jobs (for current run)
|
|
1041
|
-
- failed_jobs: Number of failed jobs (for current run)
|
|
1042
|
-
- updated_at: When experiment was last modified
|
|
1043
|
-
- hostname: Host where the current run was launched
|
|
779
|
+
MockExperiment instance
|
|
1044
780
|
"""
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
query = ExperimentModel.select()
|
|
1048
|
-
if since is not None:
|
|
1049
|
-
query = query.where(ExperimentModel.updated_at > since)
|
|
1050
|
-
|
|
1051
|
-
for exp_model in query:
|
|
1052
|
-
# Count jobs for current run
|
|
1053
|
-
total_jobs = 0
|
|
1054
|
-
finished_jobs = 0
|
|
1055
|
-
failed_jobs = 0
|
|
1056
|
-
|
|
1057
|
-
started_at = None
|
|
1058
|
-
ended_at = None
|
|
1059
|
-
hostname = None
|
|
1060
|
-
|
|
1061
|
-
if exp_model.current_run_id:
|
|
1062
|
-
total_jobs = (
|
|
1063
|
-
JobModel.select()
|
|
1064
|
-
.where(
|
|
1065
|
-
(JobModel.experiment_id == exp_model.experiment_id)
|
|
1066
|
-
& (JobModel.run_id == exp_model.current_run_id)
|
|
1067
|
-
)
|
|
1068
|
-
.count()
|
|
1069
|
-
)
|
|
1070
|
-
finished_jobs = (
|
|
1071
|
-
JobModel.select()
|
|
1072
|
-
.where(
|
|
1073
|
-
(JobModel.experiment_id == exp_model.experiment_id)
|
|
1074
|
-
& (JobModel.run_id == exp_model.current_run_id)
|
|
1075
|
-
& (JobModel.state == "done")
|
|
1076
|
-
)
|
|
1077
|
-
.count()
|
|
1078
|
-
)
|
|
1079
|
-
failed_jobs = (
|
|
1080
|
-
JobModel.select()
|
|
1081
|
-
.where(
|
|
1082
|
-
(JobModel.experiment_id == exp_model.experiment_id)
|
|
1083
|
-
& (JobModel.run_id == exp_model.current_run_id)
|
|
1084
|
-
& (JobModel.state == "error")
|
|
1085
|
-
)
|
|
1086
|
-
.count()
|
|
1087
|
-
)
|
|
1088
|
-
|
|
1089
|
-
# Get run timestamps and hostname
|
|
1090
|
-
try:
|
|
1091
|
-
run_model = ExperimentRunModel.get(
|
|
1092
|
-
(ExperimentRunModel.experiment_id == exp_model.experiment_id)
|
|
1093
|
-
& (ExperimentRunModel.run_id == exp_model.current_run_id)
|
|
1094
|
-
)
|
|
1095
|
-
if run_model.started_at:
|
|
1096
|
-
started_at = run_model.started_at.timestamp()
|
|
1097
|
-
if run_model.ended_at:
|
|
1098
|
-
ended_at = run_model.ended_at.timestamp()
|
|
1099
|
-
hostname = run_model.hostname
|
|
1100
|
-
except ExperimentRunModel.DoesNotExist:
|
|
1101
|
-
pass
|
|
1102
|
-
|
|
1103
|
-
# Compute experiment path from workspace_path and experiment_id
|
|
1104
|
-
exp_path = self.workspace_path / "xp" / exp_model.experiment_id
|
|
1105
|
-
|
|
1106
|
-
experiments.append(
|
|
1107
|
-
MockExperiment(
|
|
1108
|
-
workdir=exp_path,
|
|
1109
|
-
current_run_id=exp_model.current_run_id,
|
|
1110
|
-
total_jobs=total_jobs,
|
|
1111
|
-
finished_jobs=finished_jobs,
|
|
1112
|
-
failed_jobs=failed_jobs,
|
|
1113
|
-
updated_at=exp_model.updated_at.isoformat(),
|
|
1114
|
-
started_at=started_at,
|
|
1115
|
-
ended_at=ended_at,
|
|
1116
|
-
hostname=hostname,
|
|
1117
|
-
)
|
|
1118
|
-
)
|
|
781
|
+
experiment_id = d.get("experiment_id", "")
|
|
782
|
+
run_id = d.get("run_id", "")
|
|
1119
783
|
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
experiment_id: Experiment identifier
|
|
784
|
+
# Use workdir from dict if provided, otherwise compute it
|
|
785
|
+
workdir = d.get("workdir")
|
|
786
|
+
if workdir is None:
|
|
787
|
+
# New layout: experiments/{experiment_id}/{run_id}/
|
|
788
|
+
workdir = workspace_path / "experiments" / experiment_id / run_id
|
|
789
|
+
elif isinstance(workdir, str):
|
|
790
|
+
workdir = Path(workdir)
|
|
1128
791
|
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
"""
|
|
792
|
+
# Parse status from string to enum
|
|
793
|
+
status_str = d.get("status", "running")
|
|
1132
794
|
try:
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
.where(
|
|
1157
|
-
(JobModel.experiment_id == exp_model.experiment_id)
|
|
1158
|
-
& (JobModel.run_id == exp_model.current_run_id)
|
|
1159
|
-
& (JobModel.state == "done")
|
|
1160
|
-
)
|
|
1161
|
-
.count()
|
|
1162
|
-
)
|
|
1163
|
-
failed_jobs = (
|
|
1164
|
-
JobModel.select()
|
|
1165
|
-
.where(
|
|
1166
|
-
(JobModel.experiment_id == exp_model.experiment_id)
|
|
1167
|
-
& (JobModel.run_id == exp_model.current_run_id)
|
|
1168
|
-
& (JobModel.state == "error")
|
|
1169
|
-
)
|
|
1170
|
-
.count()
|
|
1171
|
-
)
|
|
795
|
+
status = ExperimentStatus(status_str)
|
|
796
|
+
except ValueError:
|
|
797
|
+
# Handle legacy status values
|
|
798
|
+
if status_str in ("active", "running"):
|
|
799
|
+
status = ExperimentStatus.RUNNING
|
|
800
|
+
elif status_str in ("completed", "done"):
|
|
801
|
+
status = ExperimentStatus.DONE
|
|
802
|
+
elif status_str == "failed":
|
|
803
|
+
status = ExperimentStatus.FAILED
|
|
804
|
+
else:
|
|
805
|
+
status = ExperimentStatus.RUNNING
|
|
806
|
+
|
|
807
|
+
# Parse services from dict (can be list or dict)
|
|
808
|
+
services_data = d.get("services", {})
|
|
809
|
+
if isinstance(services_data, list):
|
|
810
|
+
services = {
|
|
811
|
+
s.get("service_id", ""): MockService.from_full_state_dict(s)
|
|
812
|
+
for s in services_data
|
|
813
|
+
}
|
|
814
|
+
else:
|
|
815
|
+
services = {
|
|
816
|
+
k: MockService.from_full_state_dict(v) for k, v in services_data.items()
|
|
817
|
+
}
|
|
1172
818
|
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
return MockExperiment(
|
|
1187
|
-
workdir=exp_path,
|
|
1188
|
-
current_run_id=exp_model.current_run_id,
|
|
1189
|
-
total_jobs=total_jobs,
|
|
1190
|
-
finished_jobs=finished_jobs,
|
|
1191
|
-
failed_jobs=failed_jobs,
|
|
1192
|
-
updated_at=exp_model.updated_at.isoformat(),
|
|
1193
|
-
hostname=hostname,
|
|
819
|
+
return cls(
|
|
820
|
+
workdir=workdir,
|
|
821
|
+
run_id=run_id,
|
|
822
|
+
status=status,
|
|
823
|
+
events_count=d.get("events_count", 0),
|
|
824
|
+
hostname=d.get("hostname"),
|
|
825
|
+
started_at=d.get("started_at"),
|
|
826
|
+
ended_at=d.get("ended_at"),
|
|
827
|
+
services=services,
|
|
828
|
+
dependencies=d.get("dependencies", {}),
|
|
829
|
+
finished_jobs=d.get("finished_jobs", 0),
|
|
830
|
+
failed_jobs=d.get("failed_jobs", 0),
|
|
1194
831
|
)
|
|
1195
832
|
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
"""Get all runs for an experiment
|
|
833
|
+
def apply_event(self, event: "EventBase") -> None:
|
|
834
|
+
"""Apply an event to update experiment state
|
|
1199
835
|
|
|
1200
836
|
Args:
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
Returns:
|
|
1204
|
-
List of run dictionaries with keys:
|
|
1205
|
-
- experiment_id: Experiment ID
|
|
1206
|
-
- run_id: Run ID
|
|
1207
|
-
- started_at: When run started
|
|
1208
|
-
- ended_at: When run completed (None if active)
|
|
1209
|
-
- status: Run status (active, completed, failed, abandoned)
|
|
837
|
+
event: Event to apply
|
|
1210
838
|
"""
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
)
|
|
1217
|
-
runs.append(
|
|
1218
|
-
{
|
|
1219
|
-
"experiment_id": run_model.experiment_id,
|
|
1220
|
-
"run_id": run_model.run_id,
|
|
1221
|
-
"started_at": run_model.started_at.isoformat(),
|
|
1222
|
-
"ended_at": (
|
|
1223
|
-
run_model.ended_at.isoformat() if run_model.ended_at else None
|
|
1224
|
-
),
|
|
1225
|
-
"status": run_model.status,
|
|
1226
|
-
}
|
|
1227
|
-
)
|
|
1228
|
-
return runs
|
|
1229
|
-
|
|
1230
|
-
@_with_db_context
|
|
1231
|
-
def complete_run(self, experiment_id: str, run_id: str, status: str = "completed"):
|
|
1232
|
-
"""Mark a run as completed
|
|
1233
|
-
|
|
1234
|
-
Args:
|
|
1235
|
-
experiment_id: Experiment identifier
|
|
1236
|
-
run_id: Run identifier
|
|
1237
|
-
status: Final status (completed, failed, abandoned)
|
|
839
|
+
from experimaestro.scheduler.state_status import (
|
|
840
|
+
JobSubmittedEvent,
|
|
841
|
+
JobStateChangedEvent,
|
|
842
|
+
ServiceAddedEvent,
|
|
843
|
+
RunCompletedEvent,
|
|
844
|
+
)
|
|
1238
845
|
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
846
|
+
if isinstance(event, JobSubmittedEvent):
|
|
847
|
+
# Add lightweight job info (tags are stored in ExperimentJobInformation)
|
|
848
|
+
self._job_infos[event.job_id] = ExperimentJobInformation(
|
|
849
|
+
job_id=event.job_id,
|
|
850
|
+
task_id=event.task_id,
|
|
851
|
+
tags=event.tags or {},
|
|
852
|
+
timestamp=event.timestamp,
|
|
853
|
+
)
|
|
854
|
+
if event.depends_on:
|
|
855
|
+
self._dependencies[event.job_id] = event.depends_on
|
|
856
|
+
|
|
857
|
+
elif isinstance(event, ServiceAddedEvent):
|
|
858
|
+
self._services[event.service_id] = MockService(
|
|
859
|
+
service_id=event.service_id,
|
|
860
|
+
description_text=event.description,
|
|
861
|
+
state_dict_data=event.state_dict,
|
|
862
|
+
service_class=event.service_class,
|
|
863
|
+
experiment_id=self.experiment_id,
|
|
864
|
+
run_id=self.run_id,
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
elif isinstance(event, JobStateChangedEvent):
|
|
868
|
+
# Update finished/failed counters when jobs complete
|
|
869
|
+
if event.state == "done":
|
|
870
|
+
self._finished_jobs += 1
|
|
871
|
+
elif event.state == "error":
|
|
872
|
+
self._failed_jobs += 1
|
|
873
|
+
|
|
874
|
+
elif isinstance(event, RunCompletedEvent):
|
|
875
|
+
# Map status string to ExperimentStatus
|
|
876
|
+
if event.status in ("completed", "done"):
|
|
877
|
+
self._status = ExperimentStatus.DONE
|
|
878
|
+
elif event.status == "failed":
|
|
879
|
+
self._status = ExperimentStatus.FAILED
|
|
880
|
+
else:
|
|
881
|
+
self._status = ExperimentStatus.RUNNING
|
|
882
|
+
self._ended_at = event.ended_at
|
|
1244
883
|
|
|
1245
|
-
ExperimentRunModel.update(ended_at=datetime.now(), status=status).where(
|
|
1246
|
-
(ExperimentRunModel.experiment_id == experiment_id)
|
|
1247
|
-
& (ExperimentRunModel.run_id == run_id)
|
|
1248
|
-
).execute()
|
|
1249
884
|
|
|
1250
|
-
|
|
885
|
+
class MockService(BaseService):
|
|
886
|
+
"""Mock service object for remote monitoring
|
|
1251
887
|
|
|
1252
|
-
|
|
888
|
+
This class provides a service-like interface for services loaded from
|
|
889
|
+
the remote server. It mimics the Service class interface sufficiently
|
|
890
|
+
for display in the TUI ServicesList widget.
|
|
891
|
+
"""
|
|
1253
892
|
|
|
1254
|
-
|
|
1255
|
-
def get_jobs(
|
|
893
|
+
def __init__(
|
|
1256
894
|
self,
|
|
895
|
+
service_id: str,
|
|
896
|
+
description_text: str,
|
|
897
|
+
state_dict_data: dict,
|
|
898
|
+
service_class: Optional[str] = None,
|
|
1257
899
|
experiment_id: Optional[str] = None,
|
|
1258
900
|
run_id: Optional[str] = None,
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
task_id: Filter by task class identifier
|
|
1270
|
-
state: Filter by job state
|
|
1271
|
-
tags: Filter by tags (all tags must match)
|
|
1272
|
-
since: If provided, only return jobs updated after this timestamp
|
|
1273
|
-
|
|
1274
|
-
Returns:
|
|
1275
|
-
List of MockJob objects
|
|
1276
|
-
"""
|
|
1277
|
-
# Build base query
|
|
1278
|
-
query = JobModel.select()
|
|
1279
|
-
|
|
1280
|
-
# Apply since filter for incremental updates
|
|
1281
|
-
if since is not None:
|
|
1282
|
-
query = query.where(JobModel.updated_at > since)
|
|
1283
|
-
|
|
1284
|
-
# Apply experiment filter
|
|
1285
|
-
if experiment_id is not None:
|
|
1286
|
-
# If experiment_id provided but not run_id, use current run
|
|
1287
|
-
if run_id is None:
|
|
1288
|
-
current_run = self.get_current_run(experiment_id)
|
|
1289
|
-
if current_run is None:
|
|
1290
|
-
return [] # No runs exist for this experiment
|
|
1291
|
-
run_id = current_run
|
|
1292
|
-
|
|
1293
|
-
query = query.where(
|
|
1294
|
-
(JobModel.experiment_id == experiment_id) & (JobModel.run_id == run_id)
|
|
1295
|
-
)
|
|
1296
|
-
|
|
1297
|
-
# Apply task_id filter
|
|
1298
|
-
if task_id is not None:
|
|
1299
|
-
query = query.where(JobModel.task_id == task_id)
|
|
1300
|
-
|
|
1301
|
-
# Apply state filter
|
|
1302
|
-
if state is not None:
|
|
1303
|
-
query = query.where(JobModel.state == state)
|
|
1304
|
-
|
|
1305
|
-
# Apply tag filters
|
|
1306
|
-
if tags:
|
|
1307
|
-
for tag_key, tag_value in tags.items():
|
|
1308
|
-
# Join with JobTagModel for each tag filter
|
|
1309
|
-
query = query.join(
|
|
1310
|
-
JobTagModel,
|
|
1311
|
-
on=(
|
|
1312
|
-
(JobTagModel.job_id == JobModel.job_id)
|
|
1313
|
-
& (JobTagModel.experiment_id == JobModel.experiment_id)
|
|
1314
|
-
& (JobTagModel.run_id == JobModel.run_id)
|
|
1315
|
-
& (JobTagModel.tag_key == tag_key)
|
|
1316
|
-
& (JobTagModel.tag_value == tag_value)
|
|
1317
|
-
),
|
|
1318
|
-
)
|
|
1319
|
-
|
|
1320
|
-
# Execute query and convert to dictionaries
|
|
1321
|
-
jobs = []
|
|
1322
|
-
for job_model in query:
|
|
1323
|
-
# Get tags for this job
|
|
1324
|
-
job_tags = self._get_job_tags(
|
|
1325
|
-
job_model.job_id, job_model.experiment_id, job_model.run_id
|
|
1326
|
-
)
|
|
1327
|
-
|
|
1328
|
-
jobs.append(self._job_model_to_dict(job_model, job_tags))
|
|
1329
|
-
|
|
1330
|
-
return jobs
|
|
1331
|
-
|
|
1332
|
-
@_with_db_context
|
|
1333
|
-
def get_job(
|
|
1334
|
-
self, job_id: str, experiment_id: str, run_id: Optional[str] = None
|
|
1335
|
-
) -> Optional[MockJob]:
|
|
1336
|
-
"""Get a specific job
|
|
1337
|
-
|
|
1338
|
-
Args:
|
|
1339
|
-
job_id: Job identifier
|
|
1340
|
-
experiment_id: Experiment identifier
|
|
1341
|
-
run_id: Run identifier (None = current run)
|
|
901
|
+
url: Optional[str] = None,
|
|
902
|
+
):
|
|
903
|
+
self.id = service_id
|
|
904
|
+
self._description = description_text
|
|
905
|
+
self._state_name = "MOCK" # MockService always has MOCK state
|
|
906
|
+
self._state_dict_data = state_dict_data
|
|
907
|
+
self._service_class = service_class
|
|
908
|
+
self.experiment_id = experiment_id
|
|
909
|
+
self.run_id = run_id
|
|
910
|
+
self.url = url
|
|
1342
911
|
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
"""
|
|
1346
|
-
|
|
1347
|
-
if run_id is None:
|
|
1348
|
-
run_id = self.get_current_run(experiment_id)
|
|
1349
|
-
if run_id is None:
|
|
1350
|
-
return None
|
|
912
|
+
@property
|
|
913
|
+
def state(self):
|
|
914
|
+
"""Return state as a ServiceState-like object with a name attribute"""
|
|
915
|
+
from experimaestro.scheduler.services import ServiceState
|
|
1351
916
|
|
|
917
|
+
# Convert state name to ServiceState enum
|
|
1352
918
|
try:
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
return None
|
|
1360
|
-
|
|
1361
|
-
# Get tags for this job
|
|
1362
|
-
job_tags = self._get_job_tags(job_id, experiment_id, run_id)
|
|
1363
|
-
|
|
1364
|
-
return self._job_model_to_dict(job_model, job_tags)
|
|
1365
|
-
|
|
1366
|
-
@_with_db_context
|
|
1367
|
-
def update_job_submitted(self, job: "Job", experiment_id: str, run_id: str):
|
|
1368
|
-
"""Record that a job has been submitted
|
|
1369
|
-
|
|
1370
|
-
Args:
|
|
1371
|
-
job: Job instance
|
|
1372
|
-
experiment_id: Experiment identifier
|
|
1373
|
-
run_id: Run identifier
|
|
1374
|
-
|
|
1375
|
-
Raises:
|
|
1376
|
-
RuntimeError: If in read-only mode
|
|
1377
|
-
"""
|
|
1378
|
-
if self.read_only:
|
|
1379
|
-
raise RuntimeError("Cannot update jobs in read-only mode")
|
|
1380
|
-
|
|
1381
|
-
task_id = str(job.type.identifier)
|
|
1382
|
-
|
|
1383
|
-
# Create or update job record
|
|
1384
|
-
now = datetime.now()
|
|
1385
|
-
JobModel.insert(
|
|
1386
|
-
job_id=job.identifier,
|
|
1387
|
-
experiment_id=experiment_id,
|
|
1388
|
-
run_id=run_id,
|
|
1389
|
-
task_id=task_id,
|
|
1390
|
-
locator=job.identifier,
|
|
1391
|
-
state=job.state.name,
|
|
1392
|
-
submitted_time=job.submittime,
|
|
1393
|
-
updated_at=now,
|
|
1394
|
-
).on_conflict(
|
|
1395
|
-
conflict_target=[JobModel.job_id, JobModel.experiment_id, JobModel.run_id],
|
|
1396
|
-
update={
|
|
1397
|
-
JobModel.state: job.state.name,
|
|
1398
|
-
JobModel.submitted_time: job.submittime,
|
|
1399
|
-
JobModel.updated_at: now,
|
|
1400
|
-
JobModel.failure_reason: None, # Clear old failure reason on resubmit
|
|
1401
|
-
},
|
|
1402
|
-
).execute()
|
|
1403
|
-
|
|
1404
|
-
# Update tags (run-scoped)
|
|
1405
|
-
self.update_job_tags(job.identifier, experiment_id, run_id, job.tags)
|
|
1406
|
-
|
|
1407
|
-
# Register partials for all declared subparameters
|
|
1408
|
-
subparameters = job.type._subparameters
|
|
1409
|
-
for name, sp in subparameters.items():
|
|
1410
|
-
partial_id = job.config.__xpm__.get_partial_identifier(sp)
|
|
1411
|
-
partial_id_hex = partial_id.all.hex()
|
|
1412
|
-
|
|
1413
|
-
# Register the partial directory
|
|
1414
|
-
self.register_partial(partial_id_hex, task_id, name)
|
|
1415
|
-
|
|
1416
|
-
# Link job to partial
|
|
1417
|
-
self.register_job_partial(
|
|
1418
|
-
job.identifier, experiment_id, run_id, partial_id_hex
|
|
1419
|
-
)
|
|
919
|
+
return ServiceState[self._state_name]
|
|
920
|
+
except KeyError:
|
|
921
|
+
# Return a mock object with name attribute for unknown states
|
|
922
|
+
class MockState:
|
|
923
|
+
def __init__(self, name):
|
|
924
|
+
self.name = name
|
|
1420
925
|
|
|
1421
|
-
|
|
1422
|
-
"Recorded job submission: %s (experiment=%s, run=%s)",
|
|
1423
|
-
job.identifier,
|
|
1424
|
-
experiment_id,
|
|
1425
|
-
run_id,
|
|
1426
|
-
)
|
|
926
|
+
return MockState(self._state_name)
|
|
1427
927
|
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
)
|
|
1432
|
-
self._notify_listeners(
|
|
1433
|
-
StateEvent(
|
|
1434
|
-
event_type=StateEventType.JOB_UPDATED,
|
|
1435
|
-
data={
|
|
1436
|
-
"jobId": job.identifier,
|
|
1437
|
-
"taskId": str(job.type.identifier),
|
|
1438
|
-
"experimentId": experiment_id,
|
|
1439
|
-
"runId": run_id,
|
|
1440
|
-
"status": job.state.name,
|
|
1441
|
-
"path": job_path,
|
|
1442
|
-
"updatedAt": now.isoformat(),
|
|
1443
|
-
},
|
|
1444
|
-
)
|
|
1445
|
-
)
|
|
928
|
+
def description(self) -> str:
|
|
929
|
+
"""Return service description"""
|
|
930
|
+
return self._description
|
|
1446
931
|
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
932
|
+
def state_dict(self) -> dict:
|
|
933
|
+
"""Return service state for recreation"""
|
|
934
|
+
return self._state_dict_data
|
|
1450
935
|
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
experiment_id: Experiment identifier
|
|
1454
|
-
run_id: Run identifier
|
|
936
|
+
def full_state_dict(self) -> dict:
|
|
937
|
+
"""Get full state as dictionary for JSON serialization.
|
|
1455
938
|
|
|
1456
|
-
|
|
1457
|
-
|
|
939
|
+
Overrides BaseService.full_state_dict() to preserve the original
|
|
940
|
+
service class name instead of using MockService's class name.
|
|
1458
941
|
"""
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
update_data = {
|
|
1465
|
-
JobModel.state: job.state.name,
|
|
1466
|
-
JobModel.updated_at: now,
|
|
942
|
+
return {
|
|
943
|
+
"service_id": self.id,
|
|
944
|
+
"description": self._description,
|
|
945
|
+
"class": self._service_class,
|
|
946
|
+
"state_dict": self._state_dict_data,
|
|
1467
947
|
}
|
|
1468
948
|
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
update_data[JobModel.failure_reason] = job.state.failure_reason.name
|
|
1474
|
-
else:
|
|
1475
|
-
# Clear failure reason when job is not in error state
|
|
1476
|
-
update_data[JobModel.failure_reason] = None
|
|
1477
|
-
|
|
1478
|
-
# Add timing information
|
|
1479
|
-
if job.starttime:
|
|
1480
|
-
update_data[JobModel.started_time] = job.starttime
|
|
1481
|
-
if job.endtime:
|
|
1482
|
-
update_data[JobModel.ended_time] = job.endtime
|
|
1483
|
-
|
|
1484
|
-
# Add progress information
|
|
1485
|
-
if job._progress:
|
|
1486
|
-
update_data[JobModel.progress] = json.dumps(
|
|
1487
|
-
[
|
|
1488
|
-
{"level": p.level, "progress": p.progress, "desc": p.desc}
|
|
1489
|
-
for p in job._progress
|
|
1490
|
-
]
|
|
1491
|
-
)
|
|
949
|
+
@property
|
|
950
|
+
def service_class(self) -> Optional[str]:
|
|
951
|
+
"""Return service class name"""
|
|
952
|
+
return self._service_class
|
|
1492
953
|
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
& (JobModel.experiment_id == experiment_id)
|
|
1497
|
-
& (JobModel.run_id == run_id)
|
|
1498
|
-
).execute()
|
|
954
|
+
@classmethod
|
|
955
|
+
def from_full_state_dict(cls, d: Dict) -> "MockService":
|
|
956
|
+
"""Create MockService from full state dictionary
|
|
1499
957
|
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
job.identifier,
|
|
1503
|
-
job.state.name,
|
|
1504
|
-
experiment_id,
|
|
1505
|
-
run_id,
|
|
1506
|
-
)
|
|
958
|
+
Args:
|
|
959
|
+
d: Dictionary from full_state_dict()
|
|
1507
960
|
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
"runId": run_id,
|
|
1520
|
-
"status": job.state.name,
|
|
1521
|
-
"path": job_path,
|
|
1522
|
-
"updatedAt": now.isoformat(),
|
|
1523
|
-
},
|
|
1524
|
-
)
|
|
961
|
+
Returns:
|
|
962
|
+
MockService instance (state is always MOCK, not from dict)
|
|
963
|
+
"""
|
|
964
|
+
return cls(
|
|
965
|
+
service_id=d["service_id"],
|
|
966
|
+
description_text=d.get("description", ""),
|
|
967
|
+
state_dict_data=d.get("state_dict", {}),
|
|
968
|
+
service_class=d.get("class"),
|
|
969
|
+
experiment_id=d.get("experiment_id"),
|
|
970
|
+
run_id=d.get("run_id"),
|
|
971
|
+
url=d.get("url"),
|
|
1525
972
|
)
|
|
1526
973
|
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
self, job_id: str, experiment_id: str, run_id: str, tags_dict: Dict[str, str]
|
|
1530
|
-
):
|
|
1531
|
-
"""Update tags for a job (run-scoped - fixes GH #128)
|
|
974
|
+
def to_service(self) -> "BaseService":
|
|
975
|
+
"""Try to recreate a live Service instance from this mock.
|
|
1532
976
|
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
have different tags.
|
|
1536
|
-
|
|
1537
|
-
Args:
|
|
1538
|
-
job_id: Job identifier
|
|
1539
|
-
experiment_id: Experiment identifier
|
|
1540
|
-
run_id: Run identifier
|
|
1541
|
-
tags_dict: Dictionary of tag key-value pairs
|
|
1542
|
-
|
|
1543
|
-
Raises:
|
|
1544
|
-
RuntimeError: If in read-only mode
|
|
1545
|
-
"""
|
|
1546
|
-
if self.read_only:
|
|
1547
|
-
raise RuntimeError("Cannot update tags in read-only mode")
|
|
1548
|
-
|
|
1549
|
-
# Delete existing tags for this job/experiment/run
|
|
1550
|
-
JobTagModel.delete().where(
|
|
1551
|
-
(JobTagModel.job_id == job_id)
|
|
1552
|
-
& (JobTagModel.experiment_id == experiment_id)
|
|
1553
|
-
& (JobTagModel.run_id == run_id)
|
|
1554
|
-
).execute()
|
|
1555
|
-
|
|
1556
|
-
# Insert new tags
|
|
1557
|
-
if tags_dict:
|
|
1558
|
-
tag_records = [
|
|
1559
|
-
{
|
|
1560
|
-
"job_id": job_id,
|
|
1561
|
-
"experiment_id": experiment_id,
|
|
1562
|
-
"run_id": run_id,
|
|
1563
|
-
"tag_key": key,
|
|
1564
|
-
"tag_value": value,
|
|
1565
|
-
}
|
|
1566
|
-
for key, value in tags_dict.items()
|
|
1567
|
-
]
|
|
1568
|
-
JobTagModel.insert_many(tag_records).execute()
|
|
1569
|
-
|
|
1570
|
-
logger.debug(
|
|
1571
|
-
"Updated tags for job %s (experiment=%s, run=%s): %s",
|
|
1572
|
-
job_id,
|
|
1573
|
-
experiment_id,
|
|
1574
|
-
run_id,
|
|
1575
|
-
tags_dict,
|
|
1576
|
-
)
|
|
1577
|
-
|
|
1578
|
-
@_with_db_context
|
|
1579
|
-
def delete_job(self, job_id: str, experiment_id: str, run_id: str):
|
|
1580
|
-
"""Remove a job, its tags, and partial references
|
|
1581
|
-
|
|
1582
|
-
Args:
|
|
1583
|
-
job_id: Job identifier
|
|
1584
|
-
experiment_id: Experiment identifier
|
|
1585
|
-
run_id: Run identifier
|
|
1586
|
-
|
|
1587
|
-
Raises:
|
|
1588
|
-
RuntimeError: If in read-only mode
|
|
1589
|
-
"""
|
|
1590
|
-
if self.read_only:
|
|
1591
|
-
raise RuntimeError("Cannot delete jobs in read-only mode")
|
|
1592
|
-
|
|
1593
|
-
# Delete tags first (foreign key constraint)
|
|
1594
|
-
JobTagModel.delete().where(
|
|
1595
|
-
(JobTagModel.job_id == job_id)
|
|
1596
|
-
& (JobTagModel.experiment_id == experiment_id)
|
|
1597
|
-
& (JobTagModel.run_id == run_id)
|
|
1598
|
-
).execute()
|
|
1599
|
-
|
|
1600
|
-
# Delete partial references
|
|
1601
|
-
JobPartialModel.delete().where(
|
|
1602
|
-
(JobPartialModel.job_id == job_id)
|
|
1603
|
-
& (JobPartialModel.experiment_id == experiment_id)
|
|
1604
|
-
& (JobPartialModel.run_id == run_id)
|
|
1605
|
-
).execute()
|
|
1606
|
-
|
|
1607
|
-
# Delete job
|
|
1608
|
-
JobModel.delete().where(
|
|
1609
|
-
(JobModel.job_id == job_id)
|
|
1610
|
-
& (JobModel.experiment_id == experiment_id)
|
|
1611
|
-
& (JobModel.run_id == run_id)
|
|
1612
|
-
).execute()
|
|
1613
|
-
|
|
1614
|
-
logger.debug(
|
|
1615
|
-
"Deleted job %s (experiment=%s, run=%s)", job_id, experiment_id, run_id
|
|
1616
|
-
)
|
|
1617
|
-
|
|
1618
|
-
# CLI utility methods for job management
|
|
1619
|
-
|
|
1620
|
-
@_with_db_context
|
|
1621
|
-
def get_all_jobs(
|
|
1622
|
-
self,
|
|
1623
|
-
state: Optional[str] = None,
|
|
1624
|
-
tags: Optional[Dict[str, str]] = None,
|
|
1625
|
-
since: Optional[datetime] = None,
|
|
1626
|
-
) -> List[MockJob]:
|
|
1627
|
-
"""Query all jobs across all experiments/runs
|
|
1628
|
-
|
|
1629
|
-
This method is designed for CLI tools that need to list or manage jobs
|
|
1630
|
-
across the entire workspace, regardless of experiment or run.
|
|
1631
|
-
|
|
1632
|
-
Args:
|
|
1633
|
-
state: Filter by job state (e.g., "done", "error", "running")
|
|
1634
|
-
tags: Filter by tags (all tags must match)
|
|
1635
|
-
since: If provided, only return jobs updated after this timestamp
|
|
1636
|
-
|
|
1637
|
-
Returns:
|
|
1638
|
-
List of MockJob objects
|
|
1639
|
-
"""
|
|
1640
|
-
# Build base query
|
|
1641
|
-
query = JobModel.select()
|
|
1642
|
-
|
|
1643
|
-
# Apply since filter for incremental updates
|
|
1644
|
-
if since is not None:
|
|
1645
|
-
query = query.where(JobModel.updated_at > since)
|
|
1646
|
-
|
|
1647
|
-
# Apply state filter
|
|
1648
|
-
if state is not None:
|
|
1649
|
-
query = query.where(JobModel.state == state)
|
|
1650
|
-
|
|
1651
|
-
# Apply tag filters
|
|
1652
|
-
if tags:
|
|
1653
|
-
for tag_key, tag_value in tags.items():
|
|
1654
|
-
query = query.join(
|
|
1655
|
-
JobTagModel,
|
|
1656
|
-
on=(
|
|
1657
|
-
(JobTagModel.job_id == JobModel.job_id)
|
|
1658
|
-
& (JobTagModel.experiment_id == JobModel.experiment_id)
|
|
1659
|
-
& (JobTagModel.run_id == JobModel.run_id)
|
|
1660
|
-
& (JobTagModel.tag_key == tag_key)
|
|
1661
|
-
& (JobTagModel.tag_value == tag_value)
|
|
1662
|
-
),
|
|
1663
|
-
)
|
|
1664
|
-
|
|
1665
|
-
# Execute query and convert to MockJob objects
|
|
1666
|
-
jobs = []
|
|
1667
|
-
for job_model in query:
|
|
1668
|
-
# Get tags for this job
|
|
1669
|
-
job_tags = self._get_job_tags(
|
|
1670
|
-
job_model.job_id, job_model.experiment_id, job_model.run_id
|
|
1671
|
-
)
|
|
1672
|
-
jobs.append(self._job_model_to_dict(job_model, job_tags))
|
|
1673
|
-
|
|
1674
|
-
return jobs
|
|
1675
|
-
|
|
1676
|
-
def kill_job(self, job: MockJob, perform: bool = False) -> bool:
|
|
1677
|
-
"""Kill a running job process
|
|
1678
|
-
|
|
1679
|
-
This method finds the process associated with a running job and kills it.
|
|
1680
|
-
It also updates the job state in the database to ERROR.
|
|
1681
|
-
|
|
1682
|
-
Args:
|
|
1683
|
-
job: MockJob instance to kill
|
|
1684
|
-
perform: If True, actually kill the process. If False, just check
|
|
1685
|
-
if the job can be killed (dry run).
|
|
977
|
+
Attempts to recreate the service using the stored configuration.
|
|
978
|
+
If recreation fails, returns self.
|
|
1686
979
|
|
|
1687
980
|
Returns:
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
# Update job state in database
|
|
1708
|
-
if not self.read_only:
|
|
1709
|
-
self._update_job_state_to_error(job, "killed")
|
|
1710
|
-
except Exception as e:
|
|
1711
|
-
logger.error("Error killing job %s: %s", job.identifier, e)
|
|
1712
|
-
return False
|
|
1713
|
-
|
|
1714
|
-
return True
|
|
1715
|
-
|
|
1716
|
-
def _update_job_state_to_error(self, job: MockJob, reason: str):
|
|
1717
|
-
"""Update job state to ERROR in database
|
|
1718
|
-
|
|
1719
|
-
Args:
|
|
1720
|
-
job: MockJob instance
|
|
1721
|
-
reason: Failure reason
|
|
1722
|
-
"""
|
|
1723
|
-
if self.read_only:
|
|
1724
|
-
return
|
|
1725
|
-
|
|
1726
|
-
now = datetime.now()
|
|
1727
|
-
with self.workspace_db.bind_ctx([JobModel]):
|
|
1728
|
-
JobModel.update(
|
|
1729
|
-
state="error",
|
|
1730
|
-
failure_reason=reason,
|
|
1731
|
-
ended_time=now.timestamp(),
|
|
1732
|
-
updated_at=now,
|
|
1733
|
-
).where(
|
|
1734
|
-
(JobModel.job_id == job.identifier)
|
|
1735
|
-
& (JobModel.experiment_id == job.experiment_id)
|
|
1736
|
-
& (JobModel.run_id == job.run_id)
|
|
1737
|
-
).execute()
|
|
1738
|
-
|
|
1739
|
-
logger.debug(
|
|
1740
|
-
"Updated job %s state to error (reason=%s)", job.identifier, reason
|
|
1741
|
-
)
|
|
1742
|
-
|
|
1743
|
-
def clean_job(self, job: MockJob, perform: bool = False) -> bool:
|
|
1744
|
-
"""Clean a finished job (delete directory and DB entry)
|
|
1745
|
-
|
|
1746
|
-
This method removes the job's working directory and its database entry.
|
|
1747
|
-
Only finished jobs (DONE or ERROR state) can be cleaned.
|
|
1748
|
-
|
|
1749
|
-
Args:
|
|
1750
|
-
job: MockJob instance to clean
|
|
1751
|
-
perform: If True, actually delete the job. If False, just check
|
|
1752
|
-
if the job can be cleaned (dry run).
|
|
1753
|
-
|
|
1754
|
-
Returns:
|
|
1755
|
-
True if job was cleaned (or would be cleaned in dry run),
|
|
1756
|
-
False if job is not finished or cannot be cleaned
|
|
1757
|
-
"""
|
|
1758
|
-
from shutil import rmtree
|
|
1759
|
-
|
|
1760
|
-
# Check if job is in a finished state
|
|
1761
|
-
if not job.state.finished():
|
|
1762
|
-
logger.debug(
|
|
1763
|
-
"Job %s is not finished (state=%s), cannot clean",
|
|
1764
|
-
job.identifier,
|
|
1765
|
-
job.state,
|
|
1766
|
-
)
|
|
1767
|
-
return False
|
|
1768
|
-
|
|
1769
|
-
if perform:
|
|
1770
|
-
# Delete job directory
|
|
1771
|
-
if job.path.exists():
|
|
1772
|
-
logger.info("Cleaning job %s: removing %s", job.identifier, job.path)
|
|
1773
|
-
rmtree(job.path)
|
|
1774
|
-
else:
|
|
1775
|
-
logger.warning("Job directory does not exist: %s", job.path)
|
|
1776
|
-
|
|
1777
|
-
# Delete from database
|
|
1778
|
-
if not self.read_only:
|
|
1779
|
-
self.delete_job(job.identifier, job.experiment_id, job.run_id)
|
|
1780
|
-
|
|
1781
|
-
return True
|
|
1782
|
-
|
|
1783
|
-
def kill_jobs(self, jobs: List[MockJob], perform: bool = False) -> int:
|
|
1784
|
-
"""Kill multiple jobs
|
|
1785
|
-
|
|
1786
|
-
Args:
|
|
1787
|
-
jobs: List of MockJob instances to kill
|
|
1788
|
-
perform: If True, actually kill the processes. If False, dry run.
|
|
1789
|
-
|
|
1790
|
-
Returns:
|
|
1791
|
-
Number of jobs that were killed (or would be killed in dry run)
|
|
1792
|
-
"""
|
|
1793
|
-
count = 0
|
|
1794
|
-
for job in jobs:
|
|
1795
|
-
if self.kill_job(job, perform=perform):
|
|
1796
|
-
count += 1
|
|
1797
|
-
return count
|
|
1798
|
-
|
|
1799
|
-
def clean_jobs(self, jobs: List[MockJob], perform: bool = False) -> int:
|
|
1800
|
-
"""Clean multiple finished jobs
|
|
1801
|
-
|
|
1802
|
-
Args:
|
|
1803
|
-
jobs: List of MockJob instances to clean
|
|
1804
|
-
perform: If True, actually delete the jobs. If False, dry run.
|
|
1805
|
-
|
|
1806
|
-
Returns:
|
|
1807
|
-
Number of jobs that were cleaned (or would be cleaned in dry run)
|
|
1808
|
-
"""
|
|
1809
|
-
count = 0
|
|
1810
|
-
for job in jobs:
|
|
1811
|
-
if self.clean_job(job, perform=perform):
|
|
1812
|
-
count += 1
|
|
1813
|
-
return count
|
|
1814
|
-
|
|
1815
|
-
def delete_job_safely(
|
|
1816
|
-
self, job: MockJob, cascade_orphans: bool = True
|
|
1817
|
-
) -> tuple[bool, str]:
|
|
1818
|
-
"""Delete a job with proper locking and orphan cleanup
|
|
1819
|
-
|
|
1820
|
-
This method is designed for TUI/UI use. It acquires a lock on the job
|
|
1821
|
-
to prevent race conditions, then deletes the job directory and DB entry.
|
|
1822
|
-
|
|
1823
|
-
Args:
|
|
1824
|
-
job: MockJob instance to delete
|
|
1825
|
-
cascade_orphans: If True, clean up orphan partials after deletion
|
|
1826
|
-
|
|
1827
|
-
Returns:
|
|
1828
|
-
Tuple of (success: bool, message: str)
|
|
1829
|
-
"""
|
|
1830
|
-
import fasteners
|
|
1831
|
-
from shutil import rmtree
|
|
1832
|
-
|
|
1833
|
-
# Check if job is running
|
|
1834
|
-
if job.state.running():
|
|
1835
|
-
return False, "Cannot delete a running job"
|
|
1836
|
-
|
|
1837
|
-
# Check if path exists
|
|
1838
|
-
if not job.path or not job.path.exists():
|
|
1839
|
-
# Just delete from database if path doesn't exist
|
|
1840
|
-
if not self.read_only:
|
|
1841
|
-
self.delete_job(job.identifier, job.experiment_id, job.run_id)
|
|
1842
|
-
if cascade_orphans:
|
|
1843
|
-
self.cleanup_orphan_partials(perform=True)
|
|
1844
|
-
return True, f"Job {job.identifier} deleted (directory already gone)"
|
|
1845
|
-
|
|
1846
|
-
# Try to acquire job lock (non-blocking)
|
|
1847
|
-
# Lock file is typically {script_name}.lock, but we use .lock for general locking
|
|
1848
|
-
lock_path = job.path / ".lock"
|
|
1849
|
-
lock = fasteners.InterProcessLock(str(lock_path))
|
|
1850
|
-
|
|
1851
|
-
if not lock.acquire(blocking=False):
|
|
1852
|
-
return False, "Job is currently locked (possibly running)"
|
|
1853
|
-
|
|
1854
|
-
try:
|
|
1855
|
-
# Delete all files except the lock file
|
|
1856
|
-
for item in job.path.iterdir():
|
|
1857
|
-
if item.name != ".lock":
|
|
1858
|
-
if item.is_dir():
|
|
1859
|
-
rmtree(item)
|
|
1860
|
-
else:
|
|
1861
|
-
item.unlink()
|
|
1862
|
-
|
|
1863
|
-
# Mark job as "phantom" in database (don't delete - keep as phantom)
|
|
1864
|
-
if not self.read_only:
|
|
1865
|
-
from datetime import datetime
|
|
1866
|
-
|
|
1867
|
-
JobModel.update(
|
|
1868
|
-
state="phantom",
|
|
1869
|
-
updated_at=datetime.now(),
|
|
1870
|
-
).where(
|
|
1871
|
-
(JobModel.job_id == job.identifier)
|
|
1872
|
-
& (JobModel.experiment_id == job.experiment_id)
|
|
1873
|
-
& (JobModel.run_id == job.run_id)
|
|
1874
|
-
).execute()
|
|
1875
|
-
|
|
1876
|
-
finally:
|
|
1877
|
-
lock.release()
|
|
1878
|
-
# Now delete the lock file and directory
|
|
1879
|
-
try:
|
|
1880
|
-
lock_path.unlink(missing_ok=True)
|
|
1881
|
-
if job.path.exists() and not any(job.path.iterdir()):
|
|
1882
|
-
job.path.rmdir()
|
|
1883
|
-
except Exception as e:
|
|
1884
|
-
logger.warning("Could not clean up lock file: %s", e)
|
|
1885
|
-
|
|
1886
|
-
# Clean up orphan partials if requested
|
|
1887
|
-
if cascade_orphans:
|
|
1888
|
-
self.cleanup_orphan_partials(perform=True)
|
|
1889
|
-
|
|
1890
|
-
return True, f"Job {job.identifier} deleted successfully"
|
|
1891
|
-
|
|
1892
|
-
@_with_db_context
|
|
1893
|
-
def delete_experiment(
|
|
1894
|
-
self, experiment_id: str, delete_jobs: bool = False
|
|
1895
|
-
) -> tuple[bool, str]:
|
|
1896
|
-
"""Delete an experiment from the database
|
|
1897
|
-
|
|
1898
|
-
Args:
|
|
1899
|
-
experiment_id: Experiment identifier
|
|
1900
|
-
delete_jobs: If True, also delete associated jobs (default: False)
|
|
1901
|
-
|
|
1902
|
-
Returns:
|
|
1903
|
-
Tuple of (success: bool, message: str)
|
|
1904
|
-
"""
|
|
1905
|
-
from shutil import rmtree
|
|
1906
|
-
|
|
1907
|
-
if self.read_only:
|
|
1908
|
-
return False, "Cannot delete in read-only mode"
|
|
1909
|
-
|
|
1910
|
-
# Get all jobs for this experiment
|
|
1911
|
-
jobs = self.get_jobs(experiment_id)
|
|
1912
|
-
running_jobs = [j for j in jobs if j.state.running()]
|
|
1913
|
-
|
|
1914
|
-
if running_jobs:
|
|
1915
|
-
return (
|
|
1916
|
-
False,
|
|
1917
|
-
f"Cannot delete experiment with {len(running_jobs)} running job(s)",
|
|
1918
|
-
)
|
|
1919
|
-
|
|
1920
|
-
# Delete jobs if requested
|
|
1921
|
-
if delete_jobs:
|
|
1922
|
-
for job in jobs:
|
|
1923
|
-
success, msg = self.delete_job_safely(job, cascade_orphans=False)
|
|
1924
|
-
if not success:
|
|
1925
|
-
logger.warning("Failed to delete job %s: %s", job.identifier, msg)
|
|
1926
|
-
|
|
1927
|
-
# Delete experiment runs
|
|
1928
|
-
ExperimentRunModel.delete().where(
|
|
1929
|
-
ExperimentRunModel.experiment_id == experiment_id
|
|
1930
|
-
).execute()
|
|
1931
|
-
|
|
1932
|
-
# Delete experiment
|
|
1933
|
-
ExperimentModel.delete().where(
|
|
1934
|
-
ExperimentModel.experiment_id == experiment_id
|
|
1935
|
-
).execute()
|
|
1936
|
-
|
|
1937
|
-
# Optionally delete experiment directory
|
|
1938
|
-
exp_path = self.workspace_path / "xp" / experiment_id
|
|
1939
|
-
if exp_path.exists():
|
|
1940
|
-
try:
|
|
1941
|
-
rmtree(exp_path)
|
|
1942
|
-
except Exception as e:
|
|
1943
|
-
logger.warning("Could not delete experiment directory: %s", e)
|
|
1944
|
-
|
|
1945
|
-
# Clean up orphan partials
|
|
1946
|
-
self.cleanup_orphan_partials(perform=True)
|
|
1947
|
-
|
|
1948
|
-
return True, f"Experiment {experiment_id} deleted successfully"
|
|
1949
|
-
|
|
1950
|
-
@_with_db_context
|
|
1951
|
-
def get_orphan_jobs(self) -> List[MockJob]:
|
|
1952
|
-
"""Find jobs that have no associated experiment in the database
|
|
1953
|
-
|
|
1954
|
-
Returns:
|
|
1955
|
-
List of MockJob instances for orphan jobs
|
|
1956
|
-
"""
|
|
1957
|
-
# Get all jobs
|
|
1958
|
-
all_jobs = self.get_all_jobs()
|
|
1959
|
-
|
|
1960
|
-
# Get all experiment IDs
|
|
1961
|
-
experiments = self.get_experiments()
|
|
1962
|
-
experiment_ids = {exp.experiment_id for exp in experiments}
|
|
1963
|
-
|
|
1964
|
-
# Find jobs with no matching experiment
|
|
1965
|
-
orphan_jobs = [
|
|
1966
|
-
job for job in all_jobs if job.experiment_id not in experiment_ids
|
|
1967
|
-
]
|
|
1968
|
-
|
|
1969
|
-
return orphan_jobs
|
|
1970
|
-
|
|
1971
|
-
# Service operations
|
|
1972
|
-
|
|
1973
|
-
@_with_db_context
|
|
1974
|
-
def register_service(
|
|
1975
|
-
self,
|
|
1976
|
-
service_id: str,
|
|
1977
|
-
experiment_id: str,
|
|
1978
|
-
run_id: str,
|
|
1979
|
-
description: str,
|
|
1980
|
-
state_dict: Optional[str] = None,
|
|
1981
|
-
):
|
|
1982
|
-
"""Register a service in the database
|
|
1983
|
-
|
|
1984
|
-
Services are only added or removed, not updated. Runtime state
|
|
1985
|
-
is managed by the Service object itself.
|
|
1986
|
-
|
|
1987
|
-
Args:
|
|
1988
|
-
service_id: Service identifier
|
|
1989
|
-
experiment_id: Experiment identifier
|
|
1990
|
-
run_id: Run identifier
|
|
1991
|
-
description: Human-readable description
|
|
1992
|
-
state_dict: JSON serialized state_dict for service recreation
|
|
1993
|
-
|
|
1994
|
-
Raises:
|
|
1995
|
-
RuntimeError: If in read-only mode
|
|
1996
|
-
"""
|
|
1997
|
-
if self.read_only:
|
|
1998
|
-
raise RuntimeError("Cannot register services in read-only mode")
|
|
1999
|
-
|
|
2000
|
-
insert_data = {
|
|
2001
|
-
"service_id": service_id,
|
|
2002
|
-
"experiment_id": experiment_id,
|
|
2003
|
-
"run_id": run_id,
|
|
2004
|
-
"description": description,
|
|
2005
|
-
"created_at": datetime.now(),
|
|
2006
|
-
}
|
|
2007
|
-
|
|
2008
|
-
if state_dict is not None:
|
|
2009
|
-
insert_data["state_dict"] = state_dict
|
|
2010
|
-
|
|
2011
|
-
# Use INSERT OR IGNORE - services are only added, not updated
|
|
2012
|
-
ServiceModel.insert(**insert_data).on_conflict_ignore().execute()
|
|
2013
|
-
|
|
2014
|
-
logger.debug(
|
|
2015
|
-
"Registered service %s (experiment=%s, run=%s)",
|
|
2016
|
-
service_id,
|
|
2017
|
-
experiment_id,
|
|
2018
|
-
run_id,
|
|
2019
|
-
)
|
|
2020
|
-
|
|
2021
|
-
# Notify listeners
|
|
2022
|
-
self._notify_listeners(
|
|
2023
|
-
StateEvent(
|
|
2024
|
-
event_type=StateEventType.SERVICE_UPDATED,
|
|
2025
|
-
data={
|
|
2026
|
-
"serviceId": service_id,
|
|
2027
|
-
"experimentId": experiment_id,
|
|
2028
|
-
"runId": run_id,
|
|
2029
|
-
"description": description,
|
|
2030
|
-
},
|
|
2031
|
-
)
|
|
2032
|
-
)
|
|
2033
|
-
|
|
2034
|
-
def _get_live_services(
|
|
2035
|
-
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
2036
|
-
) -> Optional[List["Service"]]:
|
|
2037
|
-
"""Get live services from scheduler if available.
|
|
2038
|
-
|
|
2039
|
-
Returns None if no live services (experiment not in scheduler).
|
|
2040
|
-
"""
|
|
2041
|
-
if experiment_id is None:
|
|
2042
|
-
return None
|
|
2043
|
-
|
|
2044
|
-
try:
|
|
2045
|
-
from experimaestro.scheduler.base import Scheduler
|
|
2046
|
-
|
|
2047
|
-
if not Scheduler.has_instance():
|
|
2048
|
-
return None
|
|
2049
|
-
|
|
2050
|
-
scheduler = Scheduler.instance()
|
|
2051
|
-
if experiment_id not in scheduler.experiments:
|
|
2052
|
-
logger.debug("Experiment %s not in scheduler", experiment_id)
|
|
2053
|
-
return None
|
|
2054
|
-
|
|
2055
|
-
exp = scheduler.experiments[experiment_id]
|
|
2056
|
-
services = list(exp.services.values())
|
|
2057
|
-
logger.debug(
|
|
2058
|
-
"Returning %d live services for experiment %s",
|
|
2059
|
-
len(services),
|
|
2060
|
-
experiment_id,
|
|
2061
|
-
)
|
|
2062
|
-
return services
|
|
2063
|
-
|
|
2064
|
-
except Exception as e:
|
|
2065
|
-
logger.warning("Could not get live services: %s", e)
|
|
2066
|
-
return None
|
|
2067
|
-
|
|
2068
|
-
@_with_db_context
|
|
2069
|
-
def _fetch_services_from_storage(
|
|
2070
|
-
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
2071
|
-
) -> List["Service"]:
|
|
2072
|
-
"""Fetch services from database.
|
|
2073
|
-
|
|
2074
|
-
Called when no live services and cache is empty.
|
|
2075
|
-
"""
|
|
2076
|
-
from experimaestro.scheduler.services import Service
|
|
2077
|
-
|
|
2078
|
-
query = ServiceModel.select()
|
|
2079
|
-
|
|
2080
|
-
if experiment_id is not None:
|
|
2081
|
-
query = query.where(
|
|
2082
|
-
(ServiceModel.experiment_id == experiment_id)
|
|
2083
|
-
& (ServiceModel.run_id == run_id)
|
|
2084
|
-
)
|
|
2085
|
-
|
|
2086
|
-
services = []
|
|
2087
|
-
|
|
2088
|
-
for service_model in query:
|
|
2089
|
-
service_id = service_model.service_id
|
|
2090
|
-
|
|
2091
|
-
# Try to recreate service from state_dict
|
|
2092
|
-
state_dict_json = service_model.state_dict
|
|
2093
|
-
if state_dict_json and state_dict_json != "{}":
|
|
2094
|
-
try:
|
|
2095
|
-
state_dict = json.loads(state_dict_json)
|
|
2096
|
-
if "__class__" in state_dict:
|
|
2097
|
-
service = Service.from_state_dict(state_dict)
|
|
2098
|
-
except Exception as e:
|
|
2099
|
-
service = MockService(
|
|
2100
|
-
service_id,
|
|
2101
|
-
f"error: {e}",
|
|
2102
|
-
{},
|
|
2103
|
-
experiment_id=experiment_id,
|
|
2104
|
-
run_id=run_id,
|
|
2105
|
-
)
|
|
2106
|
-
|
|
2107
|
-
logger.warning(
|
|
2108
|
-
"Failed to recreate service %s from state_dict: %s",
|
|
2109
|
-
service_id,
|
|
2110
|
-
e,
|
|
2111
|
-
)
|
|
2112
|
-
else:
|
|
2113
|
-
# If we can't recreate, skip this service (it's not usable)
|
|
2114
|
-
logger.debug(
|
|
2115
|
-
"Service %s has no state_dict for recreation, skipping",
|
|
2116
|
-
service_id,
|
|
2117
|
-
)
|
|
2118
|
-
service = MockService(
|
|
2119
|
-
service_id,
|
|
2120
|
-
"error: no state_dict",
|
|
2121
|
-
{},
|
|
2122
|
-
experiment_id=experiment_id,
|
|
2123
|
-
run_id=run_id,
|
|
2124
|
-
)
|
|
2125
|
-
|
|
2126
|
-
# Add to services
|
|
2127
|
-
service.id = service_id
|
|
2128
|
-
services.append(service)
|
|
2129
|
-
continue
|
|
2130
|
-
|
|
2131
|
-
return services
|
|
2132
|
-
|
|
2133
|
-
@_with_db_context
|
|
2134
|
-
def get_services_raw(
|
|
2135
|
-
self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
|
|
2136
|
-
) -> List[Dict]:
|
|
2137
|
-
"""Get raw service data from database without recreating Service objects
|
|
2138
|
-
|
|
2139
|
-
This is useful for remote monitoring where the client may have different
|
|
2140
|
-
modules installed than the server. Returns dictionaries with service
|
|
2141
|
-
metadata that can be serialized over JSON-RPC.
|
|
2142
|
-
|
|
2143
|
-
Args:
|
|
2144
|
-
experiment_id: Filter by experiment (None = all)
|
|
2145
|
-
run_id: Filter by run (None = current run if experiment_id provided)
|
|
2146
|
-
|
|
2147
|
-
Returns:
|
|
2148
|
-
List of dictionaries with service data
|
|
2149
|
-
"""
|
|
2150
|
-
query = ServiceModel.select()
|
|
2151
|
-
|
|
2152
|
-
if experiment_id is not None:
|
|
2153
|
-
# Use current run if not specified
|
|
2154
|
-
if run_id is None:
|
|
2155
|
-
run_id = self.get_current_run(experiment_id)
|
|
2156
|
-
if run_id is None:
|
|
2157
|
-
return []
|
|
2158
|
-
|
|
2159
|
-
query = query.where(
|
|
2160
|
-
(ServiceModel.experiment_id == experiment_id)
|
|
2161
|
-
& (ServiceModel.run_id == run_id)
|
|
2162
|
-
)
|
|
2163
|
-
|
|
2164
|
-
services = []
|
|
2165
|
-
for service_model in query:
|
|
2166
|
-
state_dict = {}
|
|
2167
|
-
if service_model.state_dict and service_model.state_dict != "{}":
|
|
2168
|
-
try:
|
|
2169
|
-
state_dict = json.loads(service_model.state_dict)
|
|
2170
|
-
except json.JSONDecodeError:
|
|
2171
|
-
pass
|
|
2172
|
-
|
|
2173
|
-
services.append(
|
|
2174
|
-
{
|
|
2175
|
-
"service_id": service_model.service_id,
|
|
2176
|
-
"description": service_model.description,
|
|
2177
|
-
"state_dict": state_dict,
|
|
2178
|
-
"experiment_id": service_model.experiment_id,
|
|
2179
|
-
"run_id": service_model.run_id,
|
|
2180
|
-
}
|
|
2181
|
-
)
|
|
2182
|
-
|
|
2183
|
-
return services
|
|
2184
|
-
|
|
2185
|
-
def get_live_job_states(self, experiment_id: str) -> Dict[str, str]:
|
|
2186
|
-
"""Get live job states from the scheduler if available
|
|
2187
|
-
|
|
2188
|
-
This is useful for debugging to compare live state vs database state.
|
|
2189
|
-
|
|
2190
|
-
Args:
|
|
2191
|
-
experiment_id: The experiment ID to get live jobs for
|
|
2192
|
-
|
|
2193
|
-
Returns:
|
|
2194
|
-
Dict mapping job identifier to live state name, empty if scheduler
|
|
2195
|
-
not available or experiment not registered
|
|
2196
|
-
"""
|
|
2197
|
-
try:
|
|
2198
|
-
from experimaestro.scheduler.base import Scheduler
|
|
2199
|
-
|
|
2200
|
-
if not Scheduler.has_instance():
|
|
2201
|
-
logger.debug("No scheduler instance available for live states")
|
|
2202
|
-
return {}
|
|
2203
|
-
|
|
2204
|
-
scheduler = Scheduler.instance()
|
|
2205
|
-
live_states = {}
|
|
2206
|
-
|
|
2207
|
-
logger.debug(
|
|
2208
|
-
"get_live_job_states: looking for exp=%s, scheduler has %d jobs",
|
|
2209
|
-
experiment_id,
|
|
2210
|
-
len(scheduler.jobs),
|
|
2211
|
-
)
|
|
2212
|
-
|
|
2213
|
-
for job_id, job in scheduler.jobs.items():
|
|
2214
|
-
# Filter by experiment if needed
|
|
2215
|
-
if hasattr(job, "experiment") and job.experiment is not None:
|
|
2216
|
-
if hasattr(job.experiment, "workdir"):
|
|
2217
|
-
job_exp_id = job.experiment.workdir.name
|
|
2218
|
-
if job_exp_id == experiment_id:
|
|
2219
|
-
live_states[job_id] = job.state.name
|
|
2220
|
-
else:
|
|
2221
|
-
logger.debug(
|
|
2222
|
-
"Job %s exp_id=%s != requested %s",
|
|
2223
|
-
job_id[:8],
|
|
2224
|
-
job_exp_id,
|
|
2225
|
-
experiment_id,
|
|
2226
|
-
)
|
|
2227
|
-
else:
|
|
2228
|
-
# Job not associated with experiment, include it anyway
|
|
2229
|
-
live_states[job_id] = job.state.name
|
|
2230
|
-
logger.debug(
|
|
2231
|
-
"Job %s has no experiment, including anyway", job_id[:8]
|
|
2232
|
-
)
|
|
2233
|
-
|
|
2234
|
-
logger.debug("Returning %d live job states", len(live_states))
|
|
2235
|
-
return live_states
|
|
2236
|
-
|
|
2237
|
-
except Exception as e:
|
|
2238
|
-
logger.debug("Could not get live job states: %s", e)
|
|
2239
|
-
return {}
|
|
2240
|
-
|
|
2241
|
-
# Sync metadata methods
|
|
2242
|
-
|
|
2243
|
-
@_with_db_context
|
|
2244
|
-
def get_last_sync_time(self) -> Optional[datetime]:
|
|
2245
|
-
"""Get the timestamp of the last successful sync
|
|
2246
|
-
|
|
2247
|
-
Returns:
|
|
2248
|
-
datetime of last sync, or None if never synced
|
|
2249
|
-
"""
|
|
2250
|
-
from peewee import OperationalError
|
|
2251
|
-
|
|
2252
|
-
from .state_db import WorkspaceSyncMetadata
|
|
2253
|
-
|
|
2254
|
-
try:
|
|
2255
|
-
metadata = WorkspaceSyncMetadata.get_or_none(
|
|
2256
|
-
WorkspaceSyncMetadata.id == "workspace"
|
|
2257
|
-
)
|
|
2258
|
-
if metadata and metadata.last_sync_time:
|
|
2259
|
-
return metadata.last_sync_time
|
|
2260
|
-
except OperationalError:
|
|
2261
|
-
# Table might not exist in older workspaces opened in read-only mode
|
|
2262
|
-
pass
|
|
2263
|
-
return None
|
|
2264
|
-
|
|
2265
|
-
@_with_db_context
|
|
2266
|
-
def update_last_sync_time(self) -> None:
|
|
2267
|
-
"""Update the last sync timestamp to now
|
|
2268
|
-
|
|
2269
|
-
Raises:
|
|
2270
|
-
RuntimeError: If in read-only mode
|
|
2271
|
-
"""
|
|
2272
|
-
if self.read_only:
|
|
2273
|
-
raise RuntimeError("Cannot update sync time in read-only mode")
|
|
2274
|
-
|
|
2275
|
-
from .state_db import WorkspaceSyncMetadata
|
|
2276
|
-
|
|
2277
|
-
WorkspaceSyncMetadata.insert(
|
|
2278
|
-
id="workspace", last_sync_time=datetime.now()
|
|
2279
|
-
).on_conflict(
|
|
2280
|
-
conflict_target=[WorkspaceSyncMetadata.id],
|
|
2281
|
-
update={WorkspaceSyncMetadata.last_sync_time: datetime.now()},
|
|
2282
|
-
).execute()
|
|
2283
|
-
logger.debug("Updated last sync time")
|
|
2284
|
-
|
|
2285
|
-
# Partial management methods
|
|
2286
|
-
|
|
2287
|
-
@_with_db_context
|
|
2288
|
-
def register_partial(
|
|
2289
|
-
self, partial_id: str, task_id: str, subparameters_name: str
|
|
2290
|
-
) -> None:
|
|
2291
|
-
"""Register a partial directory (creates if not exists)
|
|
2292
|
-
|
|
2293
|
-
Args:
|
|
2294
|
-
partial_id: Hex hash of the partial identifier
|
|
2295
|
-
task_id: Task class identifier
|
|
2296
|
-
subparameters_name: Name of the subparameters definition
|
|
2297
|
-
|
|
2298
|
-
Raises:
|
|
2299
|
-
RuntimeError: If in read-only mode
|
|
2300
|
-
"""
|
|
2301
|
-
if self.read_only:
|
|
2302
|
-
raise RuntimeError("Cannot register partials in read-only mode")
|
|
2303
|
-
|
|
2304
|
-
PartialModel.insert(
|
|
2305
|
-
partial_id=partial_id,
|
|
2306
|
-
task_id=task_id,
|
|
2307
|
-
subparameters_name=subparameters_name,
|
|
2308
|
-
created_at=datetime.now(),
|
|
2309
|
-
).on_conflict_ignore().execute()
|
|
2310
|
-
|
|
2311
|
-
logger.debug(
|
|
2312
|
-
"Registered partial: %s (task=%s, subparams=%s)",
|
|
2313
|
-
partial_id,
|
|
2314
|
-
task_id,
|
|
2315
|
-
subparameters_name,
|
|
2316
|
-
)
|
|
2317
|
-
|
|
2318
|
-
@_with_db_context
|
|
2319
|
-
def register_job_partial(
|
|
2320
|
-
self, job_id: str, experiment_id: str, run_id: str, partial_id: str
|
|
2321
|
-
) -> None:
|
|
2322
|
-
"""Link a job to a partial directory it uses
|
|
2323
|
-
|
|
2324
|
-
Args:
|
|
2325
|
-
job_id: Job identifier
|
|
2326
|
-
experiment_id: Experiment identifier
|
|
2327
|
-
run_id: Run identifier
|
|
2328
|
-
partial_id: Partial directory identifier
|
|
2329
|
-
|
|
2330
|
-
Raises:
|
|
2331
|
-
RuntimeError: If in read-only mode
|
|
2332
|
-
"""
|
|
2333
|
-
if self.read_only:
|
|
2334
|
-
raise RuntimeError("Cannot register job partials in read-only mode")
|
|
2335
|
-
|
|
2336
|
-
JobPartialModel.insert(
|
|
2337
|
-
job_id=job_id,
|
|
2338
|
-
experiment_id=experiment_id,
|
|
2339
|
-
run_id=run_id,
|
|
2340
|
-
partial_id=partial_id,
|
|
2341
|
-
).on_conflict_ignore().execute()
|
|
2342
|
-
|
|
2343
|
-
logger.debug(
|
|
2344
|
-
"Linked job %s to partial %s (experiment=%s, run=%s)",
|
|
2345
|
-
job_id,
|
|
2346
|
-
partial_id,
|
|
2347
|
-
experiment_id,
|
|
2348
|
-
run_id,
|
|
2349
|
-
)
|
|
2350
|
-
|
|
2351
|
-
@_with_db_context
|
|
2352
|
-
def unregister_job_partials(
|
|
2353
|
-
self, job_id: str, experiment_id: str, run_id: str
|
|
2354
|
-
) -> None:
|
|
2355
|
-
"""Remove all partial links for a job
|
|
2356
|
-
|
|
2357
|
-
Called when a job is deleted to clean up its partial references.
|
|
2358
|
-
|
|
2359
|
-
Args:
|
|
2360
|
-
job_id: Job identifier
|
|
2361
|
-
experiment_id: Experiment identifier
|
|
2362
|
-
run_id: Run identifier
|
|
2363
|
-
|
|
2364
|
-
Raises:
|
|
2365
|
-
RuntimeError: If in read-only mode
|
|
2366
|
-
"""
|
|
2367
|
-
if self.read_only:
|
|
2368
|
-
raise RuntimeError("Cannot unregister job partials in read-only mode")
|
|
2369
|
-
|
|
2370
|
-
JobPartialModel.delete().where(
|
|
2371
|
-
(JobPartialModel.job_id == job_id)
|
|
2372
|
-
& (JobPartialModel.experiment_id == experiment_id)
|
|
2373
|
-
& (JobPartialModel.run_id == run_id)
|
|
2374
|
-
).execute()
|
|
2375
|
-
|
|
2376
|
-
logger.debug(
|
|
2377
|
-
"Unregistered partials for job %s (experiment=%s, run=%s)",
|
|
2378
|
-
job_id,
|
|
2379
|
-
experiment_id,
|
|
2380
|
-
run_id,
|
|
2381
|
-
)
|
|
2382
|
-
|
|
2383
|
-
@_with_db_context
|
|
2384
|
-
def get_orphan_partials(self) -> List[Dict]:
|
|
2385
|
-
"""Find partial directories that are not referenced by any job
|
|
2386
|
-
|
|
2387
|
-
Returns:
|
|
2388
|
-
List of dictionaries with partial_id, task_id, subparameters_name
|
|
2389
|
-
"""
|
|
2390
|
-
# Find partials that have no job references
|
|
2391
|
-
# Using a subquery to find referenced partial_ids
|
|
2392
|
-
referenced_partials = JobPartialModel.select(JobPartialModel.partial_id)
|
|
2393
|
-
|
|
2394
|
-
orphan_query = PartialModel.select().where(
|
|
2395
|
-
PartialModel.partial_id.not_in(referenced_partials)
|
|
2396
|
-
)
|
|
2397
|
-
|
|
2398
|
-
orphans = []
|
|
2399
|
-
for partial in orphan_query:
|
|
2400
|
-
orphans.append(
|
|
2401
|
-
{
|
|
2402
|
-
"partial_id": partial.partial_id,
|
|
2403
|
-
"task_id": partial.task_id,
|
|
2404
|
-
"subparameters_name": partial.subparameters_name,
|
|
2405
|
-
"created_at": partial.created_at.isoformat(),
|
|
2406
|
-
}
|
|
2407
|
-
)
|
|
2408
|
-
|
|
2409
|
-
return orphans
|
|
2410
|
-
|
|
2411
|
-
def cleanup_orphan_partials(self, perform: bool = False) -> List[Path]:
|
|
2412
|
-
"""Clean up orphan partial directories
|
|
2413
|
-
|
|
2414
|
-
Finds partial directories not referenced by any job and removes them.
|
|
2415
|
-
|
|
2416
|
-
Args:
|
|
2417
|
-
perform: If True, actually delete. If False, dry run (list only).
|
|
2418
|
-
|
|
2419
|
-
Returns:
|
|
2420
|
-
List of paths that were deleted (or would be deleted in dry run)
|
|
2421
|
-
"""
|
|
2422
|
-
from shutil import rmtree
|
|
2423
|
-
|
|
2424
|
-
orphans = self.get_orphan_partials()
|
|
2425
|
-
deleted_paths = []
|
|
2426
|
-
|
|
2427
|
-
for orphan in orphans:
|
|
2428
|
-
# Reconstruct path: WORKSPACE/partials/TASK_ID/SUBPARAM_NAME/PARTIAL_ID
|
|
2429
|
-
partial_path = (
|
|
2430
|
-
self.workspace_path
|
|
2431
|
-
/ "partials"
|
|
2432
|
-
/ orphan["task_id"]
|
|
2433
|
-
/ orphan["subparameters_name"]
|
|
2434
|
-
/ orphan["partial_id"]
|
|
2435
|
-
)
|
|
2436
|
-
|
|
2437
|
-
if perform:
|
|
2438
|
-
# Delete directory if it exists
|
|
2439
|
-
if partial_path.exists():
|
|
2440
|
-
logger.info("Cleaning orphan partial: %s", partial_path)
|
|
2441
|
-
rmtree(partial_path)
|
|
2442
|
-
|
|
2443
|
-
# Delete from database
|
|
2444
|
-
if not self.read_only:
|
|
2445
|
-
with self.workspace_db.bind_ctx([PartialModel]):
|
|
2446
|
-
PartialModel.delete().where(
|
|
2447
|
-
PartialModel.partial_id == orphan["partial_id"]
|
|
2448
|
-
).execute()
|
|
2449
|
-
|
|
2450
|
-
deleted_paths.append(partial_path)
|
|
2451
|
-
|
|
2452
|
-
return deleted_paths
|
|
2453
|
-
|
|
2454
|
-
# Utility methods
|
|
2455
|
-
|
|
2456
|
-
def close(self):
|
|
2457
|
-
"""Close the database connection and remove from registry
|
|
2458
|
-
|
|
2459
|
-
This should be called when done with the workspace to free resources.
|
|
2460
|
-
"""
|
|
2461
|
-
# Stop file watcher if running
|
|
2462
|
-
self._stop_file_watcher()
|
|
2463
|
-
|
|
2464
|
-
# Close database connection
|
|
2465
|
-
if hasattr(self, "workspace_db") and self.workspace_db is not None:
|
|
2466
|
-
from .state_db import close_workspace_database
|
|
2467
|
-
|
|
2468
|
-
close_workspace_database(self.workspace_db)
|
|
2469
|
-
self.workspace_db = None
|
|
2470
|
-
|
|
2471
|
-
# Remove from registry
|
|
2472
|
-
with WorkspaceStateProvider._lock:
|
|
2473
|
-
if self.workspace_path in WorkspaceStateProvider._instances:
|
|
2474
|
-
del WorkspaceStateProvider._instances[self.workspace_path]
|
|
2475
|
-
|
|
2476
|
-
logger.debug("WorkspaceStateProvider closed for %s", self.workspace_path)
|
|
2477
|
-
|
|
2478
|
-
# Listener methods for push notifications
|
|
2479
|
-
|
|
2480
|
-
def add_listener(self, listener: StateListener) -> None:
|
|
2481
|
-
"""Register a listener for state change notifications
|
|
2482
|
-
|
|
2483
|
-
Listeners are called synchronously when state changes occur.
|
|
2484
|
-
For UI applications, listeners should queue updates for their
|
|
2485
|
-
own event loop to avoid blocking database operations.
|
|
2486
|
-
|
|
2487
|
-
When the first listener is added, starts watching the database
|
|
2488
|
-
file for changes to enable push notifications.
|
|
2489
|
-
|
|
2490
|
-
Args:
|
|
2491
|
-
listener: Callback function that receives StateEvent objects
|
|
2492
|
-
"""
|
|
2493
|
-
with self._listeners_lock:
|
|
2494
|
-
was_empty = len(self._listeners) == 0
|
|
2495
|
-
self._listeners.add(listener)
|
|
2496
|
-
|
|
2497
|
-
# Start file watcher when first listener is added
|
|
2498
|
-
if was_empty:
|
|
2499
|
-
self._start_file_watcher()
|
|
2500
|
-
|
|
2501
|
-
logger.info(
|
|
2502
|
-
"Added state listener: %s (total: %d)", listener, len(self._listeners)
|
|
2503
|
-
)
|
|
2504
|
-
|
|
2505
|
-
def remove_listener(self, listener: StateListener) -> None:
|
|
2506
|
-
"""Unregister a state change listener
|
|
2507
|
-
|
|
2508
|
-
When the last listener is removed, stops watching the database file.
|
|
2509
|
-
|
|
2510
|
-
Args:
|
|
2511
|
-
listener: Previously registered callback function
|
|
2512
|
-
"""
|
|
2513
|
-
with self._listeners_lock:
|
|
2514
|
-
self._listeners.discard(listener)
|
|
2515
|
-
is_empty = len(self._listeners) == 0
|
|
2516
|
-
|
|
2517
|
-
# Stop file watcher when last listener is removed
|
|
2518
|
-
if is_empty:
|
|
2519
|
-
self._stop_file_watcher()
|
|
2520
|
-
|
|
2521
|
-
logger.debug("Removed state listener: %s", listener)
|
|
2522
|
-
|
|
2523
|
-
def _start_file_watcher(self) -> None:
|
|
2524
|
-
"""Start watching the database file for changes"""
|
|
2525
|
-
if self._db_file_watch is not None:
|
|
2526
|
-
logger.info("File watcher already running for %s", self._db_dir)
|
|
2527
|
-
return # Already watching
|
|
2528
|
-
|
|
2529
|
-
from experimaestro.ipc import ipcom
|
|
2530
|
-
|
|
2531
|
-
# Create and start the change detector thread
|
|
2532
|
-
self._change_detector = _DatabaseChangeDetector(self)
|
|
2533
|
-
self._change_detector.start()
|
|
2534
|
-
|
|
2535
|
-
# Create the file handler that signals the detector
|
|
2536
|
-
self._db_file_handler = _DatabaseFileHandler(self._change_detector)
|
|
2537
|
-
self._db_file_watch = ipcom().fswatch(
|
|
2538
|
-
self._db_file_handler,
|
|
2539
|
-
self._db_dir,
|
|
2540
|
-
recursive=False,
|
|
2541
|
-
)
|
|
2542
|
-
logger.info("Started database file watcher for %s", self._db_dir)
|
|
2543
|
-
|
|
2544
|
-
def _stop_file_watcher(self) -> None:
|
|
2545
|
-
"""Stop watching the database file"""
|
|
2546
|
-
if self._db_file_watch is None:
|
|
2547
|
-
return # Not watching
|
|
2548
|
-
|
|
2549
|
-
from experimaestro.ipc import ipcom
|
|
2550
|
-
|
|
2551
|
-
# Stop the file watcher first
|
|
2552
|
-
ipcom().fsunwatch(self._db_file_watch)
|
|
2553
|
-
self._db_file_watch = None
|
|
2554
|
-
self._db_file_handler = None
|
|
2555
|
-
|
|
2556
|
-
# Stop the change detector thread
|
|
2557
|
-
if self._change_detector is not None:
|
|
2558
|
-
self._change_detector.stop()
|
|
2559
|
-
self._change_detector = None
|
|
2560
|
-
|
|
2561
|
-
logger.debug("Stopped database file watcher for %s", self.workspace_path)
|
|
2562
|
-
|
|
2563
|
-
def _notify_listeners(self, event: StateEvent) -> None:
|
|
2564
|
-
"""Notify all registered listeners of a state change
|
|
2565
|
-
|
|
2566
|
-
This is called internally by state-modifying methods.
|
|
2567
|
-
Listeners are called synchronously - they should be fast.
|
|
2568
|
-
|
|
2569
|
-
Args:
|
|
2570
|
-
event: State change event to broadcast
|
|
2571
|
-
"""
|
|
2572
|
-
with self._listeners_lock:
|
|
2573
|
-
listeners = list(self._listeners)
|
|
2574
|
-
|
|
2575
|
-
for listener in listeners:
|
|
2576
|
-
try:
|
|
2577
|
-
listener(event)
|
|
2578
|
-
except Exception as e:
|
|
2579
|
-
logger.warning("Listener %s raised exception: %s", listener, e)
|
|
2580
|
-
|
|
2581
|
-
# Helper methods
|
|
2582
|
-
|
|
2583
|
-
@_with_db_context
|
|
2584
|
-
def _get_job_tags(
|
|
2585
|
-
self, job_id: str, experiment_id: str, run_id: str
|
|
2586
|
-
) -> Dict[str, str]:
|
|
2587
|
-
"""Get tags for a job
|
|
2588
|
-
|
|
2589
|
-
Args:
|
|
2590
|
-
job_id: Job identifier
|
|
2591
|
-
experiment_id: Experiment identifier
|
|
2592
|
-
run_id: Run identifier
|
|
2593
|
-
|
|
2594
|
-
Returns:
|
|
2595
|
-
Dictionary of tag key-value pairs
|
|
2596
|
-
"""
|
|
2597
|
-
tags = {}
|
|
2598
|
-
for tag_model in JobTagModel.select().where(
|
|
2599
|
-
(JobTagModel.job_id == job_id)
|
|
2600
|
-
& (JobTagModel.experiment_id == experiment_id)
|
|
2601
|
-
& (JobTagModel.run_id == run_id)
|
|
2602
|
-
):
|
|
2603
|
-
tags[tag_model.tag_key] = tag_model.tag_value
|
|
2604
|
-
return tags
|
|
2605
|
-
|
|
2606
|
-
def _job_model_to_dict(self, job_model: JobModel, tags: Dict[str, str]) -> MockJob:
|
|
2607
|
-
"""Convert a JobModel to a MockJob object
|
|
2608
|
-
|
|
2609
|
-
Args:
|
|
2610
|
-
job_model: JobModel instance
|
|
2611
|
-
tags: Dictionary of tags for this job
|
|
2612
|
-
|
|
2613
|
-
Returns:
|
|
2614
|
-
MockJob object
|
|
2615
|
-
"""
|
|
2616
|
-
# Parse progress JSON
|
|
2617
|
-
progress_list = json.loads(job_model.progress)
|
|
2618
|
-
|
|
2619
|
-
# Compute job path from workspace_path, task_id, and job_id
|
|
2620
|
-
job_path = self.workspace_path / "jobs" / job_model.task_id / job_model.job_id
|
|
2621
|
-
|
|
2622
|
-
# Convert failure_reason string to enum if present
|
|
2623
|
-
failure_reason = None
|
|
2624
|
-
if job_model.failure_reason:
|
|
2625
|
-
try:
|
|
2626
|
-
failure_reason = JobFailureStatus[job_model.failure_reason]
|
|
2627
|
-
except KeyError:
|
|
2628
|
-
pass # Unknown failure reason, leave as None
|
|
2629
|
-
|
|
2630
|
-
return MockJob(
|
|
2631
|
-
identifier=job_model.job_id,
|
|
2632
|
-
task_id=job_model.task_id,
|
|
2633
|
-
locator=job_model.locator,
|
|
2634
|
-
path=job_path,
|
|
2635
|
-
state=job_model.state,
|
|
2636
|
-
submittime=job_model.submitted_time,
|
|
2637
|
-
starttime=job_model.started_time,
|
|
2638
|
-
endtime=job_model.ended_time,
|
|
2639
|
-
progress=progress_list,
|
|
2640
|
-
tags=tags,
|
|
2641
|
-
experiment_id=job_model.experiment_id,
|
|
2642
|
-
run_id=job_model.run_id,
|
|
2643
|
-
updated_at=job_model.updated_at.isoformat(),
|
|
2644
|
-
failure_reason=failure_reason,
|
|
2645
|
-
)
|
|
2646
|
-
|
|
2647
|
-
def _format_time(self, timestamp: Optional[float]) -> str:
|
|
2648
|
-
"""Format timestamp for UI
|
|
2649
|
-
|
|
2650
|
-
Args:
|
|
2651
|
-
timestamp: Unix timestamp or None
|
|
2652
|
-
|
|
2653
|
-
Returns:
|
|
2654
|
-
ISO format datetime string or empty string
|
|
2655
|
-
"""
|
|
2656
|
-
if not timestamp:
|
|
2657
|
-
return ""
|
|
2658
|
-
return datetime.fromtimestamp(timestamp).isoformat()
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
# Scheduler listener adapter
|
|
2662
|
-
class SchedulerListener:
|
|
2663
|
-
"""Adapter to connect scheduler events to WorkspaceStateProvider
|
|
2664
|
-
|
|
2665
|
-
This class implements the scheduler listener interface and forwards
|
|
2666
|
-
events to the WorkspaceStateProvider. It tracks which experiment/run
|
|
2667
|
-
each job belongs to for proper database updates.
|
|
2668
|
-
"""
|
|
2669
|
-
|
|
2670
|
-
def __init__(self, state_provider: WorkspaceStateProvider):
|
|
2671
|
-
"""Initialize listener
|
|
2672
|
-
|
|
2673
|
-
Args:
|
|
2674
|
-
state_provider: WorkspaceStateProvider instance to update
|
|
2675
|
-
"""
|
|
2676
|
-
self.state_provider = state_provider
|
|
2677
|
-
# Map job_id -> (experiment_id, run_id) for tracking
|
|
2678
|
-
self.job_experiments: Dict[str, tuple] = {}
|
|
2679
|
-
|
|
2680
|
-
logger.info("SchedulerListener initialized")
|
|
2681
|
-
|
|
2682
|
-
@_with_db_context
|
|
2683
|
-
def job_submitted(self, job: "Job", experiment_id: str, run_id: str):
|
|
2684
|
-
"""Called when a job is submitted
|
|
2685
|
-
|
|
2686
|
-
Args:
|
|
2687
|
-
job: The submitted job
|
|
2688
|
-
experiment_id: Experiment this job belongs to
|
|
2689
|
-
run_id: Run this job belongs to
|
|
2690
|
-
"""
|
|
2691
|
-
# Track job's experiment/run
|
|
2692
|
-
self.job_experiments[job.identifier] = (experiment_id, run_id)
|
|
2693
|
-
|
|
2694
|
-
# Update state provider
|
|
2695
|
-
try:
|
|
2696
|
-
self.state_provider.update_job_submitted(job, experiment_id, run_id)
|
|
2697
|
-
except Exception as e:
|
|
2698
|
-
logger.exception(
|
|
2699
|
-
"Error updating job submission for %s: %s", job.identifier, e
|
|
2700
|
-
)
|
|
2701
|
-
|
|
2702
|
-
@_with_db_context
|
|
2703
|
-
def job_state(self, job: "Job"):
|
|
2704
|
-
"""Called when a job's state changes
|
|
2705
|
-
|
|
2706
|
-
Args:
|
|
2707
|
-
job: The job with updated state
|
|
2708
|
-
"""
|
|
2709
|
-
# Look up job's experiment/run
|
|
2710
|
-
if job.identifier not in self.job_experiments:
|
|
2711
|
-
logger.warning(
|
|
2712
|
-
"State change for unknown job %s (not tracked by listener)",
|
|
2713
|
-
job.identifier,
|
|
2714
|
-
)
|
|
2715
|
-
return
|
|
2716
|
-
|
|
2717
|
-
experiment_id, run_id = self.job_experiments[job.identifier]
|
|
2718
|
-
|
|
2719
|
-
# Update state provider
|
|
2720
|
-
try:
|
|
2721
|
-
self.state_provider.update_job_state(job, experiment_id, run_id)
|
|
2722
|
-
except Exception as e:
|
|
2723
|
-
logger.exception("Error updating job state for %s: %s", job.identifier, e)
|
|
2724
|
-
|
|
2725
|
-
@_with_db_context
|
|
2726
|
-
def service_add(self, service: "Service", experiment_id: str, run_id: str):
|
|
2727
|
-
"""Called when a service is added
|
|
2728
|
-
|
|
2729
|
-
Args:
|
|
2730
|
-
service: The added service
|
|
2731
|
-
experiment_id: Experiment identifier
|
|
2732
|
-
run_id: Run identifier
|
|
2733
|
-
"""
|
|
2734
|
-
from experimaestro.scheduler.services import Service
|
|
2735
|
-
|
|
2736
|
-
try:
|
|
2737
|
-
# Get state_dict for service recreation
|
|
2738
|
-
state_dict_json = None
|
|
2739
|
-
try:
|
|
2740
|
-
# _full_state_dict includes __class__ automatically
|
|
2741
|
-
state_dict = service._full_state_dict()
|
|
2742
|
-
# Serialize paths automatically
|
|
2743
|
-
serialized = Service.serialize_state_dict(state_dict)
|
|
2744
|
-
state_dict_json = json.dumps(serialized)
|
|
2745
|
-
except Exception as e:
|
|
2746
|
-
# Service cannot be serialized - store unserializable marker
|
|
2747
|
-
logger.warning(
|
|
2748
|
-
"Could not get state_dict for service %s: %s", service.id, e
|
|
2749
|
-
)
|
|
2750
|
-
state_dict_json = json.dumps(
|
|
2751
|
-
{
|
|
2752
|
-
"__class__": f"{service.__class__.__module__}.{service.__class__.__name__}",
|
|
2753
|
-
"__unserializable__": True,
|
|
2754
|
-
"__reason__": f"Cannot serialize: {e}",
|
|
2755
|
-
}
|
|
2756
|
-
)
|
|
2757
|
-
|
|
2758
|
-
self.state_provider.register_service(
|
|
2759
|
-
service.id,
|
|
2760
|
-
experiment_id,
|
|
2761
|
-
run_id,
|
|
2762
|
-
service.description(),
|
|
2763
|
-
state_dict=state_dict_json,
|
|
2764
|
-
)
|
|
2765
|
-
except Exception as e:
|
|
2766
|
-
logger.exception("Error updating service %s: %s", service.id, e)
|
|
981
|
+
A live Service instance or self if recreation is not possible
|
|
982
|
+
"""
|
|
983
|
+
# Just return self - service recreation from config not implemented
|
|
984
|
+
return self
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
__all__ = [
|
|
988
|
+
# Data classes
|
|
989
|
+
"ProcessInfo",
|
|
990
|
+
# Listener type alias
|
|
991
|
+
"StateListener",
|
|
992
|
+
# ABC
|
|
993
|
+
"StateProvider",
|
|
994
|
+
"OfflineStateProvider",
|
|
995
|
+
# Mock classes
|
|
996
|
+
"MockJob",
|
|
997
|
+
"MockExperiment",
|
|
998
|
+
"MockService",
|
|
999
|
+
]
|