experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of experimaestro might be problematic. Click here for more details.
- experimaestro/cli/__init__.py +177 -31
- experimaestro/experiments/cli.py +6 -2
- experimaestro/scheduler/base.py +21 -0
- experimaestro/scheduler/experiment.py +64 -34
- experimaestro/scheduler/interfaces.py +27 -0
- experimaestro/scheduler/remote/__init__.py +31 -0
- experimaestro/scheduler/remote/client.py +874 -0
- experimaestro/scheduler/remote/protocol.py +467 -0
- experimaestro/scheduler/remote/server.py +423 -0
- experimaestro/scheduler/remote/sync.py +144 -0
- experimaestro/scheduler/services.py +158 -32
- experimaestro/scheduler/state_db.py +58 -9
- experimaestro/scheduler/state_provider.py +512 -91
- experimaestro/scheduler/state_sync.py +65 -8
- experimaestro/tests/test_cli_jobs.py +3 -3
- experimaestro/tests/test_remote_state.py +671 -0
- experimaestro/tests/test_state_db.py +8 -8
- experimaestro/tui/app.py +100 -8
- experimaestro/version.py +2 -2
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +4 -4
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/RECORD +24 -18
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/entry_points.txt +0 -0
- {experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,13 +15,15 @@ Key features:
|
|
|
15
15
|
|
|
16
16
|
import json
|
|
17
17
|
import logging
|
|
18
|
+
import socket
|
|
18
19
|
import threading
|
|
19
20
|
import time
|
|
20
21
|
from dataclasses import dataclass
|
|
21
22
|
from datetime import datetime
|
|
22
23
|
from enum import Enum, auto
|
|
23
24
|
from pathlib import Path
|
|
24
|
-
from
|
|
25
|
+
from abc import ABC, abstractmethod
|
|
26
|
+
from typing import Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
|
|
25
27
|
|
|
26
28
|
from watchdog.events import FileSystemEventHandler
|
|
27
29
|
from watchdog.observers.api import ObservedWatch
|
|
@@ -34,11 +36,14 @@ from experimaestro.scheduler.state_db import (
|
|
|
34
36
|
ServiceModel,
|
|
35
37
|
PartialModel,
|
|
36
38
|
JobPartialModel,
|
|
39
|
+
WorkspaceSyncMetadata,
|
|
37
40
|
ALL_MODELS,
|
|
41
|
+
CURRENT_DB_VERSION,
|
|
38
42
|
)
|
|
39
43
|
from experimaestro.scheduler.interfaces import (
|
|
40
44
|
BaseJob,
|
|
41
45
|
BaseExperiment,
|
|
46
|
+
BaseService,
|
|
42
47
|
JobState,
|
|
43
48
|
JobFailureStatus,
|
|
44
49
|
STATE_NAME_TO_JOBSTATE,
|
|
@@ -78,6 +83,213 @@ class StateEvent:
|
|
|
78
83
|
StateListener = Callable[[StateEvent], None]
|
|
79
84
|
|
|
80
85
|
|
|
86
|
+
class StateProvider(ABC):
|
|
87
|
+
"""Abstract base class for state providers
|
|
88
|
+
|
|
89
|
+
Defines the interface that all state providers must implement.
|
|
90
|
+
This enables both local (WorkspaceStateProvider) and remote
|
|
91
|
+
(SSHStateProviderClient) providers to be used interchangeably.
|
|
92
|
+
|
|
93
|
+
Provides common service caching logic to preserve service instances
|
|
94
|
+
(and their URLs) across calls to get_services(). Subclasses should call
|
|
95
|
+
_init_service_cache() in their __init__ and implement _fetch_services_from_storage().
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def _init_service_cache(self) -> None:
|
|
99
|
+
"""Initialize service cache - call from subclass __init__"""
|
|
100
|
+
self._service_cache: Dict[Tuple[str, str], Dict[str, "BaseService"]] = {}
|
|
101
|
+
self._service_cache_lock = threading.Lock()
|
|
102
|
+
|
|
103
|
+
def _clear_service_cache(self) -> None:
|
|
104
|
+
"""Clear the service cache"""
|
|
105
|
+
with self._service_cache_lock:
|
|
106
|
+
self._service_cache.clear()
|
|
107
|
+
|
|
108
|
+
def get_services(
|
|
109
|
+
self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
|
|
110
|
+
) -> List[BaseService]:
|
|
111
|
+
"""Get services for an experiment
|
|
112
|
+
|
|
113
|
+
Uses caching to preserve service instances (and their URLs) across calls.
|
|
114
|
+
Subclasses can override _get_live_services() for live service support
|
|
115
|
+
and must implement _fetch_services_from_storage() for persistent storage.
|
|
116
|
+
"""
|
|
117
|
+
# Resolve run_id if needed
|
|
118
|
+
if experiment_id is not None and run_id is None:
|
|
119
|
+
run_id = self.get_current_run(experiment_id)
|
|
120
|
+
if run_id is None:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
cache_key = (experiment_id or "", run_id or "")
|
|
124
|
+
|
|
125
|
+
with self._service_cache_lock:
|
|
126
|
+
# Try to get live services (scheduler, etc.) - may return None
|
|
127
|
+
live_services = self._get_live_services(experiment_id, run_id)
|
|
128
|
+
if live_services is not None:
|
|
129
|
+
# Cache and return live services
|
|
130
|
+
self._service_cache[cache_key] = {s.id: s for s in live_services}
|
|
131
|
+
return live_services
|
|
132
|
+
|
|
133
|
+
# Check cache
|
|
134
|
+
cached = self._service_cache.get(cache_key)
|
|
135
|
+
if cached is not None:
|
|
136
|
+
return list(cached.values())
|
|
137
|
+
|
|
138
|
+
# Fetch from persistent storage (DB or remote)
|
|
139
|
+
services = self._fetch_services_from_storage(experiment_id, run_id)
|
|
140
|
+
self._service_cache[cache_key] = {s.id: s for s in services}
|
|
141
|
+
return services
|
|
142
|
+
|
|
143
|
+
def _get_live_services(
|
|
144
|
+
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
145
|
+
) -> Optional[List[BaseService]]:
|
|
146
|
+
"""Get live services if available (e.g., from scheduler).
|
|
147
|
+
|
|
148
|
+
Returns None if no live services are available (default).
|
|
149
|
+
Subclasses may override to check for live services.
|
|
150
|
+
"""
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
@abstractmethod
|
|
154
|
+
def _fetch_services_from_storage(
|
|
155
|
+
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
156
|
+
) -> List[BaseService]:
|
|
157
|
+
"""Fetch services from persistent storage (DB or remote).
|
|
158
|
+
|
|
159
|
+
Called when no live services and cache is empty.
|
|
160
|
+
"""
|
|
161
|
+
...
|
|
162
|
+
|
|
163
|
+
@abstractmethod
|
|
164
|
+
def get_experiments(self, since: Optional[datetime] = None) -> List[BaseExperiment]:
|
|
165
|
+
"""Get list of all experiments"""
|
|
166
|
+
...
|
|
167
|
+
|
|
168
|
+
@abstractmethod
|
|
169
|
+
def get_experiment(self, experiment_id: str) -> Optional[BaseExperiment]:
|
|
170
|
+
"""Get a specific experiment by ID"""
|
|
171
|
+
...
|
|
172
|
+
|
|
173
|
+
@abstractmethod
|
|
174
|
+
def get_experiment_runs(self, experiment_id: str) -> List[Dict]:
|
|
175
|
+
"""Get all runs for an experiment"""
|
|
176
|
+
...
|
|
177
|
+
|
|
178
|
+
@abstractmethod
|
|
179
|
+
def get_current_run(self, experiment_id: str) -> Optional[str]:
|
|
180
|
+
"""Get the current run ID for an experiment"""
|
|
181
|
+
...
|
|
182
|
+
|
|
183
|
+
@abstractmethod
|
|
184
|
+
def get_jobs(
|
|
185
|
+
self,
|
|
186
|
+
experiment_id: Optional[str] = None,
|
|
187
|
+
run_id: Optional[str] = None,
|
|
188
|
+
task_id: Optional[str] = None,
|
|
189
|
+
state: Optional[str] = None,
|
|
190
|
+
tags: Optional[Dict[str, str]] = None,
|
|
191
|
+
since: Optional[datetime] = None,
|
|
192
|
+
) -> List[BaseJob]:
|
|
193
|
+
"""Query jobs with optional filters"""
|
|
194
|
+
...
|
|
195
|
+
|
|
196
|
+
@abstractmethod
|
|
197
|
+
def get_job(
|
|
198
|
+
self, job_id: str, experiment_id: str, run_id: Optional[str] = None
|
|
199
|
+
) -> Optional[BaseJob]:
|
|
200
|
+
"""Get a specific job"""
|
|
201
|
+
...
|
|
202
|
+
|
|
203
|
+
@abstractmethod
|
|
204
|
+
def get_all_jobs(
|
|
205
|
+
self,
|
|
206
|
+
state: Optional[str] = None,
|
|
207
|
+
tags: Optional[Dict[str, str]] = None,
|
|
208
|
+
since: Optional[datetime] = None,
|
|
209
|
+
) -> List[BaseJob]:
|
|
210
|
+
"""Get all jobs across all experiments"""
|
|
211
|
+
...
|
|
212
|
+
|
|
213
|
+
# Note: get_services is implemented in base class using _fetch_services_from_storage
|
|
214
|
+
|
|
215
|
+
@abstractmethod
|
|
216
|
+
def get_services_raw(
|
|
217
|
+
self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
|
|
218
|
+
) -> List[Dict]:
|
|
219
|
+
"""Get raw service data as dictionaries (for serialization)"""
|
|
220
|
+
...
|
|
221
|
+
|
|
222
|
+
@abstractmethod
|
|
223
|
+
def add_listener(self, listener: StateListener) -> None:
|
|
224
|
+
"""Register a listener for state change events"""
|
|
225
|
+
...
|
|
226
|
+
|
|
227
|
+
@abstractmethod
|
|
228
|
+
def remove_listener(self, listener: StateListener) -> None:
|
|
229
|
+
"""Unregister a listener"""
|
|
230
|
+
...
|
|
231
|
+
|
|
232
|
+
@abstractmethod
|
|
233
|
+
def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
|
|
234
|
+
"""Kill a running job"""
|
|
235
|
+
...
|
|
236
|
+
|
|
237
|
+
@abstractmethod
|
|
238
|
+
def clean_job(self, job: BaseJob, perform: bool = False) -> bool:
|
|
239
|
+
"""Clean a finished job"""
|
|
240
|
+
...
|
|
241
|
+
|
|
242
|
+
@abstractmethod
|
|
243
|
+
def close(self) -> None:
|
|
244
|
+
"""Close the state provider and release resources"""
|
|
245
|
+
...
|
|
246
|
+
|
|
247
|
+
# Optional methods with default implementations
|
|
248
|
+
|
|
249
|
+
def sync_path(self, path: str) -> Optional[Path]:
|
|
250
|
+
"""Sync a specific path from remote (remote providers only)
|
|
251
|
+
|
|
252
|
+
Returns None for local providers or if sync fails.
|
|
253
|
+
"""
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
def get_orphan_jobs(self) -> List[BaseJob]:
|
|
257
|
+
"""Get orphan jobs (jobs not associated with any experiment run)"""
|
|
258
|
+
return []
|
|
259
|
+
|
|
260
|
+
def delete_job_safely(self, job: BaseJob, perform: bool = True) -> Tuple[bool, str]:
|
|
261
|
+
"""Safely delete a job and its data"""
|
|
262
|
+
return False, "Not implemented"
|
|
263
|
+
|
|
264
|
+
def delete_experiment(
|
|
265
|
+
self, experiment_id: str, perform: bool = True
|
|
266
|
+
) -> Tuple[bool, str]:
|
|
267
|
+
"""Delete an experiment and all its data"""
|
|
268
|
+
return False, "Not implemented"
|
|
269
|
+
|
|
270
|
+
def cleanup_orphan_partials(self, perform: bool = False) -> List[str]:
|
|
271
|
+
"""Clean up orphan partial directories"""
|
|
272
|
+
return []
|
|
273
|
+
|
|
274
|
+
def get_last_sync_time(self) -> Optional[datetime]:
|
|
275
|
+
"""Get the last sync time (for incremental updates)"""
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def read_only(self) -> bool:
|
|
280
|
+
"""Whether this provider is read-only"""
|
|
281
|
+
return True
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def is_remote(self) -> bool:
|
|
285
|
+
"""Whether this is a remote provider (e.g., SSH)
|
|
286
|
+
|
|
287
|
+
Remote providers use periodic refresh instead of push notifications
|
|
288
|
+
and support sync_path for on-demand file synchronization.
|
|
289
|
+
"""
|
|
290
|
+
return False
|
|
291
|
+
|
|
292
|
+
|
|
81
293
|
class _DatabaseChangeDetector:
|
|
82
294
|
"""Background thread that detects database changes and notifies listeners
|
|
83
295
|
|
|
@@ -371,6 +583,7 @@ class MockExperiment(BaseExperiment):
|
|
|
371
583
|
updated_at: str,
|
|
372
584
|
started_at: Optional[float] = None,
|
|
373
585
|
ended_at: Optional[float] = None,
|
|
586
|
+
hostname: Optional[str] = None,
|
|
374
587
|
):
|
|
375
588
|
self.workdir = workdir
|
|
376
589
|
self.current_run_id = current_run_id
|
|
@@ -380,6 +593,7 @@ class MockExperiment(BaseExperiment):
|
|
|
380
593
|
self.updated_at = updated_at
|
|
381
594
|
self.started_at = started_at
|
|
382
595
|
self.ended_at = ended_at
|
|
596
|
+
self.hostname = hostname
|
|
383
597
|
|
|
384
598
|
@property
|
|
385
599
|
def experiment_id(self) -> str:
|
|
@@ -387,6 +601,57 @@ class MockExperiment(BaseExperiment):
|
|
|
387
601
|
return self.workdir.name
|
|
388
602
|
|
|
389
603
|
|
|
604
|
+
class MockService(BaseService):
|
|
605
|
+
"""Mock service object for remote monitoring
|
|
606
|
+
|
|
607
|
+
This class provides a service-like interface for services loaded from
|
|
608
|
+
the remote server. It mimics the Service class interface sufficiently
|
|
609
|
+
for display in the TUI ServicesList widget.
|
|
610
|
+
"""
|
|
611
|
+
|
|
612
|
+
def __init__(
|
|
613
|
+
self,
|
|
614
|
+
service_id: str,
|
|
615
|
+
description_text: str,
|
|
616
|
+
state_dict_data: dict,
|
|
617
|
+
experiment_id: Optional[str] = None,
|
|
618
|
+
run_id: Optional[str] = None,
|
|
619
|
+
url: Optional[str] = None,
|
|
620
|
+
state: str = "STOPPED",
|
|
621
|
+
):
|
|
622
|
+
self.id = service_id
|
|
623
|
+
self._description = description_text
|
|
624
|
+
self._state_name = state
|
|
625
|
+
self._state_dict_data = state_dict_data
|
|
626
|
+
self.experiment_id = experiment_id
|
|
627
|
+
self.run_id = run_id
|
|
628
|
+
self.url = url
|
|
629
|
+
|
|
630
|
+
@property
|
|
631
|
+
def state(self):
|
|
632
|
+
"""Return state as a ServiceState-like object with a name attribute"""
|
|
633
|
+
from experimaestro.scheduler.services import ServiceState
|
|
634
|
+
|
|
635
|
+
# Convert state name to ServiceState enum
|
|
636
|
+
try:
|
|
637
|
+
return ServiceState[self._state_name]
|
|
638
|
+
except KeyError:
|
|
639
|
+
# Return a mock object with name attribute for unknown states
|
|
640
|
+
class MockState:
|
|
641
|
+
def __init__(self, name):
|
|
642
|
+
self.name = name
|
|
643
|
+
|
|
644
|
+
return MockState(self._state_name)
|
|
645
|
+
|
|
646
|
+
def description(self) -> str:
|
|
647
|
+
"""Return service description"""
|
|
648
|
+
return self._description
|
|
649
|
+
|
|
650
|
+
def state_dict(self) -> dict:
|
|
651
|
+
"""Return state dictionary for service recreation"""
|
|
652
|
+
return self._state_dict_data
|
|
653
|
+
|
|
654
|
+
|
|
390
655
|
def _with_db_context(func):
|
|
391
656
|
"""Decorator to wrap method in database bind context
|
|
392
657
|
|
|
@@ -406,7 +671,7 @@ def _with_db_context(func):
|
|
|
406
671
|
return wrapper
|
|
407
672
|
|
|
408
673
|
|
|
409
|
-
class WorkspaceStateProvider:
|
|
674
|
+
class WorkspaceStateProvider(StateProvider):
|
|
410
675
|
"""Unified state provider for workspace-level database (singleton per workspace path)
|
|
411
676
|
|
|
412
677
|
Provides access to experiment and job state from a single workspace database.
|
|
@@ -507,13 +772,16 @@ class WorkspaceStateProvider:
|
|
|
507
772
|
workspace_path = Path(workspace_path).absolute()
|
|
508
773
|
|
|
509
774
|
self.workspace_path = workspace_path
|
|
510
|
-
self.
|
|
775
|
+
self._read_only = read_only
|
|
511
776
|
self.sync_interval_minutes = sync_interval_minutes
|
|
512
777
|
|
|
513
778
|
# Listeners for push notifications
|
|
514
779
|
self._listeners: Set[StateListener] = set()
|
|
515
780
|
self._listeners_lock = threading.Lock()
|
|
516
781
|
|
|
782
|
+
# Service cache (from base class)
|
|
783
|
+
self._init_service_cache()
|
|
784
|
+
|
|
517
785
|
# File watcher for database changes (started when listeners are added)
|
|
518
786
|
self._change_detector: Optional[_DatabaseChangeDetector] = None
|
|
519
787
|
self._db_file_handler: Optional[_DatabaseFileHandler] = None
|
|
@@ -570,9 +838,18 @@ class WorkspaceStateProvider:
|
|
|
570
838
|
experimaestro_dir.mkdir(parents=True, exist_ok=True)
|
|
571
839
|
|
|
572
840
|
db_path = experimaestro_dir / "workspace.db"
|
|
573
|
-
self.workspace_db = initialize_workspace_database(
|
|
841
|
+
self.workspace_db, needs_resync = initialize_workspace_database(
|
|
842
|
+
db_path, read_only=read_only
|
|
843
|
+
)
|
|
574
844
|
self._db_dir = experimaestro_dir # Store for file watcher
|
|
575
845
|
|
|
846
|
+
# Sync from disk if needed due to schema version change
|
|
847
|
+
if needs_resync and not read_only:
|
|
848
|
+
logger.info(
|
|
849
|
+
"Database schema version changed, triggering full resync from disk"
|
|
850
|
+
)
|
|
851
|
+
sync_on_start = True # Force sync
|
|
852
|
+
|
|
576
853
|
# Optionally sync from disk on start (only in write mode)
|
|
577
854
|
# Syncing requires write access to update the database and sync timestamp
|
|
578
855
|
if sync_on_start and not read_only:
|
|
@@ -581,16 +858,29 @@ class WorkspaceStateProvider:
|
|
|
581
858
|
sync_workspace_from_disk(
|
|
582
859
|
self.workspace_path,
|
|
583
860
|
write_mode=True,
|
|
584
|
-
force=
|
|
861
|
+
force=needs_resync, # Force full sync if schema changed
|
|
585
862
|
sync_interval_minutes=sync_interval_minutes,
|
|
586
863
|
)
|
|
587
864
|
|
|
865
|
+
# Update db_version after successful sync
|
|
866
|
+
if needs_resync:
|
|
867
|
+
with self.workspace_db.bind_ctx([WorkspaceSyncMetadata]):
|
|
868
|
+
WorkspaceSyncMetadata.update(db_version=CURRENT_DB_VERSION).where(
|
|
869
|
+
WorkspaceSyncMetadata.id == "workspace"
|
|
870
|
+
).execute()
|
|
871
|
+
logger.info("Database schema updated to version %d", CURRENT_DB_VERSION)
|
|
872
|
+
|
|
588
873
|
logger.info(
|
|
589
874
|
"WorkspaceStateProvider initialized (read_only=%s, workspace=%s)",
|
|
590
875
|
read_only,
|
|
591
876
|
workspace_path,
|
|
592
877
|
)
|
|
593
878
|
|
|
879
|
+
@property
|
|
880
|
+
def read_only(self) -> bool:
|
|
881
|
+
"""Whether this provider is read-only"""
|
|
882
|
+
return self._read_only
|
|
883
|
+
|
|
594
884
|
# Experiment management methods
|
|
595
885
|
|
|
596
886
|
@_with_db_context
|
|
@@ -652,14 +942,40 @@ class WorkspaceStateProvider:
|
|
|
652
942
|
now = datetime.now()
|
|
653
943
|
run_id = now.strftime("%Y%m%d_%H%M%S") + f"_{now.microsecond:06d}"
|
|
654
944
|
|
|
655
|
-
#
|
|
945
|
+
# Capture hostname
|
|
946
|
+
hostname = socket.gethostname()
|
|
947
|
+
started_at = datetime.now()
|
|
948
|
+
|
|
949
|
+
# Create run record with hostname
|
|
656
950
|
ExperimentRunModel.insert(
|
|
657
951
|
experiment_id=experiment_id,
|
|
658
952
|
run_id=run_id,
|
|
659
|
-
started_at=
|
|
953
|
+
started_at=started_at,
|
|
660
954
|
status="active",
|
|
955
|
+
hostname=hostname,
|
|
661
956
|
).execute()
|
|
662
957
|
|
|
958
|
+
# Persist to disk in experiment folder (informations.json)
|
|
959
|
+
exp_dir = self.workspace_path / "xp" / experiment_id
|
|
960
|
+
exp_dir.mkdir(parents=True, exist_ok=True)
|
|
961
|
+
info_file = exp_dir / "informations.json"
|
|
962
|
+
|
|
963
|
+
# Merge with existing data (may have multiple runs)
|
|
964
|
+
info_data: Dict = {}
|
|
965
|
+
if info_file.exists():
|
|
966
|
+
try:
|
|
967
|
+
info_data = json.loads(info_file.read_text())
|
|
968
|
+
except json.JSONDecodeError:
|
|
969
|
+
logger.warning("Could not parse existing informations.json")
|
|
970
|
+
|
|
971
|
+
if "runs" not in info_data:
|
|
972
|
+
info_data["runs"] = {}
|
|
973
|
+
info_data["runs"][run_id] = {
|
|
974
|
+
"hostname": hostname,
|
|
975
|
+
"started_at": started_at.isoformat(),
|
|
976
|
+
}
|
|
977
|
+
info_file.write_text(json.dumps(info_data, indent=2))
|
|
978
|
+
|
|
663
979
|
# Update experiment's current_run_id and updated_at
|
|
664
980
|
now = datetime.now()
|
|
665
981
|
ExperimentModel.update(
|
|
@@ -667,7 +983,12 @@ class WorkspaceStateProvider:
|
|
|
667
983
|
updated_at=now,
|
|
668
984
|
).where(ExperimentModel.experiment_id == experiment_id).execute()
|
|
669
985
|
|
|
670
|
-
logger.info(
|
|
986
|
+
logger.info(
|
|
987
|
+
"Created run %s for experiment %s on host %s",
|
|
988
|
+
run_id,
|
|
989
|
+
experiment_id,
|
|
990
|
+
hostname,
|
|
991
|
+
)
|
|
671
992
|
|
|
672
993
|
# Notify listeners
|
|
673
994
|
self._notify_listeners(
|
|
@@ -678,6 +999,7 @@ class WorkspaceStateProvider:
|
|
|
678
999
|
"run_id": run_id,
|
|
679
1000
|
"status": "active",
|
|
680
1001
|
"started_at": now.isoformat(),
|
|
1002
|
+
"hostname": hostname,
|
|
681
1003
|
},
|
|
682
1004
|
)
|
|
683
1005
|
)
|
|
@@ -718,6 +1040,7 @@ class WorkspaceStateProvider:
|
|
|
718
1040
|
- finished_jobs: Number of completed jobs (for current run)
|
|
719
1041
|
- failed_jobs: Number of failed jobs (for current run)
|
|
720
1042
|
- updated_at: When experiment was last modified
|
|
1043
|
+
- hostname: Host where the current run was launched
|
|
721
1044
|
"""
|
|
722
1045
|
experiments = []
|
|
723
1046
|
|
|
@@ -733,6 +1056,7 @@ class WorkspaceStateProvider:
|
|
|
733
1056
|
|
|
734
1057
|
started_at = None
|
|
735
1058
|
ended_at = None
|
|
1059
|
+
hostname = None
|
|
736
1060
|
|
|
737
1061
|
if exp_model.current_run_id:
|
|
738
1062
|
total_jobs = (
|
|
@@ -762,7 +1086,7 @@ class WorkspaceStateProvider:
|
|
|
762
1086
|
.count()
|
|
763
1087
|
)
|
|
764
1088
|
|
|
765
|
-
# Get run timestamps
|
|
1089
|
+
# Get run timestamps and hostname
|
|
766
1090
|
try:
|
|
767
1091
|
run_model = ExperimentRunModel.get(
|
|
768
1092
|
(ExperimentRunModel.experiment_id == exp_model.experiment_id)
|
|
@@ -772,6 +1096,7 @@ class WorkspaceStateProvider:
|
|
|
772
1096
|
started_at = run_model.started_at.timestamp()
|
|
773
1097
|
if run_model.ended_at:
|
|
774
1098
|
ended_at = run_model.ended_at.timestamp()
|
|
1099
|
+
hostname = run_model.hostname
|
|
775
1100
|
except ExperimentRunModel.DoesNotExist:
|
|
776
1101
|
pass
|
|
777
1102
|
|
|
@@ -788,6 +1113,7 @@ class WorkspaceStateProvider:
|
|
|
788
1113
|
updated_at=exp_model.updated_at.isoformat(),
|
|
789
1114
|
started_at=started_at,
|
|
790
1115
|
ended_at=ended_at,
|
|
1116
|
+
hostname=hostname,
|
|
791
1117
|
)
|
|
792
1118
|
)
|
|
793
1119
|
|
|
@@ -814,6 +1140,7 @@ class WorkspaceStateProvider:
|
|
|
814
1140
|
total_jobs = 0
|
|
815
1141
|
finished_jobs = 0
|
|
816
1142
|
failed_jobs = 0
|
|
1143
|
+
hostname = None
|
|
817
1144
|
|
|
818
1145
|
if exp_model.current_run_id:
|
|
819
1146
|
total_jobs = (
|
|
@@ -843,6 +1170,16 @@ class WorkspaceStateProvider:
|
|
|
843
1170
|
.count()
|
|
844
1171
|
)
|
|
845
1172
|
|
|
1173
|
+
# Get hostname from run model
|
|
1174
|
+
try:
|
|
1175
|
+
run_model = ExperimentRunModel.get(
|
|
1176
|
+
(ExperimentRunModel.experiment_id == exp_model.experiment_id)
|
|
1177
|
+
& (ExperimentRunModel.run_id == exp_model.current_run_id)
|
|
1178
|
+
)
|
|
1179
|
+
hostname = run_model.hostname
|
|
1180
|
+
except ExperimentRunModel.DoesNotExist:
|
|
1181
|
+
pass
|
|
1182
|
+
|
|
846
1183
|
# Compute experiment path from workspace_path and experiment_id
|
|
847
1184
|
exp_path = self.workspace_path / "xp" / exp_model.experiment_id
|
|
848
1185
|
|
|
@@ -853,6 +1190,7 @@ class WorkspaceStateProvider:
|
|
|
853
1190
|
finished_jobs=finished_jobs,
|
|
854
1191
|
failed_jobs=failed_jobs,
|
|
855
1192
|
updated_at=exp_model.updated_at.isoformat(),
|
|
1193
|
+
hostname=hostname,
|
|
856
1194
|
)
|
|
857
1195
|
|
|
858
1196
|
@_with_db_context
|
|
@@ -1633,61 +1971,48 @@ class WorkspaceStateProvider:
|
|
|
1633
1971
|
# Service operations
|
|
1634
1972
|
|
|
1635
1973
|
@_with_db_context
|
|
1636
|
-
def
|
|
1974
|
+
def register_service(
|
|
1637
1975
|
self,
|
|
1638
1976
|
service_id: str,
|
|
1639
1977
|
experiment_id: str,
|
|
1640
1978
|
run_id: str,
|
|
1641
1979
|
description: str,
|
|
1642
|
-
state: str,
|
|
1643
1980
|
state_dict: Optional[str] = None,
|
|
1644
1981
|
):
|
|
1645
|
-
"""
|
|
1982
|
+
"""Register a service in the database
|
|
1983
|
+
|
|
1984
|
+
Services are only added or removed, not updated. Runtime state
|
|
1985
|
+
is managed by the Service object itself.
|
|
1646
1986
|
|
|
1647
1987
|
Args:
|
|
1648
1988
|
service_id: Service identifier
|
|
1649
1989
|
experiment_id: Experiment identifier
|
|
1650
1990
|
run_id: Run identifier
|
|
1651
1991
|
description: Human-readable description
|
|
1652
|
-
state: Service state
|
|
1653
1992
|
state_dict: JSON serialized state_dict for service recreation
|
|
1654
1993
|
|
|
1655
1994
|
Raises:
|
|
1656
1995
|
RuntimeError: If in read-only mode
|
|
1657
1996
|
"""
|
|
1658
1997
|
if self.read_only:
|
|
1659
|
-
raise RuntimeError("Cannot
|
|
1998
|
+
raise RuntimeError("Cannot register services in read-only mode")
|
|
1660
1999
|
|
|
1661
2000
|
insert_data = {
|
|
1662
2001
|
"service_id": service_id,
|
|
1663
2002
|
"experiment_id": experiment_id,
|
|
1664
2003
|
"run_id": run_id,
|
|
1665
2004
|
"description": description,
|
|
1666
|
-
"state": state,
|
|
1667
2005
|
"created_at": datetime.now(),
|
|
1668
|
-
"updated_at": datetime.now(),
|
|
1669
|
-
}
|
|
1670
|
-
update_data = {
|
|
1671
|
-
ServiceModel.description: description,
|
|
1672
|
-
ServiceModel.state: state,
|
|
1673
|
-
ServiceModel.updated_at: datetime.now(),
|
|
1674
2006
|
}
|
|
1675
2007
|
|
|
1676
2008
|
if state_dict is not None:
|
|
1677
2009
|
insert_data["state_dict"] = state_dict
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
ServiceModel.insert(**insert_data).
|
|
1681
|
-
conflict_target=[
|
|
1682
|
-
ServiceModel.service_id,
|
|
1683
|
-
ServiceModel.experiment_id,
|
|
1684
|
-
ServiceModel.run_id,
|
|
1685
|
-
],
|
|
1686
|
-
update=update_data,
|
|
1687
|
-
).execute()
|
|
2010
|
+
|
|
2011
|
+
# Use INSERT OR IGNORE - services are only added, not updated
|
|
2012
|
+
ServiceModel.insert(**insert_data).on_conflict_ignore().execute()
|
|
1688
2013
|
|
|
1689
2014
|
logger.debug(
|
|
1690
|
-
"
|
|
2015
|
+
"Registered service %s (experiment=%s, run=%s)",
|
|
1691
2016
|
service_id,
|
|
1692
2017
|
experiment_id,
|
|
1693
2018
|
run_id,
|
|
@@ -1701,70 +2026,68 @@ class WorkspaceStateProvider:
|
|
|
1701
2026
|
"serviceId": service_id,
|
|
1702
2027
|
"experimentId": experiment_id,
|
|
1703
2028
|
"runId": run_id,
|
|
1704
|
-
"state": state,
|
|
1705
2029
|
"description": description,
|
|
1706
2030
|
},
|
|
1707
2031
|
)
|
|
1708
2032
|
)
|
|
1709
2033
|
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
"""Get services, optionally filtered by experiment/run
|
|
2034
|
+
def _get_live_services(
|
|
2035
|
+
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
2036
|
+
) -> Optional[List["Service"]]:
|
|
2037
|
+
"""Get live services from scheduler if available.
|
|
1715
2038
|
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
2039
|
+
Returns None if no live services (experiment not in scheduler).
|
|
2040
|
+
"""
|
|
2041
|
+
if experiment_id is None:
|
|
2042
|
+
return None
|
|
1720
2043
|
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
run_id: Filter by run (None = current run if experiment_id provided)
|
|
2044
|
+
try:
|
|
2045
|
+
from experimaestro.scheduler.base import Scheduler
|
|
1724
2046
|
|
|
1725
|
-
|
|
1726
|
-
|
|
2047
|
+
if not Scheduler.has_instance():
|
|
2048
|
+
return None
|
|
2049
|
+
|
|
2050
|
+
scheduler = Scheduler.instance()
|
|
2051
|
+
if experiment_id not in scheduler.experiments:
|
|
2052
|
+
logger.debug("Experiment %s not in scheduler", experiment_id)
|
|
2053
|
+
return None
|
|
2054
|
+
|
|
2055
|
+
exp = scheduler.experiments[experiment_id]
|
|
2056
|
+
services = list(exp.services.values())
|
|
2057
|
+
logger.debug(
|
|
2058
|
+
"Returning %d live services for experiment %s",
|
|
2059
|
+
len(services),
|
|
2060
|
+
experiment_id,
|
|
2061
|
+
)
|
|
2062
|
+
return services
|
|
2063
|
+
|
|
2064
|
+
except Exception as e:
|
|
2065
|
+
logger.warning("Could not get live services: %s", e)
|
|
2066
|
+
return None
|
|
2067
|
+
|
|
2068
|
+
@_with_db_context
|
|
2069
|
+
def _fetch_services_from_storage(
|
|
2070
|
+
self, experiment_id: Optional[str], run_id: Optional[str]
|
|
2071
|
+
) -> List["Service"]:
|
|
2072
|
+
"""Fetch services from database.
|
|
2073
|
+
|
|
2074
|
+
Called when no live services and cache is empty.
|
|
1727
2075
|
"""
|
|
1728
2076
|
from experimaestro.scheduler.services import Service
|
|
1729
2077
|
|
|
1730
|
-
# First, check for live services from the scheduler
|
|
1731
|
-
if experiment_id is not None:
|
|
1732
|
-
try:
|
|
1733
|
-
from experimaestro.scheduler.base import Scheduler
|
|
1734
|
-
|
|
1735
|
-
if Scheduler.has_instance():
|
|
1736
|
-
scheduler = Scheduler.instance()
|
|
1737
|
-
# Check if experiment is registered with scheduler
|
|
1738
|
-
if experiment_id in scheduler.experiments:
|
|
1739
|
-
exp = scheduler.experiments[experiment_id]
|
|
1740
|
-
services = list(exp.services.values())
|
|
1741
|
-
logger.debug(
|
|
1742
|
-
"Returning %d live services for experiment %s",
|
|
1743
|
-
len(services),
|
|
1744
|
-
experiment_id,
|
|
1745
|
-
)
|
|
1746
|
-
return services
|
|
1747
|
-
except Exception as e:
|
|
1748
|
-
# Scheduler not available or error - fall back to database
|
|
1749
|
-
logger.debug("Could not get live services: %s", e)
|
|
1750
|
-
|
|
1751
|
-
# Fall back to database
|
|
1752
2078
|
query = ServiceModel.select()
|
|
1753
2079
|
|
|
1754
2080
|
if experiment_id is not None:
|
|
1755
|
-
# Use current run if not specified
|
|
1756
|
-
if run_id is None:
|
|
1757
|
-
run_id = self.get_current_run(experiment_id)
|
|
1758
|
-
if run_id is None:
|
|
1759
|
-
return []
|
|
1760
|
-
|
|
1761
2081
|
query = query.where(
|
|
1762
2082
|
(ServiceModel.experiment_id == experiment_id)
|
|
1763
2083
|
& (ServiceModel.run_id == run_id)
|
|
1764
2084
|
)
|
|
1765
2085
|
|
|
1766
2086
|
services = []
|
|
2087
|
+
|
|
1767
2088
|
for service_model in query:
|
|
2089
|
+
service_id = service_model.service_id
|
|
2090
|
+
|
|
1768
2091
|
# Try to recreate service from state_dict
|
|
1769
2092
|
state_dict_json = service_model.state_dict
|
|
1770
2093
|
if state_dict_json and state_dict_json != "{}":
|
|
@@ -1772,20 +2095,89 @@ class WorkspaceStateProvider:
|
|
|
1772
2095
|
state_dict = json.loads(state_dict_json)
|
|
1773
2096
|
if "__class__" in state_dict:
|
|
1774
2097
|
service = Service.from_state_dict(state_dict)
|
|
1775
|
-
# Set the id from the database record
|
|
1776
|
-
service.id = service_model.service_id
|
|
1777
|
-
services.append(service)
|
|
1778
|
-
continue
|
|
1779
2098
|
except Exception as e:
|
|
2099
|
+
service = MockService(
|
|
2100
|
+
service_id,
|
|
2101
|
+
f"error: {e}",
|
|
2102
|
+
{},
|
|
2103
|
+
experiment_id=experiment_id,
|
|
2104
|
+
run_id=run_id,
|
|
2105
|
+
)
|
|
2106
|
+
|
|
1780
2107
|
logger.warning(
|
|
1781
2108
|
"Failed to recreate service %s from state_dict: %s",
|
|
1782
|
-
|
|
2109
|
+
service_id,
|
|
1783
2110
|
e,
|
|
1784
2111
|
)
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
2112
|
+
else:
|
|
2113
|
+
# If we can't recreate, skip this service (it's not usable)
|
|
2114
|
+
logger.debug(
|
|
2115
|
+
"Service %s has no state_dict for recreation, skipping",
|
|
2116
|
+
service_id,
|
|
2117
|
+
)
|
|
2118
|
+
service = MockService(
|
|
2119
|
+
service_id,
|
|
2120
|
+
"error: no state_dict",
|
|
2121
|
+
{},
|
|
2122
|
+
experiment_id=experiment_id,
|
|
2123
|
+
run_id=run_id,
|
|
2124
|
+
)
|
|
2125
|
+
|
|
2126
|
+
# Add to services
|
|
2127
|
+
service.id = service_id
|
|
2128
|
+
services.append(service)
|
|
2129
|
+
continue
|
|
2130
|
+
|
|
2131
|
+
return services
|
|
2132
|
+
|
|
2133
|
+
@_with_db_context
|
|
2134
|
+
def get_services_raw(
|
|
2135
|
+
self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
|
|
2136
|
+
) -> List[Dict]:
|
|
2137
|
+
"""Get raw service data from database without recreating Service objects
|
|
2138
|
+
|
|
2139
|
+
This is useful for remote monitoring where the client may have different
|
|
2140
|
+
modules installed than the server. Returns dictionaries with service
|
|
2141
|
+
metadata that can be serialized over JSON-RPC.
|
|
2142
|
+
|
|
2143
|
+
Args:
|
|
2144
|
+
experiment_id: Filter by experiment (None = all)
|
|
2145
|
+
run_id: Filter by run (None = current run if experiment_id provided)
|
|
2146
|
+
|
|
2147
|
+
Returns:
|
|
2148
|
+
List of dictionaries with service data
|
|
2149
|
+
"""
|
|
2150
|
+
query = ServiceModel.select()
|
|
2151
|
+
|
|
2152
|
+
if experiment_id is not None:
|
|
2153
|
+
# Use current run if not specified
|
|
2154
|
+
if run_id is None:
|
|
2155
|
+
run_id = self.get_current_run(experiment_id)
|
|
2156
|
+
if run_id is None:
|
|
2157
|
+
return []
|
|
2158
|
+
|
|
2159
|
+
query = query.where(
|
|
2160
|
+
(ServiceModel.experiment_id == experiment_id)
|
|
2161
|
+
& (ServiceModel.run_id == run_id)
|
|
2162
|
+
)
|
|
2163
|
+
|
|
2164
|
+
services = []
|
|
2165
|
+
for service_model in query:
|
|
2166
|
+
state_dict = {}
|
|
2167
|
+
if service_model.state_dict and service_model.state_dict != "{}":
|
|
2168
|
+
try:
|
|
2169
|
+
state_dict = json.loads(service_model.state_dict)
|
|
2170
|
+
except json.JSONDecodeError:
|
|
2171
|
+
pass
|
|
2172
|
+
|
|
2173
|
+
services.append(
|
|
2174
|
+
{
|
|
2175
|
+
"service_id": service_model.service_id,
|
|
2176
|
+
"description": service_model.description,
|
|
2177
|
+
"state_dict": state_dict,
|
|
2178
|
+
"experiment_id": service_model.experiment_id,
|
|
2179
|
+
"run_id": service_model.run_id,
|
|
2180
|
+
}
|
|
1789
2181
|
)
|
|
1790
2182
|
|
|
1791
2183
|
return services
|
|
@@ -1855,13 +2247,19 @@ class WorkspaceStateProvider:
|
|
|
1855
2247
|
Returns:
|
|
1856
2248
|
datetime of last sync, or None if never synced
|
|
1857
2249
|
"""
|
|
2250
|
+
from peewee import OperationalError
|
|
2251
|
+
|
|
1858
2252
|
from .state_db import WorkspaceSyncMetadata
|
|
1859
2253
|
|
|
1860
|
-
|
|
1861
|
-
WorkspaceSyncMetadata.
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
2254
|
+
try:
|
|
2255
|
+
metadata = WorkspaceSyncMetadata.get_or_none(
|
|
2256
|
+
WorkspaceSyncMetadata.id == "workspace"
|
|
2257
|
+
)
|
|
2258
|
+
if metadata and metadata.last_sync_time:
|
|
2259
|
+
return metadata.last_sync_time
|
|
2260
|
+
except OperationalError:
|
|
2261
|
+
# Table might not exist in older workspaces opened in read-only mode
|
|
2262
|
+
pass
|
|
1865
2263
|
return None
|
|
1866
2264
|
|
|
1867
2265
|
@_with_db_context
|
|
@@ -2333,13 +2731,36 @@ class SchedulerListener:
|
|
|
2333
2731
|
experiment_id: Experiment identifier
|
|
2334
2732
|
run_id: Run identifier
|
|
2335
2733
|
"""
|
|
2734
|
+
from experimaestro.scheduler.services import Service
|
|
2735
|
+
|
|
2336
2736
|
try:
|
|
2337
|
-
|
|
2737
|
+
# Get state_dict for service recreation
|
|
2738
|
+
state_dict_json = None
|
|
2739
|
+
try:
|
|
2740
|
+
# _full_state_dict includes __class__ automatically
|
|
2741
|
+
state_dict = service._full_state_dict()
|
|
2742
|
+
# Serialize paths automatically
|
|
2743
|
+
serialized = Service.serialize_state_dict(state_dict)
|
|
2744
|
+
state_dict_json = json.dumps(serialized)
|
|
2745
|
+
except Exception as e:
|
|
2746
|
+
# Service cannot be serialized - store unserializable marker
|
|
2747
|
+
logger.warning(
|
|
2748
|
+
"Could not get state_dict for service %s: %s", service.id, e
|
|
2749
|
+
)
|
|
2750
|
+
state_dict_json = json.dumps(
|
|
2751
|
+
{
|
|
2752
|
+
"__class__": f"{service.__class__.__module__}.{service.__class__.__name__}",
|
|
2753
|
+
"__unserializable__": True,
|
|
2754
|
+
"__reason__": f"Cannot serialize: {e}",
|
|
2755
|
+
}
|
|
2756
|
+
)
|
|
2757
|
+
|
|
2758
|
+
self.state_provider.register_service(
|
|
2338
2759
|
service.id,
|
|
2339
2760
|
experiment_id,
|
|
2340
2761
|
run_id,
|
|
2341
2762
|
service.description(),
|
|
2342
|
-
|
|
2763
|
+
state_dict=state_dict_json,
|
|
2343
2764
|
)
|
|
2344
2765
|
except Exception as e:
|
|
2345
2766
|
logger.exception("Error updating service %s: %s", service.id, e)
|