experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

@@ -15,13 +15,15 @@ Key features:
15
15
 
16
16
  import json
17
17
  import logging
18
+ import socket
18
19
  import threading
19
20
  import time
20
21
  from dataclasses import dataclass
21
22
  from datetime import datetime
22
23
  from enum import Enum, auto
23
24
  from pathlib import Path
24
- from typing import Callable, Dict, List, Optional, Set, TYPE_CHECKING
25
+ from abc import ABC, abstractmethod
26
+ from typing import Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
25
27
 
26
28
  from watchdog.events import FileSystemEventHandler
27
29
  from watchdog.observers.api import ObservedWatch
@@ -34,11 +36,14 @@ from experimaestro.scheduler.state_db import (
34
36
  ServiceModel,
35
37
  PartialModel,
36
38
  JobPartialModel,
39
+ WorkspaceSyncMetadata,
37
40
  ALL_MODELS,
41
+ CURRENT_DB_VERSION,
38
42
  )
39
43
  from experimaestro.scheduler.interfaces import (
40
44
  BaseJob,
41
45
  BaseExperiment,
46
+ BaseService,
42
47
  JobState,
43
48
  JobFailureStatus,
44
49
  STATE_NAME_TO_JOBSTATE,
@@ -78,6 +83,213 @@ class StateEvent:
78
83
  StateListener = Callable[[StateEvent], None]
79
84
 
80
85
 
86
+ class StateProvider(ABC):
87
+ """Abstract base class for state providers
88
+
89
+ Defines the interface that all state providers must implement.
90
+ This enables both local (WorkspaceStateProvider) and remote
91
+ (SSHStateProviderClient) providers to be used interchangeably.
92
+
93
+ Provides common service caching logic to preserve service instances
94
+ (and their URLs) across calls to get_services(). Subclasses should call
95
+ _init_service_cache() in their __init__ and implement _fetch_services_from_storage().
96
+ """
97
+
98
+ def _init_service_cache(self) -> None:
99
+ """Initialize service cache - call from subclass __init__"""
100
+ self._service_cache: Dict[Tuple[str, str], Dict[str, "BaseService"]] = {}
101
+ self._service_cache_lock = threading.Lock()
102
+
103
+ def _clear_service_cache(self) -> None:
104
+ """Clear the service cache"""
105
+ with self._service_cache_lock:
106
+ self._service_cache.clear()
107
+
108
+ def get_services(
109
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
110
+ ) -> List[BaseService]:
111
+ """Get services for an experiment
112
+
113
+ Uses caching to preserve service instances (and their URLs) across calls.
114
+ Subclasses can override _get_live_services() for live service support
115
+ and must implement _fetch_services_from_storage() for persistent storage.
116
+ """
117
+ # Resolve run_id if needed
118
+ if experiment_id is not None and run_id is None:
119
+ run_id = self.get_current_run(experiment_id)
120
+ if run_id is None:
121
+ return []
122
+
123
+ cache_key = (experiment_id or "", run_id or "")
124
+
125
+ with self._service_cache_lock:
126
+ # Try to get live services (scheduler, etc.) - may return None
127
+ live_services = self._get_live_services(experiment_id, run_id)
128
+ if live_services is not None:
129
+ # Cache and return live services
130
+ self._service_cache[cache_key] = {s.id: s for s in live_services}
131
+ return live_services
132
+
133
+ # Check cache
134
+ cached = self._service_cache.get(cache_key)
135
+ if cached is not None:
136
+ return list(cached.values())
137
+
138
+ # Fetch from persistent storage (DB or remote)
139
+ services = self._fetch_services_from_storage(experiment_id, run_id)
140
+ self._service_cache[cache_key] = {s.id: s for s in services}
141
+ return services
142
+
143
+ def _get_live_services(
144
+ self, experiment_id: Optional[str], run_id: Optional[str]
145
+ ) -> Optional[List[BaseService]]:
146
+ """Get live services if available (e.g., from scheduler).
147
+
148
+ Returns None if no live services are available (default).
149
+ Subclasses may override to check for live services.
150
+ """
151
+ return None
152
+
153
+ @abstractmethod
154
+ def _fetch_services_from_storage(
155
+ self, experiment_id: Optional[str], run_id: Optional[str]
156
+ ) -> List[BaseService]:
157
+ """Fetch services from persistent storage (DB or remote).
158
+
159
+ Called when no live services and cache is empty.
160
+ """
161
+ ...
162
+
163
+ @abstractmethod
164
+ def get_experiments(self, since: Optional[datetime] = None) -> List[BaseExperiment]:
165
+ """Get list of all experiments"""
166
+ ...
167
+
168
+ @abstractmethod
169
+ def get_experiment(self, experiment_id: str) -> Optional[BaseExperiment]:
170
+ """Get a specific experiment by ID"""
171
+ ...
172
+
173
+ @abstractmethod
174
+ def get_experiment_runs(self, experiment_id: str) -> List[Dict]:
175
+ """Get all runs for an experiment"""
176
+ ...
177
+
178
+ @abstractmethod
179
+ def get_current_run(self, experiment_id: str) -> Optional[str]:
180
+ """Get the current run ID for an experiment"""
181
+ ...
182
+
183
+ @abstractmethod
184
+ def get_jobs(
185
+ self,
186
+ experiment_id: Optional[str] = None,
187
+ run_id: Optional[str] = None,
188
+ task_id: Optional[str] = None,
189
+ state: Optional[str] = None,
190
+ tags: Optional[Dict[str, str]] = None,
191
+ since: Optional[datetime] = None,
192
+ ) -> List[BaseJob]:
193
+ """Query jobs with optional filters"""
194
+ ...
195
+
196
+ @abstractmethod
197
+ def get_job(
198
+ self, job_id: str, experiment_id: str, run_id: Optional[str] = None
199
+ ) -> Optional[BaseJob]:
200
+ """Get a specific job"""
201
+ ...
202
+
203
+ @abstractmethod
204
+ def get_all_jobs(
205
+ self,
206
+ state: Optional[str] = None,
207
+ tags: Optional[Dict[str, str]] = None,
208
+ since: Optional[datetime] = None,
209
+ ) -> List[BaseJob]:
210
+ """Get all jobs across all experiments"""
211
+ ...
212
+
213
+ # Note: get_services is implemented in base class using _fetch_services_from_storage
214
+
215
+ @abstractmethod
216
+ def get_services_raw(
217
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
218
+ ) -> List[Dict]:
219
+ """Get raw service data as dictionaries (for serialization)"""
220
+ ...
221
+
222
+ @abstractmethod
223
+ def add_listener(self, listener: StateListener) -> None:
224
+ """Register a listener for state change events"""
225
+ ...
226
+
227
+ @abstractmethod
228
+ def remove_listener(self, listener: StateListener) -> None:
229
+ """Unregister a listener"""
230
+ ...
231
+
232
+ @abstractmethod
233
+ def kill_job(self, job: BaseJob, perform: bool = False) -> bool:
234
+ """Kill a running job"""
235
+ ...
236
+
237
+ @abstractmethod
238
+ def clean_job(self, job: BaseJob, perform: bool = False) -> bool:
239
+ """Clean a finished job"""
240
+ ...
241
+
242
+ @abstractmethod
243
+ def close(self) -> None:
244
+ """Close the state provider and release resources"""
245
+ ...
246
+
247
+ # Optional methods with default implementations
248
+
249
+ def sync_path(self, path: str) -> Optional[Path]:
250
+ """Sync a specific path from remote (remote providers only)
251
+
252
+ Returns None for local providers or if sync fails.
253
+ """
254
+ return None
255
+
256
+ def get_orphan_jobs(self) -> List[BaseJob]:
257
+ """Get orphan jobs (jobs not associated with any experiment run)"""
258
+ return []
259
+
260
+ def delete_job_safely(self, job: BaseJob, perform: bool = True) -> Tuple[bool, str]:
261
+ """Safely delete a job and its data"""
262
+ return False, "Not implemented"
263
+
264
+ def delete_experiment(
265
+ self, experiment_id: str, perform: bool = True
266
+ ) -> Tuple[bool, str]:
267
+ """Delete an experiment and all its data"""
268
+ return False, "Not implemented"
269
+
270
+ def cleanup_orphan_partials(self, perform: bool = False) -> List[str]:
271
+ """Clean up orphan partial directories"""
272
+ return []
273
+
274
+ def get_last_sync_time(self) -> Optional[datetime]:
275
+ """Get the last sync time (for incremental updates)"""
276
+ return None
277
+
278
+ @property
279
+ def read_only(self) -> bool:
280
+ """Whether this provider is read-only"""
281
+ return True
282
+
283
+ @property
284
+ def is_remote(self) -> bool:
285
+ """Whether this is a remote provider (e.g., SSH)
286
+
287
+ Remote providers use periodic refresh instead of push notifications
288
+ and support sync_path for on-demand file synchronization.
289
+ """
290
+ return False
291
+
292
+
81
293
  class _DatabaseChangeDetector:
82
294
  """Background thread that detects database changes and notifies listeners
83
295
 
@@ -371,6 +583,7 @@ class MockExperiment(BaseExperiment):
371
583
  updated_at: str,
372
584
  started_at: Optional[float] = None,
373
585
  ended_at: Optional[float] = None,
586
+ hostname: Optional[str] = None,
374
587
  ):
375
588
  self.workdir = workdir
376
589
  self.current_run_id = current_run_id
@@ -380,6 +593,7 @@ class MockExperiment(BaseExperiment):
380
593
  self.updated_at = updated_at
381
594
  self.started_at = started_at
382
595
  self.ended_at = ended_at
596
+ self.hostname = hostname
383
597
 
384
598
  @property
385
599
  def experiment_id(self) -> str:
@@ -387,6 +601,57 @@ class MockExperiment(BaseExperiment):
387
601
  return self.workdir.name
388
602
 
389
603
 
604
+ class MockService(BaseService):
605
+ """Mock service object for remote monitoring
606
+
607
+ This class provides a service-like interface for services loaded from
608
+ the remote server. It mimics the Service class interface sufficiently
609
+ for display in the TUI ServicesList widget.
610
+ """
611
+
612
+ def __init__(
613
+ self,
614
+ service_id: str,
615
+ description_text: str,
616
+ state_dict_data: dict,
617
+ experiment_id: Optional[str] = None,
618
+ run_id: Optional[str] = None,
619
+ url: Optional[str] = None,
620
+ state: str = "STOPPED",
621
+ ):
622
+ self.id = service_id
623
+ self._description = description_text
624
+ self._state_name = state
625
+ self._state_dict_data = state_dict_data
626
+ self.experiment_id = experiment_id
627
+ self.run_id = run_id
628
+ self.url = url
629
+
630
+ @property
631
+ def state(self):
632
+ """Return state as a ServiceState-like object with a name attribute"""
633
+ from experimaestro.scheduler.services import ServiceState
634
+
635
+ # Convert state name to ServiceState enum
636
+ try:
637
+ return ServiceState[self._state_name]
638
+ except KeyError:
639
+ # Return a mock object with name attribute for unknown states
640
+ class MockState:
641
+ def __init__(self, name):
642
+ self.name = name
643
+
644
+ return MockState(self._state_name)
645
+
646
+ def description(self) -> str:
647
+ """Return service description"""
648
+ return self._description
649
+
650
+ def state_dict(self) -> dict:
651
+ """Return state dictionary for service recreation"""
652
+ return self._state_dict_data
653
+
654
+
390
655
  def _with_db_context(func):
391
656
  """Decorator to wrap method in database bind context
392
657
 
@@ -406,7 +671,7 @@ def _with_db_context(func):
406
671
  return wrapper
407
672
 
408
673
 
409
- class WorkspaceStateProvider:
674
+ class WorkspaceStateProvider(StateProvider):
410
675
  """Unified state provider for workspace-level database (singleton per workspace path)
411
676
 
412
677
  Provides access to experiment and job state from a single workspace database.
@@ -507,13 +772,16 @@ class WorkspaceStateProvider:
507
772
  workspace_path = Path(workspace_path).absolute()
508
773
 
509
774
  self.workspace_path = workspace_path
510
- self.read_only = read_only
775
+ self._read_only = read_only
511
776
  self.sync_interval_minutes = sync_interval_minutes
512
777
 
513
778
  # Listeners for push notifications
514
779
  self._listeners: Set[StateListener] = set()
515
780
  self._listeners_lock = threading.Lock()
516
781
 
782
+ # Service cache (from base class)
783
+ self._init_service_cache()
784
+
517
785
  # File watcher for database changes (started when listeners are added)
518
786
  self._change_detector: Optional[_DatabaseChangeDetector] = None
519
787
  self._db_file_handler: Optional[_DatabaseFileHandler] = None
@@ -570,9 +838,18 @@ class WorkspaceStateProvider:
570
838
  experimaestro_dir.mkdir(parents=True, exist_ok=True)
571
839
 
572
840
  db_path = experimaestro_dir / "workspace.db"
573
- self.workspace_db = initialize_workspace_database(db_path, read_only=read_only)
841
+ self.workspace_db, needs_resync = initialize_workspace_database(
842
+ db_path, read_only=read_only
843
+ )
574
844
  self._db_dir = experimaestro_dir # Store for file watcher
575
845
 
846
+ # Sync from disk if needed due to schema version change
847
+ if needs_resync and not read_only:
848
+ logger.info(
849
+ "Database schema version changed, triggering full resync from disk"
850
+ )
851
+ sync_on_start = True # Force sync
852
+
576
853
  # Optionally sync from disk on start (only in write mode)
577
854
  # Syncing requires write access to update the database and sync timestamp
578
855
  if sync_on_start and not read_only:
@@ -581,16 +858,29 @@ class WorkspaceStateProvider:
581
858
  sync_workspace_from_disk(
582
859
  self.workspace_path,
583
860
  write_mode=True,
584
- force=False,
861
+ force=needs_resync, # Force full sync if schema changed
585
862
  sync_interval_minutes=sync_interval_minutes,
586
863
  )
587
864
 
865
+ # Update db_version after successful sync
866
+ if needs_resync:
867
+ with self.workspace_db.bind_ctx([WorkspaceSyncMetadata]):
868
+ WorkspaceSyncMetadata.update(db_version=CURRENT_DB_VERSION).where(
869
+ WorkspaceSyncMetadata.id == "workspace"
870
+ ).execute()
871
+ logger.info("Database schema updated to version %d", CURRENT_DB_VERSION)
872
+
588
873
  logger.info(
589
874
  "WorkspaceStateProvider initialized (read_only=%s, workspace=%s)",
590
875
  read_only,
591
876
  workspace_path,
592
877
  )
593
878
 
879
+ @property
880
+ def read_only(self) -> bool:
881
+ """Whether this provider is read-only"""
882
+ return self._read_only
883
+
594
884
  # Experiment management methods
595
885
 
596
886
  @_with_db_context
@@ -652,14 +942,40 @@ class WorkspaceStateProvider:
652
942
  now = datetime.now()
653
943
  run_id = now.strftime("%Y%m%d_%H%M%S") + f"_{now.microsecond:06d}"
654
944
 
655
- # Create run record
945
+ # Capture hostname
946
+ hostname = socket.gethostname()
947
+ started_at = datetime.now()
948
+
949
+ # Create run record with hostname
656
950
  ExperimentRunModel.insert(
657
951
  experiment_id=experiment_id,
658
952
  run_id=run_id,
659
- started_at=datetime.now(),
953
+ started_at=started_at,
660
954
  status="active",
955
+ hostname=hostname,
661
956
  ).execute()
662
957
 
958
+ # Persist to disk in experiment folder (informations.json)
959
+ exp_dir = self.workspace_path / "xp" / experiment_id
960
+ exp_dir.mkdir(parents=True, exist_ok=True)
961
+ info_file = exp_dir / "informations.json"
962
+
963
+ # Merge with existing data (may have multiple runs)
964
+ info_data: Dict = {}
965
+ if info_file.exists():
966
+ try:
967
+ info_data = json.loads(info_file.read_text())
968
+ except json.JSONDecodeError:
969
+ logger.warning("Could not parse existing informations.json")
970
+
971
+ if "runs" not in info_data:
972
+ info_data["runs"] = {}
973
+ info_data["runs"][run_id] = {
974
+ "hostname": hostname,
975
+ "started_at": started_at.isoformat(),
976
+ }
977
+ info_file.write_text(json.dumps(info_data, indent=2))
978
+
663
979
  # Update experiment's current_run_id and updated_at
664
980
  now = datetime.now()
665
981
  ExperimentModel.update(
@@ -667,7 +983,12 @@ class WorkspaceStateProvider:
667
983
  updated_at=now,
668
984
  ).where(ExperimentModel.experiment_id == experiment_id).execute()
669
985
 
670
- logger.info("Created run %s for experiment %s", run_id, experiment_id)
986
+ logger.info(
987
+ "Created run %s for experiment %s on host %s",
988
+ run_id,
989
+ experiment_id,
990
+ hostname,
991
+ )
671
992
 
672
993
  # Notify listeners
673
994
  self._notify_listeners(
@@ -678,6 +999,7 @@ class WorkspaceStateProvider:
678
999
  "run_id": run_id,
679
1000
  "status": "active",
680
1001
  "started_at": now.isoformat(),
1002
+ "hostname": hostname,
681
1003
  },
682
1004
  )
683
1005
  )
@@ -718,6 +1040,7 @@ class WorkspaceStateProvider:
718
1040
  - finished_jobs: Number of completed jobs (for current run)
719
1041
  - failed_jobs: Number of failed jobs (for current run)
720
1042
  - updated_at: When experiment was last modified
1043
+ - hostname: Host where the current run was launched
721
1044
  """
722
1045
  experiments = []
723
1046
 
@@ -733,6 +1056,7 @@ class WorkspaceStateProvider:
733
1056
 
734
1057
  started_at = None
735
1058
  ended_at = None
1059
+ hostname = None
736
1060
 
737
1061
  if exp_model.current_run_id:
738
1062
  total_jobs = (
@@ -762,7 +1086,7 @@ class WorkspaceStateProvider:
762
1086
  .count()
763
1087
  )
764
1088
 
765
- # Get run timestamps
1089
+ # Get run timestamps and hostname
766
1090
  try:
767
1091
  run_model = ExperimentRunModel.get(
768
1092
  (ExperimentRunModel.experiment_id == exp_model.experiment_id)
@@ -772,6 +1096,7 @@ class WorkspaceStateProvider:
772
1096
  started_at = run_model.started_at.timestamp()
773
1097
  if run_model.ended_at:
774
1098
  ended_at = run_model.ended_at.timestamp()
1099
+ hostname = run_model.hostname
775
1100
  except ExperimentRunModel.DoesNotExist:
776
1101
  pass
777
1102
 
@@ -788,6 +1113,7 @@ class WorkspaceStateProvider:
788
1113
  updated_at=exp_model.updated_at.isoformat(),
789
1114
  started_at=started_at,
790
1115
  ended_at=ended_at,
1116
+ hostname=hostname,
791
1117
  )
792
1118
  )
793
1119
 
@@ -814,6 +1140,7 @@ class WorkspaceStateProvider:
814
1140
  total_jobs = 0
815
1141
  finished_jobs = 0
816
1142
  failed_jobs = 0
1143
+ hostname = None
817
1144
 
818
1145
  if exp_model.current_run_id:
819
1146
  total_jobs = (
@@ -843,6 +1170,16 @@ class WorkspaceStateProvider:
843
1170
  .count()
844
1171
  )
845
1172
 
1173
+ # Get hostname from run model
1174
+ try:
1175
+ run_model = ExperimentRunModel.get(
1176
+ (ExperimentRunModel.experiment_id == exp_model.experiment_id)
1177
+ & (ExperimentRunModel.run_id == exp_model.current_run_id)
1178
+ )
1179
+ hostname = run_model.hostname
1180
+ except ExperimentRunModel.DoesNotExist:
1181
+ pass
1182
+
846
1183
  # Compute experiment path from workspace_path and experiment_id
847
1184
  exp_path = self.workspace_path / "xp" / exp_model.experiment_id
848
1185
 
@@ -853,6 +1190,7 @@ class WorkspaceStateProvider:
853
1190
  finished_jobs=finished_jobs,
854
1191
  failed_jobs=failed_jobs,
855
1192
  updated_at=exp_model.updated_at.isoformat(),
1193
+ hostname=hostname,
856
1194
  )
857
1195
 
858
1196
  @_with_db_context
@@ -1633,61 +1971,48 @@ class WorkspaceStateProvider:
1633
1971
  # Service operations
1634
1972
 
1635
1973
  @_with_db_context
1636
- def update_service(
1974
+ def register_service(
1637
1975
  self,
1638
1976
  service_id: str,
1639
1977
  experiment_id: str,
1640
1978
  run_id: str,
1641
1979
  description: str,
1642
- state: str,
1643
1980
  state_dict: Optional[str] = None,
1644
1981
  ):
1645
- """Update service information
1982
+ """Register a service in the database
1983
+
1984
+ Services are only added or removed, not updated. Runtime state
1985
+ is managed by the Service object itself.
1646
1986
 
1647
1987
  Args:
1648
1988
  service_id: Service identifier
1649
1989
  experiment_id: Experiment identifier
1650
1990
  run_id: Run identifier
1651
1991
  description: Human-readable description
1652
- state: Service state
1653
1992
  state_dict: JSON serialized state_dict for service recreation
1654
1993
 
1655
1994
  Raises:
1656
1995
  RuntimeError: If in read-only mode
1657
1996
  """
1658
1997
  if self.read_only:
1659
- raise RuntimeError("Cannot update services in read-only mode")
1998
+ raise RuntimeError("Cannot register services in read-only mode")
1660
1999
 
1661
2000
  insert_data = {
1662
2001
  "service_id": service_id,
1663
2002
  "experiment_id": experiment_id,
1664
2003
  "run_id": run_id,
1665
2004
  "description": description,
1666
- "state": state,
1667
2005
  "created_at": datetime.now(),
1668
- "updated_at": datetime.now(),
1669
- }
1670
- update_data = {
1671
- ServiceModel.description: description,
1672
- ServiceModel.state: state,
1673
- ServiceModel.updated_at: datetime.now(),
1674
2006
  }
1675
2007
 
1676
2008
  if state_dict is not None:
1677
2009
  insert_data["state_dict"] = state_dict
1678
- update_data[ServiceModel.state_dict] = state_dict
1679
-
1680
- ServiceModel.insert(**insert_data).on_conflict(
1681
- conflict_target=[
1682
- ServiceModel.service_id,
1683
- ServiceModel.experiment_id,
1684
- ServiceModel.run_id,
1685
- ],
1686
- update=update_data,
1687
- ).execute()
2010
+
2011
+ # Use INSERT OR IGNORE - services are only added, not updated
2012
+ ServiceModel.insert(**insert_data).on_conflict_ignore().execute()
1688
2013
 
1689
2014
  logger.debug(
1690
- "Updated service %s (experiment=%s, run=%s)",
2015
+ "Registered service %s (experiment=%s, run=%s)",
1691
2016
  service_id,
1692
2017
  experiment_id,
1693
2018
  run_id,
@@ -1701,70 +2026,68 @@ class WorkspaceStateProvider:
1701
2026
  "serviceId": service_id,
1702
2027
  "experimentId": experiment_id,
1703
2028
  "runId": run_id,
1704
- "state": state,
1705
2029
  "description": description,
1706
2030
  },
1707
2031
  )
1708
2032
  )
1709
2033
 
1710
- @_with_db_context
1711
- def get_services(
1712
- self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
1713
- ) -> List["Service"]:
1714
- """Get services, optionally filtered by experiment/run
2034
+ def _get_live_services(
2035
+ self, experiment_id: Optional[str], run_id: Optional[str]
2036
+ ) -> Optional[List["Service"]]:
2037
+ """Get live services from scheduler if available.
1715
2038
 
1716
- This method abstracts whether services are live (from scheduler) or
1717
- from the database. It returns actual Service objects in both cases:
1718
- - If a live scheduler has the experiment, return live Service objects
1719
- - Otherwise, recreate Service objects from stored state_dict
2039
+ Returns None if no live services (experiment not in scheduler).
2040
+ """
2041
+ if experiment_id is None:
2042
+ return None
1720
2043
 
1721
- Args:
1722
- experiment_id: Filter by experiment (None = all)
1723
- run_id: Filter by run (None = current run if experiment_id provided)
2044
+ try:
2045
+ from experimaestro.scheduler.base import Scheduler
1724
2046
 
1725
- Returns:
1726
- List of Service objects
2047
+ if not Scheduler.has_instance():
2048
+ return None
2049
+
2050
+ scheduler = Scheduler.instance()
2051
+ if experiment_id not in scheduler.experiments:
2052
+ logger.debug("Experiment %s not in scheduler", experiment_id)
2053
+ return None
2054
+
2055
+ exp = scheduler.experiments[experiment_id]
2056
+ services = list(exp.services.values())
2057
+ logger.debug(
2058
+ "Returning %d live services for experiment %s",
2059
+ len(services),
2060
+ experiment_id,
2061
+ )
2062
+ return services
2063
+
2064
+ except Exception as e:
2065
+ logger.warning("Could not get live services: %s", e)
2066
+ return None
2067
+
2068
+ @_with_db_context
2069
+ def _fetch_services_from_storage(
2070
+ self, experiment_id: Optional[str], run_id: Optional[str]
2071
+ ) -> List["Service"]:
2072
+ """Fetch services from database.
2073
+
2074
+ Called when no live services and cache is empty.
1727
2075
  """
1728
2076
  from experimaestro.scheduler.services import Service
1729
2077
 
1730
- # First, check for live services from the scheduler
1731
- if experiment_id is not None:
1732
- try:
1733
- from experimaestro.scheduler.base import Scheduler
1734
-
1735
- if Scheduler.has_instance():
1736
- scheduler = Scheduler.instance()
1737
- # Check if experiment is registered with scheduler
1738
- if experiment_id in scheduler.experiments:
1739
- exp = scheduler.experiments[experiment_id]
1740
- services = list(exp.services.values())
1741
- logger.debug(
1742
- "Returning %d live services for experiment %s",
1743
- len(services),
1744
- experiment_id,
1745
- )
1746
- return services
1747
- except Exception as e:
1748
- # Scheduler not available or error - fall back to database
1749
- logger.debug("Could not get live services: %s", e)
1750
-
1751
- # Fall back to database
1752
2078
  query = ServiceModel.select()
1753
2079
 
1754
2080
  if experiment_id is not None:
1755
- # Use current run if not specified
1756
- if run_id is None:
1757
- run_id = self.get_current_run(experiment_id)
1758
- if run_id is None:
1759
- return []
1760
-
1761
2081
  query = query.where(
1762
2082
  (ServiceModel.experiment_id == experiment_id)
1763
2083
  & (ServiceModel.run_id == run_id)
1764
2084
  )
1765
2085
 
1766
2086
  services = []
2087
+
1767
2088
  for service_model in query:
2089
+ service_id = service_model.service_id
2090
+
1768
2091
  # Try to recreate service from state_dict
1769
2092
  state_dict_json = service_model.state_dict
1770
2093
  if state_dict_json and state_dict_json != "{}":
@@ -1772,20 +2095,89 @@ class WorkspaceStateProvider:
1772
2095
  state_dict = json.loads(state_dict_json)
1773
2096
  if "__class__" in state_dict:
1774
2097
  service = Service.from_state_dict(state_dict)
1775
- # Set the id from the database record
1776
- service.id = service_model.service_id
1777
- services.append(service)
1778
- continue
1779
2098
  except Exception as e:
2099
+ service = MockService(
2100
+ service_id,
2101
+ f"error: {e}",
2102
+ {},
2103
+ experiment_id=experiment_id,
2104
+ run_id=run_id,
2105
+ )
2106
+
1780
2107
  logger.warning(
1781
2108
  "Failed to recreate service %s from state_dict: %s",
1782
- service_model.service_id,
2109
+ service_id,
1783
2110
  e,
1784
2111
  )
1785
- # If we can't recreate, skip this service (it's not usable)
1786
- logger.debug(
1787
- "Service %s has no state_dict for recreation, skipping",
1788
- service_model.service_id,
2112
+ else:
2113
+ # If we can't recreate, skip this service (it's not usable)
2114
+ logger.debug(
2115
+ "Service %s has no state_dict for recreation, skipping",
2116
+ service_id,
2117
+ )
2118
+ service = MockService(
2119
+ service_id,
2120
+ "error: no state_dict",
2121
+ {},
2122
+ experiment_id=experiment_id,
2123
+ run_id=run_id,
2124
+ )
2125
+
2126
+ # Add to services
2127
+ service.id = service_id
2128
+ services.append(service)
2129
+ continue
2130
+
2131
+ return services
2132
+
2133
+ @_with_db_context
2134
+ def get_services_raw(
2135
+ self, experiment_id: Optional[str] = None, run_id: Optional[str] = None
2136
+ ) -> List[Dict]:
2137
+ """Get raw service data from database without recreating Service objects
2138
+
2139
+ This is useful for remote monitoring where the client may have different
2140
+ modules installed than the server. Returns dictionaries with service
2141
+ metadata that can be serialized over JSON-RPC.
2142
+
2143
+ Args:
2144
+ experiment_id: Filter by experiment (None = all)
2145
+ run_id: Filter by run (None = current run if experiment_id provided)
2146
+
2147
+ Returns:
2148
+ List of dictionaries with service data
2149
+ """
2150
+ query = ServiceModel.select()
2151
+
2152
+ if experiment_id is not None:
2153
+ # Use current run if not specified
2154
+ if run_id is None:
2155
+ run_id = self.get_current_run(experiment_id)
2156
+ if run_id is None:
2157
+ return []
2158
+
2159
+ query = query.where(
2160
+ (ServiceModel.experiment_id == experiment_id)
2161
+ & (ServiceModel.run_id == run_id)
2162
+ )
2163
+
2164
+ services = []
2165
+ for service_model in query:
2166
+ state_dict = {}
2167
+ if service_model.state_dict and service_model.state_dict != "{}":
2168
+ try:
2169
+ state_dict = json.loads(service_model.state_dict)
2170
+ except json.JSONDecodeError:
2171
+ pass
2172
+
2173
+ services.append(
2174
+ {
2175
+ "service_id": service_model.service_id,
2176
+ "description": service_model.description,
2177
+ "state_dict": state_dict,
2178
+ "experiment_id": service_model.experiment_id,
2179
+ "run_id": service_model.run_id,
2180
+ }
1789
2181
  )
1790
2182
 
1791
2183
  return services
@@ -1855,13 +2247,19 @@ class WorkspaceStateProvider:
1855
2247
  Returns:
1856
2248
  datetime of last sync, or None if never synced
1857
2249
  """
2250
+ from peewee import OperationalError
2251
+
1858
2252
  from .state_db import WorkspaceSyncMetadata
1859
2253
 
1860
- metadata = WorkspaceSyncMetadata.get_or_none(
1861
- WorkspaceSyncMetadata.id == "workspace"
1862
- )
1863
- if metadata and metadata.last_sync_time:
1864
- return metadata.last_sync_time
2254
+ try:
2255
+ metadata = WorkspaceSyncMetadata.get_or_none(
2256
+ WorkspaceSyncMetadata.id == "workspace"
2257
+ )
2258
+ if metadata and metadata.last_sync_time:
2259
+ return metadata.last_sync_time
2260
+ except OperationalError:
2261
+ # Table might not exist in older workspaces opened in read-only mode
2262
+ pass
1865
2263
  return None
1866
2264
 
1867
2265
  @_with_db_context
@@ -2333,13 +2731,36 @@ class SchedulerListener:
2333
2731
  experiment_id: Experiment identifier
2334
2732
  run_id: Run identifier
2335
2733
  """
2734
+ from experimaestro.scheduler.services import Service
2735
+
2336
2736
  try:
2337
- self.state_provider.update_service(
2737
+ # Get state_dict for service recreation
2738
+ state_dict_json = None
2739
+ try:
2740
+ # _full_state_dict includes __class__ automatically
2741
+ state_dict = service._full_state_dict()
2742
+ # Serialize paths automatically
2743
+ serialized = Service.serialize_state_dict(state_dict)
2744
+ state_dict_json = json.dumps(serialized)
2745
+ except Exception as e:
2746
+ # Service cannot be serialized - store unserializable marker
2747
+ logger.warning(
2748
+ "Could not get state_dict for service %s: %s", service.id, e
2749
+ )
2750
+ state_dict_json = json.dumps(
2751
+ {
2752
+ "__class__": f"{service.__class__.__module__}.{service.__class__.__name__}",
2753
+ "__unserializable__": True,
2754
+ "__reason__": f"Cannot serialize: {e}",
2755
+ }
2756
+ )
2757
+
2758
+ self.state_provider.register_service(
2338
2759
  service.id,
2339
2760
  experiment_id,
2340
2761
  run_id,
2341
2762
  service.description(),
2342
- service.state.name,
2763
+ state_dict=state_dict_json,
2343
2764
  )
2344
2765
  except Exception as e:
2345
2766
  logger.exception("Error updating service %s: %s", service.id, e)