experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of experimaestro might be problematic. Click here for more details.

@@ -309,11 +309,98 @@ def experiments(ctx, workdir, workspace):
309
309
  @experiments.command()
310
310
  @pass_cfg
311
311
  def list(workdir: Path):
312
+ """List experiments in the workspace"""
313
+ from experimaestro.scheduler.state_provider import WorkspaceStateProvider
314
+
315
+ # Get experiments from state provider for detailed info
316
+ state_provider = WorkspaceStateProvider.get_instance(
317
+ workdir, read_only=True, sync_on_start=True
318
+ )
319
+ experiments_list = state_provider.get_experiments()
320
+
321
+ # Build lookup by experiment_id
322
+ exp_info = {exp.experiment_id: exp for exp in experiments_list}
323
+
312
324
  for p in (workdir / "xp").iterdir():
325
+ exp_id = p.name
326
+ exp = exp_info.get(exp_id)
327
+
328
+ # Build display string
329
+ display_parts = []
330
+
331
+ if (p / "jobs.bak").exists():
332
+ display_parts.append("[unfinished]")
333
+
334
+ display_parts.append(exp_id)
335
+
336
+ # Add hostname if available
337
+ if exp and getattr(exp, "hostname", None):
338
+ display_parts.append(f"[{exp.hostname}]")
339
+
340
+ # Add job stats if available
341
+ if exp:
342
+ display_parts.append(f"({exp.finished_jobs}/{exp.total_jobs} jobs)")
343
+
344
+ display_str = " ".join(display_parts)
345
+
313
346
  if (p / "jobs.bak").exists():
314
- cprint(f"[unfinished] {p.name}", "yellow")
347
+ cprint(display_str, "yellow")
315
348
  else:
316
- cprint(p.name, "cyan")
349
+ cprint(display_str, "cyan")
350
+
351
+
352
+ def _run_monitor_ui(
353
+ state_provider, workdir: Path, console: bool, port: int, title: str = ""
354
+ ):
355
+ """Shared code for running monitor UI (TUI or web)
356
+
357
+ Args:
358
+ state_provider: StateProvider instance (local or remote)
359
+ workdir: Local workspace/cache directory
360
+ console: If True, use TUI; otherwise use web UI
361
+ port: Port for web server
362
+ title: Optional title for status messages
363
+ """
364
+ try:
365
+ if console:
366
+ # Use Textual TUI
367
+ from experimaestro.tui import ExperimentTUI
368
+
369
+ app = ExperimentTUI(
370
+ workdir, state_provider=state_provider, watch=True, show_logs=True
371
+ )
372
+ app.run()
373
+ else:
374
+ # Use React web server
375
+ from experimaestro.server import Server
376
+
377
+ if title:
378
+ cprint(
379
+ f"Starting experiment monitor for {title} on http://localhost:{port}",
380
+ "green",
381
+ )
382
+ else:
383
+ cprint(
384
+ f"Starting experiment monitor on http://localhost:{port}", "green"
385
+ )
386
+ cprint("Press Ctrl+C to stop", "yellow")
387
+
388
+ settings = ServerSettings()
389
+ settings.port = port
390
+ server = Server.instance(settings, state_provider=state_provider)
391
+ server.start()
392
+
393
+ try:
394
+ import time
395
+
396
+ while True:
397
+ time.sleep(1)
398
+ except KeyboardInterrupt:
399
+ pass
400
+ finally:
401
+ cprint("\nShutting down...", "yellow")
402
+ if state_provider:
403
+ state_provider.close()
317
404
 
318
405
 
319
406
  @experiments.command()
@@ -326,7 +413,7 @@ def list(workdir: Path):
326
413
  )
327
414
  @pass_cfg
328
415
  def monitor(workdir: Path, console: bool, port: int, sync: bool):
329
- """Monitor experiments with web UI or console TUI"""
416
+ """Monitor local experiments with web UI or console TUI"""
330
417
  # Force sync from disk if requested
331
418
  if sync:
332
419
  from experimaestro.scheduler.state_sync import sync_workspace_from_disk
@@ -335,37 +422,96 @@ def monitor(workdir: Path, console: bool, port: int, sync: bool):
335
422
  sync_workspace_from_disk(workdir, write_mode=True, force=True)
336
423
  cprint("Sync complete", "green")
337
424
 
338
- if console:
339
- # Use Textual TUI
340
- from experimaestro.tui import ExperimentTUI
425
+ from experimaestro.scheduler.state_provider import WorkspaceStateProvider
341
426
 
342
- app = ExperimentTUI(workdir, watch=True)
343
- app.run()
344
- else:
345
- # Use React web server
346
- from experimaestro.scheduler.state_provider import WorkspaceStateProvider
347
- from experimaestro.server import Server
348
-
349
- cprint(f"Starting experiment monitor on http://localhost:{port}", "green")
350
- cprint("Press Ctrl+C to stop", "yellow")
351
-
352
- state_provider = WorkspaceStateProvider.get_instance(
353
- workdir,
354
- sync_on_start=not sync, # Skip auto-sync if we just did a forced one
355
- )
356
- settings = ServerSettings()
357
- settings.port = port
358
- server = Server.instance(settings, state_provider=state_provider)
359
- server.start()
427
+ state_provider = WorkspaceStateProvider.get_instance(
428
+ workdir,
429
+ sync_on_start=not sync, # Skip auto-sync if we just did a forced one
430
+ )
360
431
 
361
- try:
362
- import time
432
+ _run_monitor_ui(state_provider, workdir, console, port)
363
433
 
364
- while True:
365
- time.sleep(1)
366
- except KeyboardInterrupt:
367
- cprint("\nShutting down...", "yellow")
368
- state_provider.close()
434
+
435
+ @experiments.command("ssh-monitor")
436
+ @click.argument("host", type=str)
437
+ @click.argument("remote_workdir", type=str)
438
+ @click.option("--console", is_flag=True, help="Use console TUI instead of web UI")
439
+ @click.option(
440
+ "--port", type=int, default=12345, help="Port for web server (default: 12345)"
441
+ )
442
+ @click.option(
443
+ "--remote-xpm",
444
+ type=str,
445
+ default=None,
446
+ help="Path to experimaestro on remote host (default: use 'uv tool run')",
447
+ )
448
+ @click.option(
449
+ "--ssh-option",
450
+ "-o",
451
+ multiple=True,
452
+ help="Additional SSH options (can be repeated, e.g., -o '-p 2222')",
453
+ )
454
+ def ssh_monitor(
455
+ host: str,
456
+ remote_workdir: str,
457
+ console: bool,
458
+ port: int,
459
+ remote_xpm: str,
460
+ ssh_option: tuple,
461
+ ):
462
+ """Monitor experiments on a remote server via SSH
463
+
464
+ HOST is the SSH host (e.g., user@server)
465
+ REMOTE_WORKDIR is the workspace path on the remote server
466
+
467
+ Examples:
468
+ experimaestro experiments ssh-monitor myserver /path/to/workspace
469
+ experimaestro experiments ssh-monitor user@host /workspace --console
470
+ experimaestro experiments ssh-monitor host /workspace --remote-xpm /opt/xpm/bin/experimaestro
471
+ """
472
+ from experimaestro.scheduler.remote.client import SSHStateProviderClient
473
+
474
+ cprint(f"Connecting to {host}...", "yellow")
475
+ state_provider = SSHStateProviderClient(
476
+ host=host,
477
+ remote_workspace=remote_workdir,
478
+ ssh_options=list(ssh_option) if ssh_option else None,
479
+ remote_xpm_path=remote_xpm,
480
+ )
481
+ try:
482
+ state_provider.connect()
483
+ cprint(f"Connected to {host}", "green")
484
+ except Exception as e:
485
+ cprint(f"Failed to connect: {e}", "red")
486
+ raise click.Abort()
487
+
488
+ _run_monitor_ui(
489
+ state_provider,
490
+ state_provider.local_cache_dir,
491
+ console,
492
+ port,
493
+ title=host,
494
+ )
495
+
496
+
497
+ @experiments.command("monitor-server")
498
+ @pass_cfg
499
+ def monitor_server(workdir: Path):
500
+ """Start monitoring server for SSH connections (JSON-RPC over stdio)
501
+
502
+ This command is intended to be run over SSH to provide remote monitoring.
503
+ Communication is via JSON-RPC over stdin/stdout.
504
+
505
+ Example:
506
+ ssh host 'experimaestro experiments --workdir /path monitor-server'
507
+ """
508
+ from experimaestro.scheduler.remote.server import SSHStateProviderServer
509
+
510
+ server = SSHStateProviderServer(workdir)
511
+ try:
512
+ server.start()
513
+ except KeyboardInterrupt:
514
+ server.stop()
369
515
 
370
516
 
371
517
  @experiments.command()
@@ -360,7 +360,12 @@ def experiments_cli( # noqa: C901
360
360
  except HandledException:
361
361
  sys.exit(1)
362
362
 
363
- if console:
363
+ # Console mode is only available in NORMAL run mode
364
+ use_console = console and run_mode == RunMode.NORMAL
365
+ if console and not use_console:
366
+ logging.warning("--console is ignored when run_mode is not NORMAL")
367
+
368
+ if use_console:
364
369
  # Run experiment in background thread, console UI in main thread
365
370
  import threading
366
371
  from experimaestro.tui import ExperimentTUI
@@ -375,7 +380,6 @@ def experiments_cli( # noqa: C901
375
380
  run_experiment_code(xp_holder, xp_ready, register_signals=False)
376
381
  # Add a test message after experiment completes
377
382
  logging.info("Experiment thread completed")
378
- print("Experiment thread print test")
379
383
  except Exception as e:
380
384
  exception_holder["exception"] = e
381
385
  xp_ready.set() # Signal even on error
@@ -197,6 +197,27 @@ class Scheduler(threading.Thread):
197
197
  with self._listeners_lock:
198
198
  self._listeners.clear()
199
199
 
200
+ def wait_for_notifications(self, timeout: float = 5.0) -> bool:
201
+ """Wait for all pending notifications to be processed.
202
+
203
+ This submits a sentinel task and waits for it to complete,
204
+ ensuring all previously submitted notifications have been processed.
205
+
206
+ Args:
207
+ timeout: Maximum time to wait in seconds
208
+
209
+ Returns:
210
+ True if all notifications were processed, False if timeout occurred
211
+ """
212
+ try:
213
+ # Submit a no-op and wait for it to complete
214
+ future = self._notification_executor.submit(lambda: None)
215
+ future.result(timeout=timeout)
216
+ return True
217
+ except concurrent.futures.TimeoutError:
218
+ logger.warning("Timeout waiting for notification queue to drain")
219
+ return False
220
+
200
221
  def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
201
222
  # Check if the job belongs to this scheduler
202
223
  if job.identifier not in self.jobs:
@@ -43,26 +43,22 @@ class DatabaseListener:
43
43
  self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
44
44
 
45
45
  def service_add(self, service):
46
- """Update service in database"""
47
- self.state_provider.update_service(
46
+ """Register service in database"""
47
+ from experimaestro.scheduler.services import Service
48
+
49
+ state_dict = Service.serialize_state_dict(service._full_state_dict())
50
+ self.state_provider.register_service(
48
51
  service.id,
49
52
  self.experiment_id,
50
53
  self.run_id,
51
54
  service.description(),
52
- service.state.name,
53
- state_dict=json.dumps(service.state_dict()),
55
+ state_dict=json.dumps(state_dict),
54
56
  )
55
57
 
56
58
  def service_state_changed(self, service):
57
- """Update service state in database (called by Service when state changes)"""
58
- self.state_provider.update_service(
59
- service.id,
60
- self.experiment_id,
61
- self.run_id,
62
- service.description(),
63
- service.state.name,
64
- state_dict=json.dumps(service.state_dict()),
65
- )
59
+ """Called when service state changes (runtime only, not persisted)"""
60
+ # Service state is managed at runtime, not persisted to DB
61
+ pass
66
62
 
67
63
 
68
64
  class experiment:
@@ -224,10 +220,13 @@ class experiment:
224
220
 
225
221
  def _write_services_json(self):
226
222
  """Write all services to services.json file"""
223
+ from experimaestro.scheduler.services import Service
224
+
227
225
  services_data = {}
228
226
  for service_id, service in self.services.items():
229
227
  # Get state_dict from service (includes __class__ for recreation)
230
- service_state = service.state_dict()
228
+ # and serialize paths to JSON-compatible format
229
+ service_state = Service.serialize_state_dict(service._full_state_dict())
231
230
  # Add runtime state info
232
231
  service_state.update(
233
232
  {
@@ -281,9 +280,10 @@ class experiment:
281
280
  with self.jobs_jsonl_path.open("a") as f:
282
281
  f.write(json.dumps(record) + "\n")
283
282
 
284
- # Also register in database for TUI/monitoring
285
- experiment_id = self.workdir.name
286
- self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
283
+ # Also register in database for TUI/monitoring (only in NORMAL mode)
284
+ if self._db_listener is not None:
285
+ experiment_id = self.workdir.name
286
+ self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
287
287
 
288
288
  def stop(self):
289
289
  """Stop the experiment as soon as possible"""
@@ -403,24 +403,31 @@ class experiment:
403
403
  (self.workspace.path / ".__experimaestro__").touch()
404
404
 
405
405
  # Initialize workspace state provider (singleton per workspace path)
406
+ # Use read_only mode when not in NORMAL run mode to prevent DB changes
406
407
  from .state_provider import WorkspaceStateProvider
407
408
 
409
+ is_normal_mode = self.workspace.run_mode == RunMode.NORMAL
408
410
  self.state_provider = WorkspaceStateProvider.get_instance(
409
411
  self.workspace.path,
410
- read_only=False,
412
+ read_only=not is_normal_mode,
411
413
  sync_on_start=False, # Experiments don't sync on start
412
414
  )
413
415
 
414
- # Register experiment in database and create a run
416
+ # Register experiment in database and create a run (only in NORMAL mode)
415
417
  experiment_id = self.workdir.name
416
- self.state_provider.ensure_experiment(experiment_id)
417
- self.run_id = self.state_provider.create_run(experiment_id)
418
-
419
- # Add database listener to update job state in database
420
- self._db_listener = DatabaseListener(
421
- self.state_provider, experiment_id, self.run_id
422
- )
423
- self.scheduler.addlistener(self._db_listener)
418
+ self._db_listener = None
419
+ if is_normal_mode:
420
+ self.state_provider.ensure_experiment(experiment_id)
421
+ self.run_id = self.state_provider.create_run(experiment_id)
422
+
423
+ # Add database listener to update job state in database
424
+ self._db_listener = DatabaseListener(
425
+ self.state_provider, experiment_id, self.run_id
426
+ )
427
+ self.scheduler.addlistener(self._db_listener)
428
+ else:
429
+ # In non-NORMAL modes, use a placeholder run_id
430
+ self.run_id = None
424
431
 
425
432
  # Number of unfinished jobs
426
433
  self.unfinishedJobs = 0
@@ -461,6 +468,10 @@ class experiment:
461
468
  )
462
469
  else:
463
470
  self.wait()
471
+
472
+ # Wait for all pending notifications to be processed
473
+ # before removing listeners
474
+ self.scheduler.wait_for_notifications()
464
475
  finally:
465
476
  if self._register_signals:
466
477
  SIGNAL_HANDLER.remove(self)
@@ -473,13 +484,14 @@ class experiment:
473
484
  # Unregister experiment from scheduler
474
485
  self.scheduler.unregister_experiment(self)
475
486
 
476
- # Remove database listener
477
- self.scheduler.removelistener(self._db_listener)
487
+ # Remove database listener and mark run as completed (only in NORMAL mode)
488
+ if self._db_listener is not None:
489
+ self.scheduler.removelistener(self._db_listener)
478
490
 
479
- # Mark run as completed in database
480
- experiment_id = self.workdir.name
481
- status = "failed" if exc_type else "completed"
482
- self.state_provider.complete_run(experiment_id, self.run_id, status)
491
+ # Mark run as completed in database
492
+ experiment_id = self.workdir.name
493
+ status = "failed" if exc_type else "completed"
494
+ self.state_provider.complete_run(experiment_id, self.run_id, status)
483
495
 
484
496
  # Note: Don't stop scheduler - it's shared!
485
497
  # Note: Don't stop server - it runs in daemon mode until program exit
@@ -526,10 +538,28 @@ class experiment:
526
538
  """Adds a service (e.g. tensorboard viewer) to the experiment
527
539
 
528
540
  :param service: A service instance
529
- :return: The same service instance
541
+ :return: The same service instance (or existing service if already added)
530
542
  """
543
+ existing = self.services.get(service.id)
544
+ if existing is not None:
545
+ if existing is service:
546
+ # Same service instance added twice - just return it
547
+ logger.debug("Service %s already added, ignoring duplicate", service.id)
548
+ return service
549
+ else:
550
+ # Different service with same id - warn and replace
551
+ logger.warning(
552
+ "Replacing service %s (old id=%s, new id=%s)",
553
+ service.id,
554
+ id(existing),
555
+ id(service),
556
+ )
557
+
531
558
  self.services[service.id] = service
532
559
 
560
+ # Allow service to access experiment context
561
+ service.set_experiment(self)
562
+
533
563
  # Register database listener for state changes
534
564
  service.add_listener(self._db_listener)
535
565
 
@@ -472,3 +472,30 @@ class BaseExperiment:
472
472
  def experiment_id(self) -> str:
473
473
  """Experiment identifier derived from workdir name"""
474
474
  return self.workdir.name
475
+
476
+
477
+ class BaseService:
478
+ """Base interface for service information
479
+
480
+ This class defines the interface for service data. Both live Service instances
481
+ and MockService instances should provide these attributes and methods.
482
+
483
+ Attributes:
484
+ id: Unique identifier for the service
485
+ state: Current service state (ServiceState enum or compatible)
486
+ """
487
+
488
+ id: str
489
+
490
+ @property
491
+ def state(self):
492
+ """Current service state"""
493
+ raise NotImplementedError
494
+
495
+ def description(self) -> str:
496
+ """Human-readable description of the service"""
497
+ raise NotImplementedError
498
+
499
+ def state_dict(self) -> dict:
500
+ """Return dictionary representation for serialization"""
501
+ raise NotImplementedError
@@ -0,0 +1,31 @@
1
+ """Remote monitoring support for experimaestro
2
+
3
+ This package provides SSH-based remote monitoring capabilities for experiments.
4
+
5
+ Main components:
6
+ - SSHStateProviderServer: JSON-RPC server that wraps WorkspaceStateProvider
7
+ - SSHStateProviderClient: Client that connects via SSH and implements StateProvider interface
8
+ - RemoteFileSynchronizer: Rsync-based file synchronization
9
+
10
+ Usage:
11
+ # On remote host (run via SSH):
12
+ from experimaestro.scheduler.remote.server import SSHStateProviderServer
13
+ server = SSHStateProviderServer(workspace_path)
14
+ server.start()
15
+
16
+ # On local host:
17
+ from experimaestro.scheduler.remote.client import SSHStateProviderClient
18
+ client = SSHStateProviderClient(host="server", remote_workspace="/path")
19
+ client.connect()
20
+ experiments = client.get_experiments()
21
+ """
22
+
23
+ from experimaestro.scheduler.remote.server import SSHStateProviderServer
24
+ from experimaestro.scheduler.remote.client import SSHStateProviderClient
25
+ from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
26
+
27
+ __all__ = [
28
+ "SSHStateProviderServer",
29
+ "SSHStateProviderClient",
30
+ "RemoteFileSynchronizer",
31
+ ]