PyPI - experimaestro - Versions diffs - 2.0.0b4__py3-none-any.whl → 2.0.0b8__py3-none-any.whl - Mend

experimaestro 2.0.0b4py3-none-any.whl → 2.0.0b8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of experimaestro might be problematic. Click here for more details.

Files changed (24) hide show

experimaestro/cli/__init__.py +177 -31
experimaestro/experiments/cli.py +6 -2
experimaestro/scheduler/base.py +21 -0
experimaestro/scheduler/experiment.py +64 -34
experimaestro/scheduler/interfaces.py +27 -0
experimaestro/scheduler/remote/__init__.py +31 -0
experimaestro/scheduler/remote/client.py +874 -0
experimaestro/scheduler/remote/protocol.py +467 -0
experimaestro/scheduler/remote/server.py +423 -0
experimaestro/scheduler/remote/sync.py +144 -0
experimaestro/scheduler/services.py +158 -32
experimaestro/scheduler/state_db.py +58 -9
experimaestro/scheduler/state_provider.py +512 -91
experimaestro/scheduler/state_sync.py +65 -8
experimaestro/tests/test_cli_jobs.py +3 -3
experimaestro/tests/test_remote_state.py +671 -0
experimaestro/tests/test_state_db.py +8 -8
experimaestro/tui/app.py +100 -8
experimaestro/version.py +2 -2
{experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/METADATA +4 -4
{experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/RECORD +24 -18
{experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/WHEEL +0 -0
{experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/entry_points.txt +0 -0
{experimaestro-2.0.0b4.dist-info → experimaestro-2.0.0b8.dist-info}/licenses/LICENSE +0 -0

experimaestro/cli/__init__.py CHANGED Viewed

@@ -309,11 +309,98 @@ def experiments(ctx, workdir, workspace):
 @experiments.command()
 @pass_cfg
 def list(workdir: Path):
+    """List experiments in the workspace"""
+    from experimaestro.scheduler.state_provider import WorkspaceStateProvider
+    # Get experiments from state provider for detailed info
+    state_provider = WorkspaceStateProvider.get_instance(
+        workdir, read_only=True, sync_on_start=True
+    )
+    experiments_list = state_provider.get_experiments()
+    # Build lookup by experiment_id
+    exp_info = {exp.experiment_id: exp for exp in experiments_list}
     for p in (workdir / "xp").iterdir():
+        exp_id = p.name
+        exp = exp_info.get(exp_id)
+        # Build display string
+        display_parts = []
+        if (p / "jobs.bak").exists():
+            display_parts.append("[unfinished]")
+        display_parts.append(exp_id)
+        # Add hostname if available
+        if exp and getattr(exp, "hostname", None):
+            display_parts.append(f"[{exp.hostname}]")
+        # Add job stats if available
+        if exp:
+            display_parts.append(f"({exp.finished_jobs}/{exp.total_jobs} jobs)")
+        display_str = " ".join(display_parts)
         if (p / "jobs.bak").exists():
-            cprint(f"[unfinished] {p.name}", "yellow")
+            cprint(display_str, "yellow")
         else:
-            cprint(p.name, "cyan")
+            cprint(display_str, "cyan")
+def _run_monitor_ui(
+    state_provider, workdir: Path, console: bool, port: int, title: str = ""
+):
+    """Shared code for running monitor UI (TUI or web)
+    Args:
+        state_provider: StateProvider instance (local or remote)
+        workdir: Local workspace/cache directory
+        console: If True, use TUI; otherwise use web UI
+        port: Port for web server
+        title: Optional title for status messages
+    """
+    try:
+        if console:
+            # Use Textual TUI
+            from experimaestro.tui import ExperimentTUI
+            app = ExperimentTUI(
+                workdir, state_provider=state_provider, watch=True, show_logs=True
+            )
+            app.run()
+        else:
+            # Use React web server
+            from experimaestro.server import Server
+            if title:
+                cprint(
+                    f"Starting experiment monitor for {title} on http://localhost:{port}",
+                    "green",
+                )
+            else:
+                cprint(
+                    f"Starting experiment monitor on http://localhost:{port}", "green"
+                )
+            cprint("Press Ctrl+C to stop", "yellow")
+            settings = ServerSettings()
+            settings.port = port
+            server = Server.instance(settings, state_provider=state_provider)
+            server.start()
+            try:
+                import time
+                while True:
+                    time.sleep(1)
+            except KeyboardInterrupt:
+                pass
+    finally:
+        cprint("\nShutting down...", "yellow")
+        if state_provider:
+            state_provider.close()
 @experiments.command()
@@ -326,7 +413,7 @@ def list(workdir: Path):
 )
 @pass_cfg
 def monitor(workdir: Path, console: bool, port: int, sync: bool):
-    """Monitor experiments with web UI or console TUI"""
+    """Monitor local experiments with web UI or console TUI"""
     # Force sync from disk if requested
     if sync:
         from experimaestro.scheduler.state_sync import sync_workspace_from_disk
@@ -335,37 +422,96 @@ def monitor(workdir: Path, console: bool, port: int, sync: bool):
         sync_workspace_from_disk(workdir, write_mode=True, force=True)
         cprint("Sync complete", "green")
-    if console:
-        # Use Textual TUI
-        from experimaestro.tui import ExperimentTUI
+    from experimaestro.scheduler.state_provider import WorkspaceStateProvider
-        app = ExperimentTUI(workdir, watch=True)
-        app.run()
-    else:
-        # Use React web server
-        from experimaestro.scheduler.state_provider import WorkspaceStateProvider
-        from experimaestro.server import Server
-        cprint(f"Starting experiment monitor on http://localhost:{port}", "green")
-        cprint("Press Ctrl+C to stop", "yellow")
-        state_provider = WorkspaceStateProvider.get_instance(
-            workdir,
-            sync_on_start=not sync,  # Skip auto-sync if we just did a forced one
-        )
-        settings = ServerSettings()
-        settings.port = port
-        server = Server.instance(settings, state_provider=state_provider)
-        server.start()
+    state_provider = WorkspaceStateProvider.get_instance(
+        workdir,
+        sync_on_start=not sync,  # Skip auto-sync if we just did a forced one
+    )
-        try:
-            import time
+    _run_monitor_ui(state_provider, workdir, console, port)
-            while True:
-                time.sleep(1)
-        except KeyboardInterrupt:
-            cprint("\nShutting down...", "yellow")
-            state_provider.close()
+@experiments.command("ssh-monitor")
+@click.argument("host", type=str)
+@click.argument("remote_workdir", type=str)
+@click.option("--console", is_flag=True, help="Use console TUI instead of web UI")
+@click.option(
+    "--port", type=int, default=12345, help="Port for web server (default: 12345)"
+)
+@click.option(
+    "--remote-xpm",
+    type=str,
+    default=None,
+    help="Path to experimaestro on remote host (default: use 'uv tool run')",
+)
+@click.option(
+    "--ssh-option",
+    "-o",
+    multiple=True,
+    help="Additional SSH options (can be repeated, e.g., -o '-p 2222')",
+)
+def ssh_monitor(
+    host: str,
+    remote_workdir: str,
+    console: bool,
+    port: int,
+    remote_xpm: str,
+    ssh_option: tuple,
+):
+    """Monitor experiments on a remote server via SSH
+    HOST is the SSH host (e.g., user@server)
+    REMOTE_WORKDIR is the workspace path on the remote server
+    Examples:
+        experimaestro experiments ssh-monitor myserver /path/to/workspace
+        experimaestro experiments ssh-monitor user@host /workspace --console
+        experimaestro experiments ssh-monitor host /workspace --remote-xpm /opt/xpm/bin/experimaestro
+    """
+    from experimaestro.scheduler.remote.client import SSHStateProviderClient
+    cprint(f"Connecting to {host}...", "yellow")
+    state_provider = SSHStateProviderClient(
+        host=host,
+        remote_workspace=remote_workdir,
+        ssh_options=list(ssh_option) if ssh_option else None,
+        remote_xpm_path=remote_xpm,
+    )
+    try:
+        state_provider.connect()
+        cprint(f"Connected to {host}", "green")
+    except Exception as e:
+        cprint(f"Failed to connect: {e}", "red")
+        raise click.Abort()
+    _run_monitor_ui(
+        state_provider,
+        state_provider.local_cache_dir,
+        console,
+        port,
+        title=host,
+    )
+@experiments.command("monitor-server")
+@pass_cfg
+def monitor_server(workdir: Path):
+    """Start monitoring server for SSH connections (JSON-RPC over stdio)
+    This command is intended to be run over SSH to provide remote monitoring.
+    Communication is via JSON-RPC over stdin/stdout.
+    Example:
+        ssh host 'experimaestro experiments --workdir /path monitor-server'
+    """
+    from experimaestro.scheduler.remote.server import SSHStateProviderServer
+    server = SSHStateProviderServer(workdir)
+    try:
+        server.start()
+    except KeyboardInterrupt:
+        server.stop()
 @experiments.command()

experimaestro/experiments/cli.py CHANGED Viewed

@@ -360,7 +360,12 @@ def experiments_cli(  # noqa: C901
         except HandledException:
             sys.exit(1)
-    if console:
+    # Console mode is only available in NORMAL run mode
+    use_console = console and run_mode == RunMode.NORMAL
+    if console and not use_console:
+        logging.warning("--console is ignored when run_mode is not NORMAL")
+    if use_console:
         # Run experiment in background thread, console UI in main thread
         import threading
         from experimaestro.tui import ExperimentTUI
@@ -375,7 +380,6 @@ def experiments_cli(  # noqa: C901
                 run_experiment_code(xp_holder, xp_ready, register_signals=False)
                 # Add a test message after experiment completes
                 logging.info("Experiment thread completed")
-                print("Experiment thread print test")
             except Exception as e:
                 exception_holder["exception"] = e
                 xp_ready.set()  # Signal even on error

experimaestro/scheduler/base.py CHANGED Viewed

@@ -197,6 +197,27 @@ class Scheduler(threading.Thread):
         with self._listeners_lock:
             self._listeners.clear()
+    def wait_for_notifications(self, timeout: float = 5.0) -> bool:
+        """Wait for all pending notifications to be processed.
+        This submits a sentinel task and waits for it to complete,
+        ensuring all previously submitted notifications have been processed.
+        Args:
+            timeout: Maximum time to wait in seconds
+        Returns:
+            True if all notifications were processed, False if timeout occurred
+        """
+        try:
+            # Submit a no-op and wait for it to complete
+            future = self._notification_executor.submit(lambda: None)
+            future.result(timeout=timeout)
+            return True
+        except concurrent.futures.TimeoutError:
+            logger.warning("Timeout waiting for notification queue to drain")
+            return False
     def getJobState(self, job: Job) -> "concurrent.futures.Future[JobState]":
         # Check if the job belongs to this scheduler
         if job.identifier not in self.jobs:

experimaestro/scheduler/experiment.py CHANGED Viewed

@@ -43,26 +43,22 @@ class DatabaseListener:
         self.state_provider.update_job_state(job, self.experiment_id, self.run_id)
     def service_add(self, service):
-        """Update service in database"""
-        self.state_provider.update_service(
+        """Register service in database"""
+        from experimaestro.scheduler.services import Service
+        state_dict = Service.serialize_state_dict(service._full_state_dict())
+        self.state_provider.register_service(
             service.id,
             self.experiment_id,
             self.run_id,
             service.description(),
-            service.state.name,
-            state_dict=json.dumps(service.state_dict()),
+            state_dict=json.dumps(state_dict),
         )
     def service_state_changed(self, service):
-        """Update service state in database (called by Service when state changes)"""
-        self.state_provider.update_service(
-            service.id,
-            self.experiment_id,
-            self.run_id,
-            service.description(),
-            service.state.name,
-            state_dict=json.dumps(service.state_dict()),
-        )
+        """Called when service state changes (runtime only, not persisted)"""
+        # Service state is managed at runtime, not persisted to DB
+        pass
 class experiment:
@@ -224,10 +220,13 @@ class experiment:
     def _write_services_json(self):
         """Write all services to services.json file"""
+        from experimaestro.scheduler.services import Service
         services_data = {}
         for service_id, service in self.services.items():
             # Get state_dict from service (includes __class__ for recreation)
-            service_state = service.state_dict()
+            # and serialize paths to JSON-compatible format
+            service_state = Service.serialize_state_dict(service._full_state_dict())
             # Add runtime state info
             service_state.update(
                 {
@@ -281,9 +280,10 @@ class experiment:
         with self.jobs_jsonl_path.open("a") as f:
             f.write(json.dumps(record) + "\n")
-        # Also register in database for TUI/monitoring
-        experiment_id = self.workdir.name
-        self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
+        # Also register in database for TUI/monitoring (only in NORMAL mode)
+        if self._db_listener is not None:
+            experiment_id = self.workdir.name
+            self.state_provider.update_job_submitted(job, experiment_id, self.run_id)
     def stop(self):
         """Stop the experiment as soon as possible"""
@@ -403,24 +403,31 @@ class experiment:
         (self.workspace.path / ".__experimaestro__").touch()
         # Initialize workspace state provider (singleton per workspace path)
+        # Use read_only mode when not in NORMAL run mode to prevent DB changes
         from .state_provider import WorkspaceStateProvider
+        is_normal_mode = self.workspace.run_mode == RunMode.NORMAL
         self.state_provider = WorkspaceStateProvider.get_instance(
             self.workspace.path,
-            read_only=False,
+            read_only=not is_normal_mode,
             sync_on_start=False,  # Experiments don't sync on start
         )
-        # Register experiment in database and create a run
+        # Register experiment in database and create a run (only in NORMAL mode)
         experiment_id = self.workdir.name
-        self.state_provider.ensure_experiment(experiment_id)
-        self.run_id = self.state_provider.create_run(experiment_id)
-        # Add database listener to update job state in database
-        self._db_listener = DatabaseListener(
-            self.state_provider, experiment_id, self.run_id
-        )
-        self.scheduler.addlistener(self._db_listener)
+        self._db_listener = None
+        if is_normal_mode:
+            self.state_provider.ensure_experiment(experiment_id)
+            self.run_id = self.state_provider.create_run(experiment_id)
+            # Add database listener to update job state in database
+            self._db_listener = DatabaseListener(
+                self.state_provider, experiment_id, self.run_id
+            )
+            self.scheduler.addlistener(self._db_listener)
+        else:
+            # In non-NORMAL modes, use a placeholder run_id
+            self.run_id = None
         # Number of unfinished jobs
         self.unfinishedJobs = 0
@@ -461,6 +468,10 @@ class experiment:
                 )
             else:
                 self.wait()
+            # Wait for all pending notifications to be processed
+            # before removing listeners
+            self.scheduler.wait_for_notifications()
         finally:
             if self._register_signals:
                 SIGNAL_HANDLER.remove(self)
@@ -473,13 +484,14 @@ class experiment:
             # Unregister experiment from scheduler
             self.scheduler.unregister_experiment(self)
-            # Remove database listener
-            self.scheduler.removelistener(self._db_listener)
+            # Remove database listener and mark run as completed (only in NORMAL mode)
+            if self._db_listener is not None:
+                self.scheduler.removelistener(self._db_listener)
-            # Mark run as completed in database
-            experiment_id = self.workdir.name
-            status = "failed" if exc_type else "completed"
-            self.state_provider.complete_run(experiment_id, self.run_id, status)
+                # Mark run as completed in database
+                experiment_id = self.workdir.name
+                status = "failed" if exc_type else "completed"
+                self.state_provider.complete_run(experiment_id, self.run_id, status)
             # Note: Don't stop scheduler - it's shared!
             # Note: Don't stop server - it runs in daemon mode until program exit
@@ -526,10 +538,28 @@ class experiment:
         """Adds a service (e.g. tensorboard viewer) to the experiment
         :param service: A service instance
-        :return: The same service instance
+        :return: The same service instance (or existing service if already added)
         """
+        existing = self.services.get(service.id)
+        if existing is not None:
+            if existing is service:
+                # Same service instance added twice - just return it
+                logger.debug("Service %s already added, ignoring duplicate", service.id)
+                return service
+            else:
+                # Different service with same id - warn and replace
+                logger.warning(
+                    "Replacing service %s (old id=%s, new id=%s)",
+                    service.id,
+                    id(existing),
+                    id(service),
+                )
         self.services[service.id] = service
+        # Allow service to access experiment context
+        service.set_experiment(self)
         # Register database listener for state changes
         service.add_listener(self._db_listener)

experimaestro/scheduler/interfaces.py CHANGED Viewed

@@ -472,3 +472,30 @@ class BaseExperiment:
     def experiment_id(self) -> str:
         """Experiment identifier derived from workdir name"""
         return self.workdir.name
+class BaseService:
+    """Base interface for service information
+    This class defines the interface for service data. Both live Service instances
+    and MockService instances should provide these attributes and methods.
+    Attributes:
+        id: Unique identifier for the service
+        state: Current service state (ServiceState enum or compatible)
+    """
+    id: str
+    @property
+    def state(self):
+        """Current service state"""
+        raise NotImplementedError
+    def description(self) -> str:
+        """Human-readable description of the service"""
+        raise NotImplementedError
+    def state_dict(self) -> dict:
+        """Return dictionary representation for serialization"""
+        raise NotImplementedError

experimaestro/scheduler/remote/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Remote monitoring support for experimaestro
+This package provides SSH-based remote monitoring capabilities for experiments.
+Main components:
+- SSHStateProviderServer: JSON-RPC server that wraps WorkspaceStateProvider
+- SSHStateProviderClient: Client that connects via SSH and implements StateProvider interface
+- RemoteFileSynchronizer: Rsync-based file synchronization
+Usage:
+    # On remote host (run via SSH):
+    from experimaestro.scheduler.remote.server import SSHStateProviderServer
+    server = SSHStateProviderServer(workspace_path)
+    server.start()
+    # On local host:
+    from experimaestro.scheduler.remote.client import SSHStateProviderClient
+    client = SSHStateProviderClient(host="server", remote_workspace="/path")
+    client.connect()
+    experiments = client.get_experiments()
+"""
+from experimaestro.scheduler.remote.server import SSHStateProviderServer
+from experimaestro.scheduler.remote.client import SSHStateProviderClient
+from experimaestro.scheduler.remote.sync import RemoteFileSynchronizer
+__all__ = [
+    "SSHStateProviderServer",
+    "SSHStateProviderClient",
+    "RemoteFileSynchronizer",
+]

experimaestro 2.0.0b4__py3-none-any.whl → 2.0.0b8__py3-none-any.whl

Potentially problematic release.

experimaestro 2.0.0b4py3-none-any.whl → 2.0.0b8py3-none-any.whl