PyPI - plato-sdk-v2 - Versions diffs - 2.3.3__py3-none-any.whl → 2.4.2__py3-none-any.whl - Mend

plato-sdk-v2 2.3.3py3-none-any.whl → 2.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

plato/agents/__init__.py +24 -16
plato/agents/artifacts.py +108 -0
plato/agents/config.py +16 -13
plato/agents/otel.py +261 -0
plato/agents/runner.py +223 -149
plato/chronos/models/__init__.py +9 -1
plato/v1/cli/agent.py +7 -7
plato/v1/cli/chronos.py +767 -0
plato/v1/cli/main.py +2 -0
plato/v1/cli/pm.py +3 -3
plato/v1/cli/sandbox.py +58 -6
plato/v1/cli/ssh.py +21 -14
plato/v1/cli/templates/world-runner.Dockerfile +27 -0
plato/v1/cli/utils.py +32 -12
plato/worlds/README.md +2 -1
plato/worlds/base.py +222 -101
plato/worlds/config.py +5 -3
plato/worlds/runner.py +1 -391
{plato_sdk_v2-2.3.3.dist-info → plato_sdk_v2-2.4.2.dist-info}/METADATA +4 -3
{plato_sdk_v2-2.3.3.dist-info → plato_sdk_v2-2.4.2.dist-info}/RECORD +22 -25
plato/agents/logging.py +0 -515
plato/chronos/api/callback/__init__.py +0 -11
plato/chronos/api/callback/push_agent_logs.py +0 -61
plato/chronos/api/callback/update_agent_status.py +0 -57
plato/chronos/api/callback/upload_artifacts.py +0 -59
plato/chronos/api/callback/upload_logs_zip.py +0 -57
plato/chronos/api/callback/upload_trajectory.py +0 -57
{plato_sdk_v2-2.3.3.dist-info → plato_sdk_v2-2.4.2.dist-info}/WHEEL +0 -0
{plato_sdk_v2-2.3.3.dist-info → plato_sdk_v2-2.4.2.dist-info}/entry_points.txt +0 -0

plato/worlds/base.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import logging
+import os
 import subprocess
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -16,15 +17,29 @@ if TYPE_CHECKING:
     from plato.v2.async_.environment import Environment
     from plato.v2.async_.session import Session
-from plato.agents.logging import init_logging as _init_chronos_logging
-from plato.agents.logging import log_event as _log_event
-from plato.agents.logging import reset_logging as _reset_chronos_logging
-from plato.agents.logging import span as _span
-from plato.agents.logging import upload_artifact as _upload_artifact
-from plato.agents.logging import upload_checkpoint as _upload_checkpoint
+from plato.agents.artifacts import (
+    upload_artifact as _upload_artifact_raw,
+)
+from plato.agents.otel import (
+    get_tracer,
+    init_tracing,
+    shutdown_tracing,
+)
+from plato.agents.runner import run_agent as _run_agent_raw
 logger = logging.getLogger(__name__)
+def _get_plato_version() -> str:
+    """Get the installed plato SDK version."""
+    try:
+        from importlib.metadata import version
+        return version("plato")
+    except Exception:
+        return "unknown"
 # Global registry of worlds
 _WORLD_REGISTRY: dict[str, type[BaseWorld]] = {}
@@ -111,6 +126,8 @@ class BaseWorld(ABC, Generic[ConfigT]):
         self._step_count: int = 0
         self.plato_session = None
         self._current_step_id: str | None = None
+        self._session_id: str | None = None
+        self._agent_containers: list[str] = []  # Track spawned agent containers for cleanup
     @classmethod
     def get_config_class(cls) -> type[RunConfig]:
@@ -170,7 +187,70 @@ class BaseWorld(ABC, Generic[ConfigT]):
     async def close(self) -> None:
         """Cleanup resources. Called after run completes."""
-        pass
+        await self._cleanup_agent_containers()
+    async def _cleanup_agent_containers(self) -> None:
+        """Stop any agent containers spawned by this world."""
+        import asyncio
+        if not self._agent_containers:
+            return
+        self.logger.info(f"Stopping {len(self._agent_containers)} agent container(s)...")
+        for container_name in self._agent_containers:
+            try:
+                proc = await asyncio.create_subprocess_exec(
+                    "docker",
+                    "stop",
+                    container_name,
+                    stdout=asyncio.subprocess.DEVNULL,
+                    stderr=asyncio.subprocess.DEVNULL,
+                )
+                await proc.wait()
+                self.logger.debug(f"Stopped container: {container_name}")
+            except Exception as e:
+                self.logger.warning(f"Failed to stop container {container_name}: {e}")
+        self._agent_containers.clear()
+        self.logger.info("Agent containers stopped")
+    async def run_agent(
+        self,
+        image: str,
+        config: dict,
+        secrets: dict[str, str],
+        instruction: str,
+        workspace: str | None = None,
+        logs_dir: str | None = None,
+        pull: bool = True,
+    ) -> str:
+        """Run an agent in a Docker container, tracking the container for cleanup.
+        This is a wrapper around plato.agents.runner.run_agent that automatically
+        tracks spawned containers so they can be cleaned up when the world closes.
+        Args:
+            image: Docker image URI
+            config: Agent configuration dict
+            secrets: Secret values (API keys, etc.)
+            instruction: Task instruction for the agent
+            workspace: Docker volume name for workspace
+            logs_dir: Ignored (kept for backwards compatibility)
+            pull: Whether to pull the image first
+        Returns:
+            The container name that was created
+        """
+        container_name = await _run_agent_raw(
+            image=image,
+            config=config,
+            secrets=secrets,
+            instruction=instruction,
+            workspace=workspace,
+            logs_dir=logs_dir,
+            pull=pull,
+        )
+        self._agent_containers.append(container_name)
+        return container_name
     async def _connect_plato_session(self) -> None:
         """Connect to Plato session from config.
@@ -390,17 +470,39 @@ class BaseWorld(ABC, Generic[ConfigT]):
             self.logger.warning(f"Failed to create state bundle: {e.stderr}")
             return None
-    async def _create_and_upload_checkpoint(self) -> dict[str, Any] | None:
+    async def _upload_artifact(
+        self,
+        data: bytes,
+        content_type: str = "application/octet-stream",
+    ) -> bool:
+        """Upload an artifact directly to S3.
+        Args:
+            data: Raw bytes of the artifact
+            content_type: MIME type of the content
+        Returns:
+            True if successful, False otherwise
+        """
+        if not self.config.upload_url:
+            self.logger.warning("Cannot upload artifact: upload_url not set")
+            return False
+        return await _upload_artifact_raw(
+            upload_url=self.config.upload_url,
+            data=data,
+            content_type=content_type,
+        )
+    async def _create_and_upload_checkpoint(self) -> tuple[dict[str, str], bool]:
         """Create a full checkpoint including env snapshots and state bundle.
         This method:
         1. Commits any pending state changes
         2. Creates env snapshots using snapshot_store
-        3. Creates and uploads state bundle as an artifact
-        4. Calls the checkpoint endpoint with all data
+        3. Creates and uploads state bundle to S3
         Returns:
-            Checkpoint result dict if successful, None otherwise.
+            Tuple of (env_snapshots dict, state_bundle_uploaded bool)
         """
         # Commit state changes first
         self._commit_state(f"Checkpoint at step {self._step_count}")
@@ -410,36 +512,24 @@ class BaseWorld(ABC, Generic[ConfigT]):
         if env_snapshots is None:
             env_snapshots = {}
+        state_bundle_uploaded = True  # Default to True if state not enabled
         # Create and upload state bundle
-        state_artifact_id: str | None = None
         if self.config.state.enabled:
             bundle_data = self._create_state_bundle()
             if bundle_data:
-                result = await _upload_artifact(
+                success = await self._upload_artifact(
                     data=bundle_data,
-                    artifact_type="state",
-                    filename=f"state_step_{self._step_count}.bundle",
-                    extra={
-                        "step_number": self._step_count,
-                        "state_path": self.config.state.path,
-                    },
+                    content_type="application/octet-stream",
                 )
-                if result:
-                    state_artifact_id = result.get("artifact_id")
-                    self.logger.info(f"Uploaded state artifact: {state_artifact_id}")
-        # Upload checkpoint with all data
-        checkpoint_result = await _upload_checkpoint(
-            step_number=self._step_count,
-            env_snapshots=env_snapshots,
-            state_artifact_id=state_artifact_id,
-            extra={
-                "world_name": self.name,
-                "world_version": self.get_version(),
-            },
-        )
+                if success:
+                    self.logger.info(f"Uploaded state bundle at step {self._step_count}")
+                    state_bundle_uploaded = True
+                else:
+                    self.logger.warning(f"Failed to upload state bundle at step {self._step_count}")
+                    state_bundle_uploaded = False
-        return checkpoint_result
+        return env_snapshots, state_bundle_uploaded
     def get_env(self, alias: str) -> Environment | None:
         """Get an environment by alias.
@@ -630,81 +720,112 @@ The following services are available for your use:
         # Initialize state directory (creates git repo if needed)
         self._init_state_directory()
-        # Initialize the logging singleton for agents to use
-        if config.callback_url and config.session_id:
-            _init_chronos_logging(
-                callback_url=config.callback_url,
-                session_id=config.session_id,
-            )
+        # Initialize OTel tracing and session info for artifact uploads
+        if config.session_id:
+            self._session_id = config.session_id
+            # Set environment variables for agent runners (which run in Docker)
+            os.environ["SESSION_ID"] = config.session_id
+            if config.otel_url:
+                # For agents in Docker, convert localhost to host.docker.internal
+                # so they can reach the host machine's Chronos instance
+                agent_otel_url = config.otel_url
+                if "localhost" in agent_otel_url or "127.0.0.1" in agent_otel_url:
+                    agent_otel_url = agent_otel_url.replace("localhost", "host.docker.internal")
+                    agent_otel_url = agent_otel_url.replace("127.0.0.1", "host.docker.internal")
+                os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = agent_otel_url
+                os.environ["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf"
+            if config.upload_url:
+                os.environ["UPLOAD_URL"] = config.upload_url
+            # Initialize OTel tracing for the world itself (runs on host, not in Docker)
+            if config.otel_url:
+                logger.debug(f"Initializing OTel tracing with endpoint: {config.otel_url}")
+                init_tracing(
+                    service_name=f"world-{self.name}",
+                    session_id=config.session_id,
+                    otlp_endpoint=config.otel_url,
+                )
+            else:
+                logger.debug("No otel_url in config - OTel tracing disabled")
+        # Log version info (goes to OTel after init_tracing)
+        plato_version = _get_plato_version()
+        world_version = self.get_version()
+        self.logger.info(f"World version: {world_version}, Plato SDK version: {plato_version}")
         # Connect to Plato session if configured (for heartbeats)
         await self._connect_plato_session()
-        # Log session start
-        await _log_event(
-            span_type="session_start",
-            content=f"World '{self.name}' started",
-            source="world",
-            extra={"world_name": self.name, "world_version": self.get_version()},
-        )
-        try:
-            # Execute reset with automatic span tracking
-            async with _span("reset", span_type="reset", source="world") as reset_span:
-                reset_span.log(f"Resetting world '{self.name}'")
-                obs = await self.reset()
-                reset_span.set_extra({"observation": obs.model_dump() if hasattr(obs, "model_dump") else str(obs)})
-            self.logger.info(f"World reset complete: {obs}")
-            while True:
-                self._step_count += 1
-                # Execute step with automatic span tracking
-                # The span automatically sets itself as the current parent,
-                # so agent trajectories will nest under this step
-                async with _span(
-                    f"step_{self._step_count}",
-                    span_type="step",
-                    source="world",
-                ) as step_span:
-                    self._current_step_id = step_span.event_id
-                    step_span.log(f"Step {self._step_count} started")
-                    result = await self.step()
-                    step_span.set_extra(
-                        {
-                            "done": result.done,
-                            "observation": result.observation.model_dump()
-                            if hasattr(result.observation, "model_dump")
-                            else str(result.observation),
-                            "info": result.info,
-                        }
-                    )
+        # Get tracer for spans
+        tracer = get_tracer("plato.world")
-                self.logger.info(f"Step {self._step_count}: done={result.done}")
+        # Create root session span that encompasses everything
+        # This ensures all child spans share the same trace_id
+        with tracer.start_as_current_span("session") as session_span:
+            session_span.set_attribute("plato.world.name", self.name)
+            session_span.set_attribute("plato.world.version", self.get_version())
+            session_span.set_attribute("plato.session.id", config.session_id)
-                # Create checkpoint if enabled and interval matches
-                # Note: The checkpoint event is created by the callback endpoint,
-                # so we don't need a span wrapper here (would create duplicates)
-                if self.config.checkpoint.enabled and self._step_count % self.config.checkpoint.interval == 0:
-                    self.logger.info(f"Creating checkpoint after step {self._step_count}")
-                    await self._create_and_upload_checkpoint()
+            try:
+                # Execute reset with OTel span
+                with tracer.start_as_current_span("reset") as reset_span:
+                    obs = await self.reset()
+                    obs_data = obs.model_dump() if hasattr(obs, "model_dump") else str(obs)
+                    reset_span.set_attribute("plato.observation", str(obs_data)[:1000])
+                self.logger.info(f"World reset complete: {obs}")
-                if result.done:
-                    break
+                while True:
+                    self._step_count += 1
-        finally:
-            await self.close()
-            await self._disconnect_plato_session()
+                    # Execute step with OTel span
+                    with tracer.start_as_current_span(f"step_{self._step_count}") as step_span:
+                        step_span.set_attribute("plato.step.number", self._step_count)
-            # Log session end
-            await _log_event(
-                span_type="session_end",
-                content=f"World '{self.name}' completed after {self._step_count} steps",
-                source="world",
-                extra={"total_steps": self._step_count},
-            )
+                        # Store span context for nested agent spans
+                        self._current_step_id = format(step_span.get_span_context().span_id, "016x")
-            # Reset the logging singleton
-            _reset_chronos_logging()
+                        result = await self.step()
-            self.logger.info(f"World '{self.name}' completed after {self._step_count} steps")
+                        step_span.set_attribute("plato.step.done", result.done)
+                        obs_data = (
+                            result.observation.model_dump()
+                            if hasattr(result.observation, "model_dump")
+                            else str(result.observation)
+                        )
+                        step_span.set_attribute("plato.step.observation", str(obs_data)[:1000])
+                    self.logger.info(f"Step {self._step_count}: done={result.done}")
+                    # Create checkpoint if enabled and interval matches
+                    if self.config.checkpoint.enabled and self._step_count % self.config.checkpoint.interval == 0:
+                        self.logger.info(f"Creating checkpoint after step {self._step_count}")
+                        with tracer.start_as_current_span("checkpoint") as checkpoint_span:
+                            checkpoint_span.set_attribute("plato.checkpoint.step", self._step_count)
+                            env_snapshots, state_bundle_uploaded = await self._create_and_upload_checkpoint()
+                            checkpoint_span.set_attribute("plato.checkpoint.success", len(env_snapshots) > 0)
+                            checkpoint_span.set_attribute(
+                                "plato.checkpoint.state_bundle_uploaded", state_bundle_uploaded
+                            )
+                            if env_snapshots:
+                                checkpoint_span.set_attribute(
+                                    "plato.checkpoint.environments", list(env_snapshots.keys())
+                                )
+                                checkpoint_span.set_attribute(
+                                    "plato.checkpoint.artifact_ids", list(env_snapshots.values())
+                                )
+                    if result.done:
+                        break
+            finally:
+                await self.close()
+                await self._disconnect_plato_session()
+        # Shutdown OTel tracing and clear session info (outside the span)
+        shutdown_tracing()
+        self._session_id = None
+        self.logger.info(f"World '{self.name}' completed after {self._step_count} steps")

plato/worlds/config.py CHANGED Viewed

@@ -126,13 +126,15 @@ class RunConfig(BaseModel):
     Attributes:
         session_id: Unique Chronos session identifier
-        callback_url: Callback URL for status updates
+        otel_url: OTel endpoint URL (e.g., https://chronos.plato.so/api/otel)
+        upload_url: Presigned S3 URL for uploading artifacts (provided by Chronos)
         plato_session: Serialized Plato session for connecting to existing VM session
         checkpoint: Configuration for automatic checkpoints after steps
     """
     session_id: str = ""
-    callback_url: str = ""
+    otel_url: str = ""  # OTel endpoint URL
+    upload_url: str = ""  # Presigned S3 URL for uploads
     all_secrets: dict[str, str] = Field(default_factory=dict)  # All secrets (world + agent)
     # Serialized Plato session for connecting to VM and sending heartbeats
@@ -182,7 +184,7 @@ class RunConfig(BaseModel):
         envs = []
         # Skip runtime fields
-        runtime_fields = {"session_id", "callback_url", "all_secrets", "plato_session", "checkpoint", "state"}
+        runtime_fields = {"session_id", "otel_url", "upload_url", "all_secrets", "plato_session", "checkpoint", "state"}
         for field_name, prop_schema in properties.items():
             if field_name in runtime_fields:

plato-sdk-v2 2.3.3__py3-none-any.whl → 2.4.2__py3-none-any.whl

plato-sdk-v2 2.3.3py3-none-any.whl → 2.4.2py3-none-any.whl