plato-sdk-v2 2.3.0__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plato/agents/__init__.py +25 -13
- plato/agents/artifacts.py +108 -0
- plato/agents/config.py +16 -13
- plato/agents/otel.py +261 -0
- plato/agents/runner.py +226 -122
- plato/chronos/models/__init__.py +9 -1
- plato/v1/cli/chronos.py +788 -0
- plato/v1/cli/main.py +2 -2
- plato/v1/cli/pm.py +3 -3
- plato/v1/cli/sandbox.py +246 -52
- plato/v1/cli/ssh.py +28 -9
- plato/v1/cli/templates/world-runner.Dockerfile +27 -0
- plato/v1/cli/utils.py +32 -12
- plato/v1/cli/verify.py +243 -827
- plato/worlds/README.md +2 -1
- plato/worlds/__init__.py +3 -1
- plato/worlds/base.py +462 -67
- plato/worlds/config.py +42 -3
- plato/worlds/runner.py +1 -339
- {plato_sdk_v2-2.3.0.dist-info → plato_sdk_v2-2.4.1.dist-info}/METADATA +4 -1
- {plato_sdk_v2-2.3.0.dist-info → plato_sdk_v2-2.4.1.dist-info}/RECORD +23 -27
- plato/agents/logging.py +0 -401
- plato/chronos/api/callback/__init__.py +0 -11
- plato/chronos/api/callback/push_agent_logs.py +0 -61
- plato/chronos/api/callback/update_agent_status.py +0 -57
- plato/chronos/api/callback/upload_artifacts.py +0 -59
- plato/chronos/api/callback/upload_logs_zip.py +0 -57
- plato/chronos/api/callback/upload_trajectory.py +0 -57
- plato/v1/cli/sim.py +0 -11
- {plato_sdk_v2-2.3.0.dist-info → plato_sdk_v2-2.4.1.dist-info}/WHEEL +0 -0
- {plato_sdk_v2-2.3.0.dist-info → plato_sdk_v2-2.4.1.dist-info}/entry_points.txt +0 -0
plato/worlds/README.md
CHANGED
plato/worlds/__init__.py
CHANGED
|
@@ -52,7 +52,7 @@ from plato.worlds.base import (
|
|
|
52
52
|
get_world,
|
|
53
53
|
register_world,
|
|
54
54
|
)
|
|
55
|
-
from plato.worlds.config import Agent, AgentConfig, Env, EnvConfig, RunConfig, Secret
|
|
55
|
+
from plato.worlds.config import Agent, AgentConfig, CheckpointConfig, Env, EnvConfig, RunConfig, Secret, StateConfig
|
|
56
56
|
from plato.worlds.runner import run_world
|
|
57
57
|
|
|
58
58
|
__all__ = [
|
|
@@ -66,6 +66,8 @@ __all__ = [
|
|
|
66
66
|
"get_world",
|
|
67
67
|
# Config
|
|
68
68
|
"RunConfig",
|
|
69
|
+
"CheckpointConfig",
|
|
70
|
+
"StateConfig",
|
|
69
71
|
"AgentConfig",
|
|
70
72
|
"Agent",
|
|
71
73
|
"Secret",
|
plato/worlds/base.py
CHANGED
|
@@ -3,7 +3,10 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
6
8
|
from abc import ABC, abstractmethod
|
|
9
|
+
from pathlib import Path
|
|
7
10
|
from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, get_args, get_origin
|
|
8
11
|
|
|
9
12
|
from pydantic import BaseModel, Field
|
|
@@ -14,13 +17,29 @@ if TYPE_CHECKING:
|
|
|
14
17
|
from plato.v2.async_.environment import Environment
|
|
15
18
|
from plato.v2.async_.session import Session
|
|
16
19
|
|
|
17
|
-
from plato.agents.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
from plato.agents.
|
|
20
|
+
from plato.agents.artifacts import (
|
|
21
|
+
upload_artifact as _upload_artifact_raw,
|
|
22
|
+
)
|
|
23
|
+
from plato.agents.otel import (
|
|
24
|
+
get_tracer,
|
|
25
|
+
init_tracing,
|
|
26
|
+
shutdown_tracing,
|
|
27
|
+
)
|
|
28
|
+
from plato.agents.runner import run_agent as _run_agent_raw
|
|
21
29
|
|
|
22
30
|
logger = logging.getLogger(__name__)
|
|
23
31
|
|
|
32
|
+
|
|
33
|
+
def _get_plato_version() -> str:
|
|
34
|
+
"""Get the installed plato SDK version."""
|
|
35
|
+
try:
|
|
36
|
+
from importlib.metadata import version
|
|
37
|
+
|
|
38
|
+
return version("plato")
|
|
39
|
+
except Exception:
|
|
40
|
+
return "unknown"
|
|
41
|
+
|
|
42
|
+
|
|
24
43
|
# Global registry of worlds
|
|
25
44
|
_WORLD_REGISTRY: dict[str, type[BaseWorld]] = {}
|
|
26
45
|
|
|
@@ -107,6 +126,8 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
107
126
|
self._step_count: int = 0
|
|
108
127
|
self.plato_session = None
|
|
109
128
|
self._current_step_id: str | None = None
|
|
129
|
+
self._session_id: str | None = None
|
|
130
|
+
self._agent_containers: list[str] = [] # Track spawned agent containers for cleanup
|
|
110
131
|
|
|
111
132
|
@classmethod
|
|
112
133
|
def get_config_class(cls) -> type[RunConfig]:
|
|
@@ -166,7 +187,70 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
166
187
|
|
|
167
188
|
async def close(self) -> None:
|
|
168
189
|
"""Cleanup resources. Called after run completes."""
|
|
169
|
-
|
|
190
|
+
await self._cleanup_agent_containers()
|
|
191
|
+
|
|
192
|
+
async def _cleanup_agent_containers(self) -> None:
|
|
193
|
+
"""Stop any agent containers spawned by this world."""
|
|
194
|
+
import asyncio
|
|
195
|
+
|
|
196
|
+
if not self._agent_containers:
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
self.logger.info(f"Stopping {len(self._agent_containers)} agent container(s)...")
|
|
200
|
+
for container_name in self._agent_containers:
|
|
201
|
+
try:
|
|
202
|
+
proc = await asyncio.create_subprocess_exec(
|
|
203
|
+
"docker",
|
|
204
|
+
"stop",
|
|
205
|
+
container_name,
|
|
206
|
+
stdout=asyncio.subprocess.DEVNULL,
|
|
207
|
+
stderr=asyncio.subprocess.DEVNULL,
|
|
208
|
+
)
|
|
209
|
+
await proc.wait()
|
|
210
|
+
self.logger.debug(f"Stopped container: {container_name}")
|
|
211
|
+
except Exception as e:
|
|
212
|
+
self.logger.warning(f"Failed to stop container {container_name}: {e}")
|
|
213
|
+
self._agent_containers.clear()
|
|
214
|
+
self.logger.info("Agent containers stopped")
|
|
215
|
+
|
|
216
|
+
async def run_agent(
|
|
217
|
+
self,
|
|
218
|
+
image: str,
|
|
219
|
+
config: dict,
|
|
220
|
+
secrets: dict[str, str],
|
|
221
|
+
instruction: str,
|
|
222
|
+
workspace: str | None = None,
|
|
223
|
+
logs_dir: str | None = None,
|
|
224
|
+
pull: bool = True,
|
|
225
|
+
) -> str:
|
|
226
|
+
"""Run an agent in a Docker container, tracking the container for cleanup.
|
|
227
|
+
|
|
228
|
+
This is a wrapper around plato.agents.runner.run_agent that automatically
|
|
229
|
+
tracks spawned containers so they can be cleaned up when the world closes.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
image: Docker image URI
|
|
233
|
+
config: Agent configuration dict
|
|
234
|
+
secrets: Secret values (API keys, etc.)
|
|
235
|
+
instruction: Task instruction for the agent
|
|
236
|
+
workspace: Docker volume name for workspace
|
|
237
|
+
logs_dir: Ignored (kept for backwards compatibility)
|
|
238
|
+
pull: Whether to pull the image first
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
The container name that was created
|
|
242
|
+
"""
|
|
243
|
+
container_name = await _run_agent_raw(
|
|
244
|
+
image=image,
|
|
245
|
+
config=config,
|
|
246
|
+
secrets=secrets,
|
|
247
|
+
instruction=instruction,
|
|
248
|
+
workspace=workspace,
|
|
249
|
+
logs_dir=logs_dir,
|
|
250
|
+
pull=pull,
|
|
251
|
+
)
|
|
252
|
+
self._agent_containers.append(container_name)
|
|
253
|
+
return container_name
|
|
170
254
|
|
|
171
255
|
async def _connect_plato_session(self) -> None:
|
|
172
256
|
"""Connect to Plato session from config.
|
|
@@ -195,6 +279,258 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
195
279
|
except Exception as e:
|
|
196
280
|
self.logger.warning(f"Error stopping Plato heartbeat: {e}")
|
|
197
281
|
|
|
282
|
+
async def _create_checkpoint(self) -> dict[str, str] | None:
|
|
283
|
+
"""Create a checkpoint snapshot of all environments (excluding configured envs).
|
|
284
|
+
|
|
285
|
+
Uses snapshot_store for efficient chunk-based deduplication.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
Dict mapping environment alias to artifact_id, or None if no session connected.
|
|
289
|
+
"""
|
|
290
|
+
if not self.plato_session:
|
|
291
|
+
self.logger.warning("Cannot create checkpoint: Plato session not connected")
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
exclude_envs = set(self.config.checkpoint.exclude_envs)
|
|
295
|
+
envs_to_snapshot = [env for env in self.plato_session.envs if env.alias not in exclude_envs]
|
|
296
|
+
|
|
297
|
+
if not envs_to_snapshot:
|
|
298
|
+
self.logger.info("No environments to checkpoint (all excluded)")
|
|
299
|
+
return {}
|
|
300
|
+
|
|
301
|
+
self.logger.info(
|
|
302
|
+
f"Creating checkpoint for {len(envs_to_snapshot)} environment(s): {[e.alias for e in envs_to_snapshot]}"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
results: dict[str, str] = {}
|
|
306
|
+
for env in envs_to_snapshot:
|
|
307
|
+
try:
|
|
308
|
+
result = await env.snapshot_store()
|
|
309
|
+
artifact_id = result.artifact_id
|
|
310
|
+
results[env.alias] = artifact_id
|
|
311
|
+
|
|
312
|
+
# Check for success/error fields (available after SDK regeneration)
|
|
313
|
+
success = getattr(result, "success", True)
|
|
314
|
+
error = getattr(result, "error", None)
|
|
315
|
+
|
|
316
|
+
if not success or error:
|
|
317
|
+
self.logger.error(
|
|
318
|
+
f"Checkpoint failed for '{env.alias}': {error or 'unknown error'} (job_id={env.job_id})"
|
|
319
|
+
)
|
|
320
|
+
elif artifact_id:
|
|
321
|
+
self.logger.info(f"Checkpoint created for '{env.alias}': {artifact_id}")
|
|
322
|
+
else:
|
|
323
|
+
self.logger.warning(
|
|
324
|
+
f"Checkpoint for '{env.alias}' returned empty artifact_id (job_id={env.job_id})"
|
|
325
|
+
)
|
|
326
|
+
except Exception as e:
|
|
327
|
+
self.logger.error(f"Failed to checkpoint '{env.alias}': {e}")
|
|
328
|
+
|
|
329
|
+
return results
|
|
330
|
+
|
|
331
|
+
def _init_state_directory(self) -> None:
|
|
332
|
+
"""Initialize the state directory as a git repository.
|
|
333
|
+
|
|
334
|
+
Creates the state directory if it doesn't exist and initializes it
|
|
335
|
+
as a git repository with an initial commit.
|
|
336
|
+
"""
|
|
337
|
+
if not self.config.state.enabled:
|
|
338
|
+
return
|
|
339
|
+
|
|
340
|
+
state_path = Path(self.config.state.path)
|
|
341
|
+
|
|
342
|
+
# Create directory if it doesn't exist
|
|
343
|
+
if not state_path.exists():
|
|
344
|
+
state_path.mkdir(parents=True)
|
|
345
|
+
self.logger.info(f"Created state directory: {state_path}")
|
|
346
|
+
|
|
347
|
+
# Check if already a git repo
|
|
348
|
+
git_dir = state_path / ".git"
|
|
349
|
+
if git_dir.exists():
|
|
350
|
+
self.logger.info(f"State directory already initialized: {state_path}")
|
|
351
|
+
return
|
|
352
|
+
|
|
353
|
+
# Initialize git repo
|
|
354
|
+
try:
|
|
355
|
+
subprocess.run(
|
|
356
|
+
["git", "init"],
|
|
357
|
+
cwd=state_path,
|
|
358
|
+
capture_output=True,
|
|
359
|
+
check=True,
|
|
360
|
+
)
|
|
361
|
+
# Create initial commit (even if empty)
|
|
362
|
+
subprocess.run(
|
|
363
|
+
["git", "config", "user.email", "plato@plato.so"],
|
|
364
|
+
cwd=state_path,
|
|
365
|
+
capture_output=True,
|
|
366
|
+
check=True,
|
|
367
|
+
)
|
|
368
|
+
subprocess.run(
|
|
369
|
+
["git", "config", "user.name", "Plato"],
|
|
370
|
+
cwd=state_path,
|
|
371
|
+
capture_output=True,
|
|
372
|
+
check=True,
|
|
373
|
+
)
|
|
374
|
+
# Add all files and create initial commit
|
|
375
|
+
subprocess.run(
|
|
376
|
+
["git", "add", "-A"],
|
|
377
|
+
cwd=state_path,
|
|
378
|
+
capture_output=True,
|
|
379
|
+
check=True,
|
|
380
|
+
)
|
|
381
|
+
subprocess.run(
|
|
382
|
+
["git", "commit", "--allow-empty", "-m", "Initial state"],
|
|
383
|
+
cwd=state_path,
|
|
384
|
+
capture_output=True,
|
|
385
|
+
check=True,
|
|
386
|
+
)
|
|
387
|
+
self.logger.info(f"Initialized git repo in state directory: {state_path}")
|
|
388
|
+
except subprocess.CalledProcessError as e:
|
|
389
|
+
self.logger.warning(f"Failed to initialize state git repo: {e.stderr}")
|
|
390
|
+
|
|
391
|
+
def _commit_state(self, message: str) -> bool:
|
|
392
|
+
"""Commit current state directory changes.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
message: Commit message
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
True if commit was created (or no changes), False on error.
|
|
399
|
+
"""
|
|
400
|
+
if not self.config.state.enabled:
|
|
401
|
+
return True
|
|
402
|
+
|
|
403
|
+
state_path = Path(self.config.state.path)
|
|
404
|
+
if not state_path.exists():
|
|
405
|
+
return True
|
|
406
|
+
|
|
407
|
+
try:
|
|
408
|
+
# Add all changes
|
|
409
|
+
subprocess.run(
|
|
410
|
+
["git", "add", "-A"],
|
|
411
|
+
cwd=state_path,
|
|
412
|
+
capture_output=True,
|
|
413
|
+
check=True,
|
|
414
|
+
)
|
|
415
|
+
# Check if there are changes to commit
|
|
416
|
+
result = subprocess.run(
|
|
417
|
+
["git", "status", "--porcelain"],
|
|
418
|
+
cwd=state_path,
|
|
419
|
+
capture_output=True,
|
|
420
|
+
text=True,
|
|
421
|
+
check=True,
|
|
422
|
+
)
|
|
423
|
+
if not result.stdout.strip():
|
|
424
|
+
self.logger.debug("No state changes to commit")
|
|
425
|
+
return True
|
|
426
|
+
|
|
427
|
+
# Commit changes
|
|
428
|
+
subprocess.run(
|
|
429
|
+
["git", "commit", "-m", message],
|
|
430
|
+
cwd=state_path,
|
|
431
|
+
capture_output=True,
|
|
432
|
+
check=True,
|
|
433
|
+
)
|
|
434
|
+
self.logger.info(f"Committed state changes: {message}")
|
|
435
|
+
return True
|
|
436
|
+
except subprocess.CalledProcessError as e:
|
|
437
|
+
self.logger.warning(f"Failed to commit state: {e.stderr}")
|
|
438
|
+
return False
|
|
439
|
+
|
|
440
|
+
def _create_state_bundle(self) -> bytes | None:
|
|
441
|
+
"""Create a git bundle of the state directory.
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
Bundle bytes if successful, None otherwise.
|
|
445
|
+
"""
|
|
446
|
+
if not self.config.state.enabled:
|
|
447
|
+
return None
|
|
448
|
+
|
|
449
|
+
state_path = Path(self.config.state.path)
|
|
450
|
+
if not state_path.exists():
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
git_dir = state_path / ".git"
|
|
454
|
+
if not git_dir.exists():
|
|
455
|
+
self.logger.warning("State directory is not a git repository")
|
|
456
|
+
return None
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
# Create bundle to stdout
|
|
460
|
+
result = subprocess.run(
|
|
461
|
+
["git", "bundle", "create", "-", "--all"],
|
|
462
|
+
cwd=state_path,
|
|
463
|
+
capture_output=True,
|
|
464
|
+
check=True,
|
|
465
|
+
)
|
|
466
|
+
bundle_data = result.stdout
|
|
467
|
+
self.logger.info(f"Created state bundle: {len(bundle_data)} bytes")
|
|
468
|
+
return bundle_data
|
|
469
|
+
except subprocess.CalledProcessError as e:
|
|
470
|
+
self.logger.warning(f"Failed to create state bundle: {e.stderr}")
|
|
471
|
+
return None
|
|
472
|
+
|
|
473
|
+
async def _upload_artifact(
|
|
474
|
+
self,
|
|
475
|
+
data: bytes,
|
|
476
|
+
content_type: str = "application/octet-stream",
|
|
477
|
+
) -> bool:
|
|
478
|
+
"""Upload an artifact directly to S3.
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
data: Raw bytes of the artifact
|
|
482
|
+
content_type: MIME type of the content
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
True if successful, False otherwise
|
|
486
|
+
"""
|
|
487
|
+
if not self.config.upload_url:
|
|
488
|
+
self.logger.warning("Cannot upload artifact: upload_url not set")
|
|
489
|
+
return False
|
|
490
|
+
return await _upload_artifact_raw(
|
|
491
|
+
upload_url=self.config.upload_url,
|
|
492
|
+
data=data,
|
|
493
|
+
content_type=content_type,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
async def _create_and_upload_checkpoint(self) -> tuple[dict[str, str], bool]:
|
|
497
|
+
"""Create a full checkpoint including env snapshots and state bundle.
|
|
498
|
+
|
|
499
|
+
This method:
|
|
500
|
+
1. Commits any pending state changes
|
|
501
|
+
2. Creates env snapshots using snapshot_store
|
|
502
|
+
3. Creates and uploads state bundle to S3
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Tuple of (env_snapshots dict, state_bundle_uploaded bool)
|
|
506
|
+
"""
|
|
507
|
+
# Commit state changes first
|
|
508
|
+
self._commit_state(f"Checkpoint at step {self._step_count}")
|
|
509
|
+
|
|
510
|
+
# Create env snapshots
|
|
511
|
+
env_snapshots = await self._create_checkpoint()
|
|
512
|
+
if env_snapshots is None:
|
|
513
|
+
env_snapshots = {}
|
|
514
|
+
|
|
515
|
+
state_bundle_uploaded = True # Default to True if state not enabled
|
|
516
|
+
|
|
517
|
+
# Create and upload state bundle
|
|
518
|
+
if self.config.state.enabled:
|
|
519
|
+
bundle_data = self._create_state_bundle()
|
|
520
|
+
if bundle_data:
|
|
521
|
+
success = await self._upload_artifact(
|
|
522
|
+
data=bundle_data,
|
|
523
|
+
content_type="application/octet-stream",
|
|
524
|
+
)
|
|
525
|
+
if success:
|
|
526
|
+
self.logger.info(f"Uploaded state bundle at step {self._step_count}")
|
|
527
|
+
state_bundle_uploaded = True
|
|
528
|
+
else:
|
|
529
|
+
self.logger.warning(f"Failed to upload state bundle at step {self._step_count}")
|
|
530
|
+
state_bundle_uploaded = False
|
|
531
|
+
|
|
532
|
+
return env_snapshots, state_bundle_uploaded
|
|
533
|
+
|
|
198
534
|
def get_env(self, alias: str) -> Environment | None:
|
|
199
535
|
"""Get an environment by alias.
|
|
200
536
|
|
|
@@ -236,6 +572,9 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
236
572
|
Returns:
|
|
237
573
|
Dict of environment variable name -> value
|
|
238
574
|
|
|
575
|
+
Raises:
|
|
576
|
+
ImportError: If a sim environment is configured but package is not installed.
|
|
577
|
+
|
|
239
578
|
Example:
|
|
240
579
|
env_vars = self.get_sim_env_vars()
|
|
241
580
|
# Returns: {"AWS_ENDPOINT_URL": "https://...", "GITEA_URL": "https://...", ...}
|
|
@@ -263,7 +602,13 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
263
602
|
env_vars.update(sim_vars)
|
|
264
603
|
self.logger.info(f"{package_name} env vars: {list(sim_vars.keys())}")
|
|
265
604
|
except ImportError:
|
|
266
|
-
|
|
605
|
+
raise ImportError(
|
|
606
|
+
f"Environment '{env_alias}' is configured but 'plato.sims.{package_name}' "
|
|
607
|
+
f"package is not installed.\n\n"
|
|
608
|
+
f"Install sims packages:\n"
|
|
609
|
+
f' export INDEX_URL="https://__token__:${{PLATO_API_KEY}}@plato.so/api/v2/pypi/sims/simple/"\n'
|
|
610
|
+
f" uv pip install '.[sims]' --extra-index-url $INDEX_URL"
|
|
611
|
+
) from None
|
|
267
612
|
except Exception as e:
|
|
268
613
|
self.logger.warning(f"Failed to get {package_name} env vars: {e}")
|
|
269
614
|
|
|
@@ -278,6 +623,9 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
278
623
|
Returns:
|
|
279
624
|
Markdown string with instructions, or empty string if no sims configured.
|
|
280
625
|
|
|
626
|
+
Raises:
|
|
627
|
+
ImportError: If a sim environment is configured but package is not installed.
|
|
628
|
+
|
|
281
629
|
Example:
|
|
282
630
|
instructions = self.get_sim_instructions()
|
|
283
631
|
# Returns markdown with LocalStack/Gitea setup instructions
|
|
@@ -306,7 +654,13 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
306
654
|
instructions_parts.append(instructions)
|
|
307
655
|
self.logger.info(f"Added {package_name} instructions to prompt")
|
|
308
656
|
except ImportError:
|
|
309
|
-
|
|
657
|
+
raise ImportError(
|
|
658
|
+
f"Environment '{env_alias}' is configured but 'plato.sims.{package_name}' "
|
|
659
|
+
f"package is not installed.\n\n"
|
|
660
|
+
f"Install sims packages:\n"
|
|
661
|
+
f' export INDEX_URL="https://__token__:${{PLATO_API_KEY}}@plato.so/api/v2/pypi/sims/simple/"\n'
|
|
662
|
+
f" uv pip install '.[sims]' --extra-index-url $INDEX_URL"
|
|
663
|
+
) from None
|
|
310
664
|
except Exception as e:
|
|
311
665
|
self.logger.warning(f"Failed to get {package_name} instructions: {e}")
|
|
312
666
|
|
|
@@ -363,74 +717,115 @@ The following services are available for your use:
|
|
|
363
717
|
|
|
364
718
|
self.logger.info(f"Starting world '{self.name}'")
|
|
365
719
|
|
|
366
|
-
# Initialize
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
720
|
+
# Initialize state directory (creates git repo if needed)
|
|
721
|
+
self._init_state_directory()
|
|
722
|
+
|
|
723
|
+
# Initialize OTel tracing and session info for artifact uploads
|
|
724
|
+
if config.session_id:
|
|
725
|
+
self._session_id = config.session_id
|
|
726
|
+
|
|
727
|
+
# Set environment variables for agent runners (which run in Docker)
|
|
728
|
+
os.environ["SESSION_ID"] = config.session_id
|
|
729
|
+
if config.otel_url:
|
|
730
|
+
# For agents in Docker, convert localhost to host.docker.internal
|
|
731
|
+
# so they can reach the host machine's Chronos instance
|
|
732
|
+
agent_otel_url = config.otel_url
|
|
733
|
+
if "localhost" in agent_otel_url or "127.0.0.1" in agent_otel_url:
|
|
734
|
+
agent_otel_url = agent_otel_url.replace("localhost", "host.docker.internal")
|
|
735
|
+
agent_otel_url = agent_otel_url.replace("127.0.0.1", "host.docker.internal")
|
|
736
|
+
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = agent_otel_url
|
|
737
|
+
os.environ["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf"
|
|
738
|
+
if config.upload_url:
|
|
739
|
+
os.environ["UPLOAD_URL"] = config.upload_url
|
|
740
|
+
|
|
741
|
+
# Initialize OTel tracing for the world itself (runs on host, not in Docker)
|
|
742
|
+
if config.otel_url:
|
|
743
|
+
logger.debug(f"Initializing OTel tracing with endpoint: {config.otel_url}")
|
|
744
|
+
init_tracing(
|
|
745
|
+
service_name=f"world-{self.name}",
|
|
746
|
+
session_id=config.session_id,
|
|
747
|
+
otlp_endpoint=config.otel_url,
|
|
748
|
+
)
|
|
749
|
+
else:
|
|
750
|
+
logger.debug("No otel_url in config - OTel tracing disabled")
|
|
751
|
+
|
|
752
|
+
# Log version info (goes to OTel after init_tracing)
|
|
753
|
+
plato_version = _get_plato_version()
|
|
754
|
+
world_version = self.get_version()
|
|
755
|
+
self.logger.info(f"World version: {world_version}, Plato SDK version: {plato_version}")
|
|
372
756
|
|
|
373
757
|
# Connect to Plato session if configured (for heartbeats)
|
|
374
758
|
await self._connect_plato_session()
|
|
375
759
|
|
|
376
|
-
#
|
|
377
|
-
|
|
378
|
-
span_type="session_start",
|
|
379
|
-
content=f"World '{self.name}' started",
|
|
380
|
-
source="world",
|
|
381
|
-
extra={"world_name": self.name, "world_version": self.get_version()},
|
|
382
|
-
)
|
|
760
|
+
# Get tracer for spans
|
|
761
|
+
tracer = get_tracer("plato.world")
|
|
383
762
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
self.logger.info(f"World reset complete: {obs}")
|
|
391
|
-
|
|
392
|
-
while True:
|
|
393
|
-
self._step_count += 1
|
|
394
|
-
|
|
395
|
-
# Execute step with automatic span tracking
|
|
396
|
-
# The span automatically sets itself as the current parent,
|
|
397
|
-
# so agent trajectories will nest under this step
|
|
398
|
-
async with _span(
|
|
399
|
-
f"step_{self._step_count}",
|
|
400
|
-
span_type="step",
|
|
401
|
-
source="world",
|
|
402
|
-
) as step_span:
|
|
403
|
-
self._current_step_id = step_span.event_id
|
|
404
|
-
step_span.log(f"Step {self._step_count} started")
|
|
405
|
-
result = await self.step()
|
|
406
|
-
step_span.set_extra(
|
|
407
|
-
{
|
|
408
|
-
"done": result.done,
|
|
409
|
-
"observation": result.observation.model_dump()
|
|
410
|
-
if hasattr(result.observation, "model_dump")
|
|
411
|
-
else str(result.observation),
|
|
412
|
-
"info": result.info,
|
|
413
|
-
}
|
|
414
|
-
)
|
|
763
|
+
# Create root session span that encompasses everything
|
|
764
|
+
# This ensures all child spans share the same trace_id
|
|
765
|
+
with tracer.start_as_current_span("session") as session_span:
|
|
766
|
+
session_span.set_attribute("plato.world.name", self.name)
|
|
767
|
+
session_span.set_attribute("plato.world.version", self.get_version())
|
|
768
|
+
session_span.set_attribute("plato.session.id", config.session_id)
|
|
415
769
|
|
|
416
|
-
|
|
770
|
+
try:
|
|
771
|
+
# Execute reset with OTel span
|
|
772
|
+
with tracer.start_as_current_span("reset") as reset_span:
|
|
773
|
+
obs = await self.reset()
|
|
774
|
+
obs_data = obs.model_dump() if hasattr(obs, "model_dump") else str(obs)
|
|
775
|
+
reset_span.set_attribute("plato.observation", str(obs_data)[:1000])
|
|
776
|
+
self.logger.info(f"World reset complete: {obs}")
|
|
417
777
|
|
|
418
|
-
|
|
419
|
-
|
|
778
|
+
while True:
|
|
779
|
+
self._step_count += 1
|
|
420
780
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
781
|
+
# Execute step with OTel span
|
|
782
|
+
with tracer.start_as_current_span(f"step_{self._step_count}") as step_span:
|
|
783
|
+
step_span.set_attribute("plato.step.number", self._step_count)
|
|
424
784
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
span_type="session_end",
|
|
428
|
-
content=f"World '{self.name}' completed after {self._step_count} steps",
|
|
429
|
-
source="world",
|
|
430
|
-
extra={"total_steps": self._step_count},
|
|
431
|
-
)
|
|
785
|
+
# Store span context for nested agent spans
|
|
786
|
+
self._current_step_id = format(step_span.get_span_context().span_id, "016x")
|
|
432
787
|
|
|
433
|
-
|
|
434
|
-
_reset_chronos_logging()
|
|
788
|
+
result = await self.step()
|
|
435
789
|
|
|
436
|
-
|
|
790
|
+
step_span.set_attribute("plato.step.done", result.done)
|
|
791
|
+
obs_data = (
|
|
792
|
+
result.observation.model_dump()
|
|
793
|
+
if hasattr(result.observation, "model_dump")
|
|
794
|
+
else str(result.observation)
|
|
795
|
+
)
|
|
796
|
+
step_span.set_attribute("plato.step.observation", str(obs_data)[:1000])
|
|
797
|
+
|
|
798
|
+
self.logger.info(f"Step {self._step_count}: done={result.done}")
|
|
799
|
+
|
|
800
|
+
# Create checkpoint if enabled and interval matches
|
|
801
|
+
if self.config.checkpoint.enabled and self._step_count % self.config.checkpoint.interval == 0:
|
|
802
|
+
self.logger.info(f"Creating checkpoint after step {self._step_count}")
|
|
803
|
+
with tracer.start_as_current_span("checkpoint") as checkpoint_span:
|
|
804
|
+
checkpoint_span.set_attribute("plato.checkpoint.step", self._step_count)
|
|
805
|
+
env_snapshots, state_bundle_uploaded = await self._create_and_upload_checkpoint()
|
|
806
|
+
|
|
807
|
+
checkpoint_span.set_attribute("plato.checkpoint.success", len(env_snapshots) > 0)
|
|
808
|
+
checkpoint_span.set_attribute(
|
|
809
|
+
"plato.checkpoint.state_bundle_uploaded", state_bundle_uploaded
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
if env_snapshots:
|
|
813
|
+
checkpoint_span.set_attribute(
|
|
814
|
+
"plato.checkpoint.environments", list(env_snapshots.keys())
|
|
815
|
+
)
|
|
816
|
+
checkpoint_span.set_attribute(
|
|
817
|
+
"plato.checkpoint.artifact_ids", list(env_snapshots.values())
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
if result.done:
|
|
821
|
+
break
|
|
822
|
+
|
|
823
|
+
finally:
|
|
824
|
+
await self.close()
|
|
825
|
+
await self._disconnect_plato_session()
|
|
826
|
+
|
|
827
|
+
# Shutdown OTel tracing and clear session info (outside the span)
|
|
828
|
+
shutdown_tracing()
|
|
829
|
+
self._session_id = None
|
|
830
|
+
|
|
831
|
+
self.logger.info(f"World '{self.name}' completed after {self._step_count} steps")
|