plato-sdk-v2 2.3.5__py3-none-any.whl → 2.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plato/agents/otel.py +67 -87
- plato/agents/runner.py +125 -288
- plato/worlds/base.py +87 -80
- plato/worlds/runner.py +40 -17
- {plato_sdk_v2-2.3.5.dist-info → plato_sdk_v2-2.3.7.dist-info}/METADATA +1 -1
- {plato_sdk_v2-2.3.5.dist-info → plato_sdk_v2-2.3.7.dist-info}/RECORD +8 -8
- {plato_sdk_v2-2.3.5.dist-info → plato_sdk_v2-2.3.7.dist-info}/WHEEL +0 -0
- {plato_sdk_v2-2.3.5.dist-info → plato_sdk_v2-2.3.7.dist-info}/entry_points.txt +0 -0
plato/agents/otel.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""OpenTelemetry integration for Plato agents and worlds.
|
|
2
2
|
|
|
3
|
-
Provides tracing
|
|
4
|
-
|
|
3
|
+
Provides tracing utilities using OpenTelemetry SDK. Traces are sent directly
|
|
4
|
+
to the Chronos OTLP endpoint.
|
|
5
5
|
|
|
6
6
|
Usage:
|
|
7
7
|
from plato.agents.otel import init_tracing, get_tracer, shutdown_tracing
|
|
@@ -19,11 +19,6 @@ Usage:
|
|
|
19
19
|
span.set_attribute("key", "value")
|
|
20
20
|
# ... do work ...
|
|
21
21
|
|
|
22
|
-
# All Python logging is automatically sent to Chronos
|
|
23
|
-
import logging
|
|
24
|
-
logger = logging.getLogger(__name__)
|
|
25
|
-
logger.info("This will appear in the trajectory viewer!")
|
|
26
|
-
|
|
27
22
|
# Cleanup
|
|
28
23
|
shutdown_tracing()
|
|
29
24
|
"""
|
|
@@ -39,74 +34,42 @@ _module_logger = logging.getLogger(__name__)
|
|
|
39
34
|
|
|
40
35
|
# Global state
|
|
41
36
|
_tracer_provider = None
|
|
42
|
-
_logging_handler = None
|
|
43
37
|
_initialized = False
|
|
38
|
+
_log_handler = None
|
|
44
39
|
|
|
45
40
|
|
|
46
|
-
class
|
|
47
|
-
"""Logging handler that
|
|
41
|
+
class OTelSpanLogHandler(logging.Handler):
|
|
42
|
+
"""Logging handler that creates OTel spans for log messages.
|
|
48
43
|
|
|
49
|
-
|
|
50
|
-
- span.type: "log"
|
|
51
|
-
- log.level: DEBUG/INFO/WARNING/ERROR/CRITICAL
|
|
52
|
-
- content: the log message
|
|
53
|
-
- source: the logger name
|
|
44
|
+
Converts Python log records to OTel spans with log attributes.
|
|
54
45
|
"""
|
|
55
46
|
|
|
56
|
-
def __init__(self,
|
|
57
|
-
super().__init__()
|
|
58
|
-
self.
|
|
59
|
-
# Filter out noisy loggers
|
|
60
|
-
self._ignored_loggers = {
|
|
61
|
-
"httpx",
|
|
62
|
-
"httpcore",
|
|
63
|
-
"urllib3",
|
|
64
|
-
"asyncio",
|
|
65
|
-
"opentelemetry",
|
|
66
|
-
"plato.agents.otel", # Avoid recursion
|
|
67
|
-
}
|
|
47
|
+
def __init__(self, tracer: Tracer, level: int = logging.INFO):
|
|
48
|
+
super().__init__(level)
|
|
49
|
+
self.tracer = tracer
|
|
68
50
|
|
|
69
51
|
def emit(self, record: logging.LogRecord) -> None:
|
|
70
52
|
"""Emit a log record as an OTel span."""
|
|
71
|
-
# Skip ignored loggers
|
|
72
|
-
logger_name = record.name
|
|
73
|
-
for ignored in self._ignored_loggers:
|
|
74
|
-
if logger_name.startswith(ignored):
|
|
75
|
-
return
|
|
76
|
-
|
|
77
53
|
try:
|
|
78
|
-
tracer = trace.get_tracer(self._tracer_name)
|
|
79
|
-
|
|
80
|
-
# Format the message
|
|
81
|
-
try:
|
|
82
|
-
msg = self.format(record)
|
|
83
|
-
except Exception:
|
|
84
|
-
msg = record.getMessage()
|
|
85
|
-
|
|
86
54
|
# Create a span for the log message
|
|
87
|
-
with tracer.start_as_current_span(
|
|
88
|
-
f"log.{record.levelname.lower()}",
|
|
89
|
-
end_on_exit=True,
|
|
90
|
-
) as span:
|
|
91
|
-
span.set_attribute("span.type", "log")
|
|
55
|
+
with self.tracer.start_as_current_span(f"log.{record.levelname.lower()}") as span:
|
|
92
56
|
span.set_attribute("log.level", record.levelname)
|
|
93
|
-
span.set_attribute("
|
|
94
|
-
span.set_attribute("
|
|
57
|
+
span.set_attribute("log.message", record.getMessage())
|
|
58
|
+
span.set_attribute("log.logger", record.name)
|
|
59
|
+
span.set_attribute("source", "world")
|
|
60
|
+
span.set_attribute("content", record.getMessage()[:1000])
|
|
95
61
|
|
|
96
|
-
# Add extra context if available
|
|
97
62
|
if record.funcName:
|
|
98
63
|
span.set_attribute("log.function", record.funcName)
|
|
99
|
-
if record.pathname:
|
|
100
|
-
span.set_attribute("log.file", record.pathname)
|
|
101
64
|
if record.lineno:
|
|
102
|
-
span.set_attribute("log.
|
|
65
|
+
span.set_attribute("log.lineno", record.lineno)
|
|
103
66
|
|
|
104
|
-
#
|
|
105
|
-
if record.
|
|
106
|
-
span.
|
|
67
|
+
# Mark errors
|
|
68
|
+
if record.levelno >= logging.ERROR:
|
|
69
|
+
span.set_attribute("error", True)
|
|
107
70
|
|
|
108
71
|
except Exception:
|
|
109
|
-
# Don't let logging
|
|
72
|
+
# Don't let logging errors crash the application
|
|
110
73
|
pass
|
|
111
74
|
|
|
112
75
|
|
|
@@ -114,66 +77,77 @@ def init_tracing(
|
|
|
114
77
|
service_name: str,
|
|
115
78
|
session_id: str,
|
|
116
79
|
otlp_endpoint: str,
|
|
117
|
-
|
|
118
|
-
|
|
80
|
+
parent_trace_id: str | None = None,
|
|
81
|
+
parent_span_id: str | None = None,
|
|
119
82
|
) -> None:
|
|
120
|
-
"""Initialize OpenTelemetry tracing
|
|
83
|
+
"""Initialize OpenTelemetry tracing.
|
|
121
84
|
|
|
122
85
|
Args:
|
|
123
86
|
service_name: Name of the service (e.g., world name or agent name)
|
|
124
87
|
session_id: Chronos session ID (added as resource attribute)
|
|
125
88
|
otlp_endpoint: Chronos OTLP endpoint (e.g., http://chronos/api/otel)
|
|
126
|
-
|
|
127
|
-
|
|
89
|
+
parent_trace_id: Optional parent trace ID for linking (hex string)
|
|
90
|
+
parent_span_id: Optional parent span ID for linking (hex string)
|
|
128
91
|
"""
|
|
129
|
-
global _tracer_provider,
|
|
92
|
+
global _tracer_provider, _initialized, _log_handler
|
|
130
93
|
|
|
131
94
|
if _initialized:
|
|
132
95
|
_module_logger.debug("Tracing already initialized")
|
|
133
96
|
return
|
|
134
97
|
|
|
135
98
|
try:
|
|
99
|
+
from opentelemetry import context as context_api
|
|
136
100
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
137
101
|
OTLPSpanExporter,
|
|
138
102
|
)
|
|
139
103
|
from opentelemetry.sdk.resources import Resource
|
|
140
104
|
from opentelemetry.sdk.trace import TracerProvider
|
|
141
|
-
from opentelemetry.sdk.trace.export import
|
|
105
|
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
|
106
|
+
from opentelemetry.trace import NonRecordingSpan, SpanContext, TraceFlags
|
|
142
107
|
|
|
143
108
|
# Create resource with session ID
|
|
144
109
|
resource = Resource.create(
|
|
145
110
|
{
|
|
146
111
|
"service.name": service_name,
|
|
147
|
-
"session.id": session_id,
|
|
112
|
+
"plato.session.id": session_id,
|
|
148
113
|
}
|
|
149
114
|
)
|
|
150
115
|
|
|
151
116
|
# Create tracer provider
|
|
152
117
|
_tracer_provider = TracerProvider(resource=resource)
|
|
153
118
|
|
|
154
|
-
# Add OTLP exporter pointing to Chronos
|
|
119
|
+
# Add OTLP exporter pointing to Chronos (use SimpleSpanProcessor for immediate export)
|
|
155
120
|
otlp_exporter = OTLPSpanExporter(endpoint=f"{otlp_endpoint.rstrip('/')}/v1/traces")
|
|
156
|
-
_tracer_provider.add_span_processor(
|
|
121
|
+
_tracer_provider.add_span_processor(SimpleSpanProcessor(otlp_exporter))
|
|
157
122
|
|
|
158
123
|
# Set as global tracer provider
|
|
159
124
|
trace.set_tracer_provider(_tracer_provider)
|
|
160
125
|
|
|
161
|
-
|
|
126
|
+
# If parent context is provided, set it as the current context
|
|
127
|
+
# This allows new spans to automatically link to the parent
|
|
128
|
+
if parent_trace_id and parent_span_id:
|
|
129
|
+
parent_context = SpanContext(
|
|
130
|
+
trace_id=int(parent_trace_id, 16),
|
|
131
|
+
span_id=int(parent_span_id, 16),
|
|
132
|
+
is_remote=True,
|
|
133
|
+
trace_flags=TraceFlags(0x01), # Sampled
|
|
134
|
+
)
|
|
135
|
+
parent_span = NonRecordingSpan(parent_context)
|
|
136
|
+
ctx = trace.set_span_in_context(parent_span)
|
|
137
|
+
context_api.attach(ctx)
|
|
138
|
+
print(f"[OTel] Using parent context: trace_id={parent_trace_id}, span_id={parent_span_id}")
|
|
139
|
+
|
|
140
|
+
# Add OTel logging handler to capture world logs
|
|
141
|
+
tracer = trace.get_tracer(service_name)
|
|
142
|
+
_log_handler = OTelSpanLogHandler(tracer, level=logging.INFO)
|
|
143
|
+
|
|
144
|
+
# Add handler to plato.worlds loggers
|
|
145
|
+
plato_worlds_logger = logging.getLogger("plato.worlds")
|
|
146
|
+
plato_worlds_logger.addHandler(_log_handler)
|
|
162
147
|
|
|
163
|
-
|
|
164
|
-
if capture_logging:
|
|
165
|
-
_logging_handler = OTelLoggingHandler()
|
|
166
|
-
_logging_handler.setLevel(log_level)
|
|
167
|
-
# Add to root logger to capture all logs
|
|
168
|
-
logging.getLogger().addHandler(_logging_handler)
|
|
148
|
+
_initialized = True
|
|
169
149
|
|
|
170
|
-
# Use print to ensure this shows regardless of logging config
|
|
171
150
|
print(f"[OTel] Tracing initialized: service={service_name}, session={session_id}, endpoint={otlp_endpoint}")
|
|
172
|
-
_module_logger.info(
|
|
173
|
-
f"OTel tracing initialized: service={service_name}, "
|
|
174
|
-
f"session={session_id}, endpoint={otlp_endpoint}, "
|
|
175
|
-
f"capture_logging={capture_logging}"
|
|
176
|
-
)
|
|
177
151
|
|
|
178
152
|
except ImportError as e:
|
|
179
153
|
print(f"[OTel] OpenTelemetry SDK not installed: {e}")
|
|
@@ -184,16 +158,17 @@ def init_tracing(
|
|
|
184
158
|
|
|
185
159
|
|
|
186
160
|
def shutdown_tracing() -> None:
|
|
187
|
-
"""Shutdown the tracer provider
|
|
188
|
-
global _tracer_provider,
|
|
161
|
+
"""Shutdown the tracer provider and flush spans."""
|
|
162
|
+
global _tracer_provider, _initialized, _log_handler
|
|
189
163
|
|
|
190
|
-
# Remove
|
|
191
|
-
if
|
|
164
|
+
# Remove log handler
|
|
165
|
+
if _log_handler:
|
|
192
166
|
try:
|
|
193
|
-
logging.getLogger(
|
|
167
|
+
plato_worlds_logger = logging.getLogger("plato.worlds")
|
|
168
|
+
plato_worlds_logger.removeHandler(_log_handler)
|
|
194
169
|
except Exception:
|
|
195
170
|
pass
|
|
196
|
-
|
|
171
|
+
_log_handler = None
|
|
197
172
|
|
|
198
173
|
if _tracer_provider:
|
|
199
174
|
try:
|
|
@@ -229,6 +204,8 @@ def instrument(service_name: str = "plato-agent") -> Tracer:
|
|
|
229
204
|
Reads the following env vars:
|
|
230
205
|
- OTEL_EXPORTER_OTLP_ENDPOINT: Chronos OTLP endpoint (required for tracing)
|
|
231
206
|
- SESSION_ID: Chronos session ID (default: "local")
|
|
207
|
+
- OTEL_TRACE_ID: Parent trace ID for linking spans (optional)
|
|
208
|
+
- OTEL_PARENT_SPAN_ID: Parent span ID for linking spans (optional)
|
|
232
209
|
|
|
233
210
|
If OTEL_EXPORTER_OTLP_ENDPOINT is not set, returns a no-op tracer.
|
|
234
211
|
|
|
@@ -242,17 +219,20 @@ def instrument(service_name: str = "plato-agent") -> Tracer:
|
|
|
242
219
|
|
|
243
220
|
otel_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
244
221
|
session_id = os.environ.get("SESSION_ID", "local")
|
|
222
|
+
parent_trace_id = os.environ.get("OTEL_TRACE_ID")
|
|
223
|
+
parent_span_id = os.environ.get("OTEL_PARENT_SPAN_ID")
|
|
245
224
|
|
|
246
225
|
if not otel_endpoint:
|
|
247
226
|
# Return default tracer (no-op if no provider configured)
|
|
248
227
|
return trace.get_tracer(service_name)
|
|
249
228
|
|
|
250
|
-
# Initialize tracing
|
|
229
|
+
# Initialize tracing with parent context if provided
|
|
251
230
|
init_tracing(
|
|
252
231
|
service_name=service_name,
|
|
253
232
|
session_id=session_id,
|
|
254
233
|
otlp_endpoint=otel_endpoint,
|
|
255
|
-
|
|
234
|
+
parent_trace_id=parent_trace_id,
|
|
235
|
+
parent_span_id=parent_span_id,
|
|
256
236
|
)
|
|
257
237
|
|
|
258
238
|
return trace.get_tracer(service_name)
|
plato/agents/runner.py
CHANGED
|
@@ -1,4 +1,11 @@
|
|
|
1
|
-
"""Agent runner - run agents in Docker containers.
|
|
1
|
+
"""Agent runner - run agents in Docker containers.
|
|
2
|
+
|
|
3
|
+
Agents emit their own OTel spans for trajectory events. This runner:
|
|
4
|
+
1. Runs agents in Docker containers
|
|
5
|
+
2. Streams stdout/stderr for logging
|
|
6
|
+
3. Passes OTel environment variables for trace context propagation
|
|
7
|
+
4. Uploads artifacts to S3 when complete
|
|
8
|
+
"""
|
|
2
9
|
|
|
3
10
|
from __future__ import annotations
|
|
4
11
|
|
|
@@ -8,12 +15,10 @@ import logging
|
|
|
8
15
|
import os
|
|
9
16
|
import platform
|
|
10
17
|
import tempfile
|
|
11
|
-
from pathlib import Path
|
|
12
18
|
|
|
13
19
|
from opentelemetry import trace
|
|
14
20
|
|
|
15
21
|
from plato.agents.artifacts import upload_artifacts
|
|
16
|
-
from plato.agents.otel import get_tracer
|
|
17
22
|
|
|
18
23
|
logger = logging.getLogger(__name__)
|
|
19
24
|
|
|
@@ -37,310 +42,142 @@ async def run_agent(
|
|
|
37
42
|
workspace: Host directory to mount as /workspace
|
|
38
43
|
logs_dir: Host directory for logs (temp dir if None)
|
|
39
44
|
pull: Whether to pull the image first
|
|
45
|
+
|
|
46
|
+
Note: Agents handle their own OTel tracing. This runner only passes
|
|
47
|
+
the trace context (TRACEPARENT) so agent spans link to the parent step.
|
|
40
48
|
"""
|
|
41
49
|
logs_dir = logs_dir or tempfile.mkdtemp(prefix="agent_logs_")
|
|
42
|
-
agent_name = image.split("/")[-1].split(":")[0]
|
|
43
50
|
|
|
44
51
|
# Get session info from environment variables
|
|
45
52
|
session_id = os.environ.get("SESSION_ID")
|
|
46
53
|
otel_url = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
47
54
|
upload_url = os.environ.get("UPLOAD_URL")
|
|
48
55
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
56
|
+
# Pull image if requested
|
|
57
|
+
if pull:
|
|
58
|
+
pull_proc = await asyncio.create_subprocess_exec(
|
|
59
|
+
"docker",
|
|
60
|
+
"pull",
|
|
61
|
+
image,
|
|
62
|
+
stdout=asyncio.subprocess.PIPE,
|
|
63
|
+
stderr=asyncio.subprocess.STDOUT,
|
|
64
|
+
)
|
|
65
|
+
await pull_proc.wait()
|
|
66
|
+
|
|
67
|
+
# Setup
|
|
68
|
+
os.makedirs(os.path.join(logs_dir, "agent"), exist_ok=True)
|
|
69
|
+
config_file = tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False)
|
|
70
|
+
json.dump(config, config_file)
|
|
71
|
+
config_file.close()
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
# Build docker command
|
|
75
|
+
docker_cmd = ["docker", "run", "--rm", "--privileged"]
|
|
76
|
+
|
|
77
|
+
# Determine if we need host networking
|
|
78
|
+
use_host_network = False
|
|
79
|
+
is_macos = platform.system() == "Darwin"
|
|
80
|
+
|
|
81
|
+
if not is_macos:
|
|
82
|
+
try:
|
|
83
|
+
proc = await asyncio.create_subprocess_exec(
|
|
84
|
+
"iptables",
|
|
85
|
+
"-L",
|
|
86
|
+
"-n",
|
|
87
|
+
stdout=asyncio.subprocess.DEVNULL,
|
|
88
|
+
stderr=asyncio.subprocess.DEVNULL,
|
|
68
89
|
)
|
|
69
|
-
await
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
90
|
+
await proc.wait()
|
|
91
|
+
has_iptables = proc.returncode == 0
|
|
92
|
+
except (FileNotFoundError, PermissionError):
|
|
93
|
+
has_iptables = False
|
|
94
|
+
|
|
95
|
+
use_host_network = not has_iptables
|
|
96
|
+
|
|
97
|
+
if use_host_network:
|
|
98
|
+
docker_cmd.extend(["--network=host", "--add-host=localhost:127.0.0.1"])
|
|
99
|
+
|
|
100
|
+
docker_cmd.extend(
|
|
101
|
+
[
|
|
102
|
+
"-v",
|
|
103
|
+
f"{workspace}:/workspace",
|
|
104
|
+
"-v",
|
|
105
|
+
f"{logs_dir}:/logs",
|
|
106
|
+
"-v",
|
|
107
|
+
f"{config_file.name}:/config.json:ro",
|
|
108
|
+
"-v",
|
|
109
|
+
"/var/run/docker.sock:/var/run/docker.sock",
|
|
110
|
+
"-w",
|
|
111
|
+
"/workspace",
|
|
112
|
+
]
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Pass session info to agent
|
|
116
|
+
if otel_url:
|
|
117
|
+
traces_endpoint = f"{otel_url.rstrip('/')}/v1/traces"
|
|
118
|
+
docker_cmd.extend(["-e", f"OTEL_EXPORTER_OTLP_ENDPOINT={otel_url}"])
|
|
119
|
+
docker_cmd.extend(["-e", f"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT={traces_endpoint}"])
|
|
120
|
+
docker_cmd.extend(["-e", "OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf"])
|
|
121
|
+
if session_id:
|
|
122
|
+
docker_cmd.extend(["-e", f"SESSION_ID={session_id}"])
|
|
123
|
+
if upload_url:
|
|
124
|
+
docker_cmd.extend(["-e", f"UPLOAD_URL={upload_url}"])
|
|
125
|
+
|
|
126
|
+
# Pass trace context to agent for parent linking
|
|
127
|
+
# Agent spans will be children of the current step span
|
|
128
|
+
current_span = trace.get_current_span()
|
|
129
|
+
span_context = current_span.get_span_context()
|
|
130
|
+
if span_context.is_valid:
|
|
131
|
+
trace_id = format(span_context.trace_id, "032x")
|
|
132
|
+
span_id = format(span_context.span_id, "016x")
|
|
133
|
+
# W3C Trace Context format for TRACEPARENT
|
|
134
|
+
traceparent = f"00-{trace_id}-{span_id}-01"
|
|
104
135
|
docker_cmd.extend(
|
|
105
136
|
[
|
|
106
|
-
"-
|
|
107
|
-
f"{
|
|
108
|
-
"-
|
|
109
|
-
f"{
|
|
110
|
-
"-
|
|
111
|
-
f"{
|
|
112
|
-
"-v",
|
|
113
|
-
"/var/run/docker.sock:/var/run/docker.sock",
|
|
114
|
-
"-w",
|
|
115
|
-
"/workspace",
|
|
137
|
+
"-e",
|
|
138
|
+
f"TRACEPARENT={traceparent}",
|
|
139
|
+
"-e",
|
|
140
|
+
f"OTEL_TRACE_ID={trace_id}",
|
|
141
|
+
"-e",
|
|
142
|
+
f"OTEL_PARENT_SPAN_ID={span_id}",
|
|
116
143
|
]
|
|
117
144
|
)
|
|
118
145
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
docker_cmd.extend(["-e", f"OTEL_EXPORTER_OTLP_ENDPOINT={otel_url}"])
|
|
122
|
-
# Use JSON protocol (not protobuf) for OTLP exports
|
|
123
|
-
docker_cmd.extend(["-e", "OTEL_EXPORTER_OTLP_PROTOCOL=http/json"])
|
|
124
|
-
if session_id:
|
|
125
|
-
docker_cmd.extend(["-e", f"SESSION_ID={session_id}"])
|
|
126
|
-
if upload_url:
|
|
127
|
-
docker_cmd.extend(["-e", f"UPLOAD_URL={upload_url}"])
|
|
128
|
-
|
|
129
|
-
# Pass trace context to agent for parent linking
|
|
130
|
-
current_span = trace.get_current_span()
|
|
131
|
-
span_context = current_span.get_span_context()
|
|
132
|
-
if span_context.is_valid:
|
|
133
|
-
trace_id = format(span_context.trace_id, "032x")
|
|
134
|
-
span_id = format(span_context.span_id, "016x")
|
|
135
|
-
docker_cmd.extend(
|
|
136
|
-
[
|
|
137
|
-
"-e",
|
|
138
|
-
f"OTEL_TRACE_ID={trace_id}",
|
|
139
|
-
"-e",
|
|
140
|
-
f"OTEL_PARENT_SPAN_ID={span_id}",
|
|
141
|
-
]
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
for key, value in secrets.items():
|
|
145
|
-
docker_cmd.extend(["-e", f"{key.upper()}={value}"])
|
|
146
|
-
|
|
147
|
-
docker_cmd.append(image)
|
|
148
|
-
|
|
149
|
-
# Pass instruction via CLI arg
|
|
150
|
-
docker_cmd.extend(["--instruction", instruction])
|
|
151
|
-
|
|
152
|
-
# Run container and stream output
|
|
153
|
-
with tracer.start_as_current_span("agent_execution") as exec_span:
|
|
154
|
-
exec_span.set_attribute("span.type", "agent_execution")
|
|
155
|
-
exec_span.set_attribute("content", f"Running {agent_name}")
|
|
156
|
-
|
|
157
|
-
process = await asyncio.create_subprocess_exec(
|
|
158
|
-
*docker_cmd,
|
|
159
|
-
stdout=asyncio.subprocess.PIPE,
|
|
160
|
-
stderr=asyncio.subprocess.STDOUT,
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
# Stream output line by line
|
|
164
|
-
output_lines: list[str] = []
|
|
165
|
-
turn_count = 0
|
|
166
|
-
assert process.stdout is not None
|
|
167
|
-
while True:
|
|
168
|
-
line = await process.stdout.readline()
|
|
169
|
-
if not line:
|
|
170
|
-
break
|
|
171
|
-
decoded_line = line.decode().rstrip()
|
|
172
|
-
output_lines.append(decoded_line)
|
|
173
|
-
|
|
174
|
-
# Try to parse JSON output from agent for structured trajectory spans
|
|
175
|
-
try:
|
|
176
|
-
data = json.loads(decoded_line)
|
|
177
|
-
event_type = data.get("type", "")
|
|
178
|
-
|
|
179
|
-
if event_type == "assistant":
|
|
180
|
-
# Agent response - create a turn span
|
|
181
|
-
turn_count += 1
|
|
182
|
-
msg = data.get("message", {})
|
|
183
|
-
content_items = msg.get("content", [])
|
|
184
|
-
|
|
185
|
-
# Extract text and tool calls with full details
|
|
186
|
-
text_parts = []
|
|
187
|
-
tool_calls = []
|
|
188
|
-
for item in content_items:
|
|
189
|
-
if item.get("type") == "text":
|
|
190
|
-
text_parts.append(item.get("text", "")[:2000])
|
|
191
|
-
elif item.get("type") == "tool_use":
|
|
192
|
-
tool_input = item.get("input", {})
|
|
193
|
-
# Truncate large inputs
|
|
194
|
-
input_str = json.dumps(tool_input) if tool_input else ""
|
|
195
|
-
if len(input_str) > 2000:
|
|
196
|
-
input_str = input_str[:2000] + "..."
|
|
197
|
-
tool_calls.append(
|
|
198
|
-
{
|
|
199
|
-
"tool": item.get("name"),
|
|
200
|
-
"id": item.get("id"),
|
|
201
|
-
"input": input_str,
|
|
202
|
-
}
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
with tracer.start_as_current_span(f"turn_{turn_count}") as turn_span:
|
|
206
|
-
turn_span.set_attribute("span.type", "agent_turn")
|
|
207
|
-
turn_span.set_attribute("source", "agent")
|
|
208
|
-
turn_span.set_attribute("turn_number", turn_count)
|
|
209
|
-
turn_span.set_attribute("model", msg.get("model", "unknown"))
|
|
210
|
-
|
|
211
|
-
if text_parts:
|
|
212
|
-
turn_span.set_attribute("content", "\n".join(text_parts)[:4000])
|
|
213
|
-
if tool_calls:
|
|
214
|
-
turn_span.set_attribute("tool_calls", json.dumps(tool_calls))
|
|
215
|
-
# If no text content, show tool calls summary
|
|
216
|
-
if not text_parts:
|
|
217
|
-
turn_span.set_attribute(
|
|
218
|
-
"content", f"Tool calls: {', '.join(t['tool'] for t in tool_calls)}"
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
# Usage info
|
|
222
|
-
usage = msg.get("usage", {})
|
|
223
|
-
if usage:
|
|
224
|
-
turn_span.set_attribute("input_tokens", usage.get("input_tokens", 0))
|
|
225
|
-
turn_span.set_attribute("output_tokens", usage.get("output_tokens", 0))
|
|
226
|
-
|
|
227
|
-
elif event_type == "user":
|
|
228
|
-
# Tool result
|
|
229
|
-
tool_results = data.get("message", {}).get("content", [])
|
|
230
|
-
for result in tool_results:
|
|
231
|
-
if result.get("type") == "tool_result":
|
|
232
|
-
tool_id = result.get("tool_use_id", "")
|
|
233
|
-
content = result.get("content", "")
|
|
234
|
-
# Handle content that might be a list of content blocks
|
|
235
|
-
if isinstance(content, list):
|
|
236
|
-
text_parts = []
|
|
237
|
-
for item in content:
|
|
238
|
-
if isinstance(item, dict) and item.get("type") == "text":
|
|
239
|
-
text_parts.append(item.get("text", ""))
|
|
240
|
-
elif isinstance(item, str):
|
|
241
|
-
text_parts.append(item)
|
|
242
|
-
content = "\n".join(text_parts)
|
|
243
|
-
if isinstance(content, str):
|
|
244
|
-
content = content[:2000] # Truncate large results
|
|
245
|
-
with tracer.start_as_current_span("tool_result") as tr_span:
|
|
246
|
-
tr_span.set_attribute("span.type", "tool_result")
|
|
247
|
-
tr_span.set_attribute("source", "agent")
|
|
248
|
-
tr_span.set_attribute("tool_use_id", tool_id)
|
|
249
|
-
tr_span.set_attribute("content", f"Tool result for {tool_id}")
|
|
250
|
-
tr_span.set_attribute("result", content if content else "")
|
|
251
|
-
|
|
252
|
-
elif event_type == "result":
|
|
253
|
-
# Final result
|
|
254
|
-
result_text = data.get("result", "")[:1000]
|
|
255
|
-
is_error = data.get("is_error", False)
|
|
256
|
-
duration_ms = data.get("duration_ms", 0)
|
|
257
|
-
total_cost = data.get("total_cost_usd", 0)
|
|
258
|
-
|
|
259
|
-
with tracer.start_as_current_span("agent_result") as res_span:
|
|
260
|
-
res_span.set_attribute("span.type", "agent_result")
|
|
261
|
-
res_span.set_attribute("source", "agent")
|
|
262
|
-
res_span.set_attribute("content", result_text if result_text else "Agent completed")
|
|
263
|
-
res_span.set_attribute("is_error", is_error)
|
|
264
|
-
res_span.set_attribute("duration_ms", duration_ms)
|
|
265
|
-
res_span.set_attribute("total_cost_usd", total_cost)
|
|
266
|
-
res_span.set_attribute("num_turns", data.get("num_turns", turn_count))
|
|
267
|
-
|
|
268
|
-
elif event_type == "system" and data.get("subtype") == "init":
|
|
269
|
-
# Agent initialization
|
|
270
|
-
with tracer.start_as_current_span("agent_init") as init_span:
|
|
271
|
-
init_span.set_attribute("span.type", "agent_init")
|
|
272
|
-
init_span.set_attribute("source", "agent")
|
|
273
|
-
init_span.set_attribute("model", data.get("model", "unknown"))
|
|
274
|
-
init_span.set_attribute("tools", json.dumps(data.get("tools", [])))
|
|
275
|
-
init_span.set_attribute("content", f"Agent initialized: {data.get('model', 'unknown')}")
|
|
276
|
-
|
|
277
|
-
else:
|
|
278
|
-
# Other output - just log it without creating a span
|
|
279
|
-
logger.debug(f"[agent] {decoded_line}")
|
|
280
|
-
continue
|
|
281
|
-
|
|
282
|
-
except json.JSONDecodeError:
|
|
283
|
-
# Not JSON - just log it
|
|
284
|
-
logger.info(f"[agent] {decoded_line}")
|
|
285
|
-
|
|
286
|
-
await process.wait()
|
|
287
|
-
|
|
288
|
-
exit_code = process.returncode or 0
|
|
289
|
-
if exit_code != 0:
|
|
290
|
-
error_context = "\n".join(output_lines[-50:]) if output_lines else "No output captured"
|
|
291
|
-
|
|
292
|
-
exec_span.set_attribute("error", True)
|
|
293
|
-
exec_span.set_attribute("exit_code", exit_code)
|
|
294
|
-
exec_span.add_event(
|
|
295
|
-
"agent_error",
|
|
296
|
-
{
|
|
297
|
-
"exit_code": exit_code,
|
|
298
|
-
"output": error_context[:4000],
|
|
299
|
-
},
|
|
300
|
-
)
|
|
146
|
+
for key, value in secrets.items():
|
|
147
|
+
docker_cmd.extend(["-e", f"{key.upper()}={value}"])
|
|
301
148
|
|
|
302
|
-
|
|
303
|
-
agent_span.set_attribute("exit_code", exit_code)
|
|
149
|
+
docker_cmd.append(image)
|
|
304
150
|
|
|
305
|
-
|
|
151
|
+
# Pass instruction via CLI arg
|
|
152
|
+
docker_cmd.extend(["--instruction", instruction])
|
|
306
153
|
|
|
307
|
-
|
|
154
|
+
# Run container - agents emit their own OTel spans
|
|
155
|
+
process = await asyncio.create_subprocess_exec(
|
|
156
|
+
*docker_cmd,
|
|
157
|
+
stdout=asyncio.subprocess.PIPE,
|
|
158
|
+
stderr=asyncio.subprocess.STDOUT,
|
|
159
|
+
)
|
|
308
160
|
|
|
309
|
-
|
|
310
|
-
|
|
161
|
+
# Capture output for error reporting
|
|
162
|
+
output_lines: list[str] = []
|
|
163
|
+
assert process.stdout is not None
|
|
164
|
+
while True:
|
|
165
|
+
line = await process.stdout.readline()
|
|
166
|
+
if not line:
|
|
167
|
+
break
|
|
168
|
+
decoded_line = line.decode().rstrip()
|
|
169
|
+
output_lines.append(decoded_line)
|
|
311
170
|
|
|
312
|
-
|
|
313
|
-
trajectory_path = Path(logs_dir) / "agent" / "trajectory.json"
|
|
314
|
-
if trajectory_path.exists():
|
|
315
|
-
try:
|
|
316
|
-
with open(trajectory_path) as f:
|
|
317
|
-
trajectory = json.load(f)
|
|
318
|
-
if isinstance(trajectory, dict) and "schema_version" in trajectory:
|
|
319
|
-
# Add agent image to trajectory
|
|
320
|
-
agent_data = trajectory.get("agent", {})
|
|
321
|
-
extra = agent_data.get("extra") or {}
|
|
322
|
-
extra["image"] = image
|
|
323
|
-
agent_data["extra"] = extra
|
|
324
|
-
trajectory["agent"] = agent_data
|
|
171
|
+
await process.wait()
|
|
325
172
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
traj_span.set_attribute("source", "agent")
|
|
331
|
-
# Store trajectory in span (truncated for OTel limits)
|
|
332
|
-
traj_json = json.dumps(trajectory)
|
|
333
|
-
if len(traj_json) > 10000:
|
|
334
|
-
traj_span.set_attribute("trajectory_truncated", True)
|
|
335
|
-
traj_span.set_attribute("trajectory_size", len(traj_json))
|
|
336
|
-
else:
|
|
337
|
-
traj_span.set_attribute("trajectory", traj_json)
|
|
338
|
-
except Exception as e:
|
|
339
|
-
logger.warning(f"Failed to load trajectory: {e}")
|
|
173
|
+
exit_code = process.returncode or 0
|
|
174
|
+
if exit_code != 0:
|
|
175
|
+
error_context = "\n".join(output_lines[-50:]) if output_lines else "No output captured"
|
|
176
|
+
raise RuntimeError(f"Agent failed with exit code {exit_code}\n\nAgent output:\n{error_context}")
|
|
340
177
|
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
await upload_artifacts(upload_url, logs_dir)
|
|
178
|
+
finally:
|
|
179
|
+
os.unlink(config_file.name)
|
|
344
180
|
|
|
345
|
-
|
|
346
|
-
|
|
181
|
+
# Upload artifacts if we have upload URL configured
|
|
182
|
+
if upload_url:
|
|
183
|
+
await upload_artifacts(upload_url, logs_dir)
|
plato/worlds/base.py
CHANGED
|
@@ -417,7 +417,7 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
417
417
|
content_type=content_type,
|
|
418
418
|
)
|
|
419
419
|
|
|
420
|
-
async def _create_and_upload_checkpoint(self) -> bool:
|
|
420
|
+
async def _create_and_upload_checkpoint(self) -> tuple[dict[str, str], bool]:
|
|
421
421
|
"""Create a full checkpoint including env snapshots and state bundle.
|
|
422
422
|
|
|
423
423
|
This method:
|
|
@@ -426,7 +426,7 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
426
426
|
3. Creates and uploads state bundle to S3
|
|
427
427
|
|
|
428
428
|
Returns:
|
|
429
|
-
|
|
429
|
+
Tuple of (env_snapshots dict, state_bundle_uploaded bool)
|
|
430
430
|
"""
|
|
431
431
|
# Commit state changes first
|
|
432
432
|
self._commit_state(f"Checkpoint at step {self._step_count}")
|
|
@@ -436,6 +436,8 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
436
436
|
if env_snapshots is None:
|
|
437
437
|
env_snapshots = {}
|
|
438
438
|
|
|
439
|
+
state_bundle_uploaded = True # Default to True if state not enabled
|
|
440
|
+
|
|
439
441
|
# Create and upload state bundle
|
|
440
442
|
if self.config.state.enabled:
|
|
441
443
|
bundle_data = self._create_state_bundle()
|
|
@@ -446,12 +448,12 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
446
448
|
)
|
|
447
449
|
if success:
|
|
448
450
|
self.logger.info(f"Uploaded state bundle at step {self._step_count}")
|
|
449
|
-
|
|
451
|
+
state_bundle_uploaded = True
|
|
450
452
|
else:
|
|
451
453
|
self.logger.warning(f"Failed to upload state bundle at step {self._step_count}")
|
|
452
|
-
|
|
454
|
+
state_bundle_uploaded = False
|
|
453
455
|
|
|
454
|
-
return
|
|
456
|
+
return env_snapshots, state_bundle_uploaded
|
|
455
457
|
|
|
456
458
|
def get_env(self, alias: str) -> Environment | None:
|
|
457
459
|
"""Get an environment by alias.
|
|
@@ -646,25 +648,30 @@ The following services are available for your use:
|
|
|
646
648
|
if config.session_id:
|
|
647
649
|
self._session_id = config.session_id
|
|
648
650
|
|
|
649
|
-
# Set environment variables for agent runners
|
|
651
|
+
# Set environment variables for agent runners (which run in Docker)
|
|
650
652
|
os.environ["SESSION_ID"] = config.session_id
|
|
651
653
|
if config.otel_url:
|
|
652
|
-
|
|
653
|
-
#
|
|
654
|
-
|
|
654
|
+
# For agents in Docker, convert localhost to host.docker.internal
|
|
655
|
+
# so they can reach the host machine's Chronos instance
|
|
656
|
+
agent_otel_url = config.otel_url
|
|
657
|
+
if "localhost" in agent_otel_url or "127.0.0.1" in agent_otel_url:
|
|
658
|
+
agent_otel_url = agent_otel_url.replace("localhost", "host.docker.internal")
|
|
659
|
+
agent_otel_url = agent_otel_url.replace("127.0.0.1", "host.docker.internal")
|
|
660
|
+
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = agent_otel_url
|
|
661
|
+
os.environ["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf"
|
|
655
662
|
if config.upload_url:
|
|
656
663
|
os.environ["UPLOAD_URL"] = config.upload_url
|
|
657
664
|
|
|
658
|
-
# Initialize OTel tracing
|
|
659
|
-
print(f"[World] OTel URL from config: {config.otel_url!r}")
|
|
665
|
+
# Initialize OTel tracing for the world itself (runs on host, not in Docker)
|
|
660
666
|
if config.otel_url:
|
|
667
|
+
logger.debug(f"Initializing OTel tracing with endpoint: {config.otel_url}")
|
|
661
668
|
init_tracing(
|
|
662
669
|
service_name=f"world-{self.name}",
|
|
663
670
|
session_id=config.session_id,
|
|
664
671
|
otlp_endpoint=config.otel_url,
|
|
665
672
|
)
|
|
666
673
|
else:
|
|
667
|
-
|
|
674
|
+
logger.debug("No otel_url in config - OTel tracing disabled")
|
|
668
675
|
|
|
669
676
|
# Connect to Plato session if configured (for heartbeats)
|
|
670
677
|
await self._connect_plato_session()
|
|
@@ -672,72 +679,72 @@ The following services are available for your use:
|
|
|
672
679
|
# Get tracer for spans
|
|
673
680
|
tracer = get_tracer("plato.world")
|
|
674
681
|
|
|
675
|
-
#
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
span.set_attribute("content", f"World '{self.name}' started")
|
|
682
|
-
|
|
683
|
-
try:
|
|
684
|
-
# Execute reset with OTel span
|
|
685
|
-
with tracer.start_as_current_span("reset") as reset_span:
|
|
686
|
-
reset_span.set_attribute("span.type", "reset")
|
|
687
|
-
reset_span.set_attribute("source", "world")
|
|
688
|
-
reset_span.set_attribute("content", f"Resetting world '{self.name}'")
|
|
689
|
-
obs = await self.reset()
|
|
690
|
-
obs_data = obs.model_dump() if hasattr(obs, "model_dump") else str(obs)
|
|
691
|
-
reset_span.set_attribute("observation", str(obs_data)[:1000]) # Truncate for OTel
|
|
692
|
-
self.logger.info(f"World reset complete: {obs}")
|
|
693
|
-
|
|
694
|
-
while True:
|
|
695
|
-
self._step_count += 1
|
|
696
|
-
|
|
697
|
-
# Execute step with OTel span
|
|
698
|
-
with tracer.start_as_current_span(f"step_{self._step_count}") as step_span:
|
|
699
|
-
step_span.set_attribute("span.type", "step")
|
|
700
|
-
step_span.set_attribute("source", "world")
|
|
701
|
-
step_span.set_attribute("step_number", self._step_count)
|
|
702
|
-
step_span.set_attribute("content", f"Step {self._step_count} started")
|
|
703
|
-
|
|
704
|
-
# Store span context for nested agent spans
|
|
705
|
-
|
|
706
|
-
self._current_step_id = format(step_span.get_span_context().span_id, "016x")
|
|
707
|
-
|
|
708
|
-
result = await self.step()
|
|
709
|
-
|
|
710
|
-
step_span.set_attribute("done", result.done)
|
|
711
|
-
obs_data = (
|
|
712
|
-
result.observation.model_dump()
|
|
713
|
-
if hasattr(result.observation, "model_dump")
|
|
714
|
-
else str(result.observation)
|
|
715
|
-
)
|
|
716
|
-
step_span.set_attribute("observation", str(obs_data)[:1000])
|
|
717
|
-
|
|
718
|
-
self.logger.info(f"Step {self._step_count}: done={result.done}")
|
|
682
|
+
# Create root session span that encompasses everything
|
|
683
|
+
# This ensures all child spans share the same trace_id
|
|
684
|
+
with tracer.start_as_current_span("session") as session_span:
|
|
685
|
+
session_span.set_attribute("plato.world.name", self.name)
|
|
686
|
+
session_span.set_attribute("plato.world.version", self.get_version())
|
|
687
|
+
session_span.set_attribute("plato.session.id", config.session_id)
|
|
719
688
|
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
await self.
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
689
|
+
try:
|
|
690
|
+
# Execute reset with OTel span
|
|
691
|
+
with tracer.start_as_current_span("reset") as reset_span:
|
|
692
|
+
obs = await self.reset()
|
|
693
|
+
obs_data = obs.model_dump() if hasattr(obs, "model_dump") else str(obs)
|
|
694
|
+
reset_span.set_attribute("plato.observation", str(obs_data)[:1000])
|
|
695
|
+
self.logger.info(f"World reset complete: {obs}")
|
|
696
|
+
|
|
697
|
+
while True:
|
|
698
|
+
self._step_count += 1
|
|
699
|
+
|
|
700
|
+
# Execute step with OTel span
|
|
701
|
+
with tracer.start_as_current_span(f"step_{self._step_count}") as step_span:
|
|
702
|
+
step_span.set_attribute("plato.step.number", self._step_count)
|
|
703
|
+
|
|
704
|
+
# Store span context for nested agent spans
|
|
705
|
+
self._current_step_id = format(step_span.get_span_context().span_id, "016x")
|
|
706
|
+
|
|
707
|
+
result = await self.step()
|
|
708
|
+
|
|
709
|
+
step_span.set_attribute("plato.step.done", result.done)
|
|
710
|
+
obs_data = (
|
|
711
|
+
result.observation.model_dump()
|
|
712
|
+
if hasattr(result.observation, "model_dump")
|
|
713
|
+
else str(result.observation)
|
|
714
|
+
)
|
|
715
|
+
step_span.set_attribute("plato.step.observation", str(obs_data)[:1000])
|
|
716
|
+
|
|
717
|
+
self.logger.info(f"Step {self._step_count}: done={result.done}")
|
|
718
|
+
|
|
719
|
+
# Create checkpoint if enabled and interval matches
|
|
720
|
+
if self.config.checkpoint.enabled and self._step_count % self.config.checkpoint.interval == 0:
|
|
721
|
+
self.logger.info(f"Creating checkpoint after step {self._step_count}")
|
|
722
|
+
with tracer.start_as_current_span("checkpoint") as checkpoint_span:
|
|
723
|
+
checkpoint_span.set_attribute("plato.checkpoint.step", self._step_count)
|
|
724
|
+
env_snapshots, state_bundle_uploaded = await self._create_and_upload_checkpoint()
|
|
725
|
+
|
|
726
|
+
checkpoint_span.set_attribute("plato.checkpoint.success", len(env_snapshots) > 0)
|
|
727
|
+
checkpoint_span.set_attribute(
|
|
728
|
+
"plato.checkpoint.state_bundle_uploaded", state_bundle_uploaded
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
if env_snapshots:
|
|
732
|
+
checkpoint_span.set_attribute(
|
|
733
|
+
"plato.checkpoint.environments", list(env_snapshots.keys())
|
|
734
|
+
)
|
|
735
|
+
checkpoint_span.set_attribute(
|
|
736
|
+
"plato.checkpoint.artifact_ids", list(env_snapshots.values())
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
if result.done:
|
|
740
|
+
break
|
|
741
|
+
|
|
742
|
+
finally:
|
|
743
|
+
await self.close()
|
|
744
|
+
await self._disconnect_plato_session()
|
|
745
|
+
|
|
746
|
+
# Shutdown OTel tracing and clear session info (outside the span)
|
|
747
|
+
shutdown_tracing()
|
|
748
|
+
self._session_id = None
|
|
749
|
+
|
|
750
|
+
self.logger.info(f"World '{self.name}' completed after {self._step_count} steps")
|
plato/worlds/runner.py
CHANGED
|
@@ -6,7 +6,6 @@ import asyncio
|
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
|
-
import platform
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from typing import Annotated
|
|
12
11
|
|
|
@@ -136,6 +135,28 @@ def list_worlds(
|
|
|
136
135
|
typer.echo(f" {name} (v{version}): {desc}")
|
|
137
136
|
|
|
138
137
|
|
|
138
|
+
def _get_docker_platform() -> str:
|
|
139
|
+
"""Get the appropriate Docker platform for the current system.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Docker platform string (e.g., "linux/arm64" or "linux/amd64")
|
|
143
|
+
"""
|
|
144
|
+
import platform as plat
|
|
145
|
+
|
|
146
|
+
system = plat.system()
|
|
147
|
+
machine = plat.machine().lower()
|
|
148
|
+
|
|
149
|
+
# On macOS with Apple Silicon (arm64/aarch64), use linux/arm64
|
|
150
|
+
if system == "Darwin" and machine in ("arm64", "aarch64"):
|
|
151
|
+
return "linux/arm64"
|
|
152
|
+
# On Linux ARM
|
|
153
|
+
elif system == "Linux" and machine in ("arm64", "aarch64"):
|
|
154
|
+
return "linux/arm64"
|
|
155
|
+
# Default to amd64 for x86_64 or other architectures
|
|
156
|
+
else:
|
|
157
|
+
return "linux/amd64"
|
|
158
|
+
|
|
159
|
+
|
|
139
160
|
async def _build_agent_image(
|
|
140
161
|
agent_name: str,
|
|
141
162
|
agents_dir: Path,
|
|
@@ -181,9 +202,17 @@ async def _build_agent_image(
|
|
|
181
202
|
target = "prod"
|
|
182
203
|
logger.info(f"Building {image_tag} (prod mode from {build_context})...")
|
|
183
204
|
|
|
205
|
+
# Detect platform for ARM Mac support
|
|
206
|
+
docker_platform = _get_docker_platform()
|
|
207
|
+
logger.info(f"Building for platform: {docker_platform}")
|
|
208
|
+
|
|
184
209
|
cmd = [
|
|
185
210
|
"docker",
|
|
186
211
|
"build",
|
|
212
|
+
"--platform",
|
|
213
|
+
docker_platform,
|
|
214
|
+
"--build-arg",
|
|
215
|
+
f"PLATFORM={docker_platform}",
|
|
187
216
|
"--target",
|
|
188
217
|
target,
|
|
189
218
|
"-t",
|
|
@@ -192,10 +221,6 @@ async def _build_agent_image(
|
|
|
192
221
|
dockerfile_abs,
|
|
193
222
|
]
|
|
194
223
|
|
|
195
|
-
# Use native platform for local dev on ARM Macs (avoids slow emulation)
|
|
196
|
-
if platform.machine() == "arm64":
|
|
197
|
-
cmd.extend(["--build-arg", "PLATFORM=linux/arm64"])
|
|
198
|
-
|
|
199
224
|
cmd.append(build_context)
|
|
200
225
|
|
|
201
226
|
logger.debug(f"Build command: {' '.join(cmd)}")
|
|
@@ -405,6 +430,7 @@ async def _run_dev(
|
|
|
405
430
|
plato = AsyncPlato()
|
|
406
431
|
session = None
|
|
407
432
|
plato_session_id: str | None = None
|
|
433
|
+
chronos_session_id: str | None = None
|
|
408
434
|
|
|
409
435
|
try:
|
|
410
436
|
if env_configs:
|
|
@@ -435,18 +461,10 @@ async def _run_dev(
|
|
|
435
461
|
|
|
436
462
|
# Update run_config with session info from Chronos
|
|
437
463
|
run_config.session_id = chronos_session_id
|
|
438
|
-
|
|
464
|
+
# Use base chronos URL for OTEL endpoint (more reliable than session-provided URL)
|
|
465
|
+
run_config.otel_url = f"{chronos_url.rstrip('/')}/api/otel"
|
|
439
466
|
run_config.upload_url = chronos_session.upload_url
|
|
440
467
|
|
|
441
|
-
# For local dev, override otel_url to use localhost directly
|
|
442
|
-
# (Chronos may return a tunnel URL that's meant for remote VMs)
|
|
443
|
-
if "localhost" in chronos_url or "127.0.0.1" in chronos_url:
|
|
444
|
-
run_config.otel_url = f"{chronos_url.rstrip('/')}/api/otel"
|
|
445
|
-
logger.info(f"Local dev: using OTel URL {run_config.otel_url}")
|
|
446
|
-
|
|
447
|
-
print(f"[Runner] run_config.otel_url = {run_config.otel_url!r}")
|
|
448
|
-
print(f"[Runner] run_config.upload_url = {run_config.upload_url!r}")
|
|
449
|
-
|
|
450
468
|
# Run the world
|
|
451
469
|
logger.info(f"Starting world '{world_name}'...")
|
|
452
470
|
world_instance = world_cls()
|
|
@@ -509,11 +527,16 @@ def dev(
|
|
|
509
527
|
world: Annotated[str, typer.Option("--world", "-w", help="World name to run")],
|
|
510
528
|
config: Annotated[Path, typer.Option("--config", "-c", help="Path to config JSON file")],
|
|
511
529
|
env_timeout: Annotated[
|
|
512
|
-
int,
|
|
530
|
+
int,
|
|
531
|
+
typer.Option("--env-timeout", help="Timeout for environment creation (seconds)"),
|
|
513
532
|
] = 7200,
|
|
514
533
|
agents_dir: Annotated[
|
|
515
534
|
Path | None,
|
|
516
|
-
typer.Option(
|
|
535
|
+
typer.Option(
|
|
536
|
+
"--agents-dir",
|
|
537
|
+
"-a",
|
|
538
|
+
help="Directory containing agent source code (builds local images)",
|
|
539
|
+
),
|
|
517
540
|
] = None,
|
|
518
541
|
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False,
|
|
519
542
|
) -> None:
|
|
@@ -302,8 +302,8 @@ plato/agents/artifacts.py,sha256=ljeI0wzsp7Q6uKqMb-k7kTb680Vizs54ohtM-d7zvOg,292
|
|
|
302
302
|
plato/agents/base.py,sha256=vUbPQuNSo6Ka2lIB_ZOXgi4EoAjtAD7GIj9LnNotam0,4577
|
|
303
303
|
plato/agents/build.py,sha256=CNMbVQFs2_pYit1dA29Davve28Yi4c7TNK9wBB7odrE,1621
|
|
304
304
|
plato/agents/config.py,sha256=CmRS6vOAg7JeqX4Hgp_KpA1YWBX_LuMicHm7SBjQEbs,5077
|
|
305
|
-
plato/agents/otel.py,sha256=
|
|
306
|
-
plato/agents/runner.py,sha256=
|
|
305
|
+
plato/agents/otel.py,sha256=LI5ZK4lwoDD2AnXhSubbv6ONP2VayOsNIk-F1hQ6968,7991
|
|
306
|
+
plato/agents/runner.py,sha256=Ei20Ib-Fn5XOaS6V1Rtw0UEw34XflEWaXMpazPjmnrE,6061
|
|
307
307
|
plato/agents/trajectory.py,sha256=WdiBmua0KvCrNaM3qgPI7-7B4xmSkfbP4oZ_9_8qHzU,10529
|
|
308
308
|
plato/chronos/__init__.py,sha256=RHMvSrQS_-vkKOyTRuAkp2gKDP1HEuBLDnw8jcZs1Jg,739
|
|
309
309
|
plato/chronos/client.py,sha256=YcOGtHWERyOD9z8LKt8bRMVL0cEwL2hiAP4qQgdZlUI,5495
|
|
@@ -458,11 +458,11 @@ plato/v2/utils/models.py,sha256=PwehSSnIRG-tM3tWL1PzZEH77ZHhIAZ9R0UPs6YknbM,1441
|
|
|
458
458
|
plato/v2/utils/proxy_tunnel.py,sha256=8ZTd0jCGSfIHMvSv1fgEyacuISWnGPHLPbDglWroTzY,10463
|
|
459
459
|
plato/worlds/README.md,sha256=XFOkEA3cNNcrWkk-Cxnsl-zn-y0kvUENKQRSqFKpdqw,5479
|
|
460
460
|
plato/worlds/__init__.py,sha256=ALoou3l5lXvs_YZc5eH6HdMHpvhnpzKWqz__aSC1jFc,2152
|
|
461
|
-
plato/worlds/base.py,sha256=
|
|
461
|
+
plato/worlds/base.py,sha256=_svL9RBp3dTIhHqcvZB1F7qEFrZvAuQ-XjZkTa3L6zo,27750
|
|
462
462
|
plato/worlds/build_hook.py,sha256=KSoW0kqa5b7NyZ7MYOw2qsZ_2FkWuz0M3Ru7AKOP7Qw,3486
|
|
463
463
|
plato/worlds/config.py,sha256=a5frj3mt06rSlT25kE-L8Q2b2MTWkR-8cUoBKpC8tG4,11036
|
|
464
|
-
plato/worlds/runner.py,sha256=
|
|
465
|
-
plato_sdk_v2-2.3.
|
|
466
|
-
plato_sdk_v2-2.3.
|
|
467
|
-
plato_sdk_v2-2.3.
|
|
468
|
-
plato_sdk_v2-2.3.
|
|
464
|
+
plato/worlds/runner.py,sha256=2H5EV77bTYrMyI7qez0kwxOp9EApQxG19Ob9a_GTdbw,19383
|
|
465
|
+
plato_sdk_v2-2.3.7.dist-info/METADATA,sha256=7T1hf9Y8o0lFSrSx35VozfobEdwM097kfZQT6rEIn68,8653
|
|
466
|
+
plato_sdk_v2-2.3.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
467
|
+
plato_sdk_v2-2.3.7.dist-info/entry_points.txt,sha256=upGMbJCx6YWUTKrPoYvYUYfFCqYr75nHDwhA-45m6p8,136
|
|
468
|
+
plato_sdk_v2-2.3.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|