plato-sdk-v2 2.3.3__py3-none-any.whl → 2.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plato/agents/__init__.py +24 -16
- plato/agents/artifacts.py +108 -0
- plato/agents/config.py +2 -7
- plato/agents/otel.py +258 -0
- plato/agents/runner.py +223 -68
- plato/worlds/README.md +2 -1
- plato/worlds/base.py +111 -78
- plato/worlds/config.py +5 -3
- plato/worlds/runner.py +62 -18
- {plato_sdk_v2-2.3.3.dist-info → plato_sdk_v2-2.3.5.dist-info}/METADATA +4 -2
- {plato_sdk_v2-2.3.3.dist-info → plato_sdk_v2-2.3.5.dist-info}/RECORD +13 -18
- plato/agents/logging.py +0 -515
- plato/chronos/api/callback/__init__.py +0 -11
- plato/chronos/api/callback/push_agent_logs.py +0 -61
- plato/chronos/api/callback/update_agent_status.py +0 -57
- plato/chronos/api/callback/upload_artifacts.py +0 -59
- plato/chronos/api/callback/upload_logs_zip.py +0 -57
- plato/chronos/api/callback/upload_trajectory.py +0 -57
- {plato_sdk_v2-2.3.3.dist-info → plato_sdk_v2-2.3.5.dist-info}/WHEEL +0 -0
- {plato_sdk_v2-2.3.3.dist-info → plato_sdk_v2-2.3.5.dist-info}/entry_points.txt +0 -0
plato/agents/runner.py
CHANGED
|
@@ -10,7 +10,10 @@ import platform
|
|
|
10
10
|
import tempfile
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
|
|
13
|
-
from
|
|
13
|
+
from opentelemetry import trace
|
|
14
|
+
|
|
15
|
+
from plato.agents.artifacts import upload_artifacts
|
|
16
|
+
from plato.agents.otel import get_tracer
|
|
14
17
|
|
|
15
18
|
logger = logging.getLogger(__name__)
|
|
16
19
|
|
|
@@ -38,20 +41,32 @@ async def run_agent(
|
|
|
38
41
|
logs_dir = logs_dir or tempfile.mkdtemp(prefix="agent_logs_")
|
|
39
42
|
agent_name = image.split("/")[-1].split(":")[0]
|
|
40
43
|
|
|
41
|
-
|
|
42
|
-
|
|
44
|
+
# Get session info from environment variables
|
|
45
|
+
session_id = os.environ.get("SESSION_ID")
|
|
46
|
+
otel_url = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
47
|
+
upload_url = os.environ.get("UPLOAD_URL")
|
|
48
|
+
|
|
49
|
+
tracer = get_tracer("plato.agent")
|
|
50
|
+
|
|
51
|
+
with tracer.start_as_current_span(agent_name) as agent_span:
|
|
52
|
+
agent_span.set_attribute("span.type", "agent")
|
|
53
|
+
agent_span.set_attribute("source", "agent")
|
|
54
|
+
agent_span.set_attribute("image", image)
|
|
55
|
+
agent_span.set_attribute("content", f"Starting agent: {agent_name}")
|
|
43
56
|
|
|
44
57
|
# Pull image if requested
|
|
45
58
|
if pull:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
59
|
+
with tracer.start_as_current_span("docker_pull") as pull_span:
|
|
60
|
+
pull_span.set_attribute("span.type", "docker_pull")
|
|
61
|
+
pull_span.set_attribute("image", image)
|
|
62
|
+
pull_proc = await asyncio.create_subprocess_exec(
|
|
63
|
+
"docker",
|
|
64
|
+
"pull",
|
|
65
|
+
image,
|
|
66
|
+
stdout=asyncio.subprocess.PIPE,
|
|
67
|
+
stderr=asyncio.subprocess.STDOUT,
|
|
68
|
+
)
|
|
69
|
+
await pull_proc.wait()
|
|
55
70
|
|
|
56
71
|
# Setup
|
|
57
72
|
os.makedirs(os.path.join(logs_dir, "agent"), exist_ok=True)
|
|
@@ -63,9 +78,7 @@ async def run_agent(
|
|
|
63
78
|
# Build docker command
|
|
64
79
|
docker_cmd = ["docker", "run", "--rm"]
|
|
65
80
|
|
|
66
|
-
# Determine if we need host networking
|
|
67
|
-
# - Required on Linux without iptables for connectivity
|
|
68
|
-
# - Skip on macOS where --network=host doesn't work properly
|
|
81
|
+
# Determine if we need host networking
|
|
69
82
|
use_host_network = False
|
|
70
83
|
is_macos = platform.system() == "Darwin"
|
|
71
84
|
|
|
@@ -103,68 +116,200 @@ async def run_agent(
|
|
|
103
116
|
]
|
|
104
117
|
)
|
|
105
118
|
|
|
119
|
+
# Pass session info to agent
|
|
120
|
+
if otel_url:
|
|
121
|
+
docker_cmd.extend(["-e", f"OTEL_EXPORTER_OTLP_ENDPOINT={otel_url}"])
|
|
122
|
+
# Use JSON protocol (not protobuf) for OTLP exports
|
|
123
|
+
docker_cmd.extend(["-e", "OTEL_EXPORTER_OTLP_PROTOCOL=http/json"])
|
|
124
|
+
if session_id:
|
|
125
|
+
docker_cmd.extend(["-e", f"SESSION_ID={session_id}"])
|
|
126
|
+
if upload_url:
|
|
127
|
+
docker_cmd.extend(["-e", f"UPLOAD_URL={upload_url}"])
|
|
128
|
+
|
|
129
|
+
# Pass trace context to agent for parent linking
|
|
130
|
+
current_span = trace.get_current_span()
|
|
131
|
+
span_context = current_span.get_span_context()
|
|
132
|
+
if span_context.is_valid:
|
|
133
|
+
trace_id = format(span_context.trace_id, "032x")
|
|
134
|
+
span_id = format(span_context.span_id, "016x")
|
|
135
|
+
docker_cmd.extend(
|
|
136
|
+
[
|
|
137
|
+
"-e",
|
|
138
|
+
f"OTEL_TRACE_ID={trace_id}",
|
|
139
|
+
"-e",
|
|
140
|
+
f"OTEL_PARENT_SPAN_ID={span_id}",
|
|
141
|
+
]
|
|
142
|
+
)
|
|
143
|
+
|
|
106
144
|
for key, value in secrets.items():
|
|
107
145
|
docker_cmd.extend(["-e", f"{key.upper()}={value}"])
|
|
108
146
|
|
|
109
147
|
docker_cmd.append(image)
|
|
110
148
|
|
|
111
|
-
# Pass instruction via CLI arg
|
|
149
|
+
# Pass instruction via CLI arg
|
|
112
150
|
docker_cmd.extend(["--instruction", instruction])
|
|
113
151
|
|
|
114
152
|
# Run container and stream output
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
stderr=asyncio.subprocess.STDOUT,
|
|
119
|
-
)
|
|
153
|
+
with tracer.start_as_current_span("agent_execution") as exec_span:
|
|
154
|
+
exec_span.set_attribute("span.type", "agent_execution")
|
|
155
|
+
exec_span.set_attribute("content", f"Running {agent_name}")
|
|
120
156
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
line = await process.stdout.readline()
|
|
126
|
-
if not line:
|
|
127
|
-
break
|
|
128
|
-
decoded_line = line.decode().rstrip()
|
|
129
|
-
output_lines.append(decoded_line)
|
|
130
|
-
logger.info(f"[agent] {decoded_line}")
|
|
131
|
-
|
|
132
|
-
await process.wait()
|
|
133
|
-
|
|
134
|
-
if process.returncode != 0:
|
|
135
|
-
# Get last N lines of output for error context
|
|
136
|
-
error_context = "\n".join(output_lines[-50:]) if output_lines else "No output captured"
|
|
137
|
-
|
|
138
|
-
# Log error event with container output
|
|
139
|
-
await log_event(
|
|
140
|
-
span_type="error",
|
|
141
|
-
content=f"Agent failed with exit code {process.returncode}",
|
|
142
|
-
source="agent",
|
|
143
|
-
extra={
|
|
144
|
-
"exit_code": process.returncode,
|
|
145
|
-
"image": image,
|
|
146
|
-
"agent_name": agent_name,
|
|
147
|
-
"output": error_context,
|
|
148
|
-
"output_line_count": len(output_lines),
|
|
149
|
-
},
|
|
157
|
+
process = await asyncio.create_subprocess_exec(
|
|
158
|
+
*docker_cmd,
|
|
159
|
+
stdout=asyncio.subprocess.PIPE,
|
|
160
|
+
stderr=asyncio.subprocess.STDOUT,
|
|
150
161
|
)
|
|
151
162
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
163
|
+
# Stream output line by line
|
|
164
|
+
output_lines: list[str] = []
|
|
165
|
+
turn_count = 0
|
|
166
|
+
assert process.stdout is not None
|
|
167
|
+
while True:
|
|
168
|
+
line = await process.stdout.readline()
|
|
169
|
+
if not line:
|
|
170
|
+
break
|
|
171
|
+
decoded_line = line.decode().rstrip()
|
|
172
|
+
output_lines.append(decoded_line)
|
|
173
|
+
|
|
174
|
+
# Try to parse JSON output from agent for structured trajectory spans
|
|
175
|
+
try:
|
|
176
|
+
data = json.loads(decoded_line)
|
|
177
|
+
event_type = data.get("type", "")
|
|
178
|
+
|
|
179
|
+
if event_type == "assistant":
|
|
180
|
+
# Agent response - create a turn span
|
|
181
|
+
turn_count += 1
|
|
182
|
+
msg = data.get("message", {})
|
|
183
|
+
content_items = msg.get("content", [])
|
|
184
|
+
|
|
185
|
+
# Extract text and tool calls with full details
|
|
186
|
+
text_parts = []
|
|
187
|
+
tool_calls = []
|
|
188
|
+
for item in content_items:
|
|
189
|
+
if item.get("type") == "text":
|
|
190
|
+
text_parts.append(item.get("text", "")[:2000])
|
|
191
|
+
elif item.get("type") == "tool_use":
|
|
192
|
+
tool_input = item.get("input", {})
|
|
193
|
+
# Truncate large inputs
|
|
194
|
+
input_str = json.dumps(tool_input) if tool_input else ""
|
|
195
|
+
if len(input_str) > 2000:
|
|
196
|
+
input_str = input_str[:2000] + "..."
|
|
197
|
+
tool_calls.append(
|
|
198
|
+
{
|
|
199
|
+
"tool": item.get("name"),
|
|
200
|
+
"id": item.get("id"),
|
|
201
|
+
"input": input_str,
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
with tracer.start_as_current_span(f"turn_{turn_count}") as turn_span:
|
|
206
|
+
turn_span.set_attribute("span.type", "agent_turn")
|
|
207
|
+
turn_span.set_attribute("source", "agent")
|
|
208
|
+
turn_span.set_attribute("turn_number", turn_count)
|
|
209
|
+
turn_span.set_attribute("model", msg.get("model", "unknown"))
|
|
159
210
|
|
|
160
|
-
|
|
211
|
+
if text_parts:
|
|
212
|
+
turn_span.set_attribute("content", "\n".join(text_parts)[:4000])
|
|
213
|
+
if tool_calls:
|
|
214
|
+
turn_span.set_attribute("tool_calls", json.dumps(tool_calls))
|
|
215
|
+
# If no text content, show tool calls summary
|
|
216
|
+
if not text_parts:
|
|
217
|
+
turn_span.set_attribute(
|
|
218
|
+
"content", f"Tool calls: {', '.join(t['tool'] for t in tool_calls)}"
|
|
219
|
+
)
|
|
161
220
|
|
|
162
|
-
|
|
221
|
+
# Usage info
|
|
222
|
+
usage = msg.get("usage", {})
|
|
223
|
+
if usage:
|
|
224
|
+
turn_span.set_attribute("input_tokens", usage.get("input_tokens", 0))
|
|
225
|
+
turn_span.set_attribute("output_tokens", usage.get("output_tokens", 0))
|
|
226
|
+
|
|
227
|
+
elif event_type == "user":
|
|
228
|
+
# Tool result
|
|
229
|
+
tool_results = data.get("message", {}).get("content", [])
|
|
230
|
+
for result in tool_results:
|
|
231
|
+
if result.get("type") == "tool_result":
|
|
232
|
+
tool_id = result.get("tool_use_id", "")
|
|
233
|
+
content = result.get("content", "")
|
|
234
|
+
# Handle content that might be a list of content blocks
|
|
235
|
+
if isinstance(content, list):
|
|
236
|
+
text_parts = []
|
|
237
|
+
for item in content:
|
|
238
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
239
|
+
text_parts.append(item.get("text", ""))
|
|
240
|
+
elif isinstance(item, str):
|
|
241
|
+
text_parts.append(item)
|
|
242
|
+
content = "\n".join(text_parts)
|
|
243
|
+
if isinstance(content, str):
|
|
244
|
+
content = content[:2000] # Truncate large results
|
|
245
|
+
with tracer.start_as_current_span("tool_result") as tr_span:
|
|
246
|
+
tr_span.set_attribute("span.type", "tool_result")
|
|
247
|
+
tr_span.set_attribute("source", "agent")
|
|
248
|
+
tr_span.set_attribute("tool_use_id", tool_id)
|
|
249
|
+
tr_span.set_attribute("content", f"Tool result for {tool_id}")
|
|
250
|
+
tr_span.set_attribute("result", content if content else "")
|
|
251
|
+
|
|
252
|
+
elif event_type == "result":
|
|
253
|
+
# Final result
|
|
254
|
+
result_text = data.get("result", "")[:1000]
|
|
255
|
+
is_error = data.get("is_error", False)
|
|
256
|
+
duration_ms = data.get("duration_ms", 0)
|
|
257
|
+
total_cost = data.get("total_cost_usd", 0)
|
|
258
|
+
|
|
259
|
+
with tracer.start_as_current_span("agent_result") as res_span:
|
|
260
|
+
res_span.set_attribute("span.type", "agent_result")
|
|
261
|
+
res_span.set_attribute("source", "agent")
|
|
262
|
+
res_span.set_attribute("content", result_text if result_text else "Agent completed")
|
|
263
|
+
res_span.set_attribute("is_error", is_error)
|
|
264
|
+
res_span.set_attribute("duration_ms", duration_ms)
|
|
265
|
+
res_span.set_attribute("total_cost_usd", total_cost)
|
|
266
|
+
res_span.set_attribute("num_turns", data.get("num_turns", turn_count))
|
|
267
|
+
|
|
268
|
+
elif event_type == "system" and data.get("subtype") == "init":
|
|
269
|
+
# Agent initialization
|
|
270
|
+
with tracer.start_as_current_span("agent_init") as init_span:
|
|
271
|
+
init_span.set_attribute("span.type", "agent_init")
|
|
272
|
+
init_span.set_attribute("source", "agent")
|
|
273
|
+
init_span.set_attribute("model", data.get("model", "unknown"))
|
|
274
|
+
init_span.set_attribute("tools", json.dumps(data.get("tools", [])))
|
|
275
|
+
init_span.set_attribute("content", f"Agent initialized: {data.get('model', 'unknown')}")
|
|
276
|
+
|
|
277
|
+
else:
|
|
278
|
+
# Other output - just log it without creating a span
|
|
279
|
+
logger.debug(f"[agent] {decoded_line}")
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
except json.JSONDecodeError:
|
|
283
|
+
# Not JSON - just log it
|
|
284
|
+
logger.info(f"[agent] {decoded_line}")
|
|
285
|
+
|
|
286
|
+
await process.wait()
|
|
287
|
+
|
|
288
|
+
exit_code = process.returncode or 0
|
|
289
|
+
if exit_code != 0:
|
|
290
|
+
error_context = "\n".join(output_lines[-50:]) if output_lines else "No output captured"
|
|
291
|
+
|
|
292
|
+
exec_span.set_attribute("error", True)
|
|
293
|
+
exec_span.set_attribute("exit_code", exit_code)
|
|
294
|
+
exec_span.add_event(
|
|
295
|
+
"agent_error",
|
|
296
|
+
{
|
|
297
|
+
"exit_code": exit_code,
|
|
298
|
+
"output": error_context[:4000],
|
|
299
|
+
},
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
agent_span.set_attribute("error", True)
|
|
303
|
+
agent_span.set_attribute("exit_code", exit_code)
|
|
304
|
+
|
|
305
|
+
raise RuntimeError(f"Agent failed with exit code {exit_code}")
|
|
306
|
+
|
|
307
|
+
exec_span.set_attribute("success", True)
|
|
163
308
|
|
|
164
309
|
finally:
|
|
165
310
|
os.unlink(config_file.name)
|
|
166
311
|
|
|
167
|
-
# Load trajectory and
|
|
312
|
+
# Load trajectory and log as event
|
|
168
313
|
trajectory_path = Path(logs_dir) / "agent" / "trajectory.json"
|
|
169
314
|
if trajectory_path.exists():
|
|
170
315
|
try:
|
|
@@ -178,14 +323,24 @@ async def run_agent(
|
|
|
178
323
|
agent_data["extra"] = extra
|
|
179
324
|
trajectory["agent"] = agent_data
|
|
180
325
|
|
|
181
|
-
# Log trajectory as
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
log_type
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
326
|
+
# Log trajectory as span event
|
|
327
|
+
with tracer.start_as_current_span("trajectory") as traj_span:
|
|
328
|
+
traj_span.set_attribute("span.type", "trajectory")
|
|
329
|
+
traj_span.set_attribute("log_type", "atif")
|
|
330
|
+
traj_span.set_attribute("source", "agent")
|
|
331
|
+
# Store trajectory in span (truncated for OTel limits)
|
|
332
|
+
traj_json = json.dumps(trajectory)
|
|
333
|
+
if len(traj_json) > 10000:
|
|
334
|
+
traj_span.set_attribute("trajectory_truncated", True)
|
|
335
|
+
traj_span.set_attribute("trajectory_size", len(traj_json))
|
|
336
|
+
else:
|
|
337
|
+
traj_span.set_attribute("trajectory", traj_json)
|
|
188
338
|
except Exception as e:
|
|
189
339
|
logger.warning(f"Failed to load trajectory: {e}")
|
|
190
340
|
|
|
191
|
-
|
|
341
|
+
# Upload artifacts if we have upload URL configured
|
|
342
|
+
if upload_url:
|
|
343
|
+
await upload_artifacts(upload_url, logs_dir)
|
|
344
|
+
|
|
345
|
+
agent_span.set_attribute("success", True)
|
|
346
|
+
agent_span.set_attribute("content", f"Agent {agent_name} completed successfully")
|
plato/worlds/README.md
CHANGED
plato/worlds/base.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
import subprocess
|
|
7
8
|
from abc import ABC, abstractmethod
|
|
8
9
|
from pathlib import Path
|
|
@@ -16,12 +17,14 @@ if TYPE_CHECKING:
|
|
|
16
17
|
from plato.v2.async_.environment import Environment
|
|
17
18
|
from plato.v2.async_.session import Session
|
|
18
19
|
|
|
19
|
-
from plato.agents.
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
from plato.agents.
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
from plato.agents.artifacts import (
|
|
21
|
+
upload_artifact as _upload_artifact_raw,
|
|
22
|
+
)
|
|
23
|
+
from plato.agents.otel import (
|
|
24
|
+
get_tracer,
|
|
25
|
+
init_tracing,
|
|
26
|
+
shutdown_tracing,
|
|
27
|
+
)
|
|
25
28
|
|
|
26
29
|
logger = logging.getLogger(__name__)
|
|
27
30
|
|
|
@@ -111,6 +114,7 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
111
114
|
self._step_count: int = 0
|
|
112
115
|
self.plato_session = None
|
|
113
116
|
self._current_step_id: str | None = None
|
|
117
|
+
self._session_id: str | None = None
|
|
114
118
|
|
|
115
119
|
@classmethod
|
|
116
120
|
def get_config_class(cls) -> type[RunConfig]:
|
|
@@ -390,17 +394,39 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
390
394
|
self.logger.warning(f"Failed to create state bundle: {e.stderr}")
|
|
391
395
|
return None
|
|
392
396
|
|
|
393
|
-
async def
|
|
397
|
+
async def _upload_artifact(
|
|
398
|
+
self,
|
|
399
|
+
data: bytes,
|
|
400
|
+
content_type: str = "application/octet-stream",
|
|
401
|
+
) -> bool:
|
|
402
|
+
"""Upload an artifact directly to S3.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
data: Raw bytes of the artifact
|
|
406
|
+
content_type: MIME type of the content
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
True if successful, False otherwise
|
|
410
|
+
"""
|
|
411
|
+
if not self.config.upload_url:
|
|
412
|
+
self.logger.warning("Cannot upload artifact: upload_url not set")
|
|
413
|
+
return False
|
|
414
|
+
return await _upload_artifact_raw(
|
|
415
|
+
upload_url=self.config.upload_url,
|
|
416
|
+
data=data,
|
|
417
|
+
content_type=content_type,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
async def _create_and_upload_checkpoint(self) -> bool:
|
|
394
421
|
"""Create a full checkpoint including env snapshots and state bundle.
|
|
395
422
|
|
|
396
423
|
This method:
|
|
397
424
|
1. Commits any pending state changes
|
|
398
425
|
2. Creates env snapshots using snapshot_store
|
|
399
|
-
3. Creates and uploads state bundle
|
|
400
|
-
4. Calls the checkpoint endpoint with all data
|
|
426
|
+
3. Creates and uploads state bundle to S3
|
|
401
427
|
|
|
402
428
|
Returns:
|
|
403
|
-
|
|
429
|
+
True if successful, False otherwise
|
|
404
430
|
"""
|
|
405
431
|
# Commit state changes first
|
|
406
432
|
self._commit_state(f"Checkpoint at step {self._step_count}")
|
|
@@ -411,35 +437,21 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
411
437
|
env_snapshots = {}
|
|
412
438
|
|
|
413
439
|
# Create and upload state bundle
|
|
414
|
-
state_artifact_id: str | None = None
|
|
415
440
|
if self.config.state.enabled:
|
|
416
441
|
bundle_data = self._create_state_bundle()
|
|
417
442
|
if bundle_data:
|
|
418
|
-
|
|
443
|
+
success = await self._upload_artifact(
|
|
419
444
|
data=bundle_data,
|
|
420
|
-
|
|
421
|
-
filename=f"state_step_{self._step_count}.bundle",
|
|
422
|
-
extra={
|
|
423
|
-
"step_number": self._step_count,
|
|
424
|
-
"state_path": self.config.state.path,
|
|
425
|
-
},
|
|
445
|
+
content_type="application/octet-stream",
|
|
426
446
|
)
|
|
427
|
-
if
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
step_number=self._step_count,
|
|
434
|
-
env_snapshots=env_snapshots,
|
|
435
|
-
state_artifact_id=state_artifact_id,
|
|
436
|
-
extra={
|
|
437
|
-
"world_name": self.name,
|
|
438
|
-
"world_version": self.get_version(),
|
|
439
|
-
},
|
|
440
|
-
)
|
|
447
|
+
if success:
|
|
448
|
+
self.logger.info(f"Uploaded state bundle at step {self._step_count}")
|
|
449
|
+
return True
|
|
450
|
+
else:
|
|
451
|
+
self.logger.warning(f"Failed to upload state bundle at step {self._step_count}")
|
|
452
|
+
return False
|
|
441
453
|
|
|
442
|
-
return
|
|
454
|
+
return True
|
|
443
455
|
|
|
444
456
|
def get_env(self, alias: str) -> Environment | None:
|
|
445
457
|
"""Get an environment by alias.
|
|
@@ -630,61 +642,82 @@ The following services are available for your use:
|
|
|
630
642
|
# Initialize state directory (creates git repo if needed)
|
|
631
643
|
self._init_state_directory()
|
|
632
644
|
|
|
633
|
-
# Initialize
|
|
634
|
-
if config.
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
645
|
+
# Initialize OTel tracing and session info for artifact uploads
|
|
646
|
+
if config.session_id:
|
|
647
|
+
self._session_id = config.session_id
|
|
648
|
+
|
|
649
|
+
# Set environment variables for agent runners
|
|
650
|
+
os.environ["SESSION_ID"] = config.session_id
|
|
651
|
+
if config.otel_url:
|
|
652
|
+
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = config.otel_url
|
|
653
|
+
# Use JSON protocol (not protobuf) for the OTLP exporter
|
|
654
|
+
os.environ["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/json"
|
|
655
|
+
if config.upload_url:
|
|
656
|
+
os.environ["UPLOAD_URL"] = config.upload_url
|
|
657
|
+
|
|
658
|
+
# Initialize OTel tracing if otel_url is provided
|
|
659
|
+
print(f"[World] OTel URL from config: {config.otel_url!r}")
|
|
660
|
+
if config.otel_url:
|
|
661
|
+
init_tracing(
|
|
662
|
+
service_name=f"world-{self.name}",
|
|
663
|
+
session_id=config.session_id,
|
|
664
|
+
otlp_endpoint=config.otel_url,
|
|
665
|
+
)
|
|
666
|
+
else:
|
|
667
|
+
print("[World] No otel_url in config - OTel tracing disabled")
|
|
639
668
|
|
|
640
669
|
# Connect to Plato session if configured (for heartbeats)
|
|
641
670
|
await self._connect_plato_session()
|
|
642
671
|
|
|
672
|
+
# Get tracer for spans
|
|
673
|
+
tracer = get_tracer("plato.world")
|
|
674
|
+
|
|
643
675
|
# Log session start
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
676
|
+
with tracer.start_as_current_span("session_start") as span:
|
|
677
|
+
span.set_attribute("span.type", "session_start")
|
|
678
|
+
span.set_attribute("source", "world")
|
|
679
|
+
span.set_attribute("world_name", self.name)
|
|
680
|
+
span.set_attribute("world_version", self.get_version())
|
|
681
|
+
span.set_attribute("content", f"World '{self.name}' started")
|
|
650
682
|
|
|
651
683
|
try:
|
|
652
|
-
# Execute reset with
|
|
653
|
-
|
|
654
|
-
reset_span.
|
|
684
|
+
# Execute reset with OTel span
|
|
685
|
+
with tracer.start_as_current_span("reset") as reset_span:
|
|
686
|
+
reset_span.set_attribute("span.type", "reset")
|
|
687
|
+
reset_span.set_attribute("source", "world")
|
|
688
|
+
reset_span.set_attribute("content", f"Resetting world '{self.name}'")
|
|
655
689
|
obs = await self.reset()
|
|
656
|
-
|
|
690
|
+
obs_data = obs.model_dump() if hasattr(obs, "model_dump") else str(obs)
|
|
691
|
+
reset_span.set_attribute("observation", str(obs_data)[:1000]) # Truncate for OTel
|
|
657
692
|
self.logger.info(f"World reset complete: {obs}")
|
|
658
693
|
|
|
659
694
|
while True:
|
|
660
695
|
self._step_count += 1
|
|
661
696
|
|
|
662
|
-
# Execute step with
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
step_span.
|
|
697
|
+
# Execute step with OTel span
|
|
698
|
+
with tracer.start_as_current_span(f"step_{self._step_count}") as step_span:
|
|
699
|
+
step_span.set_attribute("span.type", "step")
|
|
700
|
+
step_span.set_attribute("source", "world")
|
|
701
|
+
step_span.set_attribute("step_number", self._step_count)
|
|
702
|
+
step_span.set_attribute("content", f"Step {self._step_count} started")
|
|
703
|
+
|
|
704
|
+
# Store span context for nested agent spans
|
|
705
|
+
|
|
706
|
+
self._current_step_id = format(step_span.get_span_context().span_id, "016x")
|
|
707
|
+
|
|
672
708
|
result = await self.step()
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
"info": result.info,
|
|
680
|
-
}
|
|
709
|
+
|
|
710
|
+
step_span.set_attribute("done", result.done)
|
|
711
|
+
obs_data = (
|
|
712
|
+
result.observation.model_dump()
|
|
713
|
+
if hasattr(result.observation, "model_dump")
|
|
714
|
+
else str(result.observation)
|
|
681
715
|
)
|
|
716
|
+
step_span.set_attribute("observation", str(obs_data)[:1000])
|
|
682
717
|
|
|
683
718
|
self.logger.info(f"Step {self._step_count}: done={result.done}")
|
|
684
719
|
|
|
685
720
|
# Create checkpoint if enabled and interval matches
|
|
686
|
-
# Note: The checkpoint event is created by the callback endpoint,
|
|
687
|
-
# so we don't need a span wrapper here (would create duplicates)
|
|
688
721
|
if self.config.checkpoint.enabled and self._step_count % self.config.checkpoint.interval == 0:
|
|
689
722
|
self.logger.info(f"Creating checkpoint after step {self._step_count}")
|
|
690
723
|
await self._create_and_upload_checkpoint()
|
|
@@ -697,14 +730,14 @@ The following services are available for your use:
|
|
|
697
730
|
await self._disconnect_plato_session()
|
|
698
731
|
|
|
699
732
|
# Log session end
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
733
|
+
with tracer.start_as_current_span("session_end") as span:
|
|
734
|
+
span.set_attribute("span.type", "session_end")
|
|
735
|
+
span.set_attribute("source", "world")
|
|
736
|
+
span.set_attribute("total_steps", self._step_count)
|
|
737
|
+
span.set_attribute("content", f"World '{self.name}' completed after {self._step_count} steps")
|
|
738
|
+
|
|
739
|
+
# Shutdown OTel tracing and clear session info
|
|
740
|
+
shutdown_tracing()
|
|
741
|
+
self._session_id = None
|
|
709
742
|
|
|
710
743
|
self.logger.info(f"World '{self.name}' completed after {self._step_count} steps")
|
plato/worlds/config.py
CHANGED
|
@@ -126,13 +126,15 @@ class RunConfig(BaseModel):
|
|
|
126
126
|
|
|
127
127
|
Attributes:
|
|
128
128
|
session_id: Unique Chronos session identifier
|
|
129
|
-
|
|
129
|
+
otel_url: OTel endpoint URL (e.g., https://chronos.plato.so/api/otel)
|
|
130
|
+
upload_url: Presigned S3 URL for uploading artifacts (provided by Chronos)
|
|
130
131
|
plato_session: Serialized Plato session for connecting to existing VM session
|
|
131
132
|
checkpoint: Configuration for automatic checkpoints after steps
|
|
132
133
|
"""
|
|
133
134
|
|
|
134
135
|
session_id: str = ""
|
|
135
|
-
|
|
136
|
+
otel_url: str = "" # OTel endpoint URL
|
|
137
|
+
upload_url: str = "" # Presigned S3 URL for uploads
|
|
136
138
|
all_secrets: dict[str, str] = Field(default_factory=dict) # All secrets (world + agent)
|
|
137
139
|
|
|
138
140
|
# Serialized Plato session for connecting to VM and sending heartbeats
|
|
@@ -182,7 +184,7 @@ class RunConfig(BaseModel):
|
|
|
182
184
|
envs = []
|
|
183
185
|
|
|
184
186
|
# Skip runtime fields
|
|
185
|
-
runtime_fields = {"session_id", "
|
|
187
|
+
runtime_fields = {"session_id", "otel_url", "upload_url", "all_secrets", "plato_session", "checkpoint", "state"}
|
|
186
188
|
|
|
187
189
|
for field_name, prop_schema in properties.items():
|
|
188
190
|
if field_name in runtime_fields:
|