plato-sdk-v2 2.3.4__py3-none-any.whl → 2.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plato/agents/__init__.py +24 -16
- plato/agents/artifacts.py +108 -0
- plato/agents/config.py +2 -7
- plato/agents/otel.py +258 -0
- plato/agents/runner.py +223 -68
- plato/worlds/README.md +2 -1
- plato/worlds/base.py +124 -79
- plato/worlds/config.py +5 -3
- plato/worlds/runner.py +38 -21
- {plato_sdk_v2-2.3.4.dist-info → plato_sdk_v2-2.3.6.dist-info}/METADATA +4 -2
- {plato_sdk_v2-2.3.4.dist-info → plato_sdk_v2-2.3.6.dist-info}/RECORD +13 -18
- plato/agents/logging.py +0 -515
- plato/chronos/api/callback/__init__.py +0 -11
- plato/chronos/api/callback/push_agent_logs.py +0 -61
- plato/chronos/api/callback/update_agent_status.py +0 -57
- plato/chronos/api/callback/upload_artifacts.py +0 -59
- plato/chronos/api/callback/upload_logs_zip.py +0 -57
- plato/chronos/api/callback/upload_trajectory.py +0 -57
- {plato_sdk_v2-2.3.4.dist-info → plato_sdk_v2-2.3.6.dist-info}/WHEEL +0 -0
- {plato_sdk_v2-2.3.4.dist-info → plato_sdk_v2-2.3.6.dist-info}/entry_points.txt +0 -0
plato/agents/runner.py
CHANGED
|
@@ -10,7 +10,10 @@ import platform
|
|
|
10
10
|
import tempfile
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
|
|
13
|
-
from
|
|
13
|
+
from opentelemetry import trace
|
|
14
|
+
|
|
15
|
+
from plato.agents.artifacts import upload_artifacts
|
|
16
|
+
from plato.agents.otel import get_tracer
|
|
14
17
|
|
|
15
18
|
logger = logging.getLogger(__name__)
|
|
16
19
|
|
|
@@ -38,20 +41,32 @@ async def run_agent(
|
|
|
38
41
|
logs_dir = logs_dir or tempfile.mkdtemp(prefix="agent_logs_")
|
|
39
42
|
agent_name = image.split("/")[-1].split(":")[0]
|
|
40
43
|
|
|
41
|
-
|
|
42
|
-
|
|
44
|
+
# Get session info from environment variables
|
|
45
|
+
session_id = os.environ.get("SESSION_ID")
|
|
46
|
+
otel_url = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
47
|
+
upload_url = os.environ.get("UPLOAD_URL")
|
|
48
|
+
|
|
49
|
+
tracer = get_tracer("plato.agent")
|
|
50
|
+
|
|
51
|
+
with tracer.start_as_current_span(agent_name) as agent_span:
|
|
52
|
+
agent_span.set_attribute("span.type", "agent")
|
|
53
|
+
agent_span.set_attribute("source", "agent")
|
|
54
|
+
agent_span.set_attribute("image", image)
|
|
55
|
+
agent_span.set_attribute("content", f"Starting agent: {agent_name}")
|
|
43
56
|
|
|
44
57
|
# Pull image if requested
|
|
45
58
|
if pull:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
59
|
+
with tracer.start_as_current_span("docker_pull") as pull_span:
|
|
60
|
+
pull_span.set_attribute("span.type", "docker_pull")
|
|
61
|
+
pull_span.set_attribute("image", image)
|
|
62
|
+
pull_proc = await asyncio.create_subprocess_exec(
|
|
63
|
+
"docker",
|
|
64
|
+
"pull",
|
|
65
|
+
image,
|
|
66
|
+
stdout=asyncio.subprocess.PIPE,
|
|
67
|
+
stderr=asyncio.subprocess.STDOUT,
|
|
68
|
+
)
|
|
69
|
+
await pull_proc.wait()
|
|
55
70
|
|
|
56
71
|
# Setup
|
|
57
72
|
os.makedirs(os.path.join(logs_dir, "agent"), exist_ok=True)
|
|
@@ -63,9 +78,7 @@ async def run_agent(
|
|
|
63
78
|
# Build docker command
|
|
64
79
|
docker_cmd = ["docker", "run", "--rm"]
|
|
65
80
|
|
|
66
|
-
# Determine if we need host networking
|
|
67
|
-
# - Required on Linux without iptables for connectivity
|
|
68
|
-
# - Skip on macOS where --network=host doesn't work properly
|
|
81
|
+
# Determine if we need host networking
|
|
69
82
|
use_host_network = False
|
|
70
83
|
is_macos = platform.system() == "Darwin"
|
|
71
84
|
|
|
@@ -103,68 +116,200 @@ async def run_agent(
|
|
|
103
116
|
]
|
|
104
117
|
)
|
|
105
118
|
|
|
119
|
+
# Pass session info to agent
|
|
120
|
+
if otel_url:
|
|
121
|
+
docker_cmd.extend(["-e", f"OTEL_EXPORTER_OTLP_ENDPOINT={otel_url}"])
|
|
122
|
+
# Use JSON protocol (not protobuf) for OTLP exports
|
|
123
|
+
docker_cmd.extend(["-e", "OTEL_EXPORTER_OTLP_PROTOCOL=http/json"])
|
|
124
|
+
if session_id:
|
|
125
|
+
docker_cmd.extend(["-e", f"SESSION_ID={session_id}"])
|
|
126
|
+
if upload_url:
|
|
127
|
+
docker_cmd.extend(["-e", f"UPLOAD_URL={upload_url}"])
|
|
128
|
+
|
|
129
|
+
# Pass trace context to agent for parent linking
|
|
130
|
+
current_span = trace.get_current_span()
|
|
131
|
+
span_context = current_span.get_span_context()
|
|
132
|
+
if span_context.is_valid:
|
|
133
|
+
trace_id = format(span_context.trace_id, "032x")
|
|
134
|
+
span_id = format(span_context.span_id, "016x")
|
|
135
|
+
docker_cmd.extend(
|
|
136
|
+
[
|
|
137
|
+
"-e",
|
|
138
|
+
f"OTEL_TRACE_ID={trace_id}",
|
|
139
|
+
"-e",
|
|
140
|
+
f"OTEL_PARENT_SPAN_ID={span_id}",
|
|
141
|
+
]
|
|
142
|
+
)
|
|
143
|
+
|
|
106
144
|
for key, value in secrets.items():
|
|
107
145
|
docker_cmd.extend(["-e", f"{key.upper()}={value}"])
|
|
108
146
|
|
|
109
147
|
docker_cmd.append(image)
|
|
110
148
|
|
|
111
|
-
# Pass instruction via CLI arg
|
|
149
|
+
# Pass instruction via CLI arg
|
|
112
150
|
docker_cmd.extend(["--instruction", instruction])
|
|
113
151
|
|
|
114
152
|
# Run container and stream output
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
stderr=asyncio.subprocess.STDOUT,
|
|
119
|
-
)
|
|
153
|
+
with tracer.start_as_current_span("agent_execution") as exec_span:
|
|
154
|
+
exec_span.set_attribute("span.type", "agent_execution")
|
|
155
|
+
exec_span.set_attribute("content", f"Running {agent_name}")
|
|
120
156
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
line = await process.stdout.readline()
|
|
126
|
-
if not line:
|
|
127
|
-
break
|
|
128
|
-
decoded_line = line.decode().rstrip()
|
|
129
|
-
output_lines.append(decoded_line)
|
|
130
|
-
logger.info(f"[agent] {decoded_line}")
|
|
131
|
-
|
|
132
|
-
await process.wait()
|
|
133
|
-
|
|
134
|
-
if process.returncode != 0:
|
|
135
|
-
# Get last N lines of output for error context
|
|
136
|
-
error_context = "\n".join(output_lines[-50:]) if output_lines else "No output captured"
|
|
137
|
-
|
|
138
|
-
# Log error event with container output
|
|
139
|
-
await log_event(
|
|
140
|
-
span_type="error",
|
|
141
|
-
content=f"Agent failed with exit code {process.returncode}",
|
|
142
|
-
source="agent",
|
|
143
|
-
extra={
|
|
144
|
-
"exit_code": process.returncode,
|
|
145
|
-
"image": image,
|
|
146
|
-
"agent_name": agent_name,
|
|
147
|
-
"output": error_context,
|
|
148
|
-
"output_line_count": len(output_lines),
|
|
149
|
-
},
|
|
157
|
+
process = await asyncio.create_subprocess_exec(
|
|
158
|
+
*docker_cmd,
|
|
159
|
+
stdout=asyncio.subprocess.PIPE,
|
|
160
|
+
stderr=asyncio.subprocess.STDOUT,
|
|
150
161
|
)
|
|
151
162
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
163
|
+
# Stream output line by line
|
|
164
|
+
output_lines: list[str] = []
|
|
165
|
+
turn_count = 0
|
|
166
|
+
assert process.stdout is not None
|
|
167
|
+
while True:
|
|
168
|
+
line = await process.stdout.readline()
|
|
169
|
+
if not line:
|
|
170
|
+
break
|
|
171
|
+
decoded_line = line.decode().rstrip()
|
|
172
|
+
output_lines.append(decoded_line)
|
|
173
|
+
|
|
174
|
+
# Try to parse JSON output from agent for structured trajectory spans
|
|
175
|
+
try:
|
|
176
|
+
data = json.loads(decoded_line)
|
|
177
|
+
event_type = data.get("type", "")
|
|
178
|
+
|
|
179
|
+
if event_type == "assistant":
|
|
180
|
+
# Agent response - create a turn span
|
|
181
|
+
turn_count += 1
|
|
182
|
+
msg = data.get("message", {})
|
|
183
|
+
content_items = msg.get("content", [])
|
|
184
|
+
|
|
185
|
+
# Extract text and tool calls with full details
|
|
186
|
+
text_parts = []
|
|
187
|
+
tool_calls = []
|
|
188
|
+
for item in content_items:
|
|
189
|
+
if item.get("type") == "text":
|
|
190
|
+
text_parts.append(item.get("text", "")[:2000])
|
|
191
|
+
elif item.get("type") == "tool_use":
|
|
192
|
+
tool_input = item.get("input", {})
|
|
193
|
+
# Truncate large inputs
|
|
194
|
+
input_str = json.dumps(tool_input) if tool_input else ""
|
|
195
|
+
if len(input_str) > 2000:
|
|
196
|
+
input_str = input_str[:2000] + "..."
|
|
197
|
+
tool_calls.append(
|
|
198
|
+
{
|
|
199
|
+
"tool": item.get("name"),
|
|
200
|
+
"id": item.get("id"),
|
|
201
|
+
"input": input_str,
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
with tracer.start_as_current_span(f"turn_{turn_count}") as turn_span:
|
|
206
|
+
turn_span.set_attribute("span.type", "agent_turn")
|
|
207
|
+
turn_span.set_attribute("source", "agent")
|
|
208
|
+
turn_span.set_attribute("turn_number", turn_count)
|
|
209
|
+
turn_span.set_attribute("model", msg.get("model", "unknown"))
|
|
159
210
|
|
|
160
|
-
|
|
211
|
+
if text_parts:
|
|
212
|
+
turn_span.set_attribute("content", "\n".join(text_parts)[:4000])
|
|
213
|
+
if tool_calls:
|
|
214
|
+
turn_span.set_attribute("tool_calls", json.dumps(tool_calls))
|
|
215
|
+
# If no text content, show tool calls summary
|
|
216
|
+
if not text_parts:
|
|
217
|
+
turn_span.set_attribute(
|
|
218
|
+
"content", f"Tool calls: {', '.join(t['tool'] for t in tool_calls)}"
|
|
219
|
+
)
|
|
161
220
|
|
|
162
|
-
|
|
221
|
+
# Usage info
|
|
222
|
+
usage = msg.get("usage", {})
|
|
223
|
+
if usage:
|
|
224
|
+
turn_span.set_attribute("input_tokens", usage.get("input_tokens", 0))
|
|
225
|
+
turn_span.set_attribute("output_tokens", usage.get("output_tokens", 0))
|
|
226
|
+
|
|
227
|
+
elif event_type == "user":
|
|
228
|
+
# Tool result
|
|
229
|
+
tool_results = data.get("message", {}).get("content", [])
|
|
230
|
+
for result in tool_results:
|
|
231
|
+
if result.get("type") == "tool_result":
|
|
232
|
+
tool_id = result.get("tool_use_id", "")
|
|
233
|
+
content = result.get("content", "")
|
|
234
|
+
# Handle content that might be a list of content blocks
|
|
235
|
+
if isinstance(content, list):
|
|
236
|
+
text_parts = []
|
|
237
|
+
for item in content:
|
|
238
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
239
|
+
text_parts.append(item.get("text", ""))
|
|
240
|
+
elif isinstance(item, str):
|
|
241
|
+
text_parts.append(item)
|
|
242
|
+
content = "\n".join(text_parts)
|
|
243
|
+
if isinstance(content, str):
|
|
244
|
+
content = content[:2000] # Truncate large results
|
|
245
|
+
with tracer.start_as_current_span("tool_result") as tr_span:
|
|
246
|
+
tr_span.set_attribute("span.type", "tool_result")
|
|
247
|
+
tr_span.set_attribute("source", "agent")
|
|
248
|
+
tr_span.set_attribute("tool_use_id", tool_id)
|
|
249
|
+
tr_span.set_attribute("content", f"Tool result for {tool_id}")
|
|
250
|
+
tr_span.set_attribute("result", content if content else "")
|
|
251
|
+
|
|
252
|
+
elif event_type == "result":
|
|
253
|
+
# Final result
|
|
254
|
+
result_text = data.get("result", "")[:1000]
|
|
255
|
+
is_error = data.get("is_error", False)
|
|
256
|
+
duration_ms = data.get("duration_ms", 0)
|
|
257
|
+
total_cost = data.get("total_cost_usd", 0)
|
|
258
|
+
|
|
259
|
+
with tracer.start_as_current_span("agent_result") as res_span:
|
|
260
|
+
res_span.set_attribute("span.type", "agent_result")
|
|
261
|
+
res_span.set_attribute("source", "agent")
|
|
262
|
+
res_span.set_attribute("content", result_text if result_text else "Agent completed")
|
|
263
|
+
res_span.set_attribute("is_error", is_error)
|
|
264
|
+
res_span.set_attribute("duration_ms", duration_ms)
|
|
265
|
+
res_span.set_attribute("total_cost_usd", total_cost)
|
|
266
|
+
res_span.set_attribute("num_turns", data.get("num_turns", turn_count))
|
|
267
|
+
|
|
268
|
+
elif event_type == "system" and data.get("subtype") == "init":
|
|
269
|
+
# Agent initialization
|
|
270
|
+
with tracer.start_as_current_span("agent_init") as init_span:
|
|
271
|
+
init_span.set_attribute("span.type", "agent_init")
|
|
272
|
+
init_span.set_attribute("source", "agent")
|
|
273
|
+
init_span.set_attribute("model", data.get("model", "unknown"))
|
|
274
|
+
init_span.set_attribute("tools", json.dumps(data.get("tools", [])))
|
|
275
|
+
init_span.set_attribute("content", f"Agent initialized: {data.get('model', 'unknown')}")
|
|
276
|
+
|
|
277
|
+
else:
|
|
278
|
+
# Other output - just log it without creating a span
|
|
279
|
+
logger.debug(f"[agent] {decoded_line}")
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
except json.JSONDecodeError:
|
|
283
|
+
# Not JSON - just log it
|
|
284
|
+
logger.info(f"[agent] {decoded_line}")
|
|
285
|
+
|
|
286
|
+
await process.wait()
|
|
287
|
+
|
|
288
|
+
exit_code = process.returncode or 0
|
|
289
|
+
if exit_code != 0:
|
|
290
|
+
error_context = "\n".join(output_lines[-50:]) if output_lines else "No output captured"
|
|
291
|
+
|
|
292
|
+
exec_span.set_attribute("error", True)
|
|
293
|
+
exec_span.set_attribute("exit_code", exit_code)
|
|
294
|
+
exec_span.add_event(
|
|
295
|
+
"agent_error",
|
|
296
|
+
{
|
|
297
|
+
"exit_code": exit_code,
|
|
298
|
+
"output": error_context[:4000],
|
|
299
|
+
},
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
agent_span.set_attribute("error", True)
|
|
303
|
+
agent_span.set_attribute("exit_code", exit_code)
|
|
304
|
+
|
|
305
|
+
raise RuntimeError(f"Agent failed with exit code {exit_code}")
|
|
306
|
+
|
|
307
|
+
exec_span.set_attribute("success", True)
|
|
163
308
|
|
|
164
309
|
finally:
|
|
165
310
|
os.unlink(config_file.name)
|
|
166
311
|
|
|
167
|
-
# Load trajectory and
|
|
312
|
+
# Load trajectory and log as event
|
|
168
313
|
trajectory_path = Path(logs_dir) / "agent" / "trajectory.json"
|
|
169
314
|
if trajectory_path.exists():
|
|
170
315
|
try:
|
|
@@ -178,14 +323,24 @@ async def run_agent(
|
|
|
178
323
|
agent_data["extra"] = extra
|
|
179
324
|
trajectory["agent"] = agent_data
|
|
180
325
|
|
|
181
|
-
# Log trajectory as
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
log_type
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
326
|
+
# Log trajectory as span event
|
|
327
|
+
with tracer.start_as_current_span("trajectory") as traj_span:
|
|
328
|
+
traj_span.set_attribute("span.type", "trajectory")
|
|
329
|
+
traj_span.set_attribute("log_type", "atif")
|
|
330
|
+
traj_span.set_attribute("source", "agent")
|
|
331
|
+
# Store trajectory in span (truncated for OTel limits)
|
|
332
|
+
traj_json = json.dumps(trajectory)
|
|
333
|
+
if len(traj_json) > 10000:
|
|
334
|
+
traj_span.set_attribute("trajectory_truncated", True)
|
|
335
|
+
traj_span.set_attribute("trajectory_size", len(traj_json))
|
|
336
|
+
else:
|
|
337
|
+
traj_span.set_attribute("trajectory", traj_json)
|
|
188
338
|
except Exception as e:
|
|
189
339
|
logger.warning(f"Failed to load trajectory: {e}")
|
|
190
340
|
|
|
191
|
-
|
|
341
|
+
# Upload artifacts if we have upload URL configured
|
|
342
|
+
if upload_url:
|
|
343
|
+
await upload_artifacts(upload_url, logs_dir)
|
|
344
|
+
|
|
345
|
+
agent_span.set_attribute("success", True)
|
|
346
|
+
agent_span.set_attribute("content", f"Agent {agent_name} completed successfully")
|
plato/worlds/README.md
CHANGED
plato/worlds/base.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
import subprocess
|
|
7
8
|
from abc import ABC, abstractmethod
|
|
8
9
|
from pathlib import Path
|
|
@@ -16,12 +17,14 @@ if TYPE_CHECKING:
|
|
|
16
17
|
from plato.v2.async_.environment import Environment
|
|
17
18
|
from plato.v2.async_.session import Session
|
|
18
19
|
|
|
19
|
-
from plato.agents.
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
from plato.agents.
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
from plato.agents.artifacts import (
|
|
21
|
+
upload_artifact as _upload_artifact_raw,
|
|
22
|
+
)
|
|
23
|
+
from plato.agents.otel import (
|
|
24
|
+
get_tracer,
|
|
25
|
+
init_tracing,
|
|
26
|
+
shutdown_tracing,
|
|
27
|
+
)
|
|
25
28
|
|
|
26
29
|
logger = logging.getLogger(__name__)
|
|
27
30
|
|
|
@@ -111,6 +114,7 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
111
114
|
self._step_count: int = 0
|
|
112
115
|
self.plato_session = None
|
|
113
116
|
self._current_step_id: str | None = None
|
|
117
|
+
self._session_id: str | None = None
|
|
114
118
|
|
|
115
119
|
@classmethod
|
|
116
120
|
def get_config_class(cls) -> type[RunConfig]:
|
|
@@ -390,17 +394,39 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
390
394
|
self.logger.warning(f"Failed to create state bundle: {e.stderr}")
|
|
391
395
|
return None
|
|
392
396
|
|
|
393
|
-
async def
|
|
397
|
+
async def _upload_artifact(
|
|
398
|
+
self,
|
|
399
|
+
data: bytes,
|
|
400
|
+
content_type: str = "application/octet-stream",
|
|
401
|
+
) -> bool:
|
|
402
|
+
"""Upload an artifact directly to S3.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
data: Raw bytes of the artifact
|
|
406
|
+
content_type: MIME type of the content
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
True if successful, False otherwise
|
|
410
|
+
"""
|
|
411
|
+
if not self.config.upload_url:
|
|
412
|
+
self.logger.warning("Cannot upload artifact: upload_url not set")
|
|
413
|
+
return False
|
|
414
|
+
return await _upload_artifact_raw(
|
|
415
|
+
upload_url=self.config.upload_url,
|
|
416
|
+
data=data,
|
|
417
|
+
content_type=content_type,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
async def _create_and_upload_checkpoint(self) -> dict[str, str] | None:
|
|
394
421
|
"""Create a full checkpoint including env snapshots and state bundle.
|
|
395
422
|
|
|
396
423
|
This method:
|
|
397
424
|
1. Commits any pending state changes
|
|
398
425
|
2. Creates env snapshots using snapshot_store
|
|
399
|
-
3. Creates and uploads state bundle
|
|
400
|
-
4. Calls the checkpoint endpoint with all data
|
|
426
|
+
3. Creates and uploads state bundle to S3
|
|
401
427
|
|
|
402
428
|
Returns:
|
|
403
|
-
|
|
429
|
+
Dict mapping env alias to artifact_id if successful, None otherwise
|
|
404
430
|
"""
|
|
405
431
|
# Commit state changes first
|
|
406
432
|
self._commit_state(f"Checkpoint at step {self._step_count}")
|
|
@@ -411,35 +437,20 @@ class BaseWorld(ABC, Generic[ConfigT]):
|
|
|
411
437
|
env_snapshots = {}
|
|
412
438
|
|
|
413
439
|
# Create and upload state bundle
|
|
414
|
-
state_artifact_id: str | None = None
|
|
415
440
|
if self.config.state.enabled:
|
|
416
441
|
bundle_data = self._create_state_bundle()
|
|
417
442
|
if bundle_data:
|
|
418
|
-
|
|
443
|
+
success = await self._upload_artifact(
|
|
419
444
|
data=bundle_data,
|
|
420
|
-
|
|
421
|
-
filename=f"state_step_{self._step_count}.bundle",
|
|
422
|
-
extra={
|
|
423
|
-
"step_number": self._step_count,
|
|
424
|
-
"state_path": self.config.state.path,
|
|
425
|
-
},
|
|
445
|
+
content_type="application/octet-stream",
|
|
426
446
|
)
|
|
427
|
-
if
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
checkpoint_result = await _upload_checkpoint(
|
|
433
|
-
step_number=self._step_count,
|
|
434
|
-
env_snapshots=env_snapshots,
|
|
435
|
-
state_artifact_id=state_artifact_id,
|
|
436
|
-
extra={
|
|
437
|
-
"world_name": self.name,
|
|
438
|
-
"world_version": self.get_version(),
|
|
439
|
-
},
|
|
440
|
-
)
|
|
447
|
+
if success:
|
|
448
|
+
self.logger.info(f"Uploaded state bundle at step {self._step_count}")
|
|
449
|
+
else:
|
|
450
|
+
self.logger.warning(f"Failed to upload state bundle at step {self._step_count}")
|
|
451
|
+
return None
|
|
441
452
|
|
|
442
|
-
return
|
|
453
|
+
return env_snapshots
|
|
443
454
|
|
|
444
455
|
def get_env(self, alias: str) -> Environment | None:
|
|
445
456
|
"""Get an environment by alias.
|
|
@@ -630,64 +641,98 @@ The following services are available for your use:
|
|
|
630
641
|
# Initialize state directory (creates git repo if needed)
|
|
631
642
|
self._init_state_directory()
|
|
632
643
|
|
|
633
|
-
# Initialize
|
|
634
|
-
if config.
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
644
|
+
# Initialize OTel tracing and session info for artifact uploads
|
|
645
|
+
if config.session_id:
|
|
646
|
+
self._session_id = config.session_id
|
|
647
|
+
|
|
648
|
+
# Set environment variables for agent runners
|
|
649
|
+
os.environ["SESSION_ID"] = config.session_id
|
|
650
|
+
if config.otel_url:
|
|
651
|
+
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = config.otel_url
|
|
652
|
+
# Use JSON protocol (not protobuf) for the OTLP exporter
|
|
653
|
+
os.environ["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/json"
|
|
654
|
+
if config.upload_url:
|
|
655
|
+
os.environ["UPLOAD_URL"] = config.upload_url
|
|
656
|
+
|
|
657
|
+
# Initialize OTel tracing if otel_url is provided
|
|
658
|
+
print(f"[World] OTel URL from config: {config.otel_url!r}")
|
|
659
|
+
if config.otel_url:
|
|
660
|
+
init_tracing(
|
|
661
|
+
service_name=f"world-{self.name}",
|
|
662
|
+
session_id=config.session_id,
|
|
663
|
+
otlp_endpoint=config.otel_url,
|
|
664
|
+
)
|
|
665
|
+
else:
|
|
666
|
+
print("[World] No otel_url in config - OTel tracing disabled")
|
|
639
667
|
|
|
640
668
|
# Connect to Plato session if configured (for heartbeats)
|
|
641
669
|
await self._connect_plato_session()
|
|
642
670
|
|
|
671
|
+
# Get tracer for spans
|
|
672
|
+
tracer = get_tracer("plato.world")
|
|
673
|
+
|
|
643
674
|
# Log session start
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
675
|
+
with tracer.start_as_current_span("session_start") as span:
|
|
676
|
+
span.set_attribute("span.type", "session_start")
|
|
677
|
+
span.set_attribute("source", "world")
|
|
678
|
+
span.set_attribute("world_name", self.name)
|
|
679
|
+
span.set_attribute("world_version", self.get_version())
|
|
680
|
+
span.set_attribute("content", f"World '{self.name}' started")
|
|
650
681
|
|
|
651
682
|
try:
|
|
652
|
-
# Execute reset with
|
|
653
|
-
|
|
654
|
-
reset_span.
|
|
683
|
+
# Execute reset with OTel span
|
|
684
|
+
with tracer.start_as_current_span("reset") as reset_span:
|
|
685
|
+
reset_span.set_attribute("span.type", "reset")
|
|
686
|
+
reset_span.set_attribute("source", "world")
|
|
687
|
+
reset_span.set_attribute("content", f"Resetting world '{self.name}'")
|
|
655
688
|
obs = await self.reset()
|
|
656
|
-
|
|
689
|
+
obs_data = obs.model_dump() if hasattr(obs, "model_dump") else str(obs)
|
|
690
|
+
reset_span.set_attribute("observation", str(obs_data)[:1000]) # Truncate for OTel
|
|
657
691
|
self.logger.info(f"World reset complete: {obs}")
|
|
658
692
|
|
|
659
693
|
while True:
|
|
660
694
|
self._step_count += 1
|
|
661
695
|
|
|
662
|
-
# Execute step with
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
step_span.
|
|
696
|
+
# Execute step with OTel span
|
|
697
|
+
with tracer.start_as_current_span(f"step_{self._step_count}") as step_span:
|
|
698
|
+
step_span.set_attribute("span.type", "step")
|
|
699
|
+
step_span.set_attribute("source", "world")
|
|
700
|
+
step_span.set_attribute("step_number", self._step_count)
|
|
701
|
+
step_span.set_attribute("content", f"Step {self._step_count} started")
|
|
702
|
+
|
|
703
|
+
# Store span context for nested agent spans
|
|
704
|
+
|
|
705
|
+
self._current_step_id = format(step_span.get_span_context().span_id, "016x")
|
|
706
|
+
|
|
672
707
|
result = await self.step()
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
"info": result.info,
|
|
680
|
-
}
|
|
708
|
+
|
|
709
|
+
step_span.set_attribute("done", result.done)
|
|
710
|
+
obs_data = (
|
|
711
|
+
result.observation.model_dump()
|
|
712
|
+
if hasattr(result.observation, "model_dump")
|
|
713
|
+
else str(result.observation)
|
|
681
714
|
)
|
|
715
|
+
step_span.set_attribute("observation", str(obs_data)[:1000])
|
|
682
716
|
|
|
683
717
|
self.logger.info(f"Step {self._step_count}: done={result.done}")
|
|
684
718
|
|
|
685
719
|
# Create checkpoint if enabled and interval matches
|
|
686
|
-
# Note: The checkpoint event is created by the callback endpoint,
|
|
687
|
-
# so we don't need a span wrapper here (would create duplicates)
|
|
688
720
|
if self.config.checkpoint.enabled and self._step_count % self.config.checkpoint.interval == 0:
|
|
689
721
|
self.logger.info(f"Creating checkpoint after step {self._step_count}")
|
|
690
|
-
await self._create_and_upload_checkpoint()
|
|
722
|
+
env_snapshots = await self._create_and_upload_checkpoint()
|
|
723
|
+
|
|
724
|
+
# Emit checkpoint span for UI visibility
|
|
725
|
+
with tracer.start_as_current_span(f"checkpoint_{self._step_count}") as checkpoint_span:
|
|
726
|
+
checkpoint_span.set_attribute("span.type", "checkpoint")
|
|
727
|
+
checkpoint_span.set_attribute("source", "world")
|
|
728
|
+
checkpoint_span.set_attribute("step_number", self._step_count)
|
|
729
|
+
checkpoint_span.set_attribute("content", f"Checkpoint created at step {self._step_count}")
|
|
730
|
+
if env_snapshots:
|
|
731
|
+
# Serialize env_snapshots for OTel attribute
|
|
732
|
+
import json
|
|
733
|
+
|
|
734
|
+
checkpoint_span.set_attribute("env_snapshots", json.dumps(env_snapshots))
|
|
735
|
+
checkpoint_span.set_attribute("success", env_snapshots is not None)
|
|
691
736
|
|
|
692
737
|
if result.done:
|
|
693
738
|
break
|
|
@@ -697,14 +742,14 @@ The following services are available for your use:
|
|
|
697
742
|
await self._disconnect_plato_session()
|
|
698
743
|
|
|
699
744
|
# Log session end
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
745
|
+
with tracer.start_as_current_span("session_end") as span:
|
|
746
|
+
span.set_attribute("span.type", "session_end")
|
|
747
|
+
span.set_attribute("source", "world")
|
|
748
|
+
span.set_attribute("total_steps", self._step_count)
|
|
749
|
+
span.set_attribute("content", f"World '{self.name}' completed after {self._step_count} steps")
|
|
750
|
+
|
|
751
|
+
# Shutdown OTel tracing and clear session info
|
|
752
|
+
shutdown_tracing()
|
|
753
|
+
self._session_id = None
|
|
709
754
|
|
|
710
755
|
self.logger.info(f"World '{self.name}' completed after {self._step_count} steps")
|