fred-runtime 2.0.0__tar.gz → 2.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/PKG-INFO +1 -1
  2. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/agent_app.py +251 -39
  3. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/__init__.py +8 -0
  4. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/completion.py +1 -1
  5. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/history_display.py +149 -0
  6. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/pod_client.py +35 -0
  7. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/repl.py +68 -53
  8. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/repl_helpers.py +30 -16
  9. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/client.py +8 -0
  10. fred_runtime-2.0.2/fred_runtime/eval/__init__.py +13 -0
  11. fred_runtime-2.0.2/fred_runtime/eval/collector.py +143 -0
  12. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/PKG-INFO +1 -1
  13. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/SOURCES.txt +3 -0
  14. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/pyproject.toml +1 -1
  15. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_agent_app.py +150 -0
  16. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_client.py +6 -4
  17. fred_runtime-2.0.2/tests/test_eval_trace.py +314 -0
  18. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/README.md +0 -0
  19. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/__init__.py +0 -0
  20. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/__init__.py +0 -0
  21. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/_catalogs.py +0 -0
  22. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/config.py +0 -0
  23. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/config_loader.py +0 -0
  24. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/container.py +0 -0
  25. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/context.py +0 -0
  26. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/dependencies.py +0 -0
  27. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/mcp_config.py +0 -0
  28. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/observability_factory.py +0 -0
  29. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/openai_compat_router.py +0 -0
  30. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/entrypoint.py +0 -0
  31. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/kpi_display.py +0 -0
  32. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/url_helpers.py +0 -0
  33. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/__init__.py +0 -0
  34. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/context_aware_tool.py +0 -0
  35. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_base_client.py +0 -0
  36. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_fast_text_client.py +0 -0
  37. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_http_client.py +0 -0
  38. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_logs_client.py +0 -0
  39. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_markdown_media_client.py +0 -0
  40. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_vectorsearch_client.py +0 -0
  41. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_workspace_client.py +0 -0
  42. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/mcp_interceptors.py +0 -0
  43. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/mcp_runtime.py +0 -0
  44. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/mcp_toolkit.py +0 -0
  45. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/mcp_utils.py +0 -0
  46. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/structures.py +0 -0
  47. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/token_expiry.py +0 -0
  48. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/tool_node_utils.py +0 -0
  49. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/deep/__init__.py +0 -0
  50. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/deep/deep_runtime.py +0 -0
  51. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/graph/__init__.py +0 -0
  52. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/graph/graph_runtime.py +0 -0
  53. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/integrations/__init__.py +0 -0
  54. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/integrations/v2_runtime/__init__.py +0 -0
  55. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/integrations/v2_runtime/adapters.py +0 -0
  56. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/__init__.py +0 -0
  57. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/catalog.py +0 -0
  58. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/contracts.py +0 -0
  59. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/provider.py +0 -0
  60. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/resolver.py +0 -0
  61. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/__init__.py +0 -0
  62. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_langchain_adapter.py +0 -0
  63. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_message_codec.py +0 -0
  64. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_model_adapter.py +0 -0
  65. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_prompting.py +0 -0
  66. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_runtime.py +0 -0
  67. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_stream_adapter.py +0 -0
  68. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_binding.py +0 -0
  69. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_loop.py +0 -0
  70. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_rendering.py +0 -0
  71. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_resolution.py +0 -0
  72. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_utils.py +0 -0
  73. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tracing.py +0 -0
  74. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_context.py +0 -0
  75. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/__init__.py +0 -0
  76. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/checkpoints.py +0 -0
  77. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/model_metadata.py +0 -0
  78. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/request_context_helpers.py +0 -0
  79. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/sql_checkpointer.py +0 -0
  80. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/user_token_refresher.py +0 -0
  81. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/support/__init__.py +0 -0
  82. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/support/filesystem_context.py +0 -0
  83. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/support/tool_approval.py +0 -0
  84. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/support/tool_loop.py +0 -0
  85. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/dependency_links.txt +0 -0
  86. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/entry_points.txt +0 -0
  87. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/requires.txt +0 -0
  88. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/top_level.txt +0 -0
  89. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/setup.cfg +0 -0
  90. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_config_loader.py +0 -0
  91. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_context.py +0 -0
  92. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_graph_runtime_observability.py +0 -0
  93. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_history.py +0 -0
  94. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_kf_workspace_client.py +0 -0
  95. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_kpi_display.py +0 -0
  96. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_mcp_config.py +0 -0
  97. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_openai_compat_router.py +0 -0
  98. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_smoke.py +0 -0
  99. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_url_helpers.py +0 -0
  100. {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_user_token_refresher.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fred-runtime
3
- Version: 2.0.0
3
+ Version: 2.0.2
4
4
  Summary: Runtime adapters and infrastructure wiring for Fred v2 agents.
5
5
  Author-email: Thales <noreply@thalesgroup.com>
6
6
  License: Apache-2.0
@@ -62,6 +62,7 @@ from fred_core.logs.log_setup import log_setup
62
62
  from fred_core.logs.memory_log_store import RamLogStore
63
63
  from fred_core.security.oidc import get_keycloak_client_id, get_keycloak_url
64
64
  from fred_core.security.structure import KeycloakUser
65
+ from fred_sdk.contracts.eval import EvalStep, EvalTrace
65
66
  from fred_sdk.contracts.context import (
66
67
  AgentInvocationRequest,
67
68
  AgentInvocationResult,
@@ -558,11 +559,13 @@ class LocalRegistryAgentInvoker(AgentInvokerPort):
558
559
  is_error=True,
559
560
  )
560
561
 
562
+ context_dict = request.context.model_dump(mode="json")
563
+ context_dict.setdefault("execution_action", ExecutionGrantAction.EXECUTE.value)
561
564
  execute_request = _AgentExecuteRequest.model_construct(
562
565
  agent_id=request.agent_id,
563
566
  agent_instance_id=None,
564
567
  message=request.message,
565
- context=request.context.model_dump(mode="json"),
568
+ context=context_dict,
566
569
  resume_payload=None,
567
570
  )
568
571
 
@@ -838,17 +841,19 @@ def _apply_runtime_tuning(
838
841
  - `definition = _apply_runtime_tuning(template_definition, resolution.tuning)`
839
842
  """
840
843
 
841
- return definition.model_copy(
842
- update={
843
- "role": tuning.role,
844
- "description": tuning.description,
845
- "tags": tuple(tuning.tags),
846
- "fields": tuple(field.model_copy(deep=True) for field in tuning.fields),
847
- "default_mcp_servers": tuple(
848
- server.model_copy(deep=True) for server in tuning.mcp_servers
849
- ),
850
- }
851
- )
844
+ update: dict[str, object] = {
845
+ "role": tuning.role,
846
+ "description": tuning.description,
847
+ "tags": tuple(tuning.tags),
848
+ "fields": tuple(field.model_copy(deep=True) for field in tuning.fields),
849
+ "default_mcp_servers": tuple(
850
+ server.model_copy(deep=True) for server in tuning.mcp_servers
851
+ ),
852
+ }
853
+ system_prompt = tuning.values.get("prompts.system")
854
+ if isinstance(system_prompt, str) and system_prompt.strip():
855
+ update["system_prompt_template"] = system_prompt
856
+ return definition.model_copy(update=update)
852
857
 
853
858
 
854
859
  def _available_mcp_servers_for_definition(
@@ -1377,6 +1382,120 @@ def _sse(payload: str) -> str:
1377
1382
  return f"data: {payload}\n\n"
1378
1383
 
1379
1384
 
1385
+ @dataclass(frozen=True)
1386
+ class _TurnOutcome:
1387
+ model_name: str | None
1388
+ finish_reason: str
1389
+ token_usage: dict[str, Any] | None
1390
+ input_tokens: int | None
1391
+ output_tokens: int | None
1392
+ tool_count: int
1393
+ is_error: bool
1394
+ total_ms: int
1395
+ final_content: str | None
1396
+
1397
+
1398
+ def _parse_turn_outcome(
1399
+ payloads: list[dict[str, Any]],
1400
+ turn_start: float,
1401
+ ) -> _TurnOutcome:
1402
+ total_ms = int((time.monotonic() - turn_start) * 1000)
1403
+ tool_count = sum(1 for p in payloads if p.get("kind") == "tool_call")
1404
+ final = next((p for p in reversed(payloads) if p.get("kind") == "final"), None)
1405
+ is_error = any(p.get("kind") == "execution_error" for p in payloads)
1406
+ token_usage: dict[str, Any] | None = final.get("token_usage") if final else None
1407
+ return _TurnOutcome(
1408
+ model_name=final.get("model_name") if final else None,
1409
+ finish_reason="error"
1410
+ if is_error
1411
+ else ((final.get("finish_reason") or "") if final else ""),
1412
+ token_usage=token_usage,
1413
+ input_tokens=token_usage.get("input_tokens") if token_usage else None,
1414
+ output_tokens=token_usage.get("output_tokens") if token_usage else None,
1415
+ tool_count=tool_count,
1416
+ is_error=is_error,
1417
+ total_ms=total_ms,
1418
+ final_content=(final.get("content") or None) if final else None,
1419
+ )
1420
+
1421
+
1422
+ def _build_eval_trace(
1423
+ payloads: list[dict[str, Any]],
1424
+ input_text: str,
1425
+ agent_id: str,
1426
+ session_id: str,
1427
+ turn_start: float,
1428
+ ) -> EvalTrace:
1429
+ outcome = _parse_turn_outcome(payloads, turn_start)
1430
+ steps: list[EvalStep] = []
1431
+ retrieval_context: list[str] = []
1432
+ tools_called: list[str] = []
1433
+ error: str | None = None
1434
+
1435
+ for p in payloads:
1436
+ kind = p.get("kind")
1437
+ if kind == "tool_call":
1438
+ steps.append(
1439
+ EvalStep(
1440
+ kind="tool_call",
1441
+ tool_name=p.get("tool_name"),
1442
+ call_id=p.get("call_id"),
1443
+ arguments=p.get("arguments") or {},
1444
+ )
1445
+ )
1446
+ if p.get("tool_name"):
1447
+ tools_called.append(p["tool_name"])
1448
+ elif kind == "tool_result":
1449
+ content = p.get("content", "")
1450
+ is_err = p.get("is_error", False)
1451
+ steps.append(
1452
+ EvalStep(
1453
+ kind="tool_result",
1454
+ tool_name=p.get("tool_name"),
1455
+ call_id=p.get("call_id"),
1456
+ content=content,
1457
+ is_error=is_err,
1458
+ )
1459
+ )
1460
+ if not is_err:
1461
+ sources = p.get("sources") or []
1462
+ if sources:
1463
+ retrieval_context.extend(
1464
+ s["content"] for s in sources if s.get("content")
1465
+ )
1466
+ elif content:
1467
+ retrieval_context.append(content)
1468
+ elif kind == "final":
1469
+ steps.append(EvalStep(kind="final", content=p.get("content")))
1470
+ elif kind == "node_error":
1471
+ steps.append(
1472
+ EvalStep(
1473
+ kind="node_error",
1474
+ node_id=p.get("node_id"),
1475
+ error_message=p.get("error_message"),
1476
+ )
1477
+ )
1478
+ elif kind == "awaiting_human":
1479
+ steps.append(EvalStep(kind="awaiting_human"))
1480
+ elif kind == "execution_error":
1481
+ error = p.get("message")
1482
+
1483
+ return EvalTrace(
1484
+ session_id=session_id,
1485
+ agent_id=agent_id,
1486
+ input=input_text,
1487
+ output=outcome.final_content,
1488
+ error=error,
1489
+ latency_ms=outcome.total_ms,
1490
+ model_name=outcome.model_name,
1491
+ token_usage=outcome.token_usage,
1492
+ finish_reason=outcome.finish_reason or None,
1493
+ steps=tuple(steps),
1494
+ retrieval_context=tuple(retrieval_context),
1495
+ tools_called=tuple(tools_called),
1496
+ )
1497
+
1498
+
1380
1499
  def _emit_turn_completed(
1381
1500
  container: PodApplicationContext,
1382
1501
  *,
@@ -1408,21 +1527,7 @@ def _emit_turn_completed(
1408
1527
  """
1409
1528
  try:
1410
1529
  kpi = get_runtime_context().get_kpi_writer()
1411
- total_ms = int((time.monotonic() - turn_start) * 1000)
1412
- tool_count = sum(1 for p in payloads if p.get("kind") == "tool_call")
1413
- final = next((p for p in reversed(payloads) if p.get("kind") == "final"), None)
1414
- is_error = any(p.get("kind") == "execution_error" for p in payloads)
1415
- model_name: str | None = final.get("model_name") if final else None
1416
- finish_reason: str = (
1417
- "error" if is_error else (final.get("finish_reason") or "") if final else ""
1418
- )
1419
- token_usage: dict[str, Any] | None = final.get("token_usage") if final else None
1420
- input_tokens: int | None = (
1421
- token_usage.get("input_tokens") if token_usage else None
1422
- )
1423
- output_tokens: int | None = (
1424
- token_usage.get("output_tokens") if token_usage else None
1425
- )
1530
+ outcome = _parse_turn_outcome(payloads, turn_start)
1426
1531
  runtime_id = get_runtime_context().config.service_name
1427
1532
 
1428
1533
  # Prometheus-safe dims: low-cardinality only.
@@ -1433,25 +1538,25 @@ def _emit_turn_completed(
1433
1538
  "team_id": team_id,
1434
1539
  "template_agent_id": template_agent_id,
1435
1540
  "runtime_id": runtime_id,
1436
- "model_name": model_name,
1437
- "finish_reason": finish_reason,
1541
+ "model_name": outcome.model_name,
1542
+ "finish_reason": outcome.finish_reason,
1438
1543
  }
1439
1544
 
1440
1545
  kpi.emit(
1441
1546
  name="agent.turn_completed",
1442
1547
  type="timer",
1443
- value=total_ms,
1548
+ value=outcome.total_ms,
1444
1549
  unit="ms",
1445
1550
  dims=prom_dims,
1446
1551
  quantities={
1447
- "tool_count": tool_count,
1448
- "input_tokens": input_tokens,
1449
- "output_tokens": output_tokens,
1552
+ "tool_count": outcome.tool_count,
1553
+ "input_tokens": outcome.input_tokens,
1554
+ "output_tokens": outcome.output_tokens,
1450
1555
  },
1451
1556
  actor=KPIActor(type="system"),
1452
1557
  )
1453
1558
 
1454
- if is_error:
1559
+ if outcome.is_error:
1455
1560
  kpi.emit(
1456
1561
  name="agent.turn_error_total",
1457
1562
  type="counter",
@@ -1468,12 +1573,12 @@ def _emit_turn_completed(
1468
1573
  "session_id": session_id,
1469
1574
  "exchange_id": exchange_id,
1470
1575
  "user_id": user_id,
1471
- "total_ms": total_ms,
1472
- "is_error": is_error,
1576
+ "total_ms": outcome.total_ms,
1577
+ "is_error": outcome.is_error,
1473
1578
  **prom_dims,
1474
- "tool_count": tool_count,
1475
- "input_tokens": input_tokens,
1476
- "output_tokens": output_tokens,
1579
+ "tool_count": outcome.tool_count,
1580
+ "input_tokens": outcome.input_tokens,
1581
+ "output_tokens": outcome.output_tokens,
1477
1582
  },
1478
1583
  )
1479
1584
  with container._kpi_turns_lock:
@@ -2328,6 +2433,113 @@ def _build_agent_router(
2328
2433
  )
2329
2434
  return _terminal_execute_payload(payloads)
2330
2435
 
2436
+ @router.post(
2437
+ "/evaluate",
2438
+ response_model=EvalTrace,
2439
+ )
2440
+ async def evaluate(
2441
+ request: RuntimeExecuteRequest,
2442
+ http_request: Request,
2443
+ authenticated_user: KeycloakUser | None = Depends(_authenticated_user),
2444
+ container: PodApplicationContext = Depends(get_pod_container),
2445
+ ) -> EvalTrace:
2446
+ """
2447
+ Execute one agent turn and return a complete EvalTrace as JSON.
2448
+
2449
+ POST <configured base_url>/agents/evaluate
2450
+ Authorization: Bearer <user JWT>
2451
+ Body: RuntimeExecuteRequest
2452
+ Response: EvalTrace — synchronous, no SSE, no Langfuse dependency
2453
+
2454
+ Intended for evaluation harnesses (DeepEval, Promptfoo) that need
2455
+ input, output, retrieval_context, tools_called, and steps in one response.
2456
+ """
2457
+ auth = http_request.headers.get("Authorization", "")
2458
+ access_token = auth.removeprefix("Bearer ").strip() or None
2459
+
2460
+ expected_action = _expected_execution_action(request)
2461
+
2462
+ try:
2463
+ validate_execution_grant(request, expected_action=expected_action)
2464
+ except ExecutionGrantViolation as exc:
2465
+ _emit_audit_event(
2466
+ container,
2467
+ "warning",
2468
+ "grant_validation_failed",
2469
+ agent_instance_id=request.agent_instance_id,
2470
+ user_id=request.effective_user_id(),
2471
+ action=expected_action.value,
2472
+ reason=str(exc),
2473
+ )
2474
+ raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=str(exc))
2475
+ if request.execution_grant is not None:
2476
+ _emit_audit_event(
2477
+ container,
2478
+ "info",
2479
+ "grant_validated",
2480
+ agent_instance_id=request.agent_instance_id,
2481
+ user_id=request.effective_user_id(),
2482
+ action=expected_action.value,
2483
+ )
2484
+ _validate_grant_user_correlation(request, authenticated_user, container)
2485
+ await _validate_session_checkpoint_access(request)
2486
+
2487
+ exchange_id = str(uuid4())
2488
+ turn_start = time.monotonic()
2489
+ internal_req = _to_internal_request(request)
2490
+ target = await _resolve_agent_instance(
2491
+ request=internal_req,
2492
+ registry=registry,
2493
+ access_token=access_token,
2494
+ control_plane_url=get_runtime_context().config.control_plane_url,
2495
+ )
2496
+ payloads = [
2497
+ payload
2498
+ async for payload in _iterate_runtime_event_payloads(
2499
+ target.definition,
2500
+ internal_req,
2501
+ access_token=access_token,
2502
+ team_id=target.team_id,
2503
+ registry=registry,
2504
+ exchange_id=exchange_id,
2505
+ )
2506
+ ]
2507
+ session_id: str | None = request.effective_session_id()
2508
+ eval_session_id = session_id or str(uuid4())
2509
+ user_id_str = request.effective_user_id() or "unknown"
2510
+ _emit_turn_completed(
2511
+ container,
2512
+ session_id=session_id,
2513
+ exchange_id=exchange_id,
2514
+ user_id=user_id_str,
2515
+ team_id=target.team_id,
2516
+ agent_instance_id=request.agent_instance_id,
2517
+ template_agent_id=target.definition.agent_id,
2518
+ payloads=payloads,
2519
+ turn_start=turn_start,
2520
+ )
2521
+ if session_id:
2522
+ history_store = get_runtime_context().config.history_store
2523
+ if history_store is not None:
2524
+ await _write_turn_history(
2525
+ session_id=session_id,
2526
+ user_id=user_id_str,
2527
+ request_message=request.input,
2528
+ payloads=payloads,
2529
+ history_store=history_store,
2530
+ team_id=target.team_id,
2531
+ agent_instance_id=request.agent_instance_id,
2532
+ exchange_id=exchange_id,
2533
+ resume_payload=request.resume_payload,
2534
+ )
2535
+ return _build_eval_trace(
2536
+ payloads=payloads,
2537
+ input_text=request.input or "",
2538
+ agent_id=target.definition.agent_id,
2539
+ session_id=eval_session_id,
2540
+ turn_start=turn_start,
2541
+ )
2542
+
2331
2543
  @router.post(
2332
2544
  "/execute/stream",
2333
2545
  )
@@ -2,8 +2,10 @@ from .completion import completion_candidates
2
2
  from .entrypoint import build_parser, main
3
3
  from .history_display import (
4
4
  build_hitl_resume_payload,
5
+ print_eval_trace,
5
6
  print_history,
6
7
  print_runtime_event,
8
+ run_eval_turn,
7
9
  run_single_turn,
8
10
  )
9
11
  from .kpi_display import (
@@ -19,6 +21,8 @@ from .kpi_display import (
19
21
  from .pod_client import DEFAULT_AGENT_POD_BASE_URL, AgentPodClient
20
22
  from .repl import run_interactive_chat
21
23
  from .repl_helpers import (
24
+ ExecutionMode,
25
+ execution_mode_color,
22
26
  execution_mode_label,
23
27
  fmt_bytes,
24
28
  parse_mode_command,
@@ -33,6 +37,7 @@ from .url_helpers import (
33
37
  __all__ = [
34
38
  "AgentPodClient",
35
39
  "DEFAULT_AGENT_POD_BASE_URL",
40
+ "ExecutionMode",
36
41
  "HistogramSeriesSummary",
37
42
  "PrometheusSample",
38
43
  "build_hitl_resume_payload",
@@ -40,7 +45,9 @@ __all__ = [
40
45
  "completion_candidates",
41
46
  "default_agent_metrics_url",
42
47
  "default_agent_pod_base_url",
48
+ "execution_mode_color",
43
49
  "execution_mode_label",
50
+ "print_eval_trace",
44
51
  "filter_prometheus_samples",
45
52
  "fmt_bytes",
46
53
  "format_metric_value",
@@ -53,6 +60,7 @@ __all__ = [
53
60
  "print_history",
54
61
  "print_runtime_event",
55
62
  "render_kpi_report",
63
+ "run_eval_turn",
56
64
  "run_interactive_chat",
57
65
  "run_single_turn",
58
66
  "summarize_prometheus_histograms",
@@ -49,7 +49,7 @@ def completion_candidates(
49
49
  return [sid for sid in session_ids if sid.startswith(prefix)]
50
50
  if stripped.startswith("/mode "):
51
51
  prefix = stripped.removeprefix("/mode ").strip()
52
- return [mode for mode in ("final", "stream") if mode.startswith(prefix)]
52
+ return [mode for mode in ("eval", "final", "stream") if mode.startswith(prefix)]
53
53
  if stripped.startswith("/"):
54
54
  return complete_slash_commands(stripped, commands=_COMMANDS)
55
55
  return []
@@ -408,3 +408,152 @@ def build_hitl_resume_payload(
408
408
  if 0 <= idx < len(choices):
409
409
  selected_choice_id = str(choices[idx].get("id", raw_response))
410
410
  return {"choice_id": selected_choice_id}
411
+
412
+
413
+ def print_eval_trace(trace: dict[str, Any], *, color_enabled: bool) -> None:
414
+ """Render one EvalTrace dict (from POST /agents/evaluate) to the terminal."""
415
+ sep = colorize(" " + "─" * 62, color=ANSI_DIM, enabled=color_enabled)
416
+ print(colorize(" EvalTrace", color=ANSI_CYAN, enabled=color_enabled, bold=True))
417
+ print(sep)
418
+
419
+ def _field(label: str, value: str, color: str = ANSI_DIM) -> None:
420
+ print(
421
+ colorize(f" {label:<18}", color=ANSI_DIM, enabled=color_enabled)
422
+ + colorize(value, color=color, enabled=color_enabled)
423
+ )
424
+
425
+ _field("agent", trace.get("agent_id") or "-", ANSI_CYAN)
426
+ _field("session", trace.get("session_id") or "-")
427
+ _field("latency", f"{trace.get('latency_ms', 0)} ms")
428
+ _field("model", trace.get("model_name") or "-")
429
+ _field("finish", trace.get("finish_reason") or "-")
430
+
431
+ tu = trace.get("token_usage") or {}
432
+ if tu:
433
+ _field(
434
+ "tokens",
435
+ f"{tu.get('input_tokens', 0)}↑ in {tu.get('output_tokens', 0)}↓ out",
436
+ )
437
+
438
+ tools_called: list[str] = list(trace.get("tools_called") or [])
439
+ if tools_called:
440
+ _field("tools_called", " ".join(tools_called), ANSI_YELLOW)
441
+
442
+ retrieval_ctx: list[str] = list(trace.get("retrieval_context") or [])
443
+ _field("retrieval_ctx", str(len(retrieval_ctx)) + " chunk(s)")
444
+
445
+ steps: list[dict[str, Any]] = list(trace.get("steps") or [])
446
+ _field("steps", str(len(steps)))
447
+
448
+ err = trace.get("error")
449
+ _field("error", err or "none", ANSI_RED if err else ANSI_DIM)
450
+
451
+ if steps:
452
+ print()
453
+ print(colorize(" Steps:", color=ANSI_DIM, enabled=color_enabled, bold=True))
454
+ for i, step in enumerate(steps, 1):
455
+ kind = step.get("kind", "?")
456
+ name = step.get("tool_name") or ""
457
+ if kind == "tool_call":
458
+ raw_args = step.get("arguments")
459
+ args_str = (
460
+ json.dumps(raw_args, ensure_ascii=False)
461
+ if raw_args is not None
462
+ else ""
463
+ )
464
+ args_str = (args_str[:80] + "…") if len(args_str) > 80 else args_str
465
+ print(
466
+ colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
467
+ + colorize(
468
+ "[tool_call] ", color=ANSI_YELLOW, enabled=color_enabled
469
+ )
470
+ + colorize(
471
+ name, color=ANSI_YELLOW, enabled=color_enabled, bold=True
472
+ )
473
+ )
474
+ if args_str:
475
+ print(
476
+ colorize(
477
+ f" {args_str}", color=ANSI_DIM, enabled=color_enabled
478
+ )
479
+ )
480
+ elif kind == "tool_result":
481
+ is_err = step.get("is_error", False)
482
+ rc = ANSI_RED if is_err else ANSI_GREEN
483
+ content = str(step.get("content") or "")
484
+ content = (content[:80] + "…") if len(content) > 80 else content
485
+ print(
486
+ colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
487
+ + colorize("[tool_result] ", color=rc, enabled=color_enabled)
488
+ + colorize(name, color=rc, enabled=color_enabled, bold=True)
489
+ + colorize(
490
+ " (error)" if is_err else " (ok)",
491
+ color=rc,
492
+ enabled=color_enabled,
493
+ )
494
+ )
495
+ if content:
496
+ print(
497
+ colorize(
498
+ f" {content}", color=ANSI_DIM, enabled=color_enabled
499
+ )
500
+ )
501
+ elif kind == "node_error":
502
+ msg = str(step.get("error_message") or "")
503
+ print(
504
+ colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
505
+ + colorize("[node_error] ", color=ANSI_RED, enabled=color_enabled)
506
+ + colorize(
507
+ step.get("node_id") or "",
508
+ color=ANSI_RED,
509
+ enabled=color_enabled,
510
+ bold=True,
511
+ )
512
+ )
513
+ if msg:
514
+ print(
515
+ colorize(f" {msg}", color=ANSI_DIM, enabled=color_enabled)
516
+ )
517
+ elif kind == "final":
518
+ print(
519
+ colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
520
+ + colorize(
521
+ "[final]", color=ANSI_GREEN, enabled=color_enabled, bold=True
522
+ )
523
+ )
524
+ else:
525
+ print(
526
+ colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
527
+ + colorize(f"[{kind}]", color=ANSI_DIM, enabled=color_enabled)
528
+ )
529
+
530
+ output = trace.get("output") or ""
531
+ if output:
532
+ print()
533
+ print(colorize(" Output:", color=ANSI_DIM, enabled=color_enabled, bold=True))
534
+ print(sep)
535
+ print(output)
536
+
537
+ print(sep)
538
+
539
+
540
+ def run_eval_turn(
541
+ *,
542
+ client: AgentPodClient,
543
+ agent_id: str,
544
+ message: str,
545
+ session_id: str,
546
+ user_id: str,
547
+ team_id: str | None,
548
+ color_enabled: bool,
549
+ ) -> int:
550
+ """Call /agents/evaluate and pretty-print the EvalTrace."""
551
+ result = client.evaluate(
552
+ agent_id=agent_id,
553
+ message=message,
554
+ session_id=session_id,
555
+ user_id=user_id,
556
+ team_id=team_id,
557
+ )
558
+ print_eval_trace(result, color_enabled=color_enabled)
559
+ return 0 if result.get("error") is None else 1
@@ -96,6 +96,41 @@ class AgentPodClient:
96
96
  raise RuntimeError("Execute response must be a JSON object.")
97
97
  return result
98
98
 
99
+ def evaluate(
100
+ self,
101
+ *,
102
+ agent_id: str,
103
+ message: str,
104
+ session_id: str,
105
+ user_id: str,
106
+ team_id: str | None = None,
107
+ agent_instance_id: str | None = None,
108
+ checkpoint_id: str | None = None,
109
+ ) -> dict[str, Any]:
110
+ runtime_context: dict[str, Any] = {"user_id": user_id}
111
+ if team_id:
112
+ runtime_context["team_id"] = team_id
113
+ payload: dict[str, Any] = {
114
+ "agent_id": agent_id,
115
+ "input": message,
116
+ "session_id": session_id,
117
+ "runtime_context": runtime_context,
118
+ }
119
+ if agent_instance_id is not None:
120
+ payload["agent_instance_id"] = agent_instance_id
121
+ if checkpoint_id is not None:
122
+ payload["checkpoint_id"] = checkpoint_id
123
+ response = self.http_client.post(
124
+ f"{self.base_url}/agents/evaluate",
125
+ json=payload,
126
+ headers=self._auth_headers(),
127
+ )
128
+ response.raise_for_status()
129
+ result = response.json()
130
+ if not isinstance(result, dict):
131
+ raise RuntimeError("Evaluate response must be a JSON object.")
132
+ return result
133
+
99
134
  def stream_events(
100
135
  self,
101
136
  *,