fred-runtime 2.0.0__tar.gz → 2.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/PKG-INFO +1 -1
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/agent_app.py +251 -39
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/__init__.py +8 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/completion.py +1 -1
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/history_display.py +149 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/pod_client.py +35 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/repl.py +68 -53
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/repl_helpers.py +30 -16
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/client.py +8 -0
- fred_runtime-2.0.2/fred_runtime/eval/__init__.py +13 -0
- fred_runtime-2.0.2/fred_runtime/eval/collector.py +143 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/PKG-INFO +1 -1
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/SOURCES.txt +3 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/pyproject.toml +1 -1
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_agent_app.py +150 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_client.py +6 -4
- fred_runtime-2.0.2/tests/test_eval_trace.py +314 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/README.md +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/_catalogs.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/config.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/config_loader.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/container.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/context.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/dependencies.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/mcp_config.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/observability_factory.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/app/openai_compat_router.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/entrypoint.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/kpi_display.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/cli/url_helpers.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/context_aware_tool.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_base_client.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_fast_text_client.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_http_client.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_logs_client.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_markdown_media_client.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_vectorsearch_client.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/kf_workspace_client.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/mcp_interceptors.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/mcp_runtime.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/mcp_toolkit.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/mcp_utils.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/structures.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/token_expiry.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/common/tool_node_utils.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/deep/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/deep/deep_runtime.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/graph/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/graph/graph_runtime.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/integrations/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/integrations/v2_runtime/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/integrations/v2_runtime/adapters.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/catalog.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/contracts.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/provider.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/model_routing/resolver.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_langchain_adapter.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_message_codec.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_model_adapter.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_prompting.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_runtime.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_stream_adapter.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_binding.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_loop.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_rendering.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_resolution.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tool_utils.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/react/react_tracing.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_context.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/checkpoints.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/model_metadata.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/request_context_helpers.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/sql_checkpointer.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/runtime_support/user_token_refresher.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/support/__init__.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/support/filesystem_context.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/support/tool_approval.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime/support/tool_loop.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/dependency_links.txt +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/entry_points.txt +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/requires.txt +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/fred_runtime.egg-info/top_level.txt +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/setup.cfg +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_config_loader.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_context.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_graph_runtime_observability.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_history.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_kf_workspace_client.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_kpi_display.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_mcp_config.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_openai_compat_router.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_smoke.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_url_helpers.py +0 -0
- {fred_runtime-2.0.0 → fred_runtime-2.0.2}/tests/test_user_token_refresher.py +0 -0
|
@@ -62,6 +62,7 @@ from fred_core.logs.log_setup import log_setup
|
|
|
62
62
|
from fred_core.logs.memory_log_store import RamLogStore
|
|
63
63
|
from fred_core.security.oidc import get_keycloak_client_id, get_keycloak_url
|
|
64
64
|
from fred_core.security.structure import KeycloakUser
|
|
65
|
+
from fred_sdk.contracts.eval import EvalStep, EvalTrace
|
|
65
66
|
from fred_sdk.contracts.context import (
|
|
66
67
|
AgentInvocationRequest,
|
|
67
68
|
AgentInvocationResult,
|
|
@@ -558,11 +559,13 @@ class LocalRegistryAgentInvoker(AgentInvokerPort):
|
|
|
558
559
|
is_error=True,
|
|
559
560
|
)
|
|
560
561
|
|
|
562
|
+
context_dict = request.context.model_dump(mode="json")
|
|
563
|
+
context_dict.setdefault("execution_action", ExecutionGrantAction.EXECUTE.value)
|
|
561
564
|
execute_request = _AgentExecuteRequest.model_construct(
|
|
562
565
|
agent_id=request.agent_id,
|
|
563
566
|
agent_instance_id=None,
|
|
564
567
|
message=request.message,
|
|
565
|
-
context=
|
|
568
|
+
context=context_dict,
|
|
566
569
|
resume_payload=None,
|
|
567
570
|
)
|
|
568
571
|
|
|
@@ -838,17 +841,19 @@ def _apply_runtime_tuning(
|
|
|
838
841
|
- `definition = _apply_runtime_tuning(template_definition, resolution.tuning)`
|
|
839
842
|
"""
|
|
840
843
|
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
)
|
|
844
|
+
update: dict[str, object] = {
|
|
845
|
+
"role": tuning.role,
|
|
846
|
+
"description": tuning.description,
|
|
847
|
+
"tags": tuple(tuning.tags),
|
|
848
|
+
"fields": tuple(field.model_copy(deep=True) for field in tuning.fields),
|
|
849
|
+
"default_mcp_servers": tuple(
|
|
850
|
+
server.model_copy(deep=True) for server in tuning.mcp_servers
|
|
851
|
+
),
|
|
852
|
+
}
|
|
853
|
+
system_prompt = tuning.values.get("prompts.system")
|
|
854
|
+
if isinstance(system_prompt, str) and system_prompt.strip():
|
|
855
|
+
update["system_prompt_template"] = system_prompt
|
|
856
|
+
return definition.model_copy(update=update)
|
|
852
857
|
|
|
853
858
|
|
|
854
859
|
def _available_mcp_servers_for_definition(
|
|
@@ -1377,6 +1382,120 @@ def _sse(payload: str) -> str:
|
|
|
1377
1382
|
return f"data: {payload}\n\n"
|
|
1378
1383
|
|
|
1379
1384
|
|
|
1385
|
+
@dataclass(frozen=True)
|
|
1386
|
+
class _TurnOutcome:
|
|
1387
|
+
model_name: str | None
|
|
1388
|
+
finish_reason: str
|
|
1389
|
+
token_usage: dict[str, Any] | None
|
|
1390
|
+
input_tokens: int | None
|
|
1391
|
+
output_tokens: int | None
|
|
1392
|
+
tool_count: int
|
|
1393
|
+
is_error: bool
|
|
1394
|
+
total_ms: int
|
|
1395
|
+
final_content: str | None
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
def _parse_turn_outcome(
|
|
1399
|
+
payloads: list[dict[str, Any]],
|
|
1400
|
+
turn_start: float,
|
|
1401
|
+
) -> _TurnOutcome:
|
|
1402
|
+
total_ms = int((time.monotonic() - turn_start) * 1000)
|
|
1403
|
+
tool_count = sum(1 for p in payloads if p.get("kind") == "tool_call")
|
|
1404
|
+
final = next((p for p in reversed(payloads) if p.get("kind") == "final"), None)
|
|
1405
|
+
is_error = any(p.get("kind") == "execution_error" for p in payloads)
|
|
1406
|
+
token_usage: dict[str, Any] | None = final.get("token_usage") if final else None
|
|
1407
|
+
return _TurnOutcome(
|
|
1408
|
+
model_name=final.get("model_name") if final else None,
|
|
1409
|
+
finish_reason="error"
|
|
1410
|
+
if is_error
|
|
1411
|
+
else ((final.get("finish_reason") or "") if final else ""),
|
|
1412
|
+
token_usage=token_usage,
|
|
1413
|
+
input_tokens=token_usage.get("input_tokens") if token_usage else None,
|
|
1414
|
+
output_tokens=token_usage.get("output_tokens") if token_usage else None,
|
|
1415
|
+
tool_count=tool_count,
|
|
1416
|
+
is_error=is_error,
|
|
1417
|
+
total_ms=total_ms,
|
|
1418
|
+
final_content=(final.get("content") or None) if final else None,
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
|
|
1422
|
+
def _build_eval_trace(
|
|
1423
|
+
payloads: list[dict[str, Any]],
|
|
1424
|
+
input_text: str,
|
|
1425
|
+
agent_id: str,
|
|
1426
|
+
session_id: str,
|
|
1427
|
+
turn_start: float,
|
|
1428
|
+
) -> EvalTrace:
|
|
1429
|
+
outcome = _parse_turn_outcome(payloads, turn_start)
|
|
1430
|
+
steps: list[EvalStep] = []
|
|
1431
|
+
retrieval_context: list[str] = []
|
|
1432
|
+
tools_called: list[str] = []
|
|
1433
|
+
error: str | None = None
|
|
1434
|
+
|
|
1435
|
+
for p in payloads:
|
|
1436
|
+
kind = p.get("kind")
|
|
1437
|
+
if kind == "tool_call":
|
|
1438
|
+
steps.append(
|
|
1439
|
+
EvalStep(
|
|
1440
|
+
kind="tool_call",
|
|
1441
|
+
tool_name=p.get("tool_name"),
|
|
1442
|
+
call_id=p.get("call_id"),
|
|
1443
|
+
arguments=p.get("arguments") or {},
|
|
1444
|
+
)
|
|
1445
|
+
)
|
|
1446
|
+
if p.get("tool_name"):
|
|
1447
|
+
tools_called.append(p["tool_name"])
|
|
1448
|
+
elif kind == "tool_result":
|
|
1449
|
+
content = p.get("content", "")
|
|
1450
|
+
is_err = p.get("is_error", False)
|
|
1451
|
+
steps.append(
|
|
1452
|
+
EvalStep(
|
|
1453
|
+
kind="tool_result",
|
|
1454
|
+
tool_name=p.get("tool_name"),
|
|
1455
|
+
call_id=p.get("call_id"),
|
|
1456
|
+
content=content,
|
|
1457
|
+
is_error=is_err,
|
|
1458
|
+
)
|
|
1459
|
+
)
|
|
1460
|
+
if not is_err:
|
|
1461
|
+
sources = p.get("sources") or []
|
|
1462
|
+
if sources:
|
|
1463
|
+
retrieval_context.extend(
|
|
1464
|
+
s["content"] for s in sources if s.get("content")
|
|
1465
|
+
)
|
|
1466
|
+
elif content:
|
|
1467
|
+
retrieval_context.append(content)
|
|
1468
|
+
elif kind == "final":
|
|
1469
|
+
steps.append(EvalStep(kind="final", content=p.get("content")))
|
|
1470
|
+
elif kind == "node_error":
|
|
1471
|
+
steps.append(
|
|
1472
|
+
EvalStep(
|
|
1473
|
+
kind="node_error",
|
|
1474
|
+
node_id=p.get("node_id"),
|
|
1475
|
+
error_message=p.get("error_message"),
|
|
1476
|
+
)
|
|
1477
|
+
)
|
|
1478
|
+
elif kind == "awaiting_human":
|
|
1479
|
+
steps.append(EvalStep(kind="awaiting_human"))
|
|
1480
|
+
elif kind == "execution_error":
|
|
1481
|
+
error = p.get("message")
|
|
1482
|
+
|
|
1483
|
+
return EvalTrace(
|
|
1484
|
+
session_id=session_id,
|
|
1485
|
+
agent_id=agent_id,
|
|
1486
|
+
input=input_text,
|
|
1487
|
+
output=outcome.final_content,
|
|
1488
|
+
error=error,
|
|
1489
|
+
latency_ms=outcome.total_ms,
|
|
1490
|
+
model_name=outcome.model_name,
|
|
1491
|
+
token_usage=outcome.token_usage,
|
|
1492
|
+
finish_reason=outcome.finish_reason or None,
|
|
1493
|
+
steps=tuple(steps),
|
|
1494
|
+
retrieval_context=tuple(retrieval_context),
|
|
1495
|
+
tools_called=tuple(tools_called),
|
|
1496
|
+
)
|
|
1497
|
+
|
|
1498
|
+
|
|
1380
1499
|
def _emit_turn_completed(
|
|
1381
1500
|
container: PodApplicationContext,
|
|
1382
1501
|
*,
|
|
@@ -1408,21 +1527,7 @@ def _emit_turn_completed(
|
|
|
1408
1527
|
"""
|
|
1409
1528
|
try:
|
|
1410
1529
|
kpi = get_runtime_context().get_kpi_writer()
|
|
1411
|
-
|
|
1412
|
-
tool_count = sum(1 for p in payloads if p.get("kind") == "tool_call")
|
|
1413
|
-
final = next((p for p in reversed(payloads) if p.get("kind") == "final"), None)
|
|
1414
|
-
is_error = any(p.get("kind") == "execution_error" for p in payloads)
|
|
1415
|
-
model_name: str | None = final.get("model_name") if final else None
|
|
1416
|
-
finish_reason: str = (
|
|
1417
|
-
"error" if is_error else (final.get("finish_reason") or "") if final else ""
|
|
1418
|
-
)
|
|
1419
|
-
token_usage: dict[str, Any] | None = final.get("token_usage") if final else None
|
|
1420
|
-
input_tokens: int | None = (
|
|
1421
|
-
token_usage.get("input_tokens") if token_usage else None
|
|
1422
|
-
)
|
|
1423
|
-
output_tokens: int | None = (
|
|
1424
|
-
token_usage.get("output_tokens") if token_usage else None
|
|
1425
|
-
)
|
|
1530
|
+
outcome = _parse_turn_outcome(payloads, turn_start)
|
|
1426
1531
|
runtime_id = get_runtime_context().config.service_name
|
|
1427
1532
|
|
|
1428
1533
|
# Prometheus-safe dims: low-cardinality only.
|
|
@@ -1433,25 +1538,25 @@ def _emit_turn_completed(
|
|
|
1433
1538
|
"team_id": team_id,
|
|
1434
1539
|
"template_agent_id": template_agent_id,
|
|
1435
1540
|
"runtime_id": runtime_id,
|
|
1436
|
-
"model_name": model_name,
|
|
1437
|
-
"finish_reason": finish_reason,
|
|
1541
|
+
"model_name": outcome.model_name,
|
|
1542
|
+
"finish_reason": outcome.finish_reason,
|
|
1438
1543
|
}
|
|
1439
1544
|
|
|
1440
1545
|
kpi.emit(
|
|
1441
1546
|
name="agent.turn_completed",
|
|
1442
1547
|
type="timer",
|
|
1443
|
-
value=total_ms,
|
|
1548
|
+
value=outcome.total_ms,
|
|
1444
1549
|
unit="ms",
|
|
1445
1550
|
dims=prom_dims,
|
|
1446
1551
|
quantities={
|
|
1447
|
-
"tool_count": tool_count,
|
|
1448
|
-
"input_tokens": input_tokens,
|
|
1449
|
-
"output_tokens": output_tokens,
|
|
1552
|
+
"tool_count": outcome.tool_count,
|
|
1553
|
+
"input_tokens": outcome.input_tokens,
|
|
1554
|
+
"output_tokens": outcome.output_tokens,
|
|
1450
1555
|
},
|
|
1451
1556
|
actor=KPIActor(type="system"),
|
|
1452
1557
|
)
|
|
1453
1558
|
|
|
1454
|
-
if is_error:
|
|
1559
|
+
if outcome.is_error:
|
|
1455
1560
|
kpi.emit(
|
|
1456
1561
|
name="agent.turn_error_total",
|
|
1457
1562
|
type="counter",
|
|
@@ -1468,12 +1573,12 @@ def _emit_turn_completed(
|
|
|
1468
1573
|
"session_id": session_id,
|
|
1469
1574
|
"exchange_id": exchange_id,
|
|
1470
1575
|
"user_id": user_id,
|
|
1471
|
-
"total_ms": total_ms,
|
|
1472
|
-
"is_error": is_error,
|
|
1576
|
+
"total_ms": outcome.total_ms,
|
|
1577
|
+
"is_error": outcome.is_error,
|
|
1473
1578
|
**prom_dims,
|
|
1474
|
-
"tool_count": tool_count,
|
|
1475
|
-
"input_tokens": input_tokens,
|
|
1476
|
-
"output_tokens": output_tokens,
|
|
1579
|
+
"tool_count": outcome.tool_count,
|
|
1580
|
+
"input_tokens": outcome.input_tokens,
|
|
1581
|
+
"output_tokens": outcome.output_tokens,
|
|
1477
1582
|
},
|
|
1478
1583
|
)
|
|
1479
1584
|
with container._kpi_turns_lock:
|
|
@@ -2328,6 +2433,113 @@ def _build_agent_router(
|
|
|
2328
2433
|
)
|
|
2329
2434
|
return _terminal_execute_payload(payloads)
|
|
2330
2435
|
|
|
2436
|
+
@router.post(
|
|
2437
|
+
"/evaluate",
|
|
2438
|
+
response_model=EvalTrace,
|
|
2439
|
+
)
|
|
2440
|
+
async def evaluate(
|
|
2441
|
+
request: RuntimeExecuteRequest,
|
|
2442
|
+
http_request: Request,
|
|
2443
|
+
authenticated_user: KeycloakUser | None = Depends(_authenticated_user),
|
|
2444
|
+
container: PodApplicationContext = Depends(get_pod_container),
|
|
2445
|
+
) -> EvalTrace:
|
|
2446
|
+
"""
|
|
2447
|
+
Execute one agent turn and return a complete EvalTrace as JSON.
|
|
2448
|
+
|
|
2449
|
+
POST <configured base_url>/agents/evaluate
|
|
2450
|
+
Authorization: Bearer <user JWT>
|
|
2451
|
+
Body: RuntimeExecuteRequest
|
|
2452
|
+
Response: EvalTrace — synchronous, no SSE, no Langfuse dependency
|
|
2453
|
+
|
|
2454
|
+
Intended for evaluation harnesses (DeepEval, Promptfoo) that need
|
|
2455
|
+
input, output, retrieval_context, tools_called, and steps in one response.
|
|
2456
|
+
"""
|
|
2457
|
+
auth = http_request.headers.get("Authorization", "")
|
|
2458
|
+
access_token = auth.removeprefix("Bearer ").strip() or None
|
|
2459
|
+
|
|
2460
|
+
expected_action = _expected_execution_action(request)
|
|
2461
|
+
|
|
2462
|
+
try:
|
|
2463
|
+
validate_execution_grant(request, expected_action=expected_action)
|
|
2464
|
+
except ExecutionGrantViolation as exc:
|
|
2465
|
+
_emit_audit_event(
|
|
2466
|
+
container,
|
|
2467
|
+
"warning",
|
|
2468
|
+
"grant_validation_failed",
|
|
2469
|
+
agent_instance_id=request.agent_instance_id,
|
|
2470
|
+
user_id=request.effective_user_id(),
|
|
2471
|
+
action=expected_action.value,
|
|
2472
|
+
reason=str(exc),
|
|
2473
|
+
)
|
|
2474
|
+
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=str(exc))
|
|
2475
|
+
if request.execution_grant is not None:
|
|
2476
|
+
_emit_audit_event(
|
|
2477
|
+
container,
|
|
2478
|
+
"info",
|
|
2479
|
+
"grant_validated",
|
|
2480
|
+
agent_instance_id=request.agent_instance_id,
|
|
2481
|
+
user_id=request.effective_user_id(),
|
|
2482
|
+
action=expected_action.value,
|
|
2483
|
+
)
|
|
2484
|
+
_validate_grant_user_correlation(request, authenticated_user, container)
|
|
2485
|
+
await _validate_session_checkpoint_access(request)
|
|
2486
|
+
|
|
2487
|
+
exchange_id = str(uuid4())
|
|
2488
|
+
turn_start = time.monotonic()
|
|
2489
|
+
internal_req = _to_internal_request(request)
|
|
2490
|
+
target = await _resolve_agent_instance(
|
|
2491
|
+
request=internal_req,
|
|
2492
|
+
registry=registry,
|
|
2493
|
+
access_token=access_token,
|
|
2494
|
+
control_plane_url=get_runtime_context().config.control_plane_url,
|
|
2495
|
+
)
|
|
2496
|
+
payloads = [
|
|
2497
|
+
payload
|
|
2498
|
+
async for payload in _iterate_runtime_event_payloads(
|
|
2499
|
+
target.definition,
|
|
2500
|
+
internal_req,
|
|
2501
|
+
access_token=access_token,
|
|
2502
|
+
team_id=target.team_id,
|
|
2503
|
+
registry=registry,
|
|
2504
|
+
exchange_id=exchange_id,
|
|
2505
|
+
)
|
|
2506
|
+
]
|
|
2507
|
+
session_id: str | None = request.effective_session_id()
|
|
2508
|
+
eval_session_id = session_id or str(uuid4())
|
|
2509
|
+
user_id_str = request.effective_user_id() or "unknown"
|
|
2510
|
+
_emit_turn_completed(
|
|
2511
|
+
container,
|
|
2512
|
+
session_id=session_id,
|
|
2513
|
+
exchange_id=exchange_id,
|
|
2514
|
+
user_id=user_id_str,
|
|
2515
|
+
team_id=target.team_id,
|
|
2516
|
+
agent_instance_id=request.agent_instance_id,
|
|
2517
|
+
template_agent_id=target.definition.agent_id,
|
|
2518
|
+
payloads=payloads,
|
|
2519
|
+
turn_start=turn_start,
|
|
2520
|
+
)
|
|
2521
|
+
if session_id:
|
|
2522
|
+
history_store = get_runtime_context().config.history_store
|
|
2523
|
+
if history_store is not None:
|
|
2524
|
+
await _write_turn_history(
|
|
2525
|
+
session_id=session_id,
|
|
2526
|
+
user_id=user_id_str,
|
|
2527
|
+
request_message=request.input,
|
|
2528
|
+
payloads=payloads,
|
|
2529
|
+
history_store=history_store,
|
|
2530
|
+
team_id=target.team_id,
|
|
2531
|
+
agent_instance_id=request.agent_instance_id,
|
|
2532
|
+
exchange_id=exchange_id,
|
|
2533
|
+
resume_payload=request.resume_payload,
|
|
2534
|
+
)
|
|
2535
|
+
return _build_eval_trace(
|
|
2536
|
+
payloads=payloads,
|
|
2537
|
+
input_text=request.input or "",
|
|
2538
|
+
agent_id=target.definition.agent_id,
|
|
2539
|
+
session_id=eval_session_id,
|
|
2540
|
+
turn_start=turn_start,
|
|
2541
|
+
)
|
|
2542
|
+
|
|
2331
2543
|
@router.post(
|
|
2332
2544
|
"/execute/stream",
|
|
2333
2545
|
)
|
|
@@ -2,8 +2,10 @@ from .completion import completion_candidates
|
|
|
2
2
|
from .entrypoint import build_parser, main
|
|
3
3
|
from .history_display import (
|
|
4
4
|
build_hitl_resume_payload,
|
|
5
|
+
print_eval_trace,
|
|
5
6
|
print_history,
|
|
6
7
|
print_runtime_event,
|
|
8
|
+
run_eval_turn,
|
|
7
9
|
run_single_turn,
|
|
8
10
|
)
|
|
9
11
|
from .kpi_display import (
|
|
@@ -19,6 +21,8 @@ from .kpi_display import (
|
|
|
19
21
|
from .pod_client import DEFAULT_AGENT_POD_BASE_URL, AgentPodClient
|
|
20
22
|
from .repl import run_interactive_chat
|
|
21
23
|
from .repl_helpers import (
|
|
24
|
+
ExecutionMode,
|
|
25
|
+
execution_mode_color,
|
|
22
26
|
execution_mode_label,
|
|
23
27
|
fmt_bytes,
|
|
24
28
|
parse_mode_command,
|
|
@@ -33,6 +37,7 @@ from .url_helpers import (
|
|
|
33
37
|
__all__ = [
|
|
34
38
|
"AgentPodClient",
|
|
35
39
|
"DEFAULT_AGENT_POD_BASE_URL",
|
|
40
|
+
"ExecutionMode",
|
|
36
41
|
"HistogramSeriesSummary",
|
|
37
42
|
"PrometheusSample",
|
|
38
43
|
"build_hitl_resume_payload",
|
|
@@ -40,7 +45,9 @@ __all__ = [
|
|
|
40
45
|
"completion_candidates",
|
|
41
46
|
"default_agent_metrics_url",
|
|
42
47
|
"default_agent_pod_base_url",
|
|
48
|
+
"execution_mode_color",
|
|
43
49
|
"execution_mode_label",
|
|
50
|
+
"print_eval_trace",
|
|
44
51
|
"filter_prometheus_samples",
|
|
45
52
|
"fmt_bytes",
|
|
46
53
|
"format_metric_value",
|
|
@@ -53,6 +60,7 @@ __all__ = [
|
|
|
53
60
|
"print_history",
|
|
54
61
|
"print_runtime_event",
|
|
55
62
|
"render_kpi_report",
|
|
63
|
+
"run_eval_turn",
|
|
56
64
|
"run_interactive_chat",
|
|
57
65
|
"run_single_turn",
|
|
58
66
|
"summarize_prometheus_histograms",
|
|
@@ -49,7 +49,7 @@ def completion_candidates(
|
|
|
49
49
|
return [sid for sid in session_ids if sid.startswith(prefix)]
|
|
50
50
|
if stripped.startswith("/mode "):
|
|
51
51
|
prefix = stripped.removeprefix("/mode ").strip()
|
|
52
|
-
return [mode for mode in ("final", "stream") if mode.startswith(prefix)]
|
|
52
|
+
return [mode for mode in ("eval", "final", "stream") if mode.startswith(prefix)]
|
|
53
53
|
if stripped.startswith("/"):
|
|
54
54
|
return complete_slash_commands(stripped, commands=_COMMANDS)
|
|
55
55
|
return []
|
|
@@ -408,3 +408,152 @@ def build_hitl_resume_payload(
|
|
|
408
408
|
if 0 <= idx < len(choices):
|
|
409
409
|
selected_choice_id = str(choices[idx].get("id", raw_response))
|
|
410
410
|
return {"choice_id": selected_choice_id}
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def print_eval_trace(trace: dict[str, Any], *, color_enabled: bool) -> None:
|
|
414
|
+
"""Render one EvalTrace dict (from POST /agents/evaluate) to the terminal."""
|
|
415
|
+
sep = colorize(" " + "─" * 62, color=ANSI_DIM, enabled=color_enabled)
|
|
416
|
+
print(colorize(" EvalTrace", color=ANSI_CYAN, enabled=color_enabled, bold=True))
|
|
417
|
+
print(sep)
|
|
418
|
+
|
|
419
|
+
def _field(label: str, value: str, color: str = ANSI_DIM) -> None:
|
|
420
|
+
print(
|
|
421
|
+
colorize(f" {label:<18}", color=ANSI_DIM, enabled=color_enabled)
|
|
422
|
+
+ colorize(value, color=color, enabled=color_enabled)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
_field("agent", trace.get("agent_id") or "-", ANSI_CYAN)
|
|
426
|
+
_field("session", trace.get("session_id") or "-")
|
|
427
|
+
_field("latency", f"{trace.get('latency_ms', 0)} ms")
|
|
428
|
+
_field("model", trace.get("model_name") or "-")
|
|
429
|
+
_field("finish", trace.get("finish_reason") or "-")
|
|
430
|
+
|
|
431
|
+
tu = trace.get("token_usage") or {}
|
|
432
|
+
if tu:
|
|
433
|
+
_field(
|
|
434
|
+
"tokens",
|
|
435
|
+
f"{tu.get('input_tokens', 0)}↑ in {tu.get('output_tokens', 0)}↓ out",
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
tools_called: list[str] = list(trace.get("tools_called") or [])
|
|
439
|
+
if tools_called:
|
|
440
|
+
_field("tools_called", " ".join(tools_called), ANSI_YELLOW)
|
|
441
|
+
|
|
442
|
+
retrieval_ctx: list[str] = list(trace.get("retrieval_context") or [])
|
|
443
|
+
_field("retrieval_ctx", str(len(retrieval_ctx)) + " chunk(s)")
|
|
444
|
+
|
|
445
|
+
steps: list[dict[str, Any]] = list(trace.get("steps") or [])
|
|
446
|
+
_field("steps", str(len(steps)))
|
|
447
|
+
|
|
448
|
+
err = trace.get("error")
|
|
449
|
+
_field("error", err or "none", ANSI_RED if err else ANSI_DIM)
|
|
450
|
+
|
|
451
|
+
if steps:
|
|
452
|
+
print()
|
|
453
|
+
print(colorize(" Steps:", color=ANSI_DIM, enabled=color_enabled, bold=True))
|
|
454
|
+
for i, step in enumerate(steps, 1):
|
|
455
|
+
kind = step.get("kind", "?")
|
|
456
|
+
name = step.get("tool_name") or ""
|
|
457
|
+
if kind == "tool_call":
|
|
458
|
+
raw_args = step.get("arguments")
|
|
459
|
+
args_str = (
|
|
460
|
+
json.dumps(raw_args, ensure_ascii=False)
|
|
461
|
+
if raw_args is not None
|
|
462
|
+
else ""
|
|
463
|
+
)
|
|
464
|
+
args_str = (args_str[:80] + "…") if len(args_str) > 80 else args_str
|
|
465
|
+
print(
|
|
466
|
+
colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
|
|
467
|
+
+ colorize(
|
|
468
|
+
"[tool_call] ", color=ANSI_YELLOW, enabled=color_enabled
|
|
469
|
+
)
|
|
470
|
+
+ colorize(
|
|
471
|
+
name, color=ANSI_YELLOW, enabled=color_enabled, bold=True
|
|
472
|
+
)
|
|
473
|
+
)
|
|
474
|
+
if args_str:
|
|
475
|
+
print(
|
|
476
|
+
colorize(
|
|
477
|
+
f" {args_str}", color=ANSI_DIM, enabled=color_enabled
|
|
478
|
+
)
|
|
479
|
+
)
|
|
480
|
+
elif kind == "tool_result":
|
|
481
|
+
is_err = step.get("is_error", False)
|
|
482
|
+
rc = ANSI_RED if is_err else ANSI_GREEN
|
|
483
|
+
content = str(step.get("content") or "")
|
|
484
|
+
content = (content[:80] + "…") if len(content) > 80 else content
|
|
485
|
+
print(
|
|
486
|
+
colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
|
|
487
|
+
+ colorize("[tool_result] ", color=rc, enabled=color_enabled)
|
|
488
|
+
+ colorize(name, color=rc, enabled=color_enabled, bold=True)
|
|
489
|
+
+ colorize(
|
|
490
|
+
" (error)" if is_err else " (ok)",
|
|
491
|
+
color=rc,
|
|
492
|
+
enabled=color_enabled,
|
|
493
|
+
)
|
|
494
|
+
)
|
|
495
|
+
if content:
|
|
496
|
+
print(
|
|
497
|
+
colorize(
|
|
498
|
+
f" {content}", color=ANSI_DIM, enabled=color_enabled
|
|
499
|
+
)
|
|
500
|
+
)
|
|
501
|
+
elif kind == "node_error":
|
|
502
|
+
msg = str(step.get("error_message") or "")
|
|
503
|
+
print(
|
|
504
|
+
colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
|
|
505
|
+
+ colorize("[node_error] ", color=ANSI_RED, enabled=color_enabled)
|
|
506
|
+
+ colorize(
|
|
507
|
+
step.get("node_id") or "",
|
|
508
|
+
color=ANSI_RED,
|
|
509
|
+
enabled=color_enabled,
|
|
510
|
+
bold=True,
|
|
511
|
+
)
|
|
512
|
+
)
|
|
513
|
+
if msg:
|
|
514
|
+
print(
|
|
515
|
+
colorize(f" {msg}", color=ANSI_DIM, enabled=color_enabled)
|
|
516
|
+
)
|
|
517
|
+
elif kind == "final":
|
|
518
|
+
print(
|
|
519
|
+
colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
|
|
520
|
+
+ colorize(
|
|
521
|
+
"[final]", color=ANSI_GREEN, enabled=color_enabled, bold=True
|
|
522
|
+
)
|
|
523
|
+
)
|
|
524
|
+
else:
|
|
525
|
+
print(
|
|
526
|
+
colorize(f" {i:>3} ", color=ANSI_DIM, enabled=color_enabled)
|
|
527
|
+
+ colorize(f"[{kind}]", color=ANSI_DIM, enabled=color_enabled)
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
output = trace.get("output") or ""
|
|
531
|
+
if output:
|
|
532
|
+
print()
|
|
533
|
+
print(colorize(" Output:", color=ANSI_DIM, enabled=color_enabled, bold=True))
|
|
534
|
+
print(sep)
|
|
535
|
+
print(output)
|
|
536
|
+
|
|
537
|
+
print(sep)
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def run_eval_turn(
|
|
541
|
+
*,
|
|
542
|
+
client: AgentPodClient,
|
|
543
|
+
agent_id: str,
|
|
544
|
+
message: str,
|
|
545
|
+
session_id: str,
|
|
546
|
+
user_id: str,
|
|
547
|
+
team_id: str | None,
|
|
548
|
+
color_enabled: bool,
|
|
549
|
+
) -> int:
|
|
550
|
+
"""Call /agents/evaluate and pretty-print the EvalTrace."""
|
|
551
|
+
result = client.evaluate(
|
|
552
|
+
agent_id=agent_id,
|
|
553
|
+
message=message,
|
|
554
|
+
session_id=session_id,
|
|
555
|
+
user_id=user_id,
|
|
556
|
+
team_id=team_id,
|
|
557
|
+
)
|
|
558
|
+
print_eval_trace(result, color_enabled=color_enabled)
|
|
559
|
+
return 0 if result.get("error") is None else 1
|
|
@@ -96,6 +96,41 @@ class AgentPodClient:
|
|
|
96
96
|
raise RuntimeError("Execute response must be a JSON object.")
|
|
97
97
|
return result
|
|
98
98
|
|
|
99
|
+
def evaluate(
|
|
100
|
+
self,
|
|
101
|
+
*,
|
|
102
|
+
agent_id: str,
|
|
103
|
+
message: str,
|
|
104
|
+
session_id: str,
|
|
105
|
+
user_id: str,
|
|
106
|
+
team_id: str | None = None,
|
|
107
|
+
agent_instance_id: str | None = None,
|
|
108
|
+
checkpoint_id: str | None = None,
|
|
109
|
+
) -> dict[str, Any]:
|
|
110
|
+
runtime_context: dict[str, Any] = {"user_id": user_id}
|
|
111
|
+
if team_id:
|
|
112
|
+
runtime_context["team_id"] = team_id
|
|
113
|
+
payload: dict[str, Any] = {
|
|
114
|
+
"agent_id": agent_id,
|
|
115
|
+
"input": message,
|
|
116
|
+
"session_id": session_id,
|
|
117
|
+
"runtime_context": runtime_context,
|
|
118
|
+
}
|
|
119
|
+
if agent_instance_id is not None:
|
|
120
|
+
payload["agent_instance_id"] = agent_instance_id
|
|
121
|
+
if checkpoint_id is not None:
|
|
122
|
+
payload["checkpoint_id"] = checkpoint_id
|
|
123
|
+
response = self.http_client.post(
|
|
124
|
+
f"{self.base_url}/agents/evaluate",
|
|
125
|
+
json=payload,
|
|
126
|
+
headers=self._auth_headers(),
|
|
127
|
+
)
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
result = response.json()
|
|
130
|
+
if not isinstance(result, dict):
|
|
131
|
+
raise RuntimeError("Evaluate response must be a JSON object.")
|
|
132
|
+
return result
|
|
133
|
+
|
|
99
134
|
def stream_events(
|
|
100
135
|
self,
|
|
101
136
|
*,
|