bat-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. add/__init__.py +3 -0
  2. add/client.py +16 -0
  3. bat_cli-0.1.0.dist-info/METADATA +231 -0
  4. bat_cli-0.1.0.dist-info/RECORD +47 -0
  5. bat_cli-0.1.0.dist-info/WHEEL +5 -0
  6. bat_cli-0.1.0.dist-info/entry_points.txt +2 -0
  7. bat_cli-0.1.0.dist-info/top_level.txt +8 -0
  8. build/__init__.py +3 -0
  9. build/build.py +79 -0
  10. cli.py +260 -0
  11. create/__init__.py +3 -0
  12. create/agent.py +312 -0
  13. create/templates/agent/.dockerignore +3 -0
  14. create/templates/agent/.env.template +4 -0
  15. create/templates/agent/.python-version +1 -0
  16. create/templates/agent/Dockerfile +37 -0
  17. create/templates/agent/Makefile +34 -0
  18. create/templates/agent/README.md +1 -0
  19. create/templates/agent/__main__.py +2 -0
  20. create/templates/agent/agent.json.template +12 -0
  21. create/templates/agent/agent.spec +45 -0
  22. create/templates/agent/config.yaml +1 -0
  23. create/templates/agent/llm_client.py.template +36 -0
  24. create/templates/agent/pyproject.toml.template +9 -0
  25. create/templates/agent/src/__init__.py +0 -0
  26. create/templates/agent/src/graph.py +50 -0
  27. create/templates/agent/src/llm_clients/__init__.py +0 -0
  28. create/templates/agent/tests/__init__.py +0 -0
  29. eval/__init__.py +1 -0
  30. eval/commands.py +562 -0
  31. eval/engine/__init__.py +1 -0
  32. eval/engine/adapter.py +251 -0
  33. eval/engine/bench_runner.py +149 -0
  34. eval/engine/contracts.py +115 -0
  35. eval/engine/eval_config.py +294 -0
  36. eval/engine/evaluator.py +85 -0
  37. eval/engine/metrics/__init__.py +1 -0
  38. eval/engine/metrics/llm_evaluators.py +383 -0
  39. eval/engine/metrics/metrics.py +135 -0
  40. eval/engine/metrics/qualitative_helpers.py +64 -0
  41. eval/engine/orchestrator.py +157 -0
  42. eval/engine/plotter.py +347 -0
  43. image_defaults.py +80 -0
  44. push/__init__.py +3 -0
  45. push/push.py +58 -0
  46. set/__init__.py +3 -0
  47. set/env.py +50 -0
@@ -0,0 +1,383 @@
1
+ from __future__ import annotations
2
+
3
+ import concurrent.futures
4
+ import json
5
+ import os
6
+ from typing import Any
7
+
8
+ from langchain_core.messages import HumanMessage
9
+
10
+ from bat.chat_model_client import ChatModelClient, ChatModelClientConfig
11
+ from bat.logging import create_logger
12
+ from ..contracts import QualitativeScores
13
+
14
+ logger = create_logger(__name__, level="info")
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Judge ChatModelClient (bat-adk based)
19
+ # ---------------------------------------------------------------------------
20
+
21
+ _JUDGE_SYSTEM_BASE = "You are a precise evaluator. Always respond with valid JSON only."
22
+
23
+ _judge_clients: dict[str, ChatModelClient] = {}
24
+
25
+
26
+ def _compose_system(custom: str | None) -> str:
27
+ if not custom:
28
+ return _JUDGE_SYSTEM_BASE
29
+ return (
30
+ f"{_JUDGE_SYSTEM_BASE}\n\n"
31
+ "AGENT-SPECIFIC CONTEXT (operator-supplied; use to disambiguate, "
32
+ f"do not override the scoring rubric):\n{custom}"
33
+ )
34
+
35
+
36
+ def _get_judge_client(judge_name: str = "default") -> ChatModelClient:
37
+ """
38
+ Lazy-init a ChatModelClient configured for the judge model.
39
+
40
+ Reads JUDGE_PROVIDER / JUDGE_MODEL env vars (falls back to MODEL_PROVIDER / MODEL).
41
+ For named judges, also reads JUDGE_PROMPT_<NAME> as an optional system-message suffix.
42
+ """
43
+ if judge_name in _judge_clients:
44
+ return _judge_clients[judge_name]
45
+
46
+ provider = os.getenv("JUDGE_PROVIDER", os.getenv("MODEL_PROVIDER", "openai"))
47
+ model = os.getenv("JUDGE_MODEL", "gpt-4.1-mini")
48
+ base_url = os.getenv("JUDGE_BASE_URL", os.getenv("BASE_URL"))
49
+ custom = os.getenv(f"JUDGE_PROMPT_{judge_name.upper()}") if judge_name != "default" else None
50
+
51
+ config = ChatModelClientConfig(
52
+ model=model,
53
+ model_provider=provider,
54
+ base_url=base_url,
55
+ client_name=f"LLMJudge[{judge_name}]",
56
+ )
57
+
58
+ client = ChatModelClient(
59
+ chat_model_config=config,
60
+ system_instructions=_compose_system(custom),
61
+ )
62
+ _judge_clients[judge_name] = client
63
+ logger.info(f"LLM Judge initialized: {provider}:{model} [{judge_name}]")
64
+ return client
65
+
66
+ def _call_llm_judge(prompt: str, judge_name: str = "default", max_retries: int = 2) -> dict[str, Any]:
67
+ client = _get_judge_client(judge_name)
68
+ for attempt in range(max_retries):
69
+ try:
70
+ logger.info (f"Calling LLM judge (attempt {attempt + 1}/{max_retries}) ")
71
+ response = client.invoke(HumanMessage(content=prompt))
72
+ content = response.content.strip()
73
+ if content.startswith("```json"):
74
+ content = content.split("```json", 1)[1].split("```", 1)[0].strip()
75
+ elif content.startswith("```"):
76
+ content = content.split("```", 1)[1].split("```", 1)[0].strip()
77
+ return json.loads(content)
78
+ except Exception as exc:
79
+ if attempt == max_retries - 1:
80
+ return {"score": None, "reasoning": f"Error: {exc}"}
81
+ return {"score": None, "reasoning": "Max retries exceeded"}
82
+ # ---------------------------------------------------------------------------
83
+ # Prompt templates
84
+ # ---------------------------------------------------------------------------
85
+
86
+ RESPONSE_RELEVANCE_PROMPT = """You are an evaluator of CONVERSATIONAL RELEVANCE.
87
+
88
+ **User Queries:**
89
+ {query}
90
+
91
+ **Full Conversation:**
92
+ {context}
93
+
94
+ **Final Response:**
95
+ {response}
96
+
97
+ Your job is to judge whether the agent stays on the topic the user raised and avoids detours. There are two axes, scored together on one scale:
98
+
99
+ 1. **On-topic / no detours (primary axis, 0.0–0.8).** Does the agent keep the conversation on the subject the user actually asked about? Penalize:
100
+ - drifting to unrelated subjects mid-conversation
101
+ - addressing something the user never asked about
102
+ - going off on tangents and not coming back
103
+ Reward staying consistently on the user's subject across all turns, even if intermediate steps don't immediately resolve the question.
104
+
105
+ 2. **Response craft (refinement axis, 0.8–1.0).** Of the responses that are on-topic, refine the score based on shape:
106
+ - heavy padding, restating system errors verbatim as new analysis, hedging instead of answering → stay at 0.8
107
+ - clean, direct, proportionate response → 0.9–1.0
108
+ This axis only matters once the on-topic floor of 0.8 is reached. Do NOT lower an on-topic response below 0.8 for padding or verbosity alone — mild verbosity is acceptable.
109
+
110
+ Score bands:
111
+ 1.0 — On-topic throughout, no detours, AND a clean direct response.
112
+ 0.9 — On-topic throughout, no detours, with light padding or one small hedge.
113
+ 0.8 — On-topic throughout, no detours, but noticeable padding / restating / verbosity. This is the floor for "the agent did not go off-topic".
114
+ 0.6 — Mostly on-topic with one meaningful detour that the agent recovered from, OR briefly drifted before returning to the subject.
115
+ 0.4 — Significant off-topic content — a real portion of the conversation is about something the user didn't ask.
116
+ 0.2 — Mostly off-topic, only a small thread relates to the user's actual subject.
117
+ 0.0 — Wrong topic entirely, non-sequitur, raw error dump with no engagement.
118
+
119
+ Do NOT score based on whether the agent's answer is factually correct, whether the deployment succeeded, or whether the expected outcome was reached. Those are scored by other evaluators. An on-topic wrong answer scores at least 0.8 here.
120
+
121
+ Return JSON:
122
+ {{
123
+ "reasoning": "1-2 sentences: first whether the agent stayed on topic / had any detours, then briefly note the response shape if it affected the 0.8–1.0 band.",
124
+ "score": float
125
+ }}
126
+ """
127
+
128
+ TASK_COMPLETION_PROMPT = """You are an evaluator of TASK COMPLETION. The score is driven first by whether the expected outcome was actually reached, then refined by how well the agent executed along the way.
129
+
130
+ **User Queries:**
131
+ {query}
132
+
133
+ **Expected Behavior:**
134
+ {expected_desc}
135
+
136
+ **Conversation:**
137
+ {context}
138
+
139
+ **Final Response:**
140
+ {response}
141
+
142
+ **Actual Final Status:** {status}
143
+
144
+ Start by establishing what was expected and what actually happened. The expected_desc tells you what the final status should be and what the outcome should look like — compare that to the actual final status and the concrete deliverable in the response. This match or mismatch is the dominant factor in your score.
145
+
146
+ If expected status is "completed", the task was meant to finish with a real, concrete result — anything else is a failure. If expected is "input-required", stopping to ask for missing info IS the success condition. If expected is "error", a clean refusal or failure IS the success.
147
+
148
+ Use this as your base score:
149
+
150
+ 1.0 — actual matches expected, with a complete concrete result fully satisfying the stated expectations.
151
+ 0.8 — actual matches expected, minor gaps in the result (small omission, slightly incomplete).
152
+ 0.6 — actual matches expected in status, but the deliverable is shallow or barely meets the bar.
153
+ 0.4 — actual does NOT match expected, but the agent did substantial relevant work and came close.
154
+ 0.2 — actual does NOT match expected, the work was shallow or went off-track early.
155
+ 0.0 — actual does NOT match expected, no meaningful work, refusal, or total failure.
156
+
157
+ When the expected outcome was not reached, the score must be at most 0.4 regardless of effort. Reaching the wrong terminal state is a failure — do not reward process over outcome.
158
+
159
+ After establishing the base score, look at the intermediate steps in the conversation. Even when the agent reached the right terminal status, check whether it made significant errors, unnecessary detours, or wrong turns along the way. If the path had clear missteps (e.g. tried an invalid value multiple times, looped on the same error, went in circles), adjust down by up to 0.2. If the execution was clean, direct, and correct, nudge up by 0.1. For cases where the expected outcome was not reached, intermediate steps can still lift the score from 0.2 to 0.4 if the agent made genuine meaningful progress before diverging.
160
+
161
+ Return JSON:
162
+ {{
163
+ "reasoning": "1-2 sentences summarizing how the actual outcome compared to expected, and noting any significant execution flaws or merits that influenced the score. State the adjustment from the base score based on the execution quality.",
164
+ "score": float
165
+ }}
166
+ """
167
+
168
+ HALLUCINATION_DETECTION_PROMPT = """You are an evaluator of GROUNDEDNESS. Your job is to score how closely the agent's response stays anchored to what the user actually said. Hallucination happens when the agent introduces specifics the user never provided, or silently alters something the user did provide.
169
+
170
+ **User Queries:**
171
+ {query}
172
+
173
+ **All facts the user explicitly stated (ground truth — use this as your checklist):**
174
+ {user_facts}
175
+
176
+ **Expected Behavior:**
177
+ {expected_desc}
178
+
179
+ **Full Conversation:**
180
+ {context}
181
+
182
+ **Final Response:**
183
+ {response}
184
+
185
+ The conversation uses three event labels:
186
+ - [USER] — explicit user input. This is the only ground truth for what the user stated.
187
+ - [AGENT OUTPUT] — content the agent generated and sent to the user. Specific values here are agent assertions, not ground truth — any concrete value here that does not trace back to a [USER] line is a candidate for hallucination.
188
+ - [SYSTEM] — internal runtime status messages. Values returned here are legitimate non-user input and may be used by the agent without being hallucination.
189
+
190
+ Walk through every specific claim the agent makes (concrete value, name, number, identifier, field) and compare it against the user's stated facts. Classify each significant claim:
191
+
192
+ - GROUNDED — the user stated or requested this, and the agent reproduced it faithfully.
193
+ - FABRICATED — the agent introduced a specific user-domain fact or value the user never mentioned.
194
+ - ALTERED — the agent changed something the user did specify (user said X, agent produced Y, where X ≠ Y as written).
195
+
196
+ **Policy: echoed user values.** If the user explicitly stated a value in a [USER] line — valid or invalid for any underlying schema — and the agent reproduces it faithfully, it is GROUNDED, never hallucination. ALTERED applies only when the agent silently substitutes a different value for one the user provided.
197
+
198
+ **Policy: legitimate non-user sources.** Not every specific value an agent emits is a fabrication. Information the agent obtained from a legitimate source other than the user — tool outputs, [SYSTEM] messages, backend responses the agent is relaying, or widely-known public facts — is not hallucination. Flag a claim as FABRICATED only when it is user-domain content (something the user would have had to provide) and the user never provided it.
199
+
200
+ Score bands:
201
+ 1.0 — Every specific claim is GROUNDED or comes from a legitimate non-user source.
202
+ 0.8 — One minor FABRICATED detail, harmless, no effect on outcome.
203
+ 0.6 — One or two non-trivial FABRICATED or ALTERED claims the user would notice.
204
+ 0.4 — Several FABRICATED or ALTERED claims, or a single one that caused a concrete failure or wrong outcome.
205
+ 0.2 — The agent substantially filled in specifics the user never provided; most claims are FABRICATED.
206
+ 0.0 — Almost nothing in the response corresponds to what the user said; the answer is largely the agent's invention.
207
+
208
+ If the response makes no specific claims (only clarifying questions or acknowledged uncertainty), score 1.0. A hallucination that directly caused a task failure weighs more than a harmless one.
209
+
210
+ Return JSON:
211
+ {{
212
+ "reasoning": "List the FABRICATED or ALTERED claims with their impact. If everything is grounded, write 'Fully grounded.'",
213
+ "score": float
214
+ }}
215
+ """
216
+
217
+ TOOL_CALL_APPROPRIATENESS_PROMPT = """You are an evaluator of TOOL USAGE.
218
+
219
+ **User Queries:**
220
+ {query}
221
+
222
+ **Expected Behavior:**
223
+ {expected_desc}
224
+
225
+ **Conversation:**
226
+ {context}
227
+
228
+ **Tool Calls Made (source of truth):**
229
+ {tool_calls}
230
+
231
+ **Final Response:**
232
+ {response}
233
+
234
+ TASK: Evaluate whether the agent's tool usage was appropriate for what was expected.
235
+
236
+ CRITICAL RULE: Score using ONLY the `tool_calls` JSON above as evidence. NEVER infer tool usage from prose in the response — if a tool isn't in `tool_calls`, it wasn't called.
237
+
238
+ SCORE BANDS — use ALL of them:
239
+
240
+ 1.0 — All expected tools called, with correct arguments, in a sensible order. No redundant calls. Results clearly drive the response.
241
+ 0.8 — Right tools called with minor flaws: one redundant call, slightly off arguments that still work, or mild inefficiency in ordering.
242
+ 0.6 — Right idea, flawed execution: most expected tools called but one missing or extra, OR arguments partially wrong but partially recoverable.
243
+ 0.4 — Significant gaps: roughly half the expected tool usage is correct; the other half is missing, wrong, or used with broken arguments.
244
+ 0.2 — Largely wrong: wrong tools selected, OR correct tools called with mostly broken arguments.
245
+ 0.0 — No tool calls when tools were required, OR every tool call is wrong/fabricated, OR the agent claims tool usage in prose with no actual calls in `tool_calls`.
246
+
247
+ ANTI-CLUSTERING: A single missing expected tool call is 0.6, not 0.8. Wrong arguments on the right tool is 0.4–0.6 depending on severity.
248
+
249
+ REASON FIRST, SCORE SECOND. Name the expected tools, mark each as called/missing/wrong, then score.
250
+
251
+ Return JSON:
252
+ {{
253
+ "reasoning": "Map expected tools to actual tool_calls — what's present, missing, wrong",
254
+ "score": float
255
+ }}
256
+ """
257
+
258
+
259
+ def evaluate_response_relevance(
260
+ query: str,
261
+ response: str,
262
+ context: str = "",
263
+ ) -> dict[str, Any]:
264
+ prompt = RESPONSE_RELEVANCE_PROMPT.format(
265
+ query=query,
266
+ response=response,
267
+ context=context or "No conversation history available",
268
+ )
269
+ return _call_llm_judge(prompt, judge_name="relevance")
270
+
271
+
272
+ def evaluate_task_completion(
273
+ query: str,
274
+ response: str,
275
+ status: str,
276
+ expected_desc: str = "Task should complete successfully",
277
+ context: str = ""
278
+ ) -> dict[str, Any]:
279
+ """Evaluate task completion quality including process and outcome."""
280
+ prompt = TASK_COMPLETION_PROMPT.format(
281
+ query=query,
282
+ response=response,
283
+ status=status,
284
+ expected_desc=expected_desc,
285
+ context=context or "No conversation history available"
286
+ )
287
+ return _call_llm_judge(prompt, judge_name="task_completion")
288
+
289
+
290
+ def evaluate_hallucination(
291
+ query: str,
292
+ response: str,
293
+ context: str = "",
294
+ expected_desc: str = "No specific expectations defined.",
295
+ user_facts: str = "No explicit user statements recorded.",
296
+ ) -> dict[str, Any]:
297
+ prompt = HALLUCINATION_DETECTION_PROMPT.format(
298
+ query=query,
299
+ response=response,
300
+ context=context or "No additional context provided",
301
+ expected_desc=expected_desc,
302
+ user_facts=user_facts,
303
+ )
304
+ return _call_llm_judge(prompt, judge_name="hallucination")
305
+
306
+
307
+ def evaluate_tool_call_appropriateness(
308
+ query: str,
309
+ response: str,
310
+ context: str = "",
311
+ tool_calls: str = "[]",
312
+ expected_desc: str = "Task should complete successfully",
313
+ ) -> dict[str, Any]:
314
+ prompt = TOOL_CALL_APPROPRIATENESS_PROMPT.format(
315
+ query=query,
316
+ response=response,
317
+ context=context or "No conversation history available",
318
+ tool_calls=tool_calls or "[]",
319
+ expected_desc=expected_desc,
320
+ )
321
+ return _call_llm_judge(prompt, judge_name="tool_call")
322
+
323
+
324
+ def evaluate_episode_quality(
325
+ query: str,
326
+ response: str,
327
+ status: str,
328
+ context: str = "",
329
+ expected_desc: str = "Task should complete successfully",
330
+ tool_calls: str = "[]",
331
+ has_expected_tools: bool = False,
332
+ user_facts: str = "No explicit user statements recorded.",
333
+ ) -> QualitativeScores:
334
+ scores = QualitativeScores()
335
+
336
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as pool:
337
+ futures: dict[str, Any] = {
338
+ "relevance": pool.submit(evaluate_response_relevance, query, response, context),
339
+ "completion": pool.submit(evaluate_task_completion, query, response, status, expected_desc, context),
340
+ "hallucination": pool.submit(evaluate_hallucination, query, response, context, expected_desc, user_facts),
341
+ }
342
+ if has_expected_tools:
343
+ futures["tool_calls"] = pool.submit(
344
+ evaluate_tool_call_appropriateness,
345
+ query, response, context, tool_calls, expected_desc,
346
+ )
347
+ else:
348
+ scores.judge_reasoning["tool_call_appropriateness"] = "skipped: no tool calls expected for this task"
349
+
350
+ try:
351
+ r = futures["relevance"].result()
352
+ if isinstance(r, dict) and r.get("score") is not None:
353
+ scores.response_relevance = float(r["score"])
354
+ scores.judge_reasoning["relevance"] = r.get("reasoning", "")
355
+ except Exception as e:
356
+ logger.error(f"Response relevance evaluation failed: {e}")
357
+
358
+ try:
359
+ r = futures["completion"].result()
360
+ if isinstance(r, dict) and r.get("score") is not None:
361
+ scores.task_completion_quality = float(r["score"])
362
+ scores.judge_reasoning["completion"] = r.get("reasoning", "")
363
+ except Exception as e:
364
+ logger.error(f"Task completion evaluation failed: {e}")
365
+
366
+ try:
367
+ r = futures["hallucination"].result()
368
+ if isinstance(r, dict) and r.get("score") is not None:
369
+ scores.hallucination_score = float(r["score"])
370
+ scores.judge_reasoning["hallucination"] = r.get("reasoning", "")
371
+ except Exception as e:
372
+ logger.error(f"Hallucination evaluation failed: {e}")
373
+
374
+ if "tool_calls" in futures:
375
+ try:
376
+ r = futures["tool_calls"].result()
377
+ if isinstance(r, dict) and r.get("score") is not None:
378
+ scores.tool_call_appropriateness = float(r["score"])
379
+ scores.judge_reasoning["tool_call_appropriateness"] = r.get("reasoning", "")
380
+ except Exception as e:
381
+ logger.error(f"Tool-call appropriateness evaluation failed: {e}")
382
+
383
+ return scores
@@ -0,0 +1,135 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from ..contracts import EpisodeResult
6
+
7
+
8
+ def _extract_wall_ms(ep: EpisodeResult) -> float:
9
+ return float(ep.trace.timings.get("wall_ms", 0.0))
10
+
11
+
12
+ def _extract_tokens_from_usage(usage: dict[str, Any]) -> tuple[int, int, int]:
13
+ if not isinstance(usage, dict) or not usage:
14
+ return 0, 0, 0
15
+
16
+ prompt = usage.get("input_tokens")
17
+ completion = usage.get("output_tokens")
18
+ total = usage.get("total_tokens")
19
+
20
+ if prompt is not None or completion is not None or total is not None:
21
+ p = int(prompt or 0)
22
+ c = int(completion or 0)
23
+ t = int(total) if total is not None else p + c
24
+ return p, c, t
25
+
26
+ by_model = usage.get("by_model") or usage.get("models")
27
+ if isinstance(by_model, dict):
28
+ prompt_sum = 0
29
+ completion_sum = 0
30
+ total_sum = 0
31
+ for _, model_usage in by_model.items():
32
+ if isinstance(model_usage, dict):
33
+ p, c, t = _extract_tokens_from_usage(model_usage)
34
+ prompt_sum += p
35
+ completion_sum += c
36
+ total_sum += t
37
+ return prompt_sum, completion_sum, total_sum
38
+
39
+ return 0, 0, 0
40
+
41
+
42
+ def episode_metrics(ep: EpisodeResult) -> dict[str, Any]:
43
+ wall_ms = _extract_wall_ms(ep)
44
+ prompt, completion, total = _extract_tokens_from_usage(ep.trace.usage)
45
+
46
+ metrics: dict[str, Any] = {
47
+ "task_id": ep.task_id,
48
+ "expected_outcome": ep.expected_outcome,
49
+ "status": ep.final_status,
50
+ "success": ep.verdict.passed if ep.verdict is not None else False,
51
+ "time": {"wall_ms": wall_ms},
52
+ "tokens": {
53
+ "prompt_tokens": prompt,
54
+ "completion_tokens": completion,
55
+ "total_tokens": total,
56
+ },
57
+ }
58
+
59
+ if ep.verdict:
60
+ metrics["verdict"] = {
61
+ "passed": ep.verdict.passed,
62
+ "reason": ep.verdict.reason,
63
+ }
64
+
65
+ if ep.qualitative_scores:
66
+ metrics["qualitative"] = {
67
+ "response_relevance": ep.qualitative_scores.response_relevance,
68
+ "task_completion_quality": ep.qualitative_scores.task_completion_quality,
69
+ "hallucination_score": ep.qualitative_scores.hallucination_score,
70
+ "tool_call_appropriateness": ep.qualitative_scores.tool_call_appropriateness,
71
+ "judge_reasoning": ep.qualitative_scores.judge_reasoning,
72
+ }
73
+
74
+ return metrics
75
+
76
+
77
+ def summarize_episode_metrics(results: list[EpisodeResult], k: int = 1) -> dict[str, Any]:
78
+ per_episode = [episode_metrics(result) for result in results]
79
+ n = len(per_episode)
80
+
81
+ wall_times = [metric["time"]["wall_ms"] for metric in per_episode]
82
+ total_wall_ms = sum(wall_times)
83
+ avg_wall_ms = (total_wall_ms / n) if n else 0.0
84
+
85
+ prompt_tokens = [metric["tokens"]["prompt_tokens"] for metric in per_episode]
86
+ completion_tokens = [metric["tokens"]["completion_tokens"] for metric in per_episode]
87
+ total_tokens = [metric["tokens"]["total_tokens"] for metric in per_episode]
88
+
89
+ passed = sum(1 for metric in per_episode if metric["success"])
90
+ failed = n - passed
91
+
92
+ summary: dict[str, Any] = {
93
+ "episodes": n,
94
+ "k_attempts": k,
95
+ "total_runs": n,
96
+ "passed": passed,
97
+ "failed": failed,
98
+ "pass_rate": (passed / n) if n else 0.0,
99
+ "time": {
100
+ "total_wall_ms": total_wall_ms,
101
+ "avg_wall_ms": avg_wall_ms,
102
+ "min_wall_ms": min(wall_times) if wall_times else 0.0,
103
+ "max_wall_ms": max(wall_times) if wall_times else 0.0,
104
+ },
105
+ "tokens": {
106
+ "prompt_tokens_total": sum(prompt_tokens),
107
+ "completion_tokens_total": sum(completion_tokens),
108
+ "total_tokens_total": sum(total_tokens),
109
+ "avg_total_tokens": (sum(total_tokens) / n) if n else 0.0,
110
+ "min_total_tokens": min(total_tokens) if total_tokens else 0.0,
111
+ "max_total_tokens": max(total_tokens) if total_tokens else 0.0,
112
+ },
113
+ }
114
+
115
+ qualitative_metrics = [metric["qualitative"] for metric in per_episode if metric.get("qualitative")]
116
+ if qualitative_metrics:
117
+ qualitative_summary: dict[str, Any] = {}
118
+ for field in [
119
+ "response_relevance",
120
+ "task_completion_quality",
121
+ "hallucination_score",
122
+ "tool_call_appropriateness",
123
+ ]:
124
+ values = [metric[field] for metric in qualitative_metrics if metric.get(field) is not None]
125
+ if values:
126
+ qualitative_summary[field] = {
127
+ "avg": sum(values) / len(values),
128
+ "min": min(values),
129
+ "max": max(values),
130
+ "count": len(values),
131
+ }
132
+ if qualitative_summary:
133
+ summary["qualitative"] = qualitative_summary
134
+
135
+ return {"per_episode": per_episode, "summary": summary}
@@ -0,0 +1,64 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Iterable
4
+
5
+
6
+ def build_context_from_events(events: Iterable[dict[str, Any]]) -> str:
7
+ """Build a labelled conversation transcript from trace events.
8
+
9
+ Labels:
10
+ [USER] — explicit user input
11
+ [AGENT OUTPUT] — agent-generated content sent to the user (input-required status);
12
+ values here are agent proposals, NOT trusted system feedback
13
+ [SYSTEM] — internal status updates from the runtime
14
+ """
15
+ lines: list[str] = []
16
+ for event in events:
17
+ t_ms = event.get("t_ms")
18
+ status = event.get("task_status", "")
19
+ preview = event.get("content_preview", "")
20
+ user_input = event.get("user_input")
21
+
22
+ prefix = f"{float(t_ms):.0f}ms | " if t_ms is not None else ""
23
+
24
+ if user_input:
25
+ lines.append(f"[{prefix}USER] {user_input}")
26
+ elif status == "input-required":
27
+ lines.append(f"[{prefix}AGENT OUTPUT] {preview}")
28
+ else:
29
+ lines.append(f"[{prefix}SYSTEM] {preview}")
30
+
31
+ return "\n".join(lines) if lines else "No events"
32
+
33
+
34
+ def build_user_facts_summary(events: Iterable[dict[str, Any]]) -> str:
35
+ """Return a bullet list of everything the user explicitly stated across all turns."""
36
+ facts = [
37
+ f"- {event['user_input'].strip()}"
38
+ for event in events
39
+ if event.get("user_input")
40
+ ]
41
+ return "\n".join(facts) if facts else "No explicit user statements recorded."
42
+
43
+
44
+ def build_expected_desc(
45
+ status: str | None = "completed",
46
+ expected_outcome: str | None = None,
47
+ output_must_contain: list[str] | None = None,
48
+ expected_tool_calls: list[Any] | None = None,
49
+ ) -> str:
50
+ parts: list[str] = []
51
+ if expected_outcome:
52
+ parts.append(f"Expected outcome: {expected_outcome.strip()}")
53
+ if status is not None:
54
+ parts.append(f"The task should reach final status '{status}'.")
55
+ if output_must_contain:
56
+ quoted = ", ".join(f'"{s}"' for s in output_must_contain)
57
+ parts.append(f"Output must contain: {quoted}.")
58
+ if expected_tool_calls:
59
+ calls = [
60
+ f"'{c.name}' (at least {c.times}×)" if c.times > 1 else f"'{c.name}'"
61
+ for c in expected_tool_calls
62
+ ]
63
+ parts.append(f"Expected tool calls: {', '.join(calls)}.")
64
+ return " ".join(parts) if parts else "No specific expectations defined."