onetool-mcp 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. bench/__init__.py +5 -0
  2. bench/cli.py +69 -0
  3. bench/harness/__init__.py +66 -0
  4. bench/harness/client.py +692 -0
  5. bench/harness/config.py +397 -0
  6. bench/harness/csv_writer.py +109 -0
  7. bench/harness/evaluate.py +512 -0
  8. bench/harness/metrics.py +283 -0
  9. bench/harness/runner.py +899 -0
  10. bench/py.typed +0 -0
  11. bench/reporter.py +629 -0
  12. bench/run.py +487 -0
  13. bench/secrets.py +101 -0
  14. bench/utils.py +16 -0
  15. onetool/__init__.py +4 -0
  16. onetool/cli.py +391 -0
  17. onetool/py.typed +0 -0
  18. onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
  19. onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
  20. onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
  21. onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
  22. onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
  23. onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
  24. ot/__init__.py +37 -0
  25. ot/__main__.py +6 -0
  26. ot/_cli.py +107 -0
  27. ot/_tui.py +53 -0
  28. ot/config/__init__.py +46 -0
  29. ot/config/defaults/bench.yaml +4 -0
  30. ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
  31. ot/config/defaults/diagram-templates/c4-context.puml +30 -0
  32. ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
  33. ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
  34. ot/config/defaults/diagram-templates/microservices.d2 +81 -0
  35. ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
  36. ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
  37. ot/config/defaults/onetool.yaml +25 -0
  38. ot/config/defaults/prompts.yaml +97 -0
  39. ot/config/defaults/servers.yaml +7 -0
  40. ot/config/defaults/snippets.yaml +4 -0
  41. ot/config/defaults/tool_templates/__init__.py +7 -0
  42. ot/config/defaults/tool_templates/extension.py +52 -0
  43. ot/config/defaults/tool_templates/isolated.py +61 -0
  44. ot/config/dynamic.py +121 -0
  45. ot/config/global_templates/__init__.py +2 -0
  46. ot/config/global_templates/bench-secrets-template.yaml +6 -0
  47. ot/config/global_templates/bench.yaml +9 -0
  48. ot/config/global_templates/onetool.yaml +27 -0
  49. ot/config/global_templates/secrets-template.yaml +44 -0
  50. ot/config/global_templates/servers.yaml +18 -0
  51. ot/config/global_templates/snippets.yaml +235 -0
  52. ot/config/loader.py +1087 -0
  53. ot/config/mcp.py +145 -0
  54. ot/config/secrets.py +190 -0
  55. ot/config/tool_config.py +125 -0
  56. ot/decorators.py +116 -0
  57. ot/executor/__init__.py +35 -0
  58. ot/executor/base.py +16 -0
  59. ot/executor/fence_processor.py +83 -0
  60. ot/executor/linter.py +142 -0
  61. ot/executor/pack_proxy.py +260 -0
  62. ot/executor/param_resolver.py +140 -0
  63. ot/executor/pep723.py +288 -0
  64. ot/executor/result_store.py +369 -0
  65. ot/executor/runner.py +496 -0
  66. ot/executor/simple.py +163 -0
  67. ot/executor/tool_loader.py +396 -0
  68. ot/executor/validator.py +398 -0
  69. ot/executor/worker_pool.py +388 -0
  70. ot/executor/worker_proxy.py +189 -0
  71. ot/http_client.py +145 -0
  72. ot/logging/__init__.py +37 -0
  73. ot/logging/config.py +315 -0
  74. ot/logging/entry.py +213 -0
  75. ot/logging/format.py +188 -0
  76. ot/logging/span.py +349 -0
  77. ot/meta.py +1555 -0
  78. ot/paths.py +453 -0
  79. ot/prompts.py +218 -0
  80. ot/proxy/__init__.py +21 -0
  81. ot/proxy/manager.py +396 -0
  82. ot/py.typed +0 -0
  83. ot/registry/__init__.py +189 -0
  84. ot/registry/models.py +57 -0
  85. ot/registry/parser.py +269 -0
  86. ot/registry/registry.py +413 -0
  87. ot/server.py +315 -0
  88. ot/shortcuts/__init__.py +15 -0
  89. ot/shortcuts/aliases.py +87 -0
  90. ot/shortcuts/snippets.py +258 -0
  91. ot/stats/__init__.py +35 -0
  92. ot/stats/html.py +250 -0
  93. ot/stats/jsonl_writer.py +283 -0
  94. ot/stats/reader.py +354 -0
  95. ot/stats/timing.py +57 -0
  96. ot/support.py +63 -0
  97. ot/tools.py +114 -0
  98. ot/utils/__init__.py +81 -0
  99. ot/utils/batch.py +161 -0
  100. ot/utils/cache.py +120 -0
  101. ot/utils/deps.py +403 -0
  102. ot/utils/exceptions.py +23 -0
  103. ot/utils/factory.py +179 -0
  104. ot/utils/format.py +65 -0
  105. ot/utils/http.py +202 -0
  106. ot/utils/platform.py +45 -0
  107. ot/utils/sanitize.py +130 -0
  108. ot/utils/truncate.py +69 -0
  109. ot_tools/__init__.py +4 -0
  110. ot_tools/_convert/__init__.py +12 -0
  111. ot_tools/_convert/excel.py +279 -0
  112. ot_tools/_convert/pdf.py +254 -0
  113. ot_tools/_convert/powerpoint.py +268 -0
  114. ot_tools/_convert/utils.py +358 -0
  115. ot_tools/_convert/word.py +283 -0
  116. ot_tools/brave_search.py +604 -0
  117. ot_tools/code_search.py +736 -0
  118. ot_tools/context7.py +495 -0
  119. ot_tools/convert.py +614 -0
  120. ot_tools/db.py +415 -0
  121. ot_tools/diagram.py +1604 -0
  122. ot_tools/diagram.yaml +167 -0
  123. ot_tools/excel.py +1372 -0
  124. ot_tools/file.py +1348 -0
  125. ot_tools/firecrawl.py +732 -0
  126. ot_tools/grounding_search.py +646 -0
  127. ot_tools/package.py +604 -0
  128. ot_tools/py.typed +0 -0
  129. ot_tools/ripgrep.py +544 -0
  130. ot_tools/scaffold.py +471 -0
  131. ot_tools/transform.py +213 -0
  132. ot_tools/web_fetch.py +384 -0
bench/py.typed ADDED
File without changes
bench/reporter.py ADDED
@@ -0,0 +1,629 @@
1
+ """Console reporter for progress events during benchmark runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from rich import box
10
+ from rich.rule import Rule
11
+ from rich.table import Table
12
+
13
+ from ot.config import get_config
14
+ from ot.logging import LogSpan
15
+
16
+ # Patterns to detect in MCP tool responses that indicate LLM retry behavior
17
+ TOOL_RESPONSE_ERROR_PATTERNS = [
18
+ r"Code validation failed",
19
+ ]
20
+
21
+ if TYPE_CHECKING:
22
+ from rich.console import Console
23
+
24
+ from bench.harness.config import HarnessConfig
25
+ from bench.harness.metrics import TaskResult
26
+
27
+
28
+ def _extract_result(response: str) -> str:
29
+ """Extract result from JSON response if present."""
30
+ try:
31
+ parsed = json.loads(response)
32
+ if isinstance(parsed, dict) and "result" in parsed:
33
+ return str(parsed["result"])
34
+ except json.JSONDecodeError:
35
+ pass
36
+ return response
37
+
38
+
39
+ class SpanPrinter:
40
+ """Formats and prints spans to console as name=value pairs.
41
+
42
+ Verbose mode:
43
+ - Non-verbose: newlines replaced with \\n, values truncated at compact_max_length
44
+ - Verbose: full content with actual newlines
45
+ """
46
+
47
+ def __init__(self, console: Console, verbose: bool = False) -> None:
48
+ self.console = console
49
+ self.verbose = verbose
50
+ self.compact_max_length = get_config().compact_max_length
51
+
52
+ def _format_value(self, value: Any) -> str:
53
+ """Format a value for console output based on verbose mode."""
54
+ # Format lists as comma-separated
55
+ if isinstance(value, list):
56
+ result = ", ".join(str(v) for v in value) if value else ""
57
+ else:
58
+ result = str(value)
59
+
60
+ # Non-verbose: truncate and replace newlines
61
+ if not self.verbose:
62
+ result = result.replace("\n", "\\n")
63
+ if len(result) > self.compact_max_length:
64
+ result = result[: self.compact_max_length] + "..."
65
+
66
+ return result
67
+
68
+ def print_span(self, data: dict[str, Any]) -> None:
69
+ """Print all span fields to console.
70
+
71
+ Special handling for taskRequest and taskResponse - highlighted in green
72
+ to match evaluation output style.
73
+ """
74
+ self.console.print()
75
+ for key, value in data.items():
76
+ formatted = self._format_value(value)
77
+ # Highlight taskRequest and taskResponse in green (like evaluation)
78
+ if key in ("taskRequest", "taskResponse"):
79
+ self.console.print(f" [bold green]{key}[/bold green]={formatted}")
80
+ else:
81
+ self.console.print(f" [cyan]{key}[/cyan]={formatted}")
82
+
83
+
84
+ class ConsoleReporter:
85
+ """Handles progress reporting for benchmark runs.
86
+
87
+ Encapsulates all console output logic, supporting verbose mode
88
+ and trace mode for debugging.
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ console: Console,
94
+ config: HarnessConfig,
95
+ *,
96
+ verbose: bool = False,
97
+ trace: bool = False,
98
+ no_color: bool = False,
99
+ ) -> None:
100
+ """Initialize the reporter.
101
+
102
+ Args:
103
+ console: Rich console for output.
104
+ config: Harness configuration (for looking up prompts).
105
+ verbose: Enable verbose output with full content.
106
+ trace: Enable timestamped trace output for debugging.
107
+ no_color: Disable color output.
108
+ """
109
+ self.console = console
110
+ self.config = config
111
+ self.verbose = verbose
112
+ self.trace = trace
113
+ self.no_color = no_color
114
+ self.span_printer = SpanPrinter(console, verbose=verbose)
115
+
116
+ # Track LLM call count per task for [call=N] prefix
117
+ self.llm_call_counts: dict[str, int] = {}
118
+
119
+ # Track tool call count per task
120
+ self.tool_call_counts: dict[str, int] = {}
121
+
122
+ # Track connected servers per task (for inferring server in tool calls)
123
+ self.connected_servers: dict[str, list[str]] = {}
124
+
125
+ # Track current task for server connection events
126
+ self.current_task: str | None = None
127
+
128
+ # Current tool call LogSpan (captures timing and data)
129
+ self.current_tool_span: LogSpan | None = None
130
+
131
+ # Current harness LogSpan (captures timing and data)
132
+ self.current_harness_span: LogSpan | None = None
133
+
134
+ # Track code validation errors for summary
135
+ self.validation_errors: list[dict[str, str]] = []
136
+
137
+ # Track current scenario name
138
+ self.current_scenario: str | None = None
139
+
140
+ def _get_server_for_task(self, task: str | None) -> str:
141
+ """Get the server name for a task.
142
+
143
+ Args:
144
+ task: Task name.
145
+
146
+ Returns:
147
+ Server name or "mcp" if unknown.
148
+ """
149
+ if task and task in self.connected_servers and self.connected_servers[task]:
150
+ servers = self.connected_servers[task]
151
+ return servers[0] if len(servers) == 1 else ",".join(servers)
152
+ return "mcp"
153
+
154
+ def on_event(
155
+ self,
156
+ event: str,
157
+ *,
158
+ scenario: str | None = None,
159
+ task: str | None = None,
160
+ result: TaskResult | None = None,
161
+ server: str | None = None,
162
+ server_status: str | None = None, # noqa: ARG002 - part of event interface
163
+ tool_count: int | None = None,
164
+ error: str | None = None,
165
+ tool_name: str | None = None,
166
+ tool_args: dict[str, Any] | None = None,
167
+ tool_result: str | None = None,
168
+ llm_request: list[dict[str, Any]] | None = None,
169
+ llm_response: str | None = None,
170
+ ) -> None:
171
+ """Handle a progress event from the runner.
172
+
173
+ Args:
174
+ event: Event type (scenario_start, task_start, task_complete, etc.).
175
+ scenario: Scenario name (if applicable).
176
+ task: Task name (if applicable).
177
+ result: TaskResult (for task_complete events).
178
+ server: Server name (for server_* events).
179
+ server_status: Status message (for server_* events).
180
+ tool_count: Number of tools available (for server_connected events).
181
+ error: Error message (for server_failed events).
182
+ tool_name: Name of tool being called (for tool_call/tool_response events).
183
+ tool_args: Arguments passed to tool (for tool_call events).
184
+ tool_result: Result from tool (for tool_response events).
185
+ llm_request: Messages sent to LLM (for llm_request events).
186
+ llm_response: Final LLM response text (for llm_response events).
187
+ """
188
+ if event == "scenario_start":
189
+ self._on_scenario_start(scenario)
190
+ elif event == "task_start":
191
+ self._on_task_start(task)
192
+ elif event == "server_connecting":
193
+ self._on_server_connecting(server)
194
+ elif event == "server_connected":
195
+ self._on_server_connected(task, server, tool_count)
196
+ elif event == "server_failed":
197
+ self._on_server_failed(server, error)
198
+ elif event == "tool_call":
199
+ self._on_tool_call(task, tool_name, tool_args)
200
+ elif event == "tool_response":
201
+ self._on_tool_response(task, tool_name, tool_result)
202
+ elif event == "llm_request":
203
+ self._on_llm_request(task, llm_request)
204
+ elif event == "llm_response":
205
+ self._on_llm_response(task, llm_response)
206
+ elif event == "task_complete":
207
+ self._on_task_complete(task, result)
208
+ elif event == "task_evaluated":
209
+ self._on_task_evaluated(result)
210
+
211
+ def _on_scenario_start(self, scenario: str | None) -> None:
212
+ """Handle scenario_start event."""
213
+ self.current_scenario = scenario
214
+ self.console.print(f"\n[yellow]Scenario[/yellow]: {scenario}")
215
+
216
+ def _on_task_start(self, task: str | None) -> None:
217
+ """Handle task_start event."""
218
+ if task:
219
+ self.llm_call_counts[task] = 0
220
+ self.tool_call_counts[task] = 0
221
+ self.connected_servers[task] = []
222
+ self.current_task = task
223
+
224
+ # Visual separator between tasks
225
+ self.console.print()
226
+ self.console.print(Rule(style="dim"))
227
+ self.console.print(f" [bold cyan]Task[/bold cyan]: {task}")
228
+
229
+ def _on_server_connecting(self, _server: str | None) -> None:
230
+ """Handle server_connecting event."""
231
+ # Server connection logged but not displayed (too noisy)
232
+
233
+ def _on_server_connected(
234
+ self,
235
+ task: str | None,
236
+ server: str | None,
237
+ _tool_count: int | None,
238
+ ) -> None:
239
+ """Handle server_connected event."""
240
+ # Track connected server for this task
241
+ task_key = task or self.current_task
242
+ if task_key and server:
243
+ if task_key not in self.connected_servers:
244
+ self.connected_servers[task_key] = []
245
+ self.connected_servers[task_key].append(server)
246
+
247
+ # Server connection logged but not displayed (too noisy)
248
+
249
+ def _on_server_failed(self, server: str | None, error: str | None) -> None:
250
+ """Handle server_failed event."""
251
+ self.console.print(
252
+ f" [yellow]mcpFailed[/yellow]: [red]{server}[/red], error: {error}"
253
+ )
254
+
255
+ def _on_tool_call(
256
+ self,
257
+ task: str | None,
258
+ tool_name: str | None,
259
+ tool_args: dict[str, Any] | None,
260
+ ) -> None:
261
+ """Handle tool_call event - start a new tool span."""
262
+ # Increment tool call counter
263
+ if task and task in self.tool_call_counts:
264
+ self.tool_call_counts[task] += 1
265
+
266
+ server = self._get_server_for_task(task)
267
+ call_num = self.tool_call_counts.get(task, 1) if task else 1
268
+
269
+ # Format request as tool_name(args) - this is what the LLM sent to MCP
270
+ request_llm = ""
271
+ if tool_name and tool_args:
272
+ if tool_name == "run" and "command" in tool_args:
273
+ request_llm = str(tool_args["command"])
274
+ else:
275
+ request_llm = (
276
+ f"{tool_name}({json.dumps(tool_args, separators=(',', ':'))})"
277
+ )
278
+
279
+ # Create LogSpan - it captures timing automatically
280
+ self.current_tool_span = LogSpan(
281
+ span="bench.tool_call",
282
+ task=task or "",
283
+ call=call_num,
284
+ server=server,
285
+ tool=tool_name or "",
286
+ requestLLM=request_llm,
287
+ )
288
+
289
+ def _on_tool_response(
290
+ self,
291
+ task: str | None,
292
+ tool_name: str | None, # noqa: ARG002
293
+ tool_result: str | None,
294
+ ) -> None:
295
+ """Handle tool_response event - complete and emit tool span."""
296
+ if not self.current_tool_span:
297
+ return
298
+
299
+ # Extract result for cleaner output
300
+ response_mcp = _extract_result(tool_result or "")
301
+
302
+ # Add response to span
303
+ self.current_tool_span.add("responseMCP", response_mcp)
304
+
305
+ # Check for error patterns and save for summary
306
+ for pattern in TOOL_RESPONSE_ERROR_PATTERNS:
307
+ if re.search(pattern, response_mcp):
308
+ # Get requestLLM from the span (dict-style access)
309
+ try:
310
+ request_llm = self.current_tool_span["requestLLM"]
311
+ except KeyError:
312
+ request_llm = ""
313
+ self.validation_errors.append(
314
+ {
315
+ "scenario": self.current_scenario or "",
316
+ "task": task or "",
317
+ "requestLLM": request_llm,
318
+ "responseMCP": response_mcp,
319
+ }
320
+ )
321
+ break
322
+
323
+ # Get data from span for console output (includes duration)
324
+ span_data = self.current_tool_span.to_dict()
325
+
326
+ # Complete the LogSpan (logs to file)
327
+ self.current_tool_span.__exit__(None, None, None)
328
+
329
+ # Print to console
330
+ self.span_printer.print_span(span_data)
331
+
332
+ self.current_tool_span = None
333
+
334
+ def _on_llm_request(
335
+ self, task: str | None, llm_request: list[dict[str, Any]] | None
336
+ ) -> None:
337
+ """Handle llm_request event - start harness span on first call."""
338
+ if not llm_request:
339
+ return
340
+
341
+ # Increment LLM call counter
342
+ if task:
343
+ if task not in self.llm_call_counts:
344
+ self.llm_call_counts[task] = 0
345
+ self.llm_call_counts[task] += 1
346
+
347
+ # Only create harness span on first LLM call
348
+ call_count = self.llm_call_counts.get(task, 0) if task else 0
349
+ if call_count == 1:
350
+ system_prompt = ""
351
+ user_request = ""
352
+ for msg in llm_request:
353
+ role = msg.get("role", "")
354
+ content = msg.get("content", "")
355
+ if role == "system" and content:
356
+ system_prompt = content
357
+ elif role == "user" and content:
358
+ user_request = content
359
+
360
+ # Create LogSpan - it captures timing automatically
361
+ self.current_harness_span = LogSpan(
362
+ span="bench.llm_call",
363
+ task=task or "",
364
+ systemPrompt=system_prompt,
365
+ taskRequest=user_request,
366
+ )
367
+
368
+ def _on_llm_response(self, task: str | None, llm_response: str | None) -> None: # noqa: ARG002
369
+ """Handle llm_response event - capture final response for harness span."""
370
+ # Update harness span with final response
371
+ if self.current_harness_span and llm_response:
372
+ self.current_harness_span.add("taskResponse", llm_response)
373
+
374
+ def _on_task_complete(self, task: str | None, result: TaskResult | None) -> None:
375
+ """Handle task_complete event - emit harness span."""
376
+ if not result:
377
+ return
378
+
379
+ # Complete and emit harness span
380
+ if self.current_harness_span:
381
+ # Add metrics and status to the span
382
+ self.current_harness_span.add(
383
+ tokensIn=result.input_tokens,
384
+ tokensOut=result.output_tokens,
385
+ llmCalls=result.llm_calls,
386
+ toolCalls=result.tool_calls,
387
+ cost=round(result.cost_usd, 6),
388
+ taskStatus="error" if result.error else "complete",
389
+ toolsUsed=result.tools_used or [],
390
+ )
391
+ if result.error:
392
+ self.current_harness_span.add(error=result.error)
393
+
394
+ # Get data from span for console output (includes duration)
395
+ span_data = self.current_harness_span.to_dict()
396
+
397
+ # Complete the LogSpan (logs to file)
398
+ self.current_harness_span.__exit__(None, None, None)
399
+
400
+ # Print to console
401
+ self.span_printer.print_span(span_data)
402
+
403
+ self.current_harness_span = None
404
+
405
+ # Clean up task state
406
+ if task:
407
+ self.connected_servers.pop(task, None)
408
+
409
+ def _on_task_evaluated(self, result: TaskResult | None) -> None:
410
+ """Handle task_evaluated event - display evaluation result."""
411
+ if not result or not result.evaluation:
412
+ return
413
+
414
+ eval_result = result.evaluation
415
+ self.console.print()
416
+
417
+ if eval_result.eval_type == "pass_fail":
418
+ # Pass/fail evaluation - show PASS or FAIL
419
+ if eval_result.passed:
420
+ status_style = "bold green"
421
+ status = "PASS"
422
+ else:
423
+ status_style = "bold red"
424
+ status = "FAIL"
425
+
426
+ self.console.print(
427
+ f" [{status_style}]evaluation[/{status_style}]: {status}"
428
+ )
429
+
430
+ # In verbose mode, show expected vs actual
431
+ if self.verbose:
432
+ if eval_result.expected:
433
+ self.console.print(
434
+ f" [dim]expected[/dim]: {eval_result.expected}"
435
+ )
436
+ if eval_result.actual:
437
+ self.console.print(f" [dim]actual[/dim]: {eval_result.actual}")
438
+ if eval_result.reason:
439
+ self.console.print(f" [dim]reason[/dim]: {eval_result.reason}")
440
+ elif not eval_result.passed:
441
+ # For failures, always show expected vs actual for debugging
442
+ if eval_result.expected:
443
+ self.console.print(
444
+ f" [dim]expected[/dim]: {eval_result.expected}"
445
+ )
446
+ if eval_result.actual:
447
+ self.console.print(f" [dim]actual[/dim]: {eval_result.actual}")
448
+ if eval_result.reason:
449
+ self.console.print(f" [dim]reason[/dim]: {eval_result.reason}")
450
+ else:
451
+ # Scored evaluation - show numeric score
452
+ score = eval_result.score
453
+ if score >= 80:
454
+ score_style = "bold green"
455
+ elif score >= 50:
456
+ score_style = "bold yellow"
457
+ else:
458
+ score_style = "bold red"
459
+
460
+ self.console.print(
461
+ f" [{score_style}]evaluation[/{score_style}]: "
462
+ f"score={score}/100, reason={eval_result.reason}"
463
+ )
464
+
465
+ def print_results_header(self) -> None:
466
+ """Print the BENCHMARK RESULTS header with a double-line separator."""
467
+ self.console.print()
468
+ self.console.print(Rule("BENCHMARK RESULTS", style="bold cyan", characters="═"))
469
+ self.console.print()
470
+
471
+ def _format_eval_result(self, task_result: Any) -> str:
472
+ """Format evaluation result for table display.
473
+
474
+ Returns PASS/FAIL for pass_fail evaluations, numeric score for scored.
475
+ """
476
+ if not task_result.evaluation:
477
+ return "-"
478
+
479
+ eval_result = task_result.evaluation
480
+ if eval_result.eval_type == "pass_fail":
481
+ return "PASS" if eval_result.passed else "FAIL"
482
+ else:
483
+ return str(eval_result.score)
484
+
485
+ def _style_eval_result(self, task_result: Any) -> str:
486
+ """Get style for evaluation result based on pass/fail or score."""
487
+ if not task_result.evaluation:
488
+ return ""
489
+
490
+ eval_result = task_result.evaluation
491
+ if eval_result.eval_type == "pass_fail":
492
+ return "bold green" if eval_result.passed else "bold red"
493
+ else:
494
+ score = eval_result.score
495
+ if score >= 80:
496
+ return "bold green"
497
+ elif score >= 50:
498
+ return "bold yellow"
499
+ else:
500
+ return "bold red"
501
+
502
+ def print_results_table(
503
+ self, scenario_result: Any, *, show_header: bool = False
504
+ ) -> None:
505
+ """Print a results table for a scenario.
506
+
507
+ Args:
508
+ scenario_result: ScenarioResult with tasks to display.
509
+ show_header: If True, print the BENCHMARK RESULTS header first.
510
+ """
511
+ if show_header:
512
+ self.print_results_header()
513
+
514
+ self.console.print(f"[yellow]Scenario[/yellow]: {scenario_result.name}")
515
+
516
+ # Check if any task has per-call metrics (for context columns)
517
+ has_call_metrics = any(
518
+ task.llm_call_metrics for task in scenario_result.tasks
519
+ )
520
+
521
+ # Create comparison table with ROUNDED box style
522
+ table = Table(show_header=True, header_style="bold", box=box.ROUNDED)
523
+ table.add_column("Task", min_width=16, no_wrap=True)
524
+ table.add_column("in", justify="right", no_wrap=True)
525
+ table.add_column("out", justify="right", no_wrap=True)
526
+ table.add_column("tools", justify="right", no_wrap=True)
527
+ table.add_column("time", justify="right", no_wrap=True)
528
+ table.add_column("cost", justify="right", no_wrap=True)
529
+ table.add_column("result", justify="right", no_wrap=True)
530
+
531
+ for task_result in scenario_result.tasks:
532
+ eval_display = self._format_eval_result(task_result)
533
+ eval_style = self._style_eval_result(task_result)
534
+ cost_cents = task_result.cost_usd * 100
535
+
536
+ # Apply style to evaluation result
537
+ if eval_style:
538
+ eval_display = f"[{eval_style}]{eval_display}[/{eval_style}]"
539
+
540
+ row = [
541
+ task_result.name,
542
+ str(task_result.input_tokens),
543
+ str(task_result.output_tokens),
544
+ str(task_result.tool_calls),
545
+ ]
546
+
547
+ row.extend([
548
+ f"{task_result.duration_seconds:.0f}s",
549
+ f"{cost_cents:.2f}¢",
550
+ eval_display,
551
+ ])
552
+
553
+ table.add_row(*row)
554
+
555
+ self.console.print(table)
556
+
557
+ # Show per-call breakdown in verbose mode
558
+ if self.verbose and has_call_metrics:
559
+ self.console.print("\n [dim]Per-call breakdown:[/dim]")
560
+ for task_result in scenario_result.tasks:
561
+ if task_result.llm_call_metrics:
562
+ self.console.print(f" [cyan]{task_result.name}[/cyan]:")
563
+ for m in task_result.llm_call_metrics:
564
+ self.console.print(
565
+ f" call{m.call_number}: "
566
+ f"in={m.input_tokens}, out={m.output_tokens}, "
567
+ f"tools={m.tool_calls_made}, "
568
+ f"cumulative={m.cumulative_input}, "
569
+ f"latency={m.latency_ms}ms"
570
+ )
571
+
572
+ # Show totals with camelCase labels
573
+ totals = scenario_result.calculate_totals()
574
+ cost_cents = totals["total_cost_usd"] * 100
575
+
576
+ # Build totals line
577
+ totals_parts = [
578
+ f"tokensIn={totals['total_input_tokens']}",
579
+ f"tokensOut={totals['total_output_tokens']}",
580
+ f"llmCalls={totals['total_llm_calls']}",
581
+ f"toolCalls={totals['total_tool_calls']}",
582
+ f"cost={cost_cents:.2f}¢",
583
+ ]
584
+
585
+ # Add evaluation summary
586
+ if "pass_count" in totals or "fail_count" in totals:
587
+ pass_count = totals.get("pass_count", 0)
588
+ fail_count = totals.get("fail_count", 0)
589
+ if fail_count == 0:
590
+ totals_parts.append(f"[bold green]{pass_count} passed[/bold green]")
591
+ else:
592
+ totals_parts.append(
593
+ f"[bold green]{pass_count} passed[/bold green], "
594
+ f"[bold red]{fail_count} failed[/bold red]"
595
+ )
596
+
597
+ if "avg_score" in totals:
598
+ avg = totals["avg_score"]
599
+ if avg >= 80:
600
+ style = "bold green"
601
+ elif avg >= 50:
602
+ style = "bold yellow"
603
+ else:
604
+ style = "bold red"
605
+ totals_parts.append(f"[{style}]avgScore={avg}[/{style}]")
606
+
607
+ self.console.print(f"\n totals: {', '.join(totals_parts)}")
608
+
609
+ def print_validation_errors(self) -> None:
610
+ """Print summary of validation errors detected during the run."""
611
+ if not self.validation_errors:
612
+ return
613
+
614
+ self.console.print()
615
+ self.console.print(
616
+ Rule("VALIDATION ERRORS", style="bold yellow", characters="─")
617
+ )
618
+ self.console.print()
619
+ self.console.print(
620
+ f"[yellow]{len(self.validation_errors)} validation error(s) detected "
621
+ "(LLM retried after these):[/yellow]"
622
+ )
623
+
624
+ for error in self.validation_errors:
625
+ self.console.print()
626
+ self.console.print(f" [cyan]Scenario[/cyan]: {error['scenario']}")
627
+ self.console.print(f" [cyan]Task[/cyan]: {error['task']}")
628
+ self.console.print(f" [cyan]requestLLM[/cyan]= {error['requestLLM']}")
629
+ self.console.print(f" [cyan]responseMCP[/cyan]= {error['responseMCP']}")