onetool-mcp 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. bench/__init__.py +5 -0
  2. bench/cli.py +69 -0
  3. bench/harness/__init__.py +66 -0
  4. bench/harness/client.py +692 -0
  5. bench/harness/config.py +397 -0
  6. bench/harness/csv_writer.py +109 -0
  7. bench/harness/evaluate.py +512 -0
  8. bench/harness/metrics.py +283 -0
  9. bench/harness/runner.py +899 -0
  10. bench/py.typed +0 -0
  11. bench/reporter.py +629 -0
  12. bench/run.py +487 -0
  13. bench/secrets.py +101 -0
  14. bench/utils.py +16 -0
  15. onetool/__init__.py +4 -0
  16. onetool/cli.py +391 -0
  17. onetool/py.typed +0 -0
  18. onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
  19. onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
  20. onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
  21. onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
  22. onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
  23. onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
  24. ot/__init__.py +37 -0
  25. ot/__main__.py +6 -0
  26. ot/_cli.py +107 -0
  27. ot/_tui.py +53 -0
  28. ot/config/__init__.py +46 -0
  29. ot/config/defaults/bench.yaml +4 -0
  30. ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
  31. ot/config/defaults/diagram-templates/c4-context.puml +30 -0
  32. ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
  33. ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
  34. ot/config/defaults/diagram-templates/microservices.d2 +81 -0
  35. ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
  36. ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
  37. ot/config/defaults/onetool.yaml +25 -0
  38. ot/config/defaults/prompts.yaml +97 -0
  39. ot/config/defaults/servers.yaml +7 -0
  40. ot/config/defaults/snippets.yaml +4 -0
  41. ot/config/defaults/tool_templates/__init__.py +7 -0
  42. ot/config/defaults/tool_templates/extension.py +52 -0
  43. ot/config/defaults/tool_templates/isolated.py +61 -0
  44. ot/config/dynamic.py +121 -0
  45. ot/config/global_templates/__init__.py +2 -0
  46. ot/config/global_templates/bench-secrets-template.yaml +6 -0
  47. ot/config/global_templates/bench.yaml +9 -0
  48. ot/config/global_templates/onetool.yaml +27 -0
  49. ot/config/global_templates/secrets-template.yaml +44 -0
  50. ot/config/global_templates/servers.yaml +18 -0
  51. ot/config/global_templates/snippets.yaml +235 -0
  52. ot/config/loader.py +1087 -0
  53. ot/config/mcp.py +145 -0
  54. ot/config/secrets.py +190 -0
  55. ot/config/tool_config.py +125 -0
  56. ot/decorators.py +116 -0
  57. ot/executor/__init__.py +35 -0
  58. ot/executor/base.py +16 -0
  59. ot/executor/fence_processor.py +83 -0
  60. ot/executor/linter.py +142 -0
  61. ot/executor/pack_proxy.py +260 -0
  62. ot/executor/param_resolver.py +140 -0
  63. ot/executor/pep723.py +288 -0
  64. ot/executor/result_store.py +369 -0
  65. ot/executor/runner.py +496 -0
  66. ot/executor/simple.py +163 -0
  67. ot/executor/tool_loader.py +396 -0
  68. ot/executor/validator.py +398 -0
  69. ot/executor/worker_pool.py +388 -0
  70. ot/executor/worker_proxy.py +189 -0
  71. ot/http_client.py +145 -0
  72. ot/logging/__init__.py +37 -0
  73. ot/logging/config.py +315 -0
  74. ot/logging/entry.py +213 -0
  75. ot/logging/format.py +188 -0
  76. ot/logging/span.py +349 -0
  77. ot/meta.py +1555 -0
  78. ot/paths.py +453 -0
  79. ot/prompts.py +218 -0
  80. ot/proxy/__init__.py +21 -0
  81. ot/proxy/manager.py +396 -0
  82. ot/py.typed +0 -0
  83. ot/registry/__init__.py +189 -0
  84. ot/registry/models.py +57 -0
  85. ot/registry/parser.py +269 -0
  86. ot/registry/registry.py +413 -0
  87. ot/server.py +315 -0
  88. ot/shortcuts/__init__.py +15 -0
  89. ot/shortcuts/aliases.py +87 -0
  90. ot/shortcuts/snippets.py +258 -0
  91. ot/stats/__init__.py +35 -0
  92. ot/stats/html.py +250 -0
  93. ot/stats/jsonl_writer.py +283 -0
  94. ot/stats/reader.py +354 -0
  95. ot/stats/timing.py +57 -0
  96. ot/support.py +63 -0
  97. ot/tools.py +114 -0
  98. ot/utils/__init__.py +81 -0
  99. ot/utils/batch.py +161 -0
  100. ot/utils/cache.py +120 -0
  101. ot/utils/deps.py +403 -0
  102. ot/utils/exceptions.py +23 -0
  103. ot/utils/factory.py +179 -0
  104. ot/utils/format.py +65 -0
  105. ot/utils/http.py +202 -0
  106. ot/utils/platform.py +45 -0
  107. ot/utils/sanitize.py +130 -0
  108. ot/utils/truncate.py +69 -0
  109. ot_tools/__init__.py +4 -0
  110. ot_tools/_convert/__init__.py +12 -0
  111. ot_tools/_convert/excel.py +279 -0
  112. ot_tools/_convert/pdf.py +254 -0
  113. ot_tools/_convert/powerpoint.py +268 -0
  114. ot_tools/_convert/utils.py +358 -0
  115. ot_tools/_convert/word.py +283 -0
  116. ot_tools/brave_search.py +604 -0
  117. ot_tools/code_search.py +736 -0
  118. ot_tools/context7.py +495 -0
  119. ot_tools/convert.py +614 -0
  120. ot_tools/db.py +415 -0
  121. ot_tools/diagram.py +1604 -0
  122. ot_tools/diagram.yaml +167 -0
  123. ot_tools/excel.py +1372 -0
  124. ot_tools/file.py +1348 -0
  125. ot_tools/firecrawl.py +732 -0
  126. ot_tools/grounding_search.py +646 -0
  127. ot_tools/package.py +604 -0
  128. ot_tools/py.typed +0 -0
  129. ot_tools/ripgrep.py +544 -0
  130. ot_tools/scaffold.py +471 -0
  131. ot_tools/transform.py +213 -0
  132. ot_tools/web_fetch.py +384 -0
@@ -0,0 +1,899 @@
1
+ """Agentic loop runner for executing prompts with MCP servers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import fnmatch
7
+ import json
8
+ import time
9
+ from typing import TYPE_CHECKING, Any, Protocol
10
+
11
+ from loguru import logger
12
+ from openai import OpenAI
13
+
14
+ from bench.harness.client import (
15
+ ServerConnectionCallback,
16
+ call_tool,
17
+ connect_to_servers,
18
+ multi_server_tools_to_openai,
19
+ )
20
+ from bench.harness.evaluate import evaluate_task, resolve_evaluator
21
+ from bench.harness.metrics import (
22
+ EvaluationResult,
23
+ LLMCallMetrics,
24
+ ScenarioResult,
25
+ TaskResult,
26
+ calculate_cost,
27
+ )
28
+ from bench.secrets import get_bench_secret
29
+ from ot.logging import LogSpan
30
+ from ot.utils import flatten_exception_group
31
+
32
+ # Delay between tasks to avoid rate limits on external APIs (OpenRouter, etc.)
33
+ TASK_DELAY_SECONDS = 3.0
34
+
35
+ # Delimiter for multi-prompt tasks
36
+ PROMPT_DELIMITER = "---PROMPT---"
37
+
38
+
39
+ def split_prompts(prompt: str) -> list[str]:
40
+ """Split a prompt into multiple sequential prompts.
41
+
42
+ Uses the `---PROMPT---` delimiter to split a single prompt field
43
+ into multiple prompts for controlled benchmarking.
44
+
45
+ Args:
46
+ prompt: The prompt string (may contain delimiters).
47
+
48
+ Returns:
49
+ List of prompt strings. Single element if no delimiter found.
50
+ """
51
+ if not prompt:
52
+ return [""]
53
+ parts = prompt.split(PROMPT_DELIMITER)
54
+ return [p.strip() for p in parts if p.strip()]
55
+
56
+ if TYPE_CHECKING:
57
+ from bench.harness.config import HarnessConfig, TaskConfig
58
+
59
+
60
+ class ProgressCallback(Protocol):
61
+ """Protocol for progress callbacks."""
62
+
63
+ def __call__(
64
+ self,
65
+ event: str,
66
+ *,
67
+ scenario: str | None = None,
68
+ task: str | None = None,
69
+ result: TaskResult | None = None,
70
+ server: str | None = None,
71
+ server_status: str | None = None,
72
+ tool_count: int | None = None,
73
+ error: str | None = None,
74
+ tool_name: str | None = None,
75
+ tool_args: dict[str, Any] | None = None,
76
+ tool_result: str | None = None,
77
+ llm_request: list[dict[str, Any]] | None = None,
78
+ llm_response: str | None = None,
79
+ ) -> None:
80
+ """Called when progress is made.
81
+
82
+ Args:
83
+ event: Event type (scenario_start, task_start, task_complete,
84
+ server_connecting, server_connected, server_failed,
85
+ tool_call, tool_response, llm_request, llm_response).
86
+ scenario: Scenario name (if applicable).
87
+ task: Task name (if applicable).
88
+ result: TaskResult (for task_complete events).
89
+ server: Server name (for server_* events).
90
+ server_status: Status message (for server_* events).
91
+ tool_count: Number of tools available (for server_connected events).
92
+ error: Error message (for server_failed events).
93
+ tool_name: Name of tool being called (for tool_call/tool_response events).
94
+ tool_args: Arguments passed to tool (for tool_call events).
95
+ tool_result: Result from tool (for tool_response events).
96
+ llm_request: Messages sent to LLM (for llm_request events).
97
+ llm_response: Final LLM response text (for llm_response events).
98
+ """
99
+ ...
100
+
101
+
102
+ class AgenticRunner:
103
+ """Runner that executes prompts with optional MCP server integration."""
104
+
105
+ def __init__(
106
+ self,
107
+ config: HarnessConfig,
108
+ dry_run: bool = False,
109
+ verbose: bool = False,
110
+ on_progress: ProgressCallback | None = None,
111
+ ) -> None:
112
+ """Initialize the runner.
113
+
114
+ Args:
115
+ config: Harness configuration.
116
+ dry_run: If True, validate config without making API calls.
117
+ verbose: If True, log detailed MCP tool call info.
118
+ on_progress: Optional callback for progress updates.
119
+ """
120
+ self.config = config
121
+ self.dry_run = dry_run
122
+ self.verbose = verbose
123
+ self.on_progress = on_progress
124
+ # Partial results accumulated during run (for interrupt handling)
125
+ self.partial_results: list[ScenarioResult] = []
126
+ self.client = OpenAI(
127
+ api_key=get_bench_secret("OPENAI_API_KEY"),
128
+ base_url=get_bench_secret("OPENAI_BASE_URL"),
129
+ )
130
+
131
+ def _emit(
132
+ self,
133
+ event: str,
134
+ *,
135
+ scenario: str | None = None,
136
+ task: str | None = None,
137
+ result: TaskResult | None = None,
138
+ server: str | None = None,
139
+ server_status: str | None = None,
140
+ tool_count: int | None = None,
141
+ error: str | None = None,
142
+ tool_name: str | None = None,
143
+ tool_args: dict[str, Any] | None = None,
144
+ tool_result: str | None = None,
145
+ llm_request: list[dict[str, Any]] | None = None,
146
+ llm_response: str | None = None,
147
+ ) -> None:
148
+ """Emit a progress event if callback is set."""
149
+ if self.on_progress:
150
+ self.on_progress(
151
+ event,
152
+ scenario=scenario,
153
+ task=task,
154
+ result=result,
155
+ server=server,
156
+ server_status=server_status,
157
+ tool_count=tool_count,
158
+ error=error,
159
+ tool_name=tool_name,
160
+ tool_args=tool_args,
161
+ tool_result=tool_result,
162
+ llm_request=llm_request,
163
+ llm_response=llm_response,
164
+ )
165
+
166
+ def _update_partial_results(
167
+ self,
168
+ completed_scenarios: list[ScenarioResult],
169
+ current_scenario_name: str,
170
+ current_model: str,
171
+ current_tasks: list[TaskResult],
172
+ ) -> None:
173
+ """Update partial_results with completed scenarios plus current progress.
174
+
175
+ Called after each task to enable interrupt handling with full visibility.
176
+ """
177
+ # Start with completed scenarios
178
+ self.partial_results = completed_scenarios.copy()
179
+ # Add current scenario's progress if any tasks completed
180
+ if current_tasks:
181
+ self.partial_results.append(
182
+ ScenarioResult(
183
+ name=current_scenario_name,
184
+ model=current_model,
185
+ tasks=current_tasks.copy(),
186
+ )
187
+ )
188
+
189
+ def _get_server_names(self, server: str | list[str] | None) -> list[str]:
190
+ """Get list of server names from task config.
191
+
192
+ Args:
193
+ server: Server name, list of names, or None.
194
+
195
+ Returns:
196
+ List of server names (empty if None).
197
+ """
198
+ if server is None:
199
+ return []
200
+ if isinstance(server, list):
201
+ return server
202
+ return [server]
203
+
204
+ async def run_task(
205
+ self,
206
+ task: TaskConfig,
207
+ default_model: str,
208
+ default_timeout: int,
209
+ ) -> TaskResult:
210
+ """Run a single task (direct or harness).
211
+
212
+ Args:
213
+ task: Task configuration.
214
+ default_model: Default model from scenario/config.
215
+ default_timeout: Default timeout from scenario/config.
216
+
217
+ Returns:
218
+ TaskResult with metrics and response.
219
+ """
220
+ if task.type == "direct":
221
+ return await self._run_direct_task(task, default_timeout)
222
+ return await self._run_harness_task(task, default_model, default_timeout)
223
+
224
+ async def _run_direct_task(
225
+ self,
226
+ task: TaskConfig,
227
+ default_timeout: int,
228
+ ) -> TaskResult:
229
+ """Run a direct MCP tool invocation task.
230
+
231
+ Args:
232
+ task: Task configuration (type: direct).
233
+ default_timeout: Default timeout from scenario/config.
234
+
235
+ Returns:
236
+ TaskResult with tool result.
237
+ """
238
+ timeout = task.timeout or default_timeout
239
+ start_time = time.time()
240
+ response_text = ""
241
+ error_msg: str | None = None
242
+
243
+ if self.dry_run:
244
+ logger.info(f"[dry-run] Would call tool: {task.tool}")
245
+ return TaskResult(
246
+ name=task.name,
247
+ server=task.server,
248
+ model="direct",
249
+ prompt=f"Tool: {task.tool}",
250
+ response="[dry-run] No API call made",
251
+ input_tokens=0,
252
+ output_tokens=0,
253
+ llm_calls=0,
254
+ tool_calls=0,
255
+ tools_used=[],
256
+ duration_seconds=0.0,
257
+ cost_usd=0.0,
258
+ tags=task.tags,
259
+ )
260
+
261
+ server_names = self._get_server_names(task.server)
262
+
263
+ try:
264
+ async with asyncio.timeout(timeout):
265
+ async with connect_to_servers(
266
+ self.config.servers,
267
+ server_names,
268
+ timeout=timeout,
269
+ ) as multi:
270
+ # Find the session for this tool
271
+ session = multi.get_session_for_tool(task.tool) # type: ignore[arg-type]
272
+ if not session:
273
+ error_msg = f"Tool '{task.tool}' not found in any server"
274
+ logger.error(f"[{task.name}] {error_msg}")
275
+ else:
276
+ self._emit(
277
+ "tool_call",
278
+ task=task.name,
279
+ tool_name=task.tool,
280
+ tool_args=task.arguments,
281
+ )
282
+ response_text = await call_tool(
283
+ session,
284
+ task.tool,
285
+ task.arguments,
286
+ timeout=timeout, # type: ignore[arg-type]
287
+ )
288
+ self._emit(
289
+ "tool_response",
290
+ task=task.name,
291
+ tool_name=task.tool,
292
+ tool_result=response_text,
293
+ )
294
+ except TimeoutError:
295
+ error_msg = f"Task timed out after {timeout}s"
296
+ logger.error(f"[{task.name}] {error_msg}")
297
+ except BaseExceptionGroup as eg:
298
+ leaf_exceptions = flatten_exception_group(eg)
299
+ error_msg = "; ".join(str(e) for e in leaf_exceptions)
300
+ logger.error(f"Error running task {task.name}: {error_msg}")
301
+ except Exception as e:
302
+ error_msg = str(e)
303
+ logger.error(f"Error running task {task.name}: {e}")
304
+
305
+ duration = time.time() - start_time
306
+
307
+ return TaskResult(
308
+ name=task.name,
309
+ server=task.server,
310
+ model="direct",
311
+ prompt=f"Tool: {task.tool}",
312
+ response=response_text,
313
+ input_tokens=0,
314
+ output_tokens=0,
315
+ llm_calls=0,
316
+ tool_calls=1 if not error_msg else 0,
317
+ tools_used=[task.tool] if task.tool and not error_msg else [],
318
+ tool_results=[response_text] if response_text else [],
319
+ duration_seconds=duration,
320
+ cost_usd=0.0,
321
+ error=error_msg,
322
+ tags=task.tags,
323
+ )
324
+
325
+ async def _run_harness_task(
326
+ self,
327
+ task: TaskConfig,
328
+ default_model: str,
329
+ default_timeout: int,
330
+ ) -> TaskResult:
331
+ """Run an agent benchmark task.
332
+
333
+ Args:
334
+ task: Task configuration (type: harness).
335
+ default_model: Default model from scenario/config.
336
+ default_timeout: Default timeout from scenario/config.
337
+
338
+ Returns:
339
+ TaskResult with metrics and response.
340
+ """
341
+ model = task.model or default_model
342
+ timeout = task.timeout or default_timeout
343
+
344
+ if self.dry_run:
345
+ logger.info(f"[dry-run] Would run task: {task.name}")
346
+ return TaskResult(
347
+ name=task.name,
348
+ server=task.server,
349
+ model=model,
350
+ prompt=task.prompt or "",
351
+ response="[dry-run] No API call made",
352
+ input_tokens=0,
353
+ output_tokens=0,
354
+ llm_calls=0,
355
+ tool_calls=0,
356
+ tools_used=[],
357
+ duration_seconds=0.0,
358
+ cost_usd=0.0,
359
+ tags=task.tags,
360
+ )
361
+
362
+ start_time = time.time()
363
+ input_tokens = 0
364
+ output_tokens = 0
365
+ llm_call_count = 0
366
+ tool_call_count = 0
367
+ tools_used: list[str] = []
368
+ tool_results: list[str] = []
369
+ response_text = ""
370
+ error_msg: str | None = None
371
+ # Per-call metrics tracking
372
+ llm_call_metrics: list[LLMCallMetrics] = []
373
+ cumulative_input = 0
374
+
375
+ # Get list of servers to connect to
376
+ server_names = self._get_server_names(task.server)
377
+
378
+ # Create a callback to emit progress events for server connections
379
+ class ConnectionProgress(ServerConnectionCallback):
380
+ def __init__(inner_self) -> None:
381
+ inner_self.runner = self
382
+ inner_self.task_name = task.name
383
+
384
+ def on_connecting(inner_self, name: str) -> None:
385
+ inner_self.runner._emit(
386
+ "server_connecting",
387
+ task=inner_self.task_name,
388
+ server=name,
389
+ )
390
+
391
+ def on_connected(inner_self, name: str, tool_count: int) -> None:
392
+ inner_self.runner._emit(
393
+ "server_connected",
394
+ task=inner_self.task_name,
395
+ server=name,
396
+ tool_count=tool_count,
397
+ )
398
+
399
+ def on_failed(inner_self, name: str, error: str) -> None:
400
+ inner_self.runner._emit(
401
+ "server_failed",
402
+ task=inner_self.task_name,
403
+ server=name,
404
+ error=error,
405
+ )
406
+
407
+ try:
408
+ # Timeout covers entire task: server connections + LLM calls + tool calls
409
+ async with asyncio.timeout(timeout):
410
+ async with connect_to_servers(
411
+ self.config.servers,
412
+ server_names,
413
+ timeout=timeout,
414
+ on_progress=ConnectionProgress() if self.on_progress else None,
415
+ ) as multi:
416
+ # Log MCP connection summary
417
+ logger.info(
418
+ f" mcpConnected={multi.healthy_count}, "
419
+ f"toolCount={len(multi.all_tools)}"
420
+ )
421
+ if multi.all_tools:
422
+ tool_names = [t.name for t in multi.all_tools]
423
+ logger.info(f" tools={tool_names}")
424
+
425
+ # Build system message with server instructions
426
+ system_parts: list[str] = []
427
+ if self.config.defaults.system_prompt:
428
+ system_parts.append(self.config.defaults.system_prompt.strip())
429
+
430
+ # Include instructions from all connected MCP servers
431
+ for server_name, instructions in multi.all_instructions:
432
+ if len(multi.connections) > 1:
433
+ system_parts.append(
434
+ f"## {server_name} Instructions\n{instructions.strip()}"
435
+ )
436
+ else:
437
+ system_parts.append(instructions.strip())
438
+
439
+ # Include prompts from all connected MCP servers
440
+ if multi.all_prompts:
441
+ prompts_text = "## Available Prompts\n"
442
+ for server_name, prompt in multi.all_prompts:
443
+ prefix = (
444
+ f"[{server_name}] "
445
+ if len(multi.connections) > 1
446
+ else ""
447
+ )
448
+ desc = (
449
+ f" - {prompt.description}" if prompt.description else ""
450
+ )
451
+ prompts_text += f"- {prefix}{prompt.name}{desc}\n"
452
+ system_parts.append(prompts_text.strip())
453
+
454
+ # Include resources from all connected MCP servers
455
+ if multi.all_resources:
456
+ resources_text = "## Available Resources\n"
457
+ for server_name, resource in multi.all_resources:
458
+ prefix = (
459
+ f"[{server_name}] "
460
+ if len(multi.connections) > 1
461
+ else ""
462
+ )
463
+ desc = (
464
+ f" - {resource.description}"
465
+ if resource.description
466
+ else ""
467
+ )
468
+ resources_text += f"- {prefix}{resource.uri}{desc}\n"
469
+ system_parts.append(resources_text.strip())
470
+
471
+ if multi.healthy_count > 0:
472
+ system_parts.append(
473
+ f"You have access to {len(multi.all_tools)} tools from "
474
+ f"{multi.healthy_count} MCP server(s)."
475
+ )
476
+ if multi.failed_count > 0:
477
+ failed_names = [h.name for h in multi.health if not h.healthy]
478
+ system_parts.append(
479
+ f"Note: {multi.failed_count} server(s) failed to start: "
480
+ f"{', '.join(failed_names)}"
481
+ )
482
+
483
+ messages: list[dict[str, Any]] = []
484
+ if system_parts:
485
+ messages.append(
486
+ {"role": "system", "content": "\n".join(system_parts)}
487
+ )
488
+
489
+ # Get combined tools from all servers (with prefixed names if multiple)
490
+ tools = None
491
+ tool_mapping: dict[str, tuple[str, str]] = {}
492
+ if multi.all_tools:
493
+ tools, tool_mapping = multi_server_tools_to_openai(multi)
494
+
495
+ # Split prompts for multi-prompt tasks
496
+ prompts = split_prompts(task.prompt or "")
497
+
498
+ # Process each prompt sequentially (conversation accumulates)
499
+ for prompt_text in prompts:
500
+ messages.append({"role": "user", "content": prompt_text})
501
+
502
+ while True:
503
+ # Emit LLM request event before calling
504
+ self._emit(
505
+ "llm_request",
506
+ task=task.name,
507
+ llm_request=messages,
508
+ )
509
+
510
+ # Track per-call timing
511
+ call_start = time.time()
512
+
513
+ # Run sync LLM call in thread so asyncio.timeout can cancel it
514
+ with LogSpan(
515
+ span="bench.llm.request",
516
+ model=model,
517
+ call=llm_call_count + 1,
518
+ ) as llm_span:
519
+ response = await asyncio.to_thread(
520
+ self.client.chat.completions.create, # type: ignore[arg-type]
521
+ model=model,
522
+ messages=messages,
523
+ tools=tools,
524
+ timeout=timeout,
525
+ )
526
+ if response.usage:
527
+ llm_span.add(
528
+ inputTokens=response.usage.prompt_tokens,
529
+ outputTokens=response.usage.completion_tokens,
530
+ )
531
+
532
+ call_latency_ms = int((time.time() - call_start) * 1000)
533
+ llm_call_count += 1
534
+
535
+ # Track token usage
536
+ call_input_tokens = 0
537
+ call_output_tokens = 0
538
+ if response.usage:
539
+ call_input_tokens = response.usage.prompt_tokens
540
+ call_output_tokens = response.usage.completion_tokens
541
+ input_tokens += call_input_tokens
542
+ output_tokens += call_output_tokens
543
+ cumulative_input += call_input_tokens
544
+
545
+ assistant_msg = response.choices[0].message
546
+
547
+ # Count tool calls in this response
548
+ call_tool_count = (
549
+ len(assistant_msg.tool_calls)
550
+ if assistant_msg.tool_calls
551
+ else 0
552
+ )
553
+
554
+ # Create per-call metrics
555
+ llm_call_metrics.append(
556
+ LLMCallMetrics(
557
+ call_number=llm_call_count,
558
+ input_tokens=call_input_tokens,
559
+ output_tokens=call_output_tokens,
560
+ tool_calls_made=call_tool_count,
561
+ cumulative_input=cumulative_input,
562
+ latency_ms=call_latency_ms,
563
+ )
564
+ )
565
+
566
+ if assistant_msg.tool_calls and multi.all_tools:
567
+ # Add assistant message with tool calls
568
+ messages.append(
569
+ {
570
+ "role": "assistant",
571
+ "content": assistant_msg.content,
572
+ "tool_calls": [
573
+ {
574
+ "id": tc.id,
575
+ "type": "function",
576
+ "function": {
577
+ "name": tc.function.name, # type: ignore[union-attr]
578
+ "arguments": tc.function.arguments, # type: ignore[union-attr]
579
+ },
580
+ }
581
+ for tc in assistant_msg.tool_calls
582
+ ],
583
+ }
584
+ )
585
+
586
+ # Execute each tool call
587
+ for tc in assistant_msg.tool_calls:
588
+ tool_call_count += 1
589
+ prefixed_name = tc.function.name # type: ignore[union-attr]
590
+ tool_args = json.loads(tc.function.arguments) # type: ignore[union-attr]
591
+
592
+ # Look up server and original tool name from mapping
593
+ if prefixed_name in tool_mapping:
594
+ server_name, original_tool_name = tool_mapping[
595
+ prefixed_name
596
+ ]
597
+ else:
598
+ # Fallback: tool name not prefixed (single server)
599
+ server_name = ""
600
+ original_tool_name = prefixed_name
601
+
602
+ # Track unique tools used (use prefixed name for display)
603
+ if prefixed_name not in tools_used:
604
+ tools_used.append(prefixed_name)
605
+
606
+ # Emit tool_call event for progress callback
607
+ self._emit(
608
+ "tool_call",
609
+ task=task.name,
610
+ tool_name=prefixed_name,
611
+ tool_args=tool_args,
612
+ )
613
+
614
+ # Find the session for this tool
615
+ if server_name and server_name in multi.connections:
616
+ session = multi.connections[server_name].session
617
+ else:
618
+ # Fallback: search by original name
619
+ session = multi.get_session_for_tool(
620
+ original_tool_name
621
+ )
622
+
623
+ if not session:
624
+ result = f"Error: Tool '{prefixed_name}' not found in any server"
625
+ logger.error(f"[{task.name}] {result}")
626
+ else:
627
+ try:
628
+ # Call with original (unprefixed) tool name
629
+ result = await call_tool(
630
+ session,
631
+ original_tool_name,
632
+ tool_args,
633
+ timeout=timeout,
634
+ )
635
+ except TimeoutError:
636
+ result = f"Error: Tool '{prefixed_name}' timed out after {timeout}s"
637
+ logger.error(
638
+ f"[{task.name}] Tool timeout | "
639
+ f"tool={prefixed_name} | timeout={timeout}s"
640
+ )
641
+ except RuntimeError as e:
642
+ # Tool returned an error - pass to LLM
643
+ result = str(e)
644
+ logger.warning(
645
+ f"[{task.name}] Tool error | "
646
+ f"tool={prefixed_name} | error={str(e)[:200]}"
647
+ )
648
+ except Exception as e:
649
+ # Unexpected error - log and pass to LLM
650
+ result = (
651
+ f"Error: Tool '{prefixed_name}' failed: {e}"
652
+ )
653
+ logger.error(
654
+ f"[{task.name}] Tool exception | "
655
+ f"tool={prefixed_name} | type={type(e).__name__} | error={e}"
656
+ )
657
+
658
+ # Emit tool_response event for progress callback
659
+ self._emit(
660
+ "tool_response",
661
+ task=task.name,
662
+ tool_name=prefixed_name,
663
+ tool_result=result,
664
+ )
665
+
666
+ # Capture tool result for evaluation
667
+ tool_results.append(result)
668
+
669
+ messages.append(
670
+ {
671
+ "role": "tool",
672
+ "tool_call_id": tc.id,
673
+ "content": result,
674
+ }
675
+ )
676
+
677
+ else:
678
+ # No tool calls, done with this prompt
679
+ response_text = assistant_msg.content or ""
680
+ self._emit(
681
+ "llm_response",
682
+ task=task.name,
683
+ llm_response=response_text,
684
+ )
685
+ # Add assistant response to messages for next prompt
686
+ messages.append(
687
+ {"role": "assistant", "content": response_text}
688
+ )
689
+ break
690
+
691
+ except TimeoutError:
692
+ error_msg = f"Task timed out after {timeout}s"
693
+ logger.error(f"[{task.name}] {error_msg}")
694
+ except BaseExceptionGroup as eg:
695
+ # Extract underlying exceptions from nested TaskGroups
696
+ leaf_exceptions = flatten_exception_group(eg)
697
+ error_msg = "; ".join(str(e) for e in leaf_exceptions)
698
+ logger.error(f"Error running task {task.name}: {error_msg}")
699
+ except Exception as e:
700
+ error_msg = str(e)
701
+ logger.error(f"Error running task {task.name}: {e}")
702
+
703
+ duration = time.time() - start_time
704
+ cost = calculate_cost(model, input_tokens, output_tokens)
705
+
706
+ return TaskResult(
707
+ name=task.name,
708
+ server=task.server,
709
+ model=model,
710
+ prompt=task.prompt or "",
711
+ response=response_text,
712
+ input_tokens=input_tokens,
713
+ output_tokens=output_tokens,
714
+ llm_calls=llm_call_count,
715
+ tool_calls=tool_call_count,
716
+ tools_used=tools_used,
717
+ tool_results=tool_results,
718
+ duration_seconds=duration,
719
+ cost_usd=cost,
720
+ error=error_msg,
721
+ tags=task.tags,
722
+ llm_call_metrics=llm_call_metrics,
723
+ )
724
+
725
+ async def run_scenario(
726
+ self,
727
+ scenario_name: str | None = None,
728
+ task_name: str | None = None,
729
+ tags: list[str] | None = None,
730
+ ) -> list[ScenarioResult]:
731
+ """Run benchmark scenarios.
732
+
733
+ Args:
734
+ scenario_name: Filter scenarios by pattern with wildcard support (optional).
735
+ task_name: Filter tasks by pattern with wildcard support (optional).
736
+ tags: Filter tasks with any of these tags (optional).
737
+
738
+ Returns:
739
+ List of ScenarioResult objects.
740
+ """
741
+ results: list[ScenarioResult] = []
742
+ default_model = self.config.defaults.model
743
+ default_timeout = self.config.defaults.timeout
744
+
745
+ for scenario in self.config.scenarios:
746
+ if scenario_name and not fnmatch.fnmatch(scenario.name, scenario_name):
747
+ continue
748
+
749
+ self._emit("scenario_start", scenario=scenario.name)
750
+ task_results: list[TaskResult] = []
751
+
752
+ for task in scenario.tasks:
753
+ if task_name and not fnmatch.fnmatch(task.name, task_name):
754
+ continue
755
+
756
+ # Filter by tags on tasks (supports wildcards like "focus*")
757
+ if tags:
758
+ task_matches_tags = any(
759
+ fnmatch.fnmatch(task_tag, pattern)
760
+ for pattern in tags
761
+ for task_tag in task.tags
762
+ )
763
+ if not task_matches_tags:
764
+ continue
765
+
766
+ self._emit("task_start", scenario=scenario.name, task=task.name)
767
+ # Pre-compute display values for error handling
768
+ prompt_display = task.prompt or f"Tool: {task.tool}"
769
+ model_display = (
770
+ "direct" if task.type == "direct" else (task.model or default_model)
771
+ )
772
+ try:
773
+ result = await self.run_task(task, default_model, default_timeout)
774
+ except asyncio.CancelledError:
775
+ # Task was cancelled (e.g., timeout) - create error result
776
+ # Note: CancelledError is BaseException, must come before Exception
777
+ logger.error(f"Task {task.name} was cancelled (timeout)")
778
+ result = TaskResult(
779
+ name=task.name,
780
+ server=task.server,
781
+ model=model_display,
782
+ prompt=prompt_display,
783
+ response="",
784
+ input_tokens=0,
785
+ output_tokens=0,
786
+ llm_calls=0,
787
+ tool_calls=0,
788
+ tools_used=[],
789
+ duration_seconds=0.0,
790
+ cost_usd=0.0,
791
+ error="Task timed out",
792
+ tags=task.tags,
793
+ )
794
+ except BaseExceptionGroup as eg:
795
+ # Task crashed with exception group - create error result
796
+ leaf_exceptions = flatten_exception_group(eg)
797
+ error_msg = "; ".join(str(e) for e in leaf_exceptions)
798
+ logger.error(f"Task {task.name} crashed: {error_msg}")
799
+ result = TaskResult(
800
+ name=task.name,
801
+ server=task.server,
802
+ model=model_display,
803
+ prompt=prompt_display,
804
+ response="",
805
+ input_tokens=0,
806
+ output_tokens=0,
807
+ llm_calls=0,
808
+ tool_calls=0,
809
+ tools_used=[],
810
+ duration_seconds=0.0,
811
+ cost_usd=0.0,
812
+ error=f"Task crashed: {error_msg}",
813
+ tags=task.tags,
814
+ )
815
+ except Exception as e:
816
+ # Task crashed with regular exception - create error result
817
+ error_msg = str(e)
818
+ logger.error(f"Task {task.name} crashed: {error_msg}")
819
+ result = TaskResult(
820
+ name=task.name,
821
+ server=task.server,
822
+ model=model_display,
823
+ prompt=prompt_display,
824
+ response="",
825
+ input_tokens=0,
826
+ output_tokens=0,
827
+ llm_calls=0,
828
+ tool_calls=0,
829
+ tools_used=[],
830
+ duration_seconds=0.0,
831
+ cost_usd=0.0,
832
+ error=f"Task crashed: {error_msg}",
833
+ tags=task.tags,
834
+ )
835
+ task_results.append(result)
836
+ # Update partial results with current scenario's progress
837
+ self._update_partial_results(
838
+ results, scenario.name, default_model, task_results
839
+ )
840
+ # Emit task_complete BEFORE evaluation so LogSpan duration is accurate
841
+ self._emit(
842
+ "task_complete",
843
+ scenario=scenario.name,
844
+ task=task.name,
845
+ result=result,
846
+ )
847
+
848
+ # Evaluate task after task_complete (so span duration excludes evaluation)
849
+ if not self.dry_run:
850
+ with LogSpan(span="bench.evaluate", task=task.name) as eval_span:
851
+ if result.error:
852
+ # Check if this test expects an error (e.g., timeout tests)
853
+ # Must resolve evaluator first since task.evaluate can be a string
854
+ eval_config = resolve_evaluator(task, self.config)
855
+ if eval_config and eval_config.expect_error:
856
+ # Use error message as response for evaluation
857
+ result.response = result.error
858
+ evaluation = evaluate_task(result, task, self.config)
859
+ if evaluation:
860
+ result.evaluation = evaluation
861
+ eval_span.add(
862
+ passed=evaluation.passed,
863
+ evalType=evaluation.eval_type,
864
+ )
865
+ else:
866
+ result.evaluation = EvaluationResult(
867
+ score=0,
868
+ reason=f"Skipped due to error: {result.error}",
869
+ eval_type="pass_fail",
870
+ passed=False,
871
+ )
872
+ eval_span.add(skipped=True, error=result.error)
873
+ else:
874
+ evaluation = evaluate_task(result, task, self.config)
875
+ if evaluation:
876
+ result.evaluation = evaluation
877
+ eval_span.add(
878
+ passed=evaluation.passed,
879
+ evalType=evaluation.eval_type,
880
+ )
881
+ # Emit separate event for evaluation display
882
+ self._emit(
883
+ "task_evaluated",
884
+ scenario=scenario.name,
885
+ task=task.name,
886
+ result=result,
887
+ )
888
+
889
+ await asyncio.sleep(TASK_DELAY_SECONDS)
890
+
891
+ if task_results:
892
+ scenario_result = ScenarioResult(
893
+ name=scenario.name,
894
+ model=default_model,
895
+ tasks=task_results,
896
+ )
897
+ results.append(scenario_result)
898
+
899
+ return results