onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
bench/harness/config.py
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
"""YAML configuration loading for harness scenarios and tasks.
|
|
2
|
+
|
|
3
|
+
Loads ot-dev.yaml with test scenarios and harness configuration.
|
|
4
|
+
Supports variable expansion from bench-secrets.yaml in the format ${VAR_NAME}.
|
|
5
|
+
|
|
6
|
+
Example ot-dev.yaml:
|
|
7
|
+
|
|
8
|
+
defaults:
|
|
9
|
+
model: openai/gpt-5-mini
|
|
10
|
+
timeout: 120
|
|
11
|
+
|
|
12
|
+
servers:
|
|
13
|
+
onetool:
|
|
14
|
+
type: stdio
|
|
15
|
+
command: uv
|
|
16
|
+
args: ["run", "onetool"]
|
|
17
|
+
|
|
18
|
+
scenarios:
|
|
19
|
+
- name: Basic Tests
|
|
20
|
+
tasks:
|
|
21
|
+
- name: hello world
|
|
22
|
+
prompt: Say hello
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import re
|
|
28
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
29
|
+
|
|
30
|
+
import yaml
|
|
31
|
+
from pydantic import BaseModel, Field, field_validator
|
|
32
|
+
|
|
33
|
+
from bench.secrets import get_bench_secret
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def expand_secrets(value: str) -> str:
|
|
40
|
+
"""Expand variables in a string using bench-secrets.yaml only.
|
|
41
|
+
|
|
42
|
+
Supports ${VAR_NAME} and ${VAR_NAME:-default} syntax.
|
|
43
|
+
Reads from bench-secrets.yaml only - does NOT read from os.environ.
|
|
44
|
+
Raises error if variable not found and no default provided.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
value: String potentially containing ${VAR} patterns.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
String with variables expanded from bench secrets.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ValueError: If variable not found in secrets and no default.
|
|
54
|
+
"""
|
|
55
|
+
pattern = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}")
|
|
56
|
+
missing_vars: list[str] = []
|
|
57
|
+
|
|
58
|
+
def replace(match: re.Match[str]) -> str:
|
|
59
|
+
var_name = match.group(1)
|
|
60
|
+
default_value = match.group(2)
|
|
61
|
+
# Read from bench secrets only - no os.environ
|
|
62
|
+
secret_value = get_bench_secret(var_name)
|
|
63
|
+
if secret_value:
|
|
64
|
+
return secret_value
|
|
65
|
+
if default_value is not None:
|
|
66
|
+
return default_value
|
|
67
|
+
missing_vars.append(var_name)
|
|
68
|
+
return match.group(0)
|
|
69
|
+
|
|
70
|
+
result = pattern.sub(replace, value)
|
|
71
|
+
|
|
72
|
+
if missing_vars:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Missing variables in bench-secrets.yaml: {', '.join(missing_vars)}. "
|
|
75
|
+
f"Add them to .onetool/config/bench-secrets.yaml or use ${{VAR:-default}} syntax."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return result
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def expand_subprocess_env(value: str) -> str:
|
|
82
|
+
"""Expand ${VAR} for subprocess environment values.
|
|
83
|
+
|
|
84
|
+
Reads from bench secrets first, then os.environ for pass-through.
|
|
85
|
+
This is the ONLY place where reading os.environ is allowed for bench,
|
|
86
|
+
enabling explicit env var pass-through to subprocesses.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
value: String potentially containing ${VAR} patterns.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
String with variables expanded. Empty string if not found.
|
|
93
|
+
"""
|
|
94
|
+
import os
|
|
95
|
+
|
|
96
|
+
pattern = re.compile(r"\$\{([^}:]+)(?::-([^}]*))?\}")
|
|
97
|
+
|
|
98
|
+
def replace(match: re.Match[str]) -> str:
|
|
99
|
+
var_name = match.group(1)
|
|
100
|
+
default_value = match.group(2)
|
|
101
|
+
# Bench secrets first
|
|
102
|
+
secret_value = get_bench_secret(var_name)
|
|
103
|
+
if secret_value:
|
|
104
|
+
return secret_value
|
|
105
|
+
# Then os.environ (for pass-through like ${HOME})
|
|
106
|
+
env_val = os.environ.get(var_name)
|
|
107
|
+
if env_val is not None:
|
|
108
|
+
return env_val
|
|
109
|
+
# Use default if provided
|
|
110
|
+
if default_value is not None:
|
|
111
|
+
return default_value
|
|
112
|
+
# Empty string if not found
|
|
113
|
+
return ""
|
|
114
|
+
|
|
115
|
+
return pattern.sub(replace, value)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def expand_secrets_in_dict(data: Any, skip_keys: set[str] | None = None) -> Any:
|
|
119
|
+
"""Recursively expand secrets in a dict/list structure.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
data: Dict, list, or scalar value.
|
|
123
|
+
skip_keys: Set of dict keys whose values should not be expanded.
|
|
124
|
+
Used to skip 'env' values which are expanded later by subprocess.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Structure with all string values expanded.
|
|
128
|
+
"""
|
|
129
|
+
if skip_keys is None:
|
|
130
|
+
skip_keys = {"env"}
|
|
131
|
+
|
|
132
|
+
if isinstance(data, dict):
|
|
133
|
+
result = {}
|
|
134
|
+
for k, v in data.items():
|
|
135
|
+
if k in skip_keys:
|
|
136
|
+
# Don't expand these - they're handled by expand_subprocess_env later
|
|
137
|
+
result[k] = v
|
|
138
|
+
else:
|
|
139
|
+
result[k] = expand_secrets_in_dict(v, skip_keys)
|
|
140
|
+
return result
|
|
141
|
+
elif isinstance(data, list):
|
|
142
|
+
return [expand_secrets_in_dict(v, skip_keys) for v in data]
|
|
143
|
+
elif isinstance(data, str):
|
|
144
|
+
return expand_secrets(data)
|
|
145
|
+
return data
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class ServerConfig(BaseModel):
|
|
149
|
+
"""Configuration for an MCP server connection."""
|
|
150
|
+
|
|
151
|
+
type: Literal["http", "stdio"] = Field(description="Server connection type")
|
|
152
|
+
url: str | None = Field(default=None, description="URL for HTTP servers")
|
|
153
|
+
headers: dict[str, str] = Field(
|
|
154
|
+
default_factory=dict, description="Headers for HTTP servers"
|
|
155
|
+
)
|
|
156
|
+
command: str | None = Field(default=None, description="Command for stdio servers")
|
|
157
|
+
args: list[str] = Field(
|
|
158
|
+
default_factory=list, description="Arguments for stdio command"
|
|
159
|
+
)
|
|
160
|
+
env: dict[str, str] = Field(
|
|
161
|
+
default_factory=dict, description="Environment variables for stdio servers"
|
|
162
|
+
)
|
|
163
|
+
timeout: int | None = Field(
|
|
164
|
+
default=None, description="Connection timeout in seconds (overrides default)"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
@field_validator("url", "command", mode="before")
|
|
168
|
+
@classmethod
|
|
169
|
+
def expand_secrets_validator(cls, v: str | None) -> str | None:
|
|
170
|
+
"""Expand secrets in URL and command."""
|
|
171
|
+
if v is None:
|
|
172
|
+
return None
|
|
173
|
+
return expand_secrets(v)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class EvaluateConfig(BaseModel):
|
|
177
|
+
"""Configuration for evaluation (LLM or deterministic)."""
|
|
178
|
+
|
|
179
|
+
# For deterministic checks - can be string, list, dict, number, bool
|
|
180
|
+
expected: str | list[Any] | dict[str, Any] | int | float | bool | None = Field(
|
|
181
|
+
default=None,
|
|
182
|
+
description="Expected value(s) for deterministic evaluation",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# For regex pattern matching
|
|
186
|
+
regex: str | None = Field(
|
|
187
|
+
default=None,
|
|
188
|
+
description="Regex pattern to match against response",
|
|
189
|
+
)
|
|
190
|
+
expect_match: bool = Field(
|
|
191
|
+
default=True,
|
|
192
|
+
description="If True, regex must match. If False, regex must NOT match.",
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# For error tests - when True, test expects an error response
|
|
196
|
+
# If LLM "fixes" the code and it succeeds, that's a failure
|
|
197
|
+
expect_error: bool = Field(
|
|
198
|
+
default=False,
|
|
199
|
+
description="When True, test expects error/failure. Success without error pattern is a failure.",
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# For LLM-as-judge evaluation
|
|
203
|
+
prompt: str | None = Field(
|
|
204
|
+
default=None,
|
|
205
|
+
description="Evaluation prompt template with {response} and {expected}",
|
|
206
|
+
)
|
|
207
|
+
model: str | None = Field(
|
|
208
|
+
default=None,
|
|
209
|
+
description="Model to use for LLM evaluation (required if using LLM-as-judge)",
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class TaskConfig(BaseModel):
|
|
214
|
+
"""Configuration for a single task (direct or harness).
|
|
215
|
+
|
|
216
|
+
Task types:
|
|
217
|
+
- direct: Direct MCP tool invocation without LLM
|
|
218
|
+
- harness: LLM benchmark with optional MCP servers (default)
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
name: str = Field(description="Task name")
|
|
222
|
+
type: Literal["direct", "harness"] = Field(
|
|
223
|
+
default="harness",
|
|
224
|
+
description="Task type: 'direct' for MCP tool call, 'harness' for LLM benchmark",
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Common fields
|
|
228
|
+
server: str | list[str] | None = Field(
|
|
229
|
+
default=None, description="Server name(s) from servers - single or list"
|
|
230
|
+
)
|
|
231
|
+
timeout: int | None = Field(default=None, description="Timeout in seconds")
|
|
232
|
+
tags: list[str] = Field(
|
|
233
|
+
default_factory=list, description="Tags for filtering tasks"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Harness-specific fields (type: harness)
|
|
237
|
+
prompt: str | None = Field(
|
|
238
|
+
default=None, description="Prompt to send to LLM (required for harness type)"
|
|
239
|
+
)
|
|
240
|
+
model: str | None = Field(default=None, description="Model override (harness only)")
|
|
241
|
+
evaluate: str | EvaluateConfig | None = Field(
|
|
242
|
+
default=None,
|
|
243
|
+
description="Evaluation config (harness only)",
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Direct-specific fields (type: direct)
|
|
247
|
+
tool: str | None = Field(
|
|
248
|
+
default=None, description="Tool name to call (required for direct type)"
|
|
249
|
+
)
|
|
250
|
+
arguments: dict[str, Any] = Field(
|
|
251
|
+
default_factory=dict, description="Tool arguments (direct only)"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
@field_validator("tags", mode="before")
|
|
255
|
+
@classmethod
|
|
256
|
+
def tags_default_empty(cls, v: list[str] | None) -> list[str]:
|
|
257
|
+
"""Convert None to empty list for tags."""
|
|
258
|
+
return v if v is not None else []
|
|
259
|
+
|
|
260
|
+
def model_post_init(self, __context: Any) -> None:
|
|
261
|
+
"""Validate type-specific required fields."""
|
|
262
|
+
if self.type == "direct":
|
|
263
|
+
if not self.tool:
|
|
264
|
+
raise ValueError(
|
|
265
|
+
f"Task '{self.name}': type 'direct' requires 'tool' field"
|
|
266
|
+
)
|
|
267
|
+
if not self.server:
|
|
268
|
+
raise ValueError(
|
|
269
|
+
f"Task '{self.name}': type 'direct' requires 'server' field"
|
|
270
|
+
)
|
|
271
|
+
elif self.type == "harness":
|
|
272
|
+
if not self.prompt:
|
|
273
|
+
raise ValueError(
|
|
274
|
+
f"Task '{self.name}': type 'harness' requires 'prompt' field"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
class ScenarioConfig(BaseModel):
|
|
279
|
+
"""Configuration for a benchmark scenario."""
|
|
280
|
+
|
|
281
|
+
name: str = Field(description="Scenario name")
|
|
282
|
+
description: str = Field(default="", description="Scenario description")
|
|
283
|
+
tasks: list[TaskConfig] = Field(description="List of tasks in the scenario")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class DefaultsConfig(BaseModel):
|
|
287
|
+
"""Default configuration values."""
|
|
288
|
+
|
|
289
|
+
timeout: int = Field(default=120, description="Default timeout in seconds")
|
|
290
|
+
model: str = Field(default="openai/gpt-5-mini", description="Default model")
|
|
291
|
+
system_prompt: str | None = Field(
|
|
292
|
+
default=None, description="System prompt to prepend to all tasks"
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class HarnessConfig(BaseModel):
|
|
297
|
+
"""Root configuration for harness YAML files."""
|
|
298
|
+
|
|
299
|
+
defaults: DefaultsConfig = Field(default_factory=DefaultsConfig)
|
|
300
|
+
servers: dict[str, ServerConfig] = Field(default_factory=dict)
|
|
301
|
+
evaluators: dict[str, EvaluateConfig] = Field(
|
|
302
|
+
default_factory=dict,
|
|
303
|
+
description="Named evaluators that can be referenced by tasks",
|
|
304
|
+
)
|
|
305
|
+
evaluate: EvaluateConfig | None = Field(
|
|
306
|
+
default=None,
|
|
307
|
+
description="Legacy: default evaluator (deprecated, use evaluators)",
|
|
308
|
+
)
|
|
309
|
+
scenarios: list[ScenarioConfig] = Field(default_factory=list)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def load_harness_config(path: Path) -> HarnessConfig:
|
|
313
|
+
"""Load and validate a harness YAML configuration.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
path: Path to the YAML file.
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
Validated HarnessConfig.
|
|
320
|
+
|
|
321
|
+
Raises:
|
|
322
|
+
FileNotFoundError: If the file doesn't exist.
|
|
323
|
+
ValueError: If the YAML is invalid.
|
|
324
|
+
"""
|
|
325
|
+
if not path.exists():
|
|
326
|
+
raise FileNotFoundError(f"Config file not found: {path}")
|
|
327
|
+
|
|
328
|
+
with path.open() as f:
|
|
329
|
+
raw_data = yaml.safe_load(f)
|
|
330
|
+
|
|
331
|
+
if raw_data is None:
|
|
332
|
+
raw_data = {}
|
|
333
|
+
|
|
334
|
+
# Expand secrets
|
|
335
|
+
data = expand_secrets_in_dict(raw_data)
|
|
336
|
+
|
|
337
|
+
return HarnessConfig.model_validate(data)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _convert_legacy_tools_config(data: dict[str, Any]) -> dict[str, Any]:
|
|
341
|
+
"""Convert legacy tools config to unified format.
|
|
342
|
+
|
|
343
|
+
Legacy tools configs have tasks with 'tool' field but no 'type' field.
|
|
344
|
+
This adds 'type: direct' to such tasks.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
data: Parsed YAML data.
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
Data with legacy tasks converted to unified format.
|
|
351
|
+
"""
|
|
352
|
+
scenarios = data.get("scenarios", [])
|
|
353
|
+
for scenario in scenarios:
|
|
354
|
+
if not isinstance(scenario, dict):
|
|
355
|
+
continue
|
|
356
|
+
tasks = scenario.get("tasks", [])
|
|
357
|
+
for task in tasks:
|
|
358
|
+
if not isinstance(task, dict):
|
|
359
|
+
continue
|
|
360
|
+
# If task has 'tool' but no 'type', it's a legacy direct task
|
|
361
|
+
if "tool" in task and "type" not in task:
|
|
362
|
+
task["type"] = "direct"
|
|
363
|
+
return data
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def load_config(path: Path) -> HarnessConfig:
|
|
367
|
+
"""Load and validate a YAML configuration file.
|
|
368
|
+
|
|
369
|
+
Supports both unified configs (with explicit type field) and
|
|
370
|
+
legacy configs (auto-detects direct vs harness based on content).
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
path: Path to the YAML file.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Validated HarnessConfig.
|
|
377
|
+
|
|
378
|
+
Raises:
|
|
379
|
+
FileNotFoundError: If the file doesn't exist.
|
|
380
|
+
ValueError: If the YAML is invalid.
|
|
381
|
+
"""
|
|
382
|
+
if not path.exists():
|
|
383
|
+
raise FileNotFoundError(f"Config file not found: {path}")
|
|
384
|
+
|
|
385
|
+
with path.open() as f:
|
|
386
|
+
raw_data = yaml.safe_load(f)
|
|
387
|
+
|
|
388
|
+
if raw_data is None:
|
|
389
|
+
raw_data = {}
|
|
390
|
+
|
|
391
|
+
# Expand secrets
|
|
392
|
+
data = expand_secrets_in_dict(raw_data)
|
|
393
|
+
|
|
394
|
+
# Convert legacy tools config format if needed
|
|
395
|
+
data = _convert_legacy_tools_config(data)
|
|
396
|
+
|
|
397
|
+
return HarnessConfig.model_validate(data)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""CSV export for benchmark results with per-call metrics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from ot.logging import LogSpan
|
|
11
|
+
from ot.paths import get_effective_cwd
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from bench.harness.metrics import ScenarioResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def write_results_csv(
|
|
18
|
+
results: list[ScenarioResult],
|
|
19
|
+
output_dir: Path | str | None = None,
|
|
20
|
+
) -> Path:
|
|
21
|
+
"""Write benchmark results to CSV with per-call breakdown.
|
|
22
|
+
|
|
23
|
+
Generates a CSV file with dynamic columns based on the maximum number
|
|
24
|
+
of LLM calls across all tasks. Each task row includes scenario info,
|
|
25
|
+
totals, and per-call metrics.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
results: List of ScenarioResult objects to export.
|
|
29
|
+
output_dir: Directory for output file (default: {cwd}/tmp/).
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Path to the generated CSV file.
|
|
33
|
+
"""
|
|
34
|
+
if output_dir is None:
|
|
35
|
+
output_dir = get_effective_cwd() / "tmp"
|
|
36
|
+
output_dir = Path(output_dir)
|
|
37
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
|
|
39
|
+
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M")
|
|
40
|
+
output_path = output_dir / f"result-{timestamp}.csv"
|
|
41
|
+
|
|
42
|
+
task_count = sum(len(s.tasks) for s in results)
|
|
43
|
+
|
|
44
|
+
with LogSpan(span="bench.results.write", path=str(output_path), tasks=task_count):
|
|
45
|
+
_write_csv_file(results, output_path)
|
|
46
|
+
|
|
47
|
+
return output_path
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _write_csv_file(results: list[ScenarioResult], output_path: Path) -> None:
|
|
51
|
+
"""Write benchmark results to CSV file."""
|
|
52
|
+
headers = [
|
|
53
|
+
"scenario",
|
|
54
|
+
"task",
|
|
55
|
+
"model",
|
|
56
|
+
"server",
|
|
57
|
+
"result",
|
|
58
|
+
"total_input",
|
|
59
|
+
"total_output",
|
|
60
|
+
"llm_calls",
|
|
61
|
+
"tool_calls",
|
|
62
|
+
"duration_s",
|
|
63
|
+
"cost_usd",
|
|
64
|
+
"base_context",
|
|
65
|
+
"context_growth_avg",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Write CSV
|
|
69
|
+
with output_path.open("w", newline="") as f:
|
|
70
|
+
writer = csv.writer(f)
|
|
71
|
+
writer.writerow(headers)
|
|
72
|
+
|
|
73
|
+
for scenario in results:
|
|
74
|
+
for task in scenario.tasks:
|
|
75
|
+
# Determine result value
|
|
76
|
+
if task.error:
|
|
77
|
+
result_val = "ERROR"
|
|
78
|
+
elif task.evaluation:
|
|
79
|
+
if task.evaluation.eval_type == "pass_fail":
|
|
80
|
+
result_val = "PASS" if task.evaluation.passed else "FAIL"
|
|
81
|
+
else:
|
|
82
|
+
result_val = str(task.evaluation.score)
|
|
83
|
+
else:
|
|
84
|
+
result_val = "-"
|
|
85
|
+
|
|
86
|
+
# Format server as comma-separated if list
|
|
87
|
+
server_val = (
|
|
88
|
+
",".join(task.server)
|
|
89
|
+
if isinstance(task.server, list)
|
|
90
|
+
else (task.server or "-")
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
row = [
|
|
94
|
+
scenario.name,
|
|
95
|
+
task.name,
|
|
96
|
+
task.model,
|
|
97
|
+
server_val,
|
|
98
|
+
result_val,
|
|
99
|
+
task.input_tokens,
|
|
100
|
+
task.output_tokens,
|
|
101
|
+
task.llm_calls,
|
|
102
|
+
task.tool_calls,
|
|
103
|
+
round(task.duration_seconds, 2),
|
|
104
|
+
round(task.cost_usd, 6),
|
|
105
|
+
task.base_context,
|
|
106
|
+
round(task.context_growth_avg, 1),
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
writer.writerow(row)
|