k-eval 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k_eval-0.3.0/MANIFEST.in +5 -0
- k_eval-0.3.0/PKG-INFO +188 -0
- k_eval-0.3.0/README.md +158 -0
- k_eval-0.3.0/k_eval/__init__.py +0 -0
- k_eval-0.3.0/k_eval/agent/__init__.py +0 -0
- k_eval-0.3.0/k_eval/agent/domain/__init__.py +0 -0
- k_eval-0.3.0/k_eval/agent/domain/agent.py +14 -0
- k_eval-0.3.0/k_eval/agent/domain/factory.py +18 -0
- k_eval-0.3.0/k_eval/agent/domain/observer.py +27 -0
- k_eval-0.3.0/k_eval/agent/domain/result.py +18 -0
- k_eval-0.3.0/k_eval/agent/domain/usage.py +12 -0
- k_eval-0.3.0/k_eval/agent/infrastructure/__init__.py +0 -0
- k_eval-0.3.0/k_eval/agent/infrastructure/claude_sdk.py +222 -0
- k_eval-0.3.0/k_eval/agent/infrastructure/errors.py +19 -0
- k_eval-0.3.0/k_eval/agent/infrastructure/factory.py +32 -0
- k_eval-0.3.0/k_eval/agent/infrastructure/observer.py +50 -0
- k_eval-0.3.0/k_eval/agent/infrastructure/registry.py +21 -0
- k_eval-0.3.0/k_eval/cli/__init__.py +0 -0
- k_eval-0.3.0/k_eval/cli/main.py +505 -0
- k_eval-0.3.0/k_eval/cli/output/__init__.py +0 -0
- k_eval-0.3.0/k_eval/cli/output/aggregator.py +89 -0
- k_eval-0.3.0/k_eval/cli/output/eee.py +224 -0
- k_eval-0.3.0/k_eval/config/__init__.py +0 -0
- k_eval-0.3.0/k_eval/config/domain/__init__.py +0 -0
- k_eval-0.3.0/k_eval/config/domain/agent.py +8 -0
- k_eval-0.3.0/k_eval/config/domain/condition.py +12 -0
- k_eval-0.3.0/k_eval/config/domain/condition_mcp_server.py +13 -0
- k_eval-0.3.0/k_eval/config/domain/config.py +26 -0
- k_eval-0.3.0/k_eval/config/domain/dataset.py +11 -0
- k_eval-0.3.0/k_eval/config/domain/execution.py +15 -0
- k_eval-0.3.0/k_eval/config/domain/judge.py +8 -0
- k_eval-0.3.0/k_eval/config/domain/mcp_server.py +39 -0
- k_eval-0.3.0/k_eval/config/domain/observer.py +9 -0
- k_eval-0.3.0/k_eval/config/infrastructure/__init__.py +0 -0
- k_eval-0.3.0/k_eval/config/infrastructure/env_interpolation.py +50 -0
- k_eval-0.3.0/k_eval/config/infrastructure/errors.py +37 -0
- k_eval-0.3.0/k_eval/config/infrastructure/observer.py +23 -0
- k_eval-0.3.0/k_eval/config/infrastructure/yaml_loader.py +120 -0
- k_eval-0.3.0/k_eval/core/__init__.py +0 -0
- k_eval-0.3.0/k_eval/core/errors.py +9 -0
- k_eval-0.3.0/k_eval/dataset/__init__.py +0 -0
- k_eval-0.3.0/k_eval/dataset/domain/__init__.py +0 -0
- k_eval-0.3.0/k_eval/dataset/domain/load_result.py +16 -0
- k_eval-0.3.0/k_eval/dataset/domain/loader.py +12 -0
- k_eval-0.3.0/k_eval/dataset/domain/observer.py +15 -0
- k_eval-0.3.0/k_eval/dataset/domain/sample.py +13 -0
- k_eval-0.3.0/k_eval/dataset/infrastructure/__init__.py +0 -0
- k_eval-0.3.0/k_eval/dataset/infrastructure/errors.py +10 -0
- k_eval-0.3.0/k_eval/dataset/infrastructure/jsonl_loader.py +130 -0
- k_eval-0.3.0/k_eval/dataset/infrastructure/observer.py +36 -0
- k_eval-0.3.0/k_eval/evaluation/__init__.py +0 -0
- k_eval-0.3.0/k_eval/evaluation/application/__init__.py +0 -0
- k_eval-0.3.0/k_eval/evaluation/application/runner.py +232 -0
- k_eval-0.3.0/k_eval/evaluation/domain/__init__.py +0 -0
- k_eval-0.3.0/k_eval/evaluation/domain/observer.py +71 -0
- k_eval-0.3.0/k_eval/evaluation/domain/run.py +20 -0
- k_eval-0.3.0/k_eval/evaluation/domain/summary.py +18 -0
- k_eval-0.3.0/k_eval/evaluation/infrastructure/__init__.py +0 -0
- k_eval-0.3.0/k_eval/evaluation/infrastructure/composite_observer.py +128 -0
- k_eval-0.3.0/k_eval/evaluation/infrastructure/observer.py +129 -0
- k_eval-0.3.0/k_eval/evaluation/infrastructure/progress_observer.py +370 -0
- k_eval-0.3.0/k_eval/judge/__init__.py +0 -0
- k_eval-0.3.0/k_eval/judge/domain/__init__.py +0 -0
- k_eval-0.3.0/k_eval/judge/domain/factory.py +15 -0
- k_eval-0.3.0/k_eval/judge/domain/judge.py +16 -0
- k_eval-0.3.0/k_eval/judge/domain/observer.py +26 -0
- k_eval-0.3.0/k_eval/judge/domain/score.py +21 -0
- k_eval-0.3.0/k_eval/judge/infrastructure/__init__.py +0 -0
- k_eval-0.3.0/k_eval/judge/infrastructure/errors.py +10 -0
- k_eval-0.3.0/k_eval/judge/infrastructure/factory.py +30 -0
- k_eval-0.3.0/k_eval/judge/infrastructure/litellm.py +205 -0
- k_eval-0.3.0/k_eval/judge/infrastructure/observer.py +53 -0
- k_eval-0.3.0/k_eval.egg-info/PKG-INFO +188 -0
- k_eval-0.3.0/k_eval.egg-info/SOURCES.txt +78 -0
- k_eval-0.3.0/k_eval.egg-info/dependency_links.txt +1 -0
- k_eval-0.3.0/k_eval.egg-info/entry_points.txt +2 -0
- k_eval-0.3.0/k_eval.egg-info/requires.txt +13 -0
- k_eval-0.3.0/k_eval.egg-info/top_level.txt +1 -0
- k_eval-0.3.0/pyproject.toml +69 -0
- k_eval-0.3.0/setup.cfg +4 -0
k_eval-0.3.0/MANIFEST.in
ADDED
k_eval-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: k-eval
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Context-aware evaluation framework for AI agents using MCP.
|
|
5
|
+
Author: jsell-rh
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/jsell-rh/k-eval
|
|
8
|
+
Project-URL: Repository, https://github.com/jsell-rh/k-eval
|
|
9
|
+
Project-URL: Issues, https://github.com/jsell-rh/k-eval/issues
|
|
10
|
+
Keywords: evaluation,ai,agents,mcp,llm
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Software Development :: Testing
|
|
17
|
+
Requires-Python: >=3.13
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: claude-agent-sdk>=0.1.39
|
|
20
|
+
Requires-Dist: litellm
|
|
21
|
+
Requires-Dist: pydantic>=2.12.5
|
|
22
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
23
|
+
Requires-Dist: rich>=14.3.3
|
|
24
|
+
Requires-Dist: structlog>=25.5.0
|
|
25
|
+
Requires-Dist: typer>=0.24.1
|
|
26
|
+
Provides-Extra: vertex-ai
|
|
27
|
+
Requires-Dist: google-cloud-aiplatform>=1.138.0; extra == "vertex-ai"
|
|
28
|
+
Provides-Extra: all
|
|
29
|
+
Requires-Dist: k-eval[vertex_ai]; extra == "all"
|
|
30
|
+
|
|
31
|
+
# k-eval
|
|
32
|
+
|
|
33
|
+
Context-aware evaluation framework for AI agents using MCP.
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
k-eval uses [uv](https://docs.astral.sh/uv/) for dependency management. Install it first if you don't have it:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Install `k-eval`
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://github.com/jsell-rh/k-eval.git
|
|
47
|
+
cd k-eval/src/k-eval
|
|
48
|
+
|
|
49
|
+
# Core dependencies
|
|
50
|
+
uv sync
|
|
51
|
+
|
|
52
|
+
# With Vertex AI provider support
|
|
53
|
+
uv sync --extra vertex_ai
|
|
54
|
+
|
|
55
|
+
# All provider dependencies
|
|
56
|
+
uv sync --extra all
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Run `k-eval`
|
|
60
|
+
|
|
61
|
+
`k-eval` runs are configured using `yaml` configuration files (see [Configuration](#Configuration)).
|
|
62
|
+
|
|
63
|
+
Once an evaluation is defined in a `yaml` file, you can invoke
|
|
64
|
+
`k-eval` like:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
cd src/k-eval
|
|
68
|
+
uv run python -m k_eval.cli.main /path/to/config.yaml
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
See [docs/run-configuration.md](docs/run-configuration.md) for authentication setup and all CLI options.
|
|
72
|
+
|
|
73
|
+
#### CLI Options
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
src/k-eval$ uv run python -m k_eval.cli.main --help
|
|
77
|
+
|
|
78
|
+
Usage: python -m cli.main [OPTIONS] CONFIG_PATH
|
|
79
|
+
|
|
80
|
+
Run a k-eval evaluation from a YAML config file.
|
|
81
|
+
|
|
82
|
+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
83
|
+
│ * config_path PATH Path to evaluation config YAML [required] │
|
|
84
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
85
|
+
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
86
|
+
│ --output-dir -o PATH Directory for output files [default: results] │
|
|
87
|
+
│ --log-format TEXT Log format: 'console' or 'json' [default: console] │
|
|
88
|
+
│ --quiet -q Suppress debug and info logs; show only the progress bar plus warnings/errors. │
|
|
89
|
+
│ --help Show this message and exit. │
|
|
90
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Understanding the Output
|
|
94
|
+
|
|
95
|
+
Each run produces two files in `./results/` (or wherever you point `--output-dir`):
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
results/
|
|
99
|
+
my-eval_20260225_a1b2c3d4.json # aggregate scores per condition
|
|
100
|
+
my-eval_20260225_a1b2c3d4.detailed.jsonl # one line per (question, condition) pair
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**`{name}_{date}_{run_id}.json`** — the summary. One entry per condition with
|
|
104
|
+
mean and standard deviation for each of the three metrics across all questions
|
|
105
|
+
and repetitions. Use this to compare conditions at a glance.
|
|
106
|
+
|
|
107
|
+
This file is intended to be mostly compliant with the [Every Eval Ever](https://evalevalai.com/projects/every-eval-ever/) schema.
|
|
108
|
+
Notably, `k-eval` does not aggregate the three metrics into a single score.
|
|
109
|
+
Thus, the individual metrics are written to `score_details.details`, and
|
|
110
|
+
`score_details.score` is left `null`.
|
|
111
|
+
|
|
112
|
+
**`{name}_{date}_{run_id}.detailed.jsonl`** — the full record. One JSON object per
|
|
113
|
+
`(question, condition)` pair containing the agent's raw responses for every
|
|
114
|
+
repetition, per-repetition judge scores and reasoning, unverified claims, and
|
|
115
|
+
token usage. Use this if you want to dig into why a condition scored the way it did.
|
|
116
|
+
|
|
117
|
+
The three metrics are scored 1-5 by the judge model:
|
|
118
|
+
|
|
119
|
+
| Metric | What it measures |
|
|
120
|
+
|---|---|
|
|
121
|
+
| `factual_adherence` | Does the response stick to facts in the golden answer? |
|
|
122
|
+
| `completeness` | Does it cover all the essential points? |
|
|
123
|
+
| `helpfulness_and_clarity` | Is it well-structured and easy to act on? |
|
|
124
|
+
|
|
125
|
+
See [evaluation-methodology](docs/evaluation-methodology.md) for more details.
|
|
126
|
+
|
|
127
|
+
### Configuration
|
|
128
|
+
|
|
129
|
+
A config file defines your dataset, agent, judge, MCP servers, and the conditions you want to compare:
|
|
130
|
+
|
|
131
|
+
> [!Important]
|
|
132
|
+
>
|
|
133
|
+
> For MCP servers that require authentication,
|
|
134
|
+
> please reference [docs/run-configuration.md](docs/run-configuration.md).
|
|
135
|
+
|
|
136
|
+
```yaml
|
|
137
|
+
name: "my-eval"
|
|
138
|
+
version: "1"
|
|
139
|
+
|
|
140
|
+
dataset:
|
|
141
|
+
# JSONL file with your questions and golden answers
|
|
142
|
+
path: "./questions.jsonl"
|
|
143
|
+
# The name of the key used to reference the question within the JSONL file.
|
|
144
|
+
question_key: "question"
|
|
145
|
+
# They key used to reference the golden "reference" or answer within the JSON file.
|
|
146
|
+
answer_key: "answer"
|
|
147
|
+
|
|
148
|
+
agent:
|
|
149
|
+
type: "claude_code_sdk" # currently the only supported type
|
|
150
|
+
model: "claude-sonnet-4-5"
|
|
151
|
+
|
|
152
|
+
judge:
|
|
153
|
+
model: "vertex_ai/claude-opus-4-5" # any LiteLLM-compatible model string (See: https://models.litellm.ai/)
|
|
154
|
+
temperature: 0.0
|
|
155
|
+
|
|
156
|
+
mcp_servers:
|
|
157
|
+
graph:
|
|
158
|
+
type: "stdio"
|
|
159
|
+
command: "python"
|
|
160
|
+
args: ["-m", "my_mcp_server"]
|
|
161
|
+
|
|
162
|
+
conditions:
|
|
163
|
+
baseline:
|
|
164
|
+
mcp_servers: []
|
|
165
|
+
system_prompt: |
|
|
166
|
+
Answer using your own knowledge.
|
|
167
|
+
with_graph:
|
|
168
|
+
mcp_servers: [graph]
|
|
169
|
+
system_prompt: |
|
|
170
|
+
Use the graph tool to answer the question.
|
|
171
|
+
|
|
172
|
+
execution:
|
|
173
|
+
# How many times each (question, condition) pair is evaluated.
|
|
174
|
+
# This is useful for managing variance in agent responses. Standard
|
|
175
|
+
# deviation between scores will be reported if num_repetitions >= 3
|
|
176
|
+
num_repetitions: 3
|
|
177
|
+
# (question, condition, repetition) tuples can be evaluated concurrently
|
|
178
|
+
# to reduce total evaluation time. The upper bound of this number is determined
|
|
179
|
+
# only by the resources on your computer and by the rate limit configuration
|
|
180
|
+
# of the agent and model providers.
|
|
181
|
+
#
|
|
182
|
+
# In practice, numbers even as high as 50 seem to be well tolerated
|
|
183
|
+
# when using Vertex AI.
|
|
184
|
+
max_concurrent: 5
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
See [docs/run-configuration.md](docs/run-configuration.md) for the full reference including authentication setup.
|
|
188
|
+
|
k_eval-0.3.0/README.md
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# k-eval
|
|
2
|
+
|
|
3
|
+
Context-aware evaluation framework for AI agents using MCP.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
k-eval uses [uv](https://docs.astral.sh/uv/) for dependency management. Install it first if you don't have it:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
### Install `k-eval`
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
git clone https://github.com/jsell-rh/k-eval.git
|
|
17
|
+
cd k-eval/src/k-eval
|
|
18
|
+
|
|
19
|
+
# Core dependencies
|
|
20
|
+
uv sync
|
|
21
|
+
|
|
22
|
+
# With Vertex AI provider support
|
|
23
|
+
uv sync --extra vertex_ai
|
|
24
|
+
|
|
25
|
+
# All provider dependencies
|
|
26
|
+
uv sync --extra all
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Run `k-eval`
|
|
30
|
+
|
|
31
|
+
`k-eval` runs are configured using `yaml` configuration files (see [Configuration](#Configuration)).
|
|
32
|
+
|
|
33
|
+
Once an evaluation is defined in a `yaml` file, you can invoke
|
|
34
|
+
`k-eval` like:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
cd src/k-eval
|
|
38
|
+
uv run python -m k_eval.cli.main /path/to/config.yaml
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
See [docs/run-configuration.md](docs/run-configuration.md) for authentication setup and all CLI options.
|
|
42
|
+
|
|
43
|
+
#### CLI Options
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
src/k-eval$ uv run python -m k_eval.cli.main --help
|
|
47
|
+
|
|
48
|
+
Usage: python -m cli.main [OPTIONS] CONFIG_PATH
|
|
49
|
+
|
|
50
|
+
Run a k-eval evaluation from a YAML config file.
|
|
51
|
+
|
|
52
|
+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
53
|
+
│ * config_path PATH Path to evaluation config YAML [required] │
|
|
54
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
55
|
+
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
|
|
56
|
+
│ --output-dir -o PATH Directory for output files [default: results] │
|
|
57
|
+
│ --log-format TEXT Log format: 'console' or 'json' [default: console] │
|
|
58
|
+
│ --quiet -q Suppress debug and info logs; show only the progress bar plus warnings/errors. │
|
|
59
|
+
│ --help Show this message and exit. │
|
|
60
|
+
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Understanding the Output
|
|
64
|
+
|
|
65
|
+
Each run produces two files in `./results/` (or wherever you point `--output-dir`):
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
results/
|
|
69
|
+
my-eval_20260225_a1b2c3d4.json # aggregate scores per condition
|
|
70
|
+
my-eval_20260225_a1b2c3d4.detailed.jsonl # one line per (question, condition) pair
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**`{name}_{date}_{run_id}.json`** — the summary. One entry per condition with
|
|
74
|
+
mean and standard deviation for each of the three metrics across all questions
|
|
75
|
+
and repetitions. Use this to compare conditions at a glance.
|
|
76
|
+
|
|
77
|
+
This file is intended to be mostly compliant with the [Every Eval Ever](https://evalevalai.com/projects/every-eval-ever/) schema.
|
|
78
|
+
Notably, `k-eval` does not aggregate the three metrics into a single score.
|
|
79
|
+
Thus, the individual metrics are written to `score_details.details`, and
|
|
80
|
+
`score_details.score` is left `null`.
|
|
81
|
+
|
|
82
|
+
**`{name}_{date}_{run_id}.detailed.jsonl`** — the full record. One JSON object per
|
|
83
|
+
`(question, condition)` pair containing the agent's raw responses for every
|
|
84
|
+
repetition, per-repetition judge scores and reasoning, unverified claims, and
|
|
85
|
+
token usage. Use this if you want to dig into why a condition scored the way it did.
|
|
86
|
+
|
|
87
|
+
The three metrics are scored 1-5 by the judge model:
|
|
88
|
+
|
|
89
|
+
| Metric | What it measures |
|
|
90
|
+
|---|---|
|
|
91
|
+
| `factual_adherence` | Does the response stick to facts in the golden answer? |
|
|
92
|
+
| `completeness` | Does it cover all the essential points? |
|
|
93
|
+
| `helpfulness_and_clarity` | Is it well-structured and easy to act on? |
|
|
94
|
+
|
|
95
|
+
See [evaluation-methodology](docs/evaluation-methodology.md) for more details.
|
|
96
|
+
|
|
97
|
+
### Configuration
|
|
98
|
+
|
|
99
|
+
A config file defines your dataset, agent, judge, MCP servers, and the conditions you want to compare:
|
|
100
|
+
|
|
101
|
+
> [!Important]
|
|
102
|
+
>
|
|
103
|
+
> For MCP servers that require authentication,
|
|
104
|
+
> please reference [docs/run-configuration.md](docs/run-configuration.md).
|
|
105
|
+
|
|
106
|
+
```yaml
|
|
107
|
+
name: "my-eval"
|
|
108
|
+
version: "1"
|
|
109
|
+
|
|
110
|
+
dataset:
|
|
111
|
+
# JSONL file with your questions and golden answers
|
|
112
|
+
path: "./questions.jsonl"
|
|
113
|
+
# The name of the key used to reference the question within the JSONL file.
|
|
114
|
+
question_key: "question"
|
|
115
|
+
# They key used to reference the golden "reference" or answer within the JSON file.
|
|
116
|
+
answer_key: "answer"
|
|
117
|
+
|
|
118
|
+
agent:
|
|
119
|
+
type: "claude_code_sdk" # currently the only supported type
|
|
120
|
+
model: "claude-sonnet-4-5"
|
|
121
|
+
|
|
122
|
+
judge:
|
|
123
|
+
model: "vertex_ai/claude-opus-4-5" # any LiteLLM-compatible model string (See: https://models.litellm.ai/)
|
|
124
|
+
temperature: 0.0
|
|
125
|
+
|
|
126
|
+
mcp_servers:
|
|
127
|
+
graph:
|
|
128
|
+
type: "stdio"
|
|
129
|
+
command: "python"
|
|
130
|
+
args: ["-m", "my_mcp_server"]
|
|
131
|
+
|
|
132
|
+
conditions:
|
|
133
|
+
baseline:
|
|
134
|
+
mcp_servers: []
|
|
135
|
+
system_prompt: |
|
|
136
|
+
Answer using your own knowledge.
|
|
137
|
+
with_graph:
|
|
138
|
+
mcp_servers: [graph]
|
|
139
|
+
system_prompt: |
|
|
140
|
+
Use the graph tool to answer the question.
|
|
141
|
+
|
|
142
|
+
execution:
|
|
143
|
+
# How many times each (question, condition) pair is evaluated.
|
|
144
|
+
# This is useful for managing variance in agent responses. Standard
|
|
145
|
+
# deviation between scores will be reported if num_repetitions >= 3
|
|
146
|
+
num_repetitions: 3
|
|
147
|
+
# (question, condition, repetition) tuples can be evaluated concurrently
|
|
148
|
+
# to reduce total evaluation time. The upper bound of this number is determined
|
|
149
|
+
# only by the resources on your computer and by the rate limit configuration
|
|
150
|
+
# of the agent and model providers.
|
|
151
|
+
#
|
|
152
|
+
# In practice, numbers even as high as 50 seem to be well tolerated
|
|
153
|
+
# when using Vertex AI.
|
|
154
|
+
max_concurrent: 5
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
See [docs/run-configuration.md](docs/run-configuration.md) for the full reference including authentication setup.
|
|
158
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Agent Protocol — structural interface for all agent implementations."""
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from k_eval.agent.domain.result import AgentResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Agent(Protocol):
|
|
9
|
+
"""Structural interface satisfied by any agent implementation.
|
|
10
|
+
|
|
11
|
+
Each instance is constructed once per (condition, sample) evaluation.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
async def ask(self, question: str) -> AgentResult: ...
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""AgentFactory Protocol — structural interface for constructing Agent instances."""
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from k_eval.agent.domain.agent import Agent
|
|
6
|
+
from k_eval.config.domain.condition_mcp_server import ConditionMcpServer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AgentFactory(Protocol):
|
|
10
|
+
"""Constructs a new Agent instance for a given (condition, sample) pair."""
|
|
11
|
+
|
|
12
|
+
def create(
|
|
13
|
+
self,
|
|
14
|
+
condition: str,
|
|
15
|
+
sample_idx: str,
|
|
16
|
+
system_prompt: str,
|
|
17
|
+
mcp_servers: list[ConditionMcpServer],
|
|
18
|
+
) -> Agent: ...
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""AgentObserver port — domain events emitted during agent invocations."""
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AgentObserver(Protocol):
|
|
7
|
+
"""Observer port for agent domain events.
|
|
8
|
+
|
|
9
|
+
Implementations may log to structlog, record for tests, or emit metrics.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def agent_invocation_started(
|
|
13
|
+
self, condition: str, sample_idx: str, model: str
|
|
14
|
+
) -> None: ...
|
|
15
|
+
|
|
16
|
+
def agent_invocation_completed(
|
|
17
|
+
self,
|
|
18
|
+
condition: str,
|
|
19
|
+
sample_idx: str,
|
|
20
|
+
duration_ms: int,
|
|
21
|
+
num_turns: int,
|
|
22
|
+
cost_usd: float | None,
|
|
23
|
+
) -> None: ...
|
|
24
|
+
|
|
25
|
+
def agent_invocation_failed(
|
|
26
|
+
self, condition: str, sample_idx: str, reason: str
|
|
27
|
+
) -> None: ...
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""AgentResult value object — the outcome of a single agent invocation."""
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
|
4
|
+
|
|
5
|
+
from k_eval.agent.domain.usage import UsageMetrics
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AgentResult(BaseModel, frozen=True):
|
|
9
|
+
"""Immutable value object capturing all outcome data from one agent invocation."""
|
|
10
|
+
|
|
11
|
+
model_config = ConfigDict(frozen=True)
|
|
12
|
+
|
|
13
|
+
response: str
|
|
14
|
+
cost_usd: float | None
|
|
15
|
+
duration_ms: int
|
|
16
|
+
duration_api_ms: int
|
|
17
|
+
num_turns: int
|
|
18
|
+
usage: UsageMetrics | None
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""UsageMetrics value object — token usage from an agent invocation."""
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class UsageMetrics(BaseModel, frozen=True):
|
|
7
|
+
"""Immutable value object capturing token usage from a single agent invocation."""
|
|
8
|
+
|
|
9
|
+
model_config = ConfigDict(frozen=True)
|
|
10
|
+
|
|
11
|
+
input_tokens: int | None
|
|
12
|
+
output_tokens: int | None
|
|
File without changes
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""ClaudeAgentSDKAgent — agent implementation using the Claude Agent SDK."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from claude_agent_sdk import query
|
|
6
|
+
from claude_agent_sdk._errors import ClaudeSDKError
|
|
7
|
+
from claude_agent_sdk.types import (
|
|
8
|
+
ClaudeAgentOptions,
|
|
9
|
+
McpHttpServerConfig,
|
|
10
|
+
McpSdkServerConfig,
|
|
11
|
+
McpSSEServerConfig,
|
|
12
|
+
McpStdioServerConfig,
|
|
13
|
+
ResultMessage,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from k_eval.agent.domain.observer import AgentObserver
|
|
17
|
+
from k_eval.agent.domain.result import AgentResult
|
|
18
|
+
from k_eval.agent.domain.usage import UsageMetrics
|
|
19
|
+
from k_eval.agent.infrastructure.errors import AgentInvocationError
|
|
20
|
+
from k_eval.config.domain.agent import AgentConfig
|
|
21
|
+
from k_eval.config.domain.condition_mcp_server import ConditionMcpServer
|
|
22
|
+
from k_eval.config.domain.mcp_server import HttpMcpServer, SseMcpServer, StdioMcpServer
|
|
23
|
+
|
|
24
|
+
type McpServerConfigMap = dict[
|
|
25
|
+
str,
|
|
26
|
+
McpStdioServerConfig
|
|
27
|
+
| McpSSEServerConfig
|
|
28
|
+
| McpHttpServerConfig
|
|
29
|
+
| McpSdkServerConfig,
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ClaudeAgentSDKAgent:
|
|
34
|
+
"""Agent implementation that delegates to the Claude Agent SDK.
|
|
35
|
+
|
|
36
|
+
One instance is constructed per (condition, sample) evaluation run.
|
|
37
|
+
The condition and sample_idx are injected at construction time so that
|
|
38
|
+
observer events carry full context without polluting the ask() signature.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
config: AgentConfig,
|
|
44
|
+
condition: str,
|
|
45
|
+
sample_idx: str,
|
|
46
|
+
system_prompt: str,
|
|
47
|
+
mcp_servers: list[ConditionMcpServer],
|
|
48
|
+
observer: AgentObserver,
|
|
49
|
+
) -> None:
|
|
50
|
+
self._config = config
|
|
51
|
+
self._condition = condition
|
|
52
|
+
self._sample_idx = sample_idx
|
|
53
|
+
self._system_prompt = system_prompt
|
|
54
|
+
self._mcp_servers = mcp_servers
|
|
55
|
+
self._observer = observer
|
|
56
|
+
|
|
57
|
+
async def ask(self, question: str) -> AgentResult:
|
|
58
|
+
"""Invoke the agent with a question and return the structured result.
|
|
59
|
+
|
|
60
|
+
Opens a new SDK session per call — correct for independent eval samples.
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
AgentInvocationError: if the SDK raises, the agent returns an error,
|
|
64
|
+
or no ResultMessage is present in the response stream.
|
|
65
|
+
"""
|
|
66
|
+
self._observer.agent_invocation_started(
|
|
67
|
+
condition=self._condition,
|
|
68
|
+
sample_idx=self._sample_idx,
|
|
69
|
+
model=self._config.model,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
options = ClaudeAgentOptions(
|
|
74
|
+
model=self._config.model,
|
|
75
|
+
system_prompt=self._system_prompt,
|
|
76
|
+
mcp_servers=self._build_mcp_servers(),
|
|
77
|
+
disallowed_tools=self._build_disallowed_tools(),
|
|
78
|
+
permission_mode="bypassPermissions",
|
|
79
|
+
setting_sources=[],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
result_message = await self._collect_result(
|
|
83
|
+
prompt=question, options=options
|
|
84
|
+
)
|
|
85
|
+
except AgentInvocationError as exc:
|
|
86
|
+
reason = str(exc).removeprefix("Failed to invoke agent: ")
|
|
87
|
+
self._observer.agent_invocation_failed(
|
|
88
|
+
condition=self._condition,
|
|
89
|
+
sample_idx=self._sample_idx,
|
|
90
|
+
reason=reason,
|
|
91
|
+
)
|
|
92
|
+
raise
|
|
93
|
+
|
|
94
|
+
self._observer.agent_invocation_completed(
|
|
95
|
+
condition=self._condition,
|
|
96
|
+
sample_idx=self._sample_idx,
|
|
97
|
+
duration_ms=result_message.duration_ms,
|
|
98
|
+
num_turns=result_message.num_turns,
|
|
99
|
+
cost_usd=result_message.total_cost_usd,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
assert result_message.result is not None # guaranteed by _collect_result
|
|
103
|
+
return AgentResult(
|
|
104
|
+
response=result_message.result,
|
|
105
|
+
cost_usd=result_message.total_cost_usd,
|
|
106
|
+
duration_ms=result_message.duration_ms,
|
|
107
|
+
duration_api_ms=result_message.duration_api_ms,
|
|
108
|
+
num_turns=result_message.num_turns,
|
|
109
|
+
usage=self._map_usage(raw=result_message.usage),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
async def _collect_result(
|
|
113
|
+
self, prompt: str, options: ClaudeAgentOptions
|
|
114
|
+
) -> ResultMessage:
|
|
115
|
+
"""Run the SDK query and extract the single ResultMessage.
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
AgentInvocationError: on SDK errors or missing/error ResultMessage.
|
|
119
|
+
"""
|
|
120
|
+
result_message: ResultMessage | None = None
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
async for message in query(prompt=prompt, options=options):
|
|
124
|
+
if isinstance(message, ResultMessage):
|
|
125
|
+
result_message = message
|
|
126
|
+
except ClaudeSDKError as exc:
|
|
127
|
+
raise AgentInvocationError(reason=str(exc), retriable=True) from exc
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
# The SDK internally raises a bare Exception (not ClaudeSDKError) when
|
|
130
|
+
# its message reader encounters a fatal error (e.g. subprocess exit).
|
|
131
|
+
raise AgentInvocationError(reason=str(exc), retriable=True) from exc
|
|
132
|
+
|
|
133
|
+
if result_message is None:
|
|
134
|
+
raise AgentInvocationError(reason="no ResultMessage in response stream")
|
|
135
|
+
|
|
136
|
+
if result_message.is_error:
|
|
137
|
+
raise AgentInvocationError(
|
|
138
|
+
reason=f"agent returned error response: {result_message.result}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if result_message.result is None:
|
|
142
|
+
raise AgentInvocationError(reason="ResultMessage has no result text")
|
|
143
|
+
|
|
144
|
+
return result_message
|
|
145
|
+
|
|
146
|
+
def _build_mcp_servers(self) -> McpServerConfigMap:
|
|
147
|
+
"""Convert ConditionMcpServer list to the SDK's TypedDict format."""
|
|
148
|
+
servers: McpServerConfigMap = {}
|
|
149
|
+
|
|
150
|
+
for server in self._mcp_servers:
|
|
151
|
+
config = server.config
|
|
152
|
+
|
|
153
|
+
if isinstance(config, StdioMcpServer):
|
|
154
|
+
servers[server.name] = self._build_stdio_server(config=config)
|
|
155
|
+
elif isinstance(config, SseMcpServer):
|
|
156
|
+
servers[server.name] = self._build_sse_server(config=config)
|
|
157
|
+
elif isinstance(config, HttpMcpServer):
|
|
158
|
+
servers[server.name] = self._build_http_server(config=config)
|
|
159
|
+
else:
|
|
160
|
+
raise AgentInvocationError(
|
|
161
|
+
reason=f"unsupported MCP server type for server '{server.name}'"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return servers
|
|
165
|
+
|
|
166
|
+
def _build_stdio_server(self, config: StdioMcpServer) -> McpStdioServerConfig:
|
|
167
|
+
"""Build a McpStdioServerConfig TypedDict from a StdioMcpServer model."""
|
|
168
|
+
server: McpStdioServerConfig = McpStdioServerConfig(command=config.command)
|
|
169
|
+
if config.args:
|
|
170
|
+
server["args"] = list(config.args)
|
|
171
|
+
if config.env:
|
|
172
|
+
server["env"] = dict(config.env)
|
|
173
|
+
return server
|
|
174
|
+
|
|
175
|
+
def _build_sse_server(self, config: SseMcpServer) -> McpSSEServerConfig:
|
|
176
|
+
"""Build a McpSSEServerConfig TypedDict from a SseMcpServer model."""
|
|
177
|
+
server: McpSSEServerConfig = McpSSEServerConfig(type="sse", url=config.url)
|
|
178
|
+
if config.headers:
|
|
179
|
+
server["headers"] = dict(config.headers)
|
|
180
|
+
return server
|
|
181
|
+
|
|
182
|
+
def _build_http_server(self, config: HttpMcpServer) -> McpHttpServerConfig:
|
|
183
|
+
"""Build a McpHttpServerConfig TypedDict from an HttpMcpServer model."""
|
|
184
|
+
server: McpHttpServerConfig = McpHttpServerConfig(type="http", url=config.url)
|
|
185
|
+
if config.headers:
|
|
186
|
+
server["headers"] = dict(config.headers)
|
|
187
|
+
return server
|
|
188
|
+
|
|
189
|
+
def _build_disallowed_tools(self) -> list[str]:
|
|
190
|
+
"""Build the disallowed tools list — all Claude built-in tools.
|
|
191
|
+
|
|
192
|
+
allowed_tools alone does not remove built-in tools from the agent's
|
|
193
|
+
context; it only controls approval requirements. Explicitly disallowing
|
|
194
|
+
all built-in tools ensures the agent cannot use web search, file I/O,
|
|
195
|
+
or any other built-in capability regardless of permission_mode.
|
|
196
|
+
"""
|
|
197
|
+
return [
|
|
198
|
+
"Bash",
|
|
199
|
+
"Edit",
|
|
200
|
+
"Glob",
|
|
201
|
+
"Grep",
|
|
202
|
+
"LS",
|
|
203
|
+
"MultiEdit",
|
|
204
|
+
"NotebookEdit",
|
|
205
|
+
"NotebookRead",
|
|
206
|
+
"Read",
|
|
207
|
+
"Task",
|
|
208
|
+
"TodoRead",
|
|
209
|
+
"TodoWrite",
|
|
210
|
+
"WebFetch",
|
|
211
|
+
"WebSearch",
|
|
212
|
+
"Write",
|
|
213
|
+
]
|
|
214
|
+
|
|
215
|
+
def _map_usage(self, raw: dict[str, Any] | None) -> UsageMetrics | None:
|
|
216
|
+
"""Map the SDK's raw usage dict to a typed UsageMetrics value object."""
|
|
217
|
+
if raw is None:
|
|
218
|
+
return None
|
|
219
|
+
return UsageMetrics(
|
|
220
|
+
input_tokens=raw.get("input_tokens"),
|
|
221
|
+
output_tokens=raw.get("output_tokens"),
|
|
222
|
+
)
|