k-eval 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. k_eval-0.3.0/MANIFEST.in +5 -0
  2. k_eval-0.3.0/PKG-INFO +188 -0
  3. k_eval-0.3.0/README.md +158 -0
  4. k_eval-0.3.0/k_eval/__init__.py +0 -0
  5. k_eval-0.3.0/k_eval/agent/__init__.py +0 -0
  6. k_eval-0.3.0/k_eval/agent/domain/__init__.py +0 -0
  7. k_eval-0.3.0/k_eval/agent/domain/agent.py +14 -0
  8. k_eval-0.3.0/k_eval/agent/domain/factory.py +18 -0
  9. k_eval-0.3.0/k_eval/agent/domain/observer.py +27 -0
  10. k_eval-0.3.0/k_eval/agent/domain/result.py +18 -0
  11. k_eval-0.3.0/k_eval/agent/domain/usage.py +12 -0
  12. k_eval-0.3.0/k_eval/agent/infrastructure/__init__.py +0 -0
  13. k_eval-0.3.0/k_eval/agent/infrastructure/claude_sdk.py +222 -0
  14. k_eval-0.3.0/k_eval/agent/infrastructure/errors.py +19 -0
  15. k_eval-0.3.0/k_eval/agent/infrastructure/factory.py +32 -0
  16. k_eval-0.3.0/k_eval/agent/infrastructure/observer.py +50 -0
  17. k_eval-0.3.0/k_eval/agent/infrastructure/registry.py +21 -0
  18. k_eval-0.3.0/k_eval/cli/__init__.py +0 -0
  19. k_eval-0.3.0/k_eval/cli/main.py +505 -0
  20. k_eval-0.3.0/k_eval/cli/output/__init__.py +0 -0
  21. k_eval-0.3.0/k_eval/cli/output/aggregator.py +89 -0
  22. k_eval-0.3.0/k_eval/cli/output/eee.py +224 -0
  23. k_eval-0.3.0/k_eval/config/__init__.py +0 -0
  24. k_eval-0.3.0/k_eval/config/domain/__init__.py +0 -0
  25. k_eval-0.3.0/k_eval/config/domain/agent.py +8 -0
  26. k_eval-0.3.0/k_eval/config/domain/condition.py +12 -0
  27. k_eval-0.3.0/k_eval/config/domain/condition_mcp_server.py +13 -0
  28. k_eval-0.3.0/k_eval/config/domain/config.py +26 -0
  29. k_eval-0.3.0/k_eval/config/domain/dataset.py +11 -0
  30. k_eval-0.3.0/k_eval/config/domain/execution.py +15 -0
  31. k_eval-0.3.0/k_eval/config/domain/judge.py +8 -0
  32. k_eval-0.3.0/k_eval/config/domain/mcp_server.py +39 -0
  33. k_eval-0.3.0/k_eval/config/domain/observer.py +9 -0
  34. k_eval-0.3.0/k_eval/config/infrastructure/__init__.py +0 -0
  35. k_eval-0.3.0/k_eval/config/infrastructure/env_interpolation.py +50 -0
  36. k_eval-0.3.0/k_eval/config/infrastructure/errors.py +37 -0
  37. k_eval-0.3.0/k_eval/config/infrastructure/observer.py +23 -0
  38. k_eval-0.3.0/k_eval/config/infrastructure/yaml_loader.py +120 -0
  39. k_eval-0.3.0/k_eval/core/__init__.py +0 -0
  40. k_eval-0.3.0/k_eval/core/errors.py +9 -0
  41. k_eval-0.3.0/k_eval/dataset/__init__.py +0 -0
  42. k_eval-0.3.0/k_eval/dataset/domain/__init__.py +0 -0
  43. k_eval-0.3.0/k_eval/dataset/domain/load_result.py +16 -0
  44. k_eval-0.3.0/k_eval/dataset/domain/loader.py +12 -0
  45. k_eval-0.3.0/k_eval/dataset/domain/observer.py +15 -0
  46. k_eval-0.3.0/k_eval/dataset/domain/sample.py +13 -0
  47. k_eval-0.3.0/k_eval/dataset/infrastructure/__init__.py +0 -0
  48. k_eval-0.3.0/k_eval/dataset/infrastructure/errors.py +10 -0
  49. k_eval-0.3.0/k_eval/dataset/infrastructure/jsonl_loader.py +130 -0
  50. k_eval-0.3.0/k_eval/dataset/infrastructure/observer.py +36 -0
  51. k_eval-0.3.0/k_eval/evaluation/__init__.py +0 -0
  52. k_eval-0.3.0/k_eval/evaluation/application/__init__.py +0 -0
  53. k_eval-0.3.0/k_eval/evaluation/application/runner.py +232 -0
  54. k_eval-0.3.0/k_eval/evaluation/domain/__init__.py +0 -0
  55. k_eval-0.3.0/k_eval/evaluation/domain/observer.py +71 -0
  56. k_eval-0.3.0/k_eval/evaluation/domain/run.py +20 -0
  57. k_eval-0.3.0/k_eval/evaluation/domain/summary.py +18 -0
  58. k_eval-0.3.0/k_eval/evaluation/infrastructure/__init__.py +0 -0
  59. k_eval-0.3.0/k_eval/evaluation/infrastructure/composite_observer.py +128 -0
  60. k_eval-0.3.0/k_eval/evaluation/infrastructure/observer.py +129 -0
  61. k_eval-0.3.0/k_eval/evaluation/infrastructure/progress_observer.py +370 -0
  62. k_eval-0.3.0/k_eval/judge/__init__.py +0 -0
  63. k_eval-0.3.0/k_eval/judge/domain/__init__.py +0 -0
  64. k_eval-0.3.0/k_eval/judge/domain/factory.py +15 -0
  65. k_eval-0.3.0/k_eval/judge/domain/judge.py +16 -0
  66. k_eval-0.3.0/k_eval/judge/domain/observer.py +26 -0
  67. k_eval-0.3.0/k_eval/judge/domain/score.py +21 -0
  68. k_eval-0.3.0/k_eval/judge/infrastructure/__init__.py +0 -0
  69. k_eval-0.3.0/k_eval/judge/infrastructure/errors.py +10 -0
  70. k_eval-0.3.0/k_eval/judge/infrastructure/factory.py +30 -0
  71. k_eval-0.3.0/k_eval/judge/infrastructure/litellm.py +205 -0
  72. k_eval-0.3.0/k_eval/judge/infrastructure/observer.py +53 -0
  73. k_eval-0.3.0/k_eval.egg-info/PKG-INFO +188 -0
  74. k_eval-0.3.0/k_eval.egg-info/SOURCES.txt +78 -0
  75. k_eval-0.3.0/k_eval.egg-info/dependency_links.txt +1 -0
  76. k_eval-0.3.0/k_eval.egg-info/entry_points.txt +2 -0
  77. k_eval-0.3.0/k_eval.egg-info/requires.txt +13 -0
  78. k_eval-0.3.0/k_eval.egg-info/top_level.txt +1 -0
  79. k_eval-0.3.0/pyproject.toml +69 -0
  80. k_eval-0.3.0/setup.cfg +4 -0
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include pyproject.toml
3
+ recursive-exclude results *
4
+ exclude out.json
5
+ exclude uv.lock
k_eval-0.3.0/PKG-INFO ADDED
@@ -0,0 +1,188 @@
1
+ Metadata-Version: 2.4
2
+ Name: k-eval
3
+ Version: 0.3.0
4
+ Summary: Context-aware evaluation framework for AI agents using MCP.
5
+ Author: jsell-rh
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/jsell-rh/k-eval
8
+ Project-URL: Repository, https://github.com/jsell-rh/k-eval
9
+ Project-URL: Issues, https://github.com/jsell-rh/k-eval/issues
10
+ Keywords: evaluation,ai,agents,mcp,llm
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Software Development :: Testing
17
+ Requires-Python: >=3.13
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: claude-agent-sdk>=0.1.39
20
+ Requires-Dist: litellm
21
+ Requires-Dist: pydantic>=2.12.5
22
+ Requires-Dist: pyyaml>=6.0.3
23
+ Requires-Dist: rich>=14.3.3
24
+ Requires-Dist: structlog>=25.5.0
25
+ Requires-Dist: typer>=0.24.1
26
+ Provides-Extra: vertex-ai
27
+ Requires-Dist: google-cloud-aiplatform>=1.138.0; extra == "vertex-ai"
28
+ Provides-Extra: all
29
+ Requires-Dist: k-eval[vertex_ai]; extra == "all"
30
+
31
+ # k-eval
32
+
33
+ Context-aware evaluation framework for AI agents using MCP.
34
+
35
+ ## Quick Start
36
+
37
+ k-eval uses [uv](https://docs.astral.sh/uv/) for dependency management. Install it first if you don't have it:
38
+
39
+ ```bash
40
+ curl -LsSf https://astral.sh/uv/install.sh | sh
41
+ ```
42
+
43
+ ### Install `k-eval`
44
+
45
+ ```bash
46
+ git clone https://github.com/jsell-rh/k-eval.git
47
+ cd k-eval/src/k-eval
48
+
49
+ # Core dependencies
50
+ uv sync
51
+
52
+ # With Vertex AI provider support
53
+ uv sync --extra vertex_ai
54
+
55
+ # All provider dependencies
56
+ uv sync --extra all
57
+ ```
58
+
59
+ ### Run `k-eval`
60
+
61
+ `k-eval` runs are configured using `yaml` configuration files (see [Configuration](#Configuration)).
62
+
63
+ Once an evaluation is defined in a `yaml` file, you can invoke
64
+ `k-eval` like:
65
+
66
+ ```bash
67
+ cd src/k-eval
68
+ uv run python -m k_eval.cli.main /path/to/config.yaml
69
+ ```
70
+
71
+ See [docs/run-configuration.md](docs/run-configuration.md) for authentication setup and all CLI options.
72
+
73
+ #### CLI Options
74
+
75
+ ```bash
76
+ src/k-eval$ uv run python -m k_eval.cli.main --help
77
+
78
+ Usage: python -m cli.main [OPTIONS] CONFIG_PATH
79
+
80
+ Run a k-eval evaluation from a YAML config file.
81
+
82
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
83
+ │ * config_path PATH Path to evaluation config YAML [required] │
84
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
85
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
86
+ │ --output-dir -o PATH Directory for output files [default: results] │
87
+ │ --log-format TEXT Log format: 'console' or 'json' [default: console] │
88
+ │ --quiet -q Suppress debug and info logs; show only the progress bar plus warnings/errors. │
89
+ │ --help Show this message and exit. │
90
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
91
+ ```
92
+
93
+ ### Understanding the Output
94
+
95
+ Each run produces two files in `./results/` (or wherever you point `--output-dir`):
96
+
97
+ ```
98
+ results/
99
+ my-eval_20260225_a1b2c3d4.json # aggregate scores per condition
100
+ my-eval_20260225_a1b2c3d4.detailed.jsonl # one line per (question, condition) pair
101
+ ```
102
+
103
+ **`{name}_{date}_{run_id}.json`** — the summary. One entry per condition with
104
+ mean and standard deviation for each of the three metrics across all questions
105
+ and repetitions. Use this to compare conditions at a glance.
106
+
107
+ This file is intended to be mostly compliant with the [Every Eval Ever](https://evalevalai.com/projects/every-eval-ever/) schema.
108
+ Notably, `k-eval` does not aggregate the three metrics into a single score.
109
+ Thus, the individual metrics are written to `score_details.details`, and
110
+ `score_details.score` is left `null`.
111
+
112
+ **`{name}_{date}_{run_id}.detailed.jsonl`** — the full record. One JSON object per
113
+ `(question, condition)` pair containing the agent's raw responses for every
114
+ repetition, per-repetition judge scores and reasoning, unverified claims, and
115
+ token usage. Use this if you want to dig into why a condition scored the way it did.
116
+
117
+ The three metrics are scored 1-5 by the judge model:
118
+
119
+ | Metric | What it measures |
120
+ |---|---|
121
+ | `factual_adherence` | Does the response stick to facts in the golden answer? |
122
+ | `completeness` | Does it cover all the essential points? |
123
+ | `helpfulness_and_clarity` | Is it well-structured and easy to act on? |
124
+
125
+ See [evaluation-methodology](docs/evaluation-methodology.md) for more details.
126
+
127
+ ### Configuration
128
+
129
+ A config file defines your dataset, agent, judge, MCP servers, and the conditions you want to compare:
130
+
131
+ > [!Important]
132
+ >
133
+ > For MCP servers that require authentication,
134
+ > please reference [docs/run-configuration.md](docs/run-configuration.md).
135
+
136
+ ```yaml
137
+ name: "my-eval"
138
+ version: "1"
139
+
140
+ dataset:
141
+ # JSONL file with your questions and golden answers
142
+ path: "./questions.jsonl"
143
+ # The name of the key used to reference the question within the JSONL file.
144
+ question_key: "question"
145
+ # They key used to reference the golden "reference" or answer within the JSON file.
146
+ answer_key: "answer"
147
+
148
+ agent:
149
+ type: "claude_code_sdk" # currently the only supported type
150
+ model: "claude-sonnet-4-5"
151
+
152
+ judge:
153
+ model: "vertex_ai/claude-opus-4-5" # any LiteLLM-compatible model string (See: https://models.litellm.ai/)
154
+ temperature: 0.0
155
+
156
+ mcp_servers:
157
+ graph:
158
+ type: "stdio"
159
+ command: "python"
160
+ args: ["-m", "my_mcp_server"]
161
+
162
+ conditions:
163
+ baseline:
164
+ mcp_servers: []
165
+ system_prompt: |
166
+ Answer using your own knowledge.
167
+ with_graph:
168
+ mcp_servers: [graph]
169
+ system_prompt: |
170
+ Use the graph tool to answer the question.
171
+
172
+ execution:
173
+ # How many times each (question, condition) pair is evaluated.
174
+ # This is useful for managing variance in agent responses. Standard
175
+ # deviation between scores will be reported if num_repetitions >= 3
176
+ num_repetitions: 3
177
+ # (question, condition, repetition) tuples can be evaluated concurrently
178
+ # to reduce total evaluation time. The upper bound of this number is determined
179
+ # only by the resources on your computer and by the rate limit configuration
180
+ # of the agent and model providers.
181
+ #
182
+ # In practice, numbers even as high as 50 seem to be well tolerated
183
+ # when using Vertex AI.
184
+ max_concurrent: 5
185
+ ```
186
+
187
+ See [docs/run-configuration.md](docs/run-configuration.md) for the full reference including authentication setup.
188
+
k_eval-0.3.0/README.md ADDED
@@ -0,0 +1,158 @@
1
+ # k-eval
2
+
3
+ Context-aware evaluation framework for AI agents using MCP.
4
+
5
+ ## Quick Start
6
+
7
+ k-eval uses [uv](https://docs.astral.sh/uv/) for dependency management. Install it first if you don't have it:
8
+
9
+ ```bash
10
+ curl -LsSf https://astral.sh/uv/install.sh | sh
11
+ ```
12
+
13
+ ### Install `k-eval`
14
+
15
+ ```bash
16
+ git clone https://github.com/jsell-rh/k-eval.git
17
+ cd k-eval/src/k-eval
18
+
19
+ # Core dependencies
20
+ uv sync
21
+
22
+ # With Vertex AI provider support
23
+ uv sync --extra vertex_ai
24
+
25
+ # All provider dependencies
26
+ uv sync --extra all
27
+ ```
28
+
29
+ ### Run `k-eval`
30
+
31
+ `k-eval` runs are configured using `yaml` configuration files (see [Configuration](#Configuration)).
32
+
33
+ Once an evaluation is defined in a `yaml` file, you can invoke
34
+ `k-eval` like:
35
+
36
+ ```bash
37
+ cd src/k-eval
38
+ uv run python -m k_eval.cli.main /path/to/config.yaml
39
+ ```
40
+
41
+ See [docs/run-configuration.md](docs/run-configuration.md) for authentication setup and all CLI options.
42
+
43
+ #### CLI Options
44
+
45
+ ```bash
46
+ src/k-eval$ uv run python -m k_eval.cli.main --help
47
+
48
+ Usage: python -m cli.main [OPTIONS] CONFIG_PATH
49
+
50
+ Run a k-eval evaluation from a YAML config file.
51
+
52
+ ╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
53
+ │ * config_path PATH Path to evaluation config YAML [required] │
54
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
55
+ ╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
56
+ │ --output-dir -o PATH Directory for output files [default: results] │
57
+ │ --log-format TEXT Log format: 'console' or 'json' [default: console] │
58
+ │ --quiet -q Suppress debug and info logs; show only the progress bar plus warnings/errors. │
59
+ │ --help Show this message and exit. │
60
+ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
61
+ ```
62
+
63
+ ### Understanding the Output
64
+
65
+ Each run produces two files in `./results/` (or wherever you point `--output-dir`):
66
+
67
+ ```
68
+ results/
69
+ my-eval_20260225_a1b2c3d4.json # aggregate scores per condition
70
+ my-eval_20260225_a1b2c3d4.detailed.jsonl # one line per (question, condition) pair
71
+ ```
72
+
73
+ **`{name}_{date}_{run_id}.json`** — the summary. One entry per condition with
74
+ mean and standard deviation for each of the three metrics across all questions
75
+ and repetitions. Use this to compare conditions at a glance.
76
+
77
+ This file is intended to be mostly compliant with the [Every Eval Ever](https://evalevalai.com/projects/every-eval-ever/) schema.
78
+ Notably, `k-eval` does not aggregate the three metrics into a single score.
79
+ Thus, the individual metrics are written to `score_details.details`, and
80
+ `score_details.score` is left `null`.
81
+
82
+ **`{name}_{date}_{run_id}.detailed.jsonl`** — the full record. One JSON object per
83
+ `(question, condition)` pair containing the agent's raw responses for every
84
+ repetition, per-repetition judge scores and reasoning, unverified claims, and
85
+ token usage. Use this if you want to dig into why a condition scored the way it did.
86
+
87
+ The three metrics are scored 1-5 by the judge model:
88
+
89
+ | Metric | What it measures |
90
+ |---|---|
91
+ | `factual_adherence` | Does the response stick to facts in the golden answer? |
92
+ | `completeness` | Does it cover all the essential points? |
93
+ | `helpfulness_and_clarity` | Is it well-structured and easy to act on? |
94
+
95
+ See [evaluation-methodology](docs/evaluation-methodology.md) for more details.
96
+
97
+ ### Configuration
98
+
99
+ A config file defines your dataset, agent, judge, MCP servers, and the conditions you want to compare:
100
+
101
+ > [!Important]
102
+ >
103
+ > For MCP servers that require authentication,
104
+ > please reference [docs/run-configuration.md](docs/run-configuration.md).
105
+
106
+ ```yaml
107
+ name: "my-eval"
108
+ version: "1"
109
+
110
+ dataset:
111
+ # JSONL file with your questions and golden answers
112
+ path: "./questions.jsonl"
113
+ # The name of the key used to reference the question within the JSONL file.
114
+ question_key: "question"
115
+ # They key used to reference the golden "reference" or answer within the JSON file.
116
+ answer_key: "answer"
117
+
118
+ agent:
119
+ type: "claude_code_sdk" # currently the only supported type
120
+ model: "claude-sonnet-4-5"
121
+
122
+ judge:
123
+ model: "vertex_ai/claude-opus-4-5" # any LiteLLM-compatible model string (See: https://models.litellm.ai/)
124
+ temperature: 0.0
125
+
126
+ mcp_servers:
127
+ graph:
128
+ type: "stdio"
129
+ command: "python"
130
+ args: ["-m", "my_mcp_server"]
131
+
132
+ conditions:
133
+ baseline:
134
+ mcp_servers: []
135
+ system_prompt: |
136
+ Answer using your own knowledge.
137
+ with_graph:
138
+ mcp_servers: [graph]
139
+ system_prompt: |
140
+ Use the graph tool to answer the question.
141
+
142
+ execution:
143
+ # How many times each (question, condition) pair is evaluated.
144
+ # This is useful for managing variance in agent responses. Standard
145
+ # deviation between scores will be reported if num_repetitions >= 3
146
+ num_repetitions: 3
147
+ # (question, condition, repetition) tuples can be evaluated concurrently
148
+ # to reduce total evaluation time. The upper bound of this number is determined
149
+ # only by the resources on your computer and by the rate limit configuration
150
+ # of the agent and model providers.
151
+ #
152
+ # In practice, numbers even as high as 50 seem to be well tolerated
153
+ # when using Vertex AI.
154
+ max_concurrent: 5
155
+ ```
156
+
157
+ See [docs/run-configuration.md](docs/run-configuration.md) for the full reference including authentication setup.
158
+
File without changes
File without changes
File without changes
@@ -0,0 +1,14 @@
1
+ """Agent Protocol — structural interface for all agent implementations."""
2
+
3
+ from typing import Protocol
4
+
5
+ from k_eval.agent.domain.result import AgentResult
6
+
7
+
8
+ class Agent(Protocol):
9
+ """Structural interface satisfied by any agent implementation.
10
+
11
+ Each instance is constructed once per (condition, sample) evaluation.
12
+ """
13
+
14
+ async def ask(self, question: str) -> AgentResult: ...
@@ -0,0 +1,18 @@
1
+ """AgentFactory Protocol — structural interface for constructing Agent instances."""
2
+
3
+ from typing import Protocol
4
+
5
+ from k_eval.agent.domain.agent import Agent
6
+ from k_eval.config.domain.condition_mcp_server import ConditionMcpServer
7
+
8
+
9
+ class AgentFactory(Protocol):
10
+ """Constructs a new Agent instance for a given (condition, sample) pair."""
11
+
12
+ def create(
13
+ self,
14
+ condition: str,
15
+ sample_idx: str,
16
+ system_prompt: str,
17
+ mcp_servers: list[ConditionMcpServer],
18
+ ) -> Agent: ...
@@ -0,0 +1,27 @@
1
+ """AgentObserver port — domain events emitted during agent invocations."""
2
+
3
+ from typing import Protocol
4
+
5
+
6
+ class AgentObserver(Protocol):
7
+ """Observer port for agent domain events.
8
+
9
+ Implementations may log to structlog, record for tests, or emit metrics.
10
+ """
11
+
12
+ def agent_invocation_started(
13
+ self, condition: str, sample_idx: str, model: str
14
+ ) -> None: ...
15
+
16
+ def agent_invocation_completed(
17
+ self,
18
+ condition: str,
19
+ sample_idx: str,
20
+ duration_ms: int,
21
+ num_turns: int,
22
+ cost_usd: float | None,
23
+ ) -> None: ...
24
+
25
+ def agent_invocation_failed(
26
+ self, condition: str, sample_idx: str, reason: str
27
+ ) -> None: ...
@@ -0,0 +1,18 @@
1
+ """AgentResult value object — the outcome of a single agent invocation."""
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+ from k_eval.agent.domain.usage import UsageMetrics
6
+
7
+
8
+ class AgentResult(BaseModel, frozen=True):
9
+ """Immutable value object capturing all outcome data from one agent invocation."""
10
+
11
+ model_config = ConfigDict(frozen=True)
12
+
13
+ response: str
14
+ cost_usd: float | None
15
+ duration_ms: int
16
+ duration_api_ms: int
17
+ num_turns: int
18
+ usage: UsageMetrics | None
@@ -0,0 +1,12 @@
1
+ """UsageMetrics value object — token usage from an agent invocation."""
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+
6
+ class UsageMetrics(BaseModel, frozen=True):
7
+ """Immutable value object capturing token usage from a single agent invocation."""
8
+
9
+ model_config = ConfigDict(frozen=True)
10
+
11
+ input_tokens: int | None
12
+ output_tokens: int | None
File without changes
@@ -0,0 +1,222 @@
1
+ """ClaudeAgentSDKAgent — agent implementation using the Claude Agent SDK."""
2
+
3
+ from typing import Any
4
+
5
+ from claude_agent_sdk import query
6
+ from claude_agent_sdk._errors import ClaudeSDKError
7
+ from claude_agent_sdk.types import (
8
+ ClaudeAgentOptions,
9
+ McpHttpServerConfig,
10
+ McpSdkServerConfig,
11
+ McpSSEServerConfig,
12
+ McpStdioServerConfig,
13
+ ResultMessage,
14
+ )
15
+
16
+ from k_eval.agent.domain.observer import AgentObserver
17
+ from k_eval.agent.domain.result import AgentResult
18
+ from k_eval.agent.domain.usage import UsageMetrics
19
+ from k_eval.agent.infrastructure.errors import AgentInvocationError
20
+ from k_eval.config.domain.agent import AgentConfig
21
+ from k_eval.config.domain.condition_mcp_server import ConditionMcpServer
22
+ from k_eval.config.domain.mcp_server import HttpMcpServer, SseMcpServer, StdioMcpServer
23
+
24
+ type McpServerConfigMap = dict[
25
+ str,
26
+ McpStdioServerConfig
27
+ | McpSSEServerConfig
28
+ | McpHttpServerConfig
29
+ | McpSdkServerConfig,
30
+ ]
31
+
32
+
33
+ class ClaudeAgentSDKAgent:
34
+ """Agent implementation that delegates to the Claude Agent SDK.
35
+
36
+ One instance is constructed per (condition, sample) evaluation run.
37
+ The condition and sample_idx are injected at construction time so that
38
+ observer events carry full context without polluting the ask() signature.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ config: AgentConfig,
44
+ condition: str,
45
+ sample_idx: str,
46
+ system_prompt: str,
47
+ mcp_servers: list[ConditionMcpServer],
48
+ observer: AgentObserver,
49
+ ) -> None:
50
+ self._config = config
51
+ self._condition = condition
52
+ self._sample_idx = sample_idx
53
+ self._system_prompt = system_prompt
54
+ self._mcp_servers = mcp_servers
55
+ self._observer = observer
56
+
57
+ async def ask(self, question: str) -> AgentResult:
58
+ """Invoke the agent with a question and return the structured result.
59
+
60
+ Opens a new SDK session per call — correct for independent eval samples.
61
+
62
+ Raises:
63
+ AgentInvocationError: if the SDK raises, the agent returns an error,
64
+ or no ResultMessage is present in the response stream.
65
+ """
66
+ self._observer.agent_invocation_started(
67
+ condition=self._condition,
68
+ sample_idx=self._sample_idx,
69
+ model=self._config.model,
70
+ )
71
+
72
+ try:
73
+ options = ClaudeAgentOptions(
74
+ model=self._config.model,
75
+ system_prompt=self._system_prompt,
76
+ mcp_servers=self._build_mcp_servers(),
77
+ disallowed_tools=self._build_disallowed_tools(),
78
+ permission_mode="bypassPermissions",
79
+ setting_sources=[],
80
+ )
81
+
82
+ result_message = await self._collect_result(
83
+ prompt=question, options=options
84
+ )
85
+ except AgentInvocationError as exc:
86
+ reason = str(exc).removeprefix("Failed to invoke agent: ")
87
+ self._observer.agent_invocation_failed(
88
+ condition=self._condition,
89
+ sample_idx=self._sample_idx,
90
+ reason=reason,
91
+ )
92
+ raise
93
+
94
+ self._observer.agent_invocation_completed(
95
+ condition=self._condition,
96
+ sample_idx=self._sample_idx,
97
+ duration_ms=result_message.duration_ms,
98
+ num_turns=result_message.num_turns,
99
+ cost_usd=result_message.total_cost_usd,
100
+ )
101
+
102
+ assert result_message.result is not None # guaranteed by _collect_result
103
+ return AgentResult(
104
+ response=result_message.result,
105
+ cost_usd=result_message.total_cost_usd,
106
+ duration_ms=result_message.duration_ms,
107
+ duration_api_ms=result_message.duration_api_ms,
108
+ num_turns=result_message.num_turns,
109
+ usage=self._map_usage(raw=result_message.usage),
110
+ )
111
+
112
+ async def _collect_result(
113
+ self, prompt: str, options: ClaudeAgentOptions
114
+ ) -> ResultMessage:
115
+ """Run the SDK query and extract the single ResultMessage.
116
+
117
+ Raises:
118
+ AgentInvocationError: on SDK errors or missing/error ResultMessage.
119
+ """
120
+ result_message: ResultMessage | None = None
121
+
122
+ try:
123
+ async for message in query(prompt=prompt, options=options):
124
+ if isinstance(message, ResultMessage):
125
+ result_message = message
126
+ except ClaudeSDKError as exc:
127
+ raise AgentInvocationError(reason=str(exc), retriable=True) from exc
128
+ except Exception as exc:
129
+ # The SDK internally raises a bare Exception (not ClaudeSDKError) when
130
+ # its message reader encounters a fatal error (e.g. subprocess exit).
131
+ raise AgentInvocationError(reason=str(exc), retriable=True) from exc
132
+
133
+ if result_message is None:
134
+ raise AgentInvocationError(reason="no ResultMessage in response stream")
135
+
136
+ if result_message.is_error:
137
+ raise AgentInvocationError(
138
+ reason=f"agent returned error response: {result_message.result}"
139
+ )
140
+
141
+ if result_message.result is None:
142
+ raise AgentInvocationError(reason="ResultMessage has no result text")
143
+
144
+ return result_message
145
+
146
+ def _build_mcp_servers(self) -> McpServerConfigMap:
147
+ """Convert ConditionMcpServer list to the SDK's TypedDict format."""
148
+ servers: McpServerConfigMap = {}
149
+
150
+ for server in self._mcp_servers:
151
+ config = server.config
152
+
153
+ if isinstance(config, StdioMcpServer):
154
+ servers[server.name] = self._build_stdio_server(config=config)
155
+ elif isinstance(config, SseMcpServer):
156
+ servers[server.name] = self._build_sse_server(config=config)
157
+ elif isinstance(config, HttpMcpServer):
158
+ servers[server.name] = self._build_http_server(config=config)
159
+ else:
160
+ raise AgentInvocationError(
161
+ reason=f"unsupported MCP server type for server '{server.name}'"
162
+ )
163
+
164
+ return servers
165
+
166
+ def _build_stdio_server(self, config: StdioMcpServer) -> McpStdioServerConfig:
167
+ """Build a McpStdioServerConfig TypedDict from a StdioMcpServer model."""
168
+ server: McpStdioServerConfig = McpStdioServerConfig(command=config.command)
169
+ if config.args:
170
+ server["args"] = list(config.args)
171
+ if config.env:
172
+ server["env"] = dict(config.env)
173
+ return server
174
+
175
+ def _build_sse_server(self, config: SseMcpServer) -> McpSSEServerConfig:
176
+ """Build a McpSSEServerConfig TypedDict from a SseMcpServer model."""
177
+ server: McpSSEServerConfig = McpSSEServerConfig(type="sse", url=config.url)
178
+ if config.headers:
179
+ server["headers"] = dict(config.headers)
180
+ return server
181
+
182
+ def _build_http_server(self, config: HttpMcpServer) -> McpHttpServerConfig:
183
+ """Build a McpHttpServerConfig TypedDict from an HttpMcpServer model."""
184
+ server: McpHttpServerConfig = McpHttpServerConfig(type="http", url=config.url)
185
+ if config.headers:
186
+ server["headers"] = dict(config.headers)
187
+ return server
188
+
189
+ def _build_disallowed_tools(self) -> list[str]:
190
+ """Build the disallowed tools list — all Claude built-in tools.
191
+
192
+ allowed_tools alone does not remove built-in tools from the agent's
193
+ context; it only controls approval requirements. Explicitly disallowing
194
+ all built-in tools ensures the agent cannot use web search, file I/O,
195
+ or any other built-in capability regardless of permission_mode.
196
+ """
197
+ return [
198
+ "Bash",
199
+ "Edit",
200
+ "Glob",
201
+ "Grep",
202
+ "LS",
203
+ "MultiEdit",
204
+ "NotebookEdit",
205
+ "NotebookRead",
206
+ "Read",
207
+ "Task",
208
+ "TodoRead",
209
+ "TodoWrite",
210
+ "WebFetch",
211
+ "WebSearch",
212
+ "Write",
213
+ ]
214
+
215
+ def _map_usage(self, raw: dict[str, Any] | None) -> UsageMetrics | None:
216
+ """Map the SDK's raw usage dict to a typed UsageMetrics value object."""
217
+ if raw is None:
218
+ return None
219
+ return UsageMetrics(
220
+ input_tokens=raw.get("input_tokens"),
221
+ output_tokens=raw.get("output_tokens"),
222
+ )