cane-eval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cane_eval-0.1.0/.gitignore +21 -0
- cane_eval-0.1.0/LICENSE +21 -0
- cane_eval-0.1.0/PKG-INFO +469 -0
- cane_eval-0.1.0/README.md +425 -0
- cane_eval-0.1.0/cane_eval/__init__.py +44 -0
- cane_eval-0.1.0/cane_eval/cli.py +610 -0
- cane_eval-0.1.0/cane_eval/engine.py +358 -0
- cane_eval-0.1.0/cane_eval/export.py +250 -0
- cane_eval-0.1.0/cane_eval/integrations/__init__.py +26 -0
- cane_eval-0.1.0/cane_eval/integrations/_base.py +143 -0
- cane_eval-0.1.0/cane_eval/integrations/fastapi_agent.py +216 -0
- cane_eval-0.1.0/cane_eval/integrations/langchain.py +151 -0
- cane_eval-0.1.0/cane_eval/integrations/llamaindex.py +122 -0
- cane_eval-0.1.0/cane_eval/integrations/openai_compat.py +234 -0
- cane_eval-0.1.0/cane_eval/judge.py +268 -0
- cane_eval-0.1.0/cane_eval/mining.py +348 -0
- cane_eval-0.1.0/cane_eval/rca.py +425 -0
- cane_eval-0.1.0/cane_eval/suite.py +249 -0
- cane_eval-0.1.0/examples/quickstart.ipynb +285 -0
- cane_eval-0.1.0/examples/run_eval.py +70 -0
- cane_eval-0.1.0/examples/support_agent.yaml +58 -0
- cane_eval-0.1.0/pyproject.toml +76 -0
- cane_eval-0.1.0/tests/__init__.py +0 -0
- cane_eval-0.1.0/tests/test_engine.py +109 -0
- cane_eval-0.1.0/tests/test_export.py +154 -0
- cane_eval-0.1.0/tests/test_judge.py +152 -0
- cane_eval-0.1.0/tests/test_mining.py +100 -0
- cane_eval-0.1.0/tests/test_suite.py +184 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
*.egg
|
|
8
|
+
.eggs/
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
env/
|
|
12
|
+
.env
|
|
13
|
+
*.jsonl
|
|
14
|
+
*.json
|
|
15
|
+
!examples/*.yaml
|
|
16
|
+
!examples/*.json
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
.ruff_cache/
|
|
19
|
+
.mypy_cache/
|
|
20
|
+
htmlcov/
|
|
21
|
+
.coverage
|
cane_eval-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Cane
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
cane_eval-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cane-eval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM-as-Judge evaluation for AI agents. YAML test suites, Claude-powered judging, failure mining, and training data export.
|
|
5
|
+
Project-URL: Homepage, https://github.com/colingfly/cane-eval
|
|
6
|
+
Project-URL: Documentation, https://github.com/colingfly/cane-eval#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/colingfly/cane-eval
|
|
8
|
+
Project-URL: Issues, https://github.com/colingfly/cane-eval/issues
|
|
9
|
+
Author: Cane
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ai-agents,dpo,eval,evaluation,judge,llm,training-data
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: anthropic>=0.39.0
|
|
23
|
+
Requires-Dist: pyyaml>=6.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
28
|
+
Provides-Extra: fastapi
|
|
29
|
+
Requires-Dist: fastapi>=0.100.0; extra == 'fastapi'
|
|
30
|
+
Requires-Dist: httpx>=0.24.0; extra == 'fastapi'
|
|
31
|
+
Provides-Extra: integrations
|
|
32
|
+
Requires-Dist: fastapi>=0.100.0; extra == 'integrations'
|
|
33
|
+
Requires-Dist: httpx>=0.24.0; extra == 'integrations'
|
|
34
|
+
Requires-Dist: langchain-core>=0.1.0; extra == 'integrations'
|
|
35
|
+
Requires-Dist: llama-index-core>=0.10.0; extra == 'integrations'
|
|
36
|
+
Requires-Dist: openai>=1.0.0; extra == 'integrations'
|
|
37
|
+
Provides-Extra: langchain
|
|
38
|
+
Requires-Dist: langchain-core>=0.1.0; extra == 'langchain'
|
|
39
|
+
Provides-Extra: llamaindex
|
|
40
|
+
Requires-Dist: llama-index-core>=0.10.0; extra == 'llamaindex'
|
|
41
|
+
Provides-Extra: openai
|
|
42
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
# cane-eval
|
|
46
|
+
|
|
47
|
+
LLM-as-Judge evaluation for AI agents. Define test suites in YAML, score responses with Claude, analyze failure root causes, and mine failures into training data.
|
|
48
|
+
|
|
49
|
+
[](https://colab.research.google.com/github/colingfly/cane-eval/blob/main/examples/quickstart.ipynb)
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
pip install cane-eval
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
**1. Define a test suite** (`tests.yaml`):
|
|
58
|
+
|
|
59
|
+
```yaml
|
|
60
|
+
name: Support Agent
|
|
61
|
+
model: claude-sonnet-4-5-20250929
|
|
62
|
+
|
|
63
|
+
criteria:
|
|
64
|
+
- key: accuracy
|
|
65
|
+
label: Accuracy
|
|
66
|
+
weight: 40
|
|
67
|
+
- key: completeness
|
|
68
|
+
label: Completeness
|
|
69
|
+
weight: 30
|
|
70
|
+
- key: hallucination
|
|
71
|
+
label: Hallucination Check
|
|
72
|
+
weight: 30
|
|
73
|
+
|
|
74
|
+
tests:
|
|
75
|
+
- question: What is the return policy?
|
|
76
|
+
expected_answer: 30-day return policy for unused items with receipt
|
|
77
|
+
tags: [policy]
|
|
78
|
+
|
|
79
|
+
- question: How do I reset my password?
|
|
80
|
+
expected_answer: Go to Settings > Security > Reset Password
|
|
81
|
+
tags: [account]
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**2. Run it**:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
cane-eval run tests.yaml
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
cane-eval Support Agent
|
|
92
|
+
2 test cases | model: claude-sonnet-4-5-20250929
|
|
93
|
+
|
|
94
|
+
PASS 1/2 [================----] 82 What is the return policy?
|
|
95
|
+
FAIL 2/2 [=========-----------] 45 How do I reset my password?
|
|
96
|
+
Agent fabricated a non-existent Settings page
|
|
97
|
+
|
|
98
|
+
========================================
|
|
99
|
+
|
|
100
|
+
Support Agent 4.2s
|
|
101
|
+
|
|
102
|
+
Overall: [==============----------------] 63.5
|
|
103
|
+
|
|
104
|
+
2 passed 0 warned 1 failed (2 total)
|
|
105
|
+
Pass rate: 50%
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**3. Mine failures into training data**:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
cane-eval run tests.yaml --mine --export dpo
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Usage
|
|
115
|
+
|
|
116
|
+
### CLI
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Run eval suite
|
|
120
|
+
cane-eval run tests.yaml
|
|
121
|
+
|
|
122
|
+
# Filter by tags
|
|
123
|
+
cane-eval run tests.yaml --tags policy,account
|
|
124
|
+
|
|
125
|
+
# Export training data (dpo, sft, openai, raw)
|
|
126
|
+
cane-eval run tests.yaml --export dpo --output training.jsonl
|
|
127
|
+
|
|
128
|
+
# Mine failures and generate improved answers
|
|
129
|
+
cane-eval run tests.yaml --mine --mine-threshold 60
|
|
130
|
+
|
|
131
|
+
# Root cause analysis on failures
|
|
132
|
+
cane-eval rca tests.yaml --threshold 60
|
|
133
|
+
|
|
134
|
+
# RCA from existing results (skip re-running eval)
|
|
135
|
+
cane-eval rca tests.yaml --results results.json
|
|
136
|
+
|
|
137
|
+
# RCA with targeted deep dives on worst failures
|
|
138
|
+
cane-eval rca tests.yaml --targeted --targeted-max 5
|
|
139
|
+
|
|
140
|
+
# Compare two runs (regression diff)
|
|
141
|
+
cane-eval diff results_v1.json results_v2.json
|
|
142
|
+
|
|
143
|
+
# Validate suite YAML
|
|
144
|
+
cane-eval validate tests.yaml
|
|
145
|
+
|
|
146
|
+
# CI mode: exit 1 on any failure
|
|
147
|
+
cane-eval run tests.yaml --quiet
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Python
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from cane_eval import TestSuite, EvalRunner, Exporter, FailureMiner, RootCauseAnalyzer
|
|
154
|
+
|
|
155
|
+
# Load suite
|
|
156
|
+
suite = TestSuite.from_yaml("tests.yaml")
|
|
157
|
+
|
|
158
|
+
# Run eval with your agent
|
|
159
|
+
runner = EvalRunner()
|
|
160
|
+
summary = runner.run(suite, agent=lambda q: my_agent.ask(q))
|
|
161
|
+
|
|
162
|
+
print(f"Score: {summary.overall_score}")
|
|
163
|
+
print(f"Pass rate: {summary.pass_rate:.0f}%")
|
|
164
|
+
|
|
165
|
+
# Root cause analysis on failures
|
|
166
|
+
analyzer = RootCauseAnalyzer()
|
|
167
|
+
rca = analyzer.analyze(summary, max_score=60)
|
|
168
|
+
print(rca.summary)
|
|
169
|
+
for rc in rca.root_causes:
|
|
170
|
+
print(f" [{rc.severity}] {rc.title} -- {rc.recommendation}")
|
|
171
|
+
|
|
172
|
+
# Export failures as DPO training pairs
|
|
173
|
+
exporter = Exporter(summary)
|
|
174
|
+
exporter.to_dpo("training_dpo.jsonl", max_score=60)
|
|
175
|
+
|
|
176
|
+
# Or mine failures with LLM-generated improvements
|
|
177
|
+
miner = FailureMiner()
|
|
178
|
+
mined = miner.mine(summary, max_score=60)
|
|
179
|
+
mined.to_file("mined_dpo.jsonl")
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Framework Integrations
|
|
183
|
+
|
|
184
|
+
One-liner eval for popular frameworks. No boilerplate needed.
|
|
185
|
+
|
|
186
|
+
**LangChain:**
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from cane_eval import evaluate_langchain
|
|
190
|
+
|
|
191
|
+
chain = prompt | llm | parser # any LCEL chain
|
|
192
|
+
results = evaluate_langchain(chain, suite="qa.yaml")
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
**LlamaIndex:**
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from cane_eval import evaluate_llamaindex
|
|
199
|
+
|
|
200
|
+
query_engine = index.as_query_engine()
|
|
201
|
+
results = evaluate_llamaindex(query_engine, suite="qa.yaml")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
**OpenAI-compatible endpoints** (OpenAI, vLLM, Ollama, LiteLLM):
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
from cane_eval import evaluate_openai
|
|
208
|
+
|
|
209
|
+
# Any OpenAI-compatible endpoint
|
|
210
|
+
results = evaluate_openai(
|
|
211
|
+
"http://localhost:11434/v1/chat/completions",
|
|
212
|
+
suite="qa.yaml",
|
|
213
|
+
openai_model="llama3",
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Or with the openai SDK client
|
|
217
|
+
from openai import OpenAI
|
|
218
|
+
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
|
|
219
|
+
results = evaluate_openai(client, suite="qa.yaml", openai_model="llama3")
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**FastAPI agents:**
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
from cane_eval import evaluate_fastapi
|
|
226
|
+
|
|
227
|
+
# Running server
|
|
228
|
+
results = evaluate_fastapi("http://localhost:8000/ask", suite="qa.yaml")
|
|
229
|
+
|
|
230
|
+
# Or test in-process (no server needed)
|
|
231
|
+
from fastapi import FastAPI
|
|
232
|
+
app = FastAPI()
|
|
233
|
+
results = evaluate_fastapi(app, suite="qa.yaml", endpoint="/ask")
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
All integrations support mining, RCA, and Cane Cloud push:
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
results = evaluate_langchain(
|
|
240
|
+
chain,
|
|
241
|
+
suite="qa.yaml",
|
|
242
|
+
mine=True, # mine failures into training data
|
|
243
|
+
rca=True, # root cause analysis
|
|
244
|
+
cloud="https://app.cane.dev", # push results to Cane Cloud
|
|
245
|
+
cloud_api_key="sk-...",
|
|
246
|
+
environment_id="env_abc123",
|
|
247
|
+
)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Install integration dependencies:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
pip install cane-eval[langchain] # LangChain
|
|
254
|
+
pip install cane-eval[llamaindex] # LlamaIndex
|
|
255
|
+
pip install cane-eval[openai] # OpenAI SDK
|
|
256
|
+
pip install cane-eval[fastapi] # FastAPI TestClient
|
|
257
|
+
pip install cane-eval[integrations] # all of the above
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### HTTP Agent Target
|
|
261
|
+
|
|
262
|
+
Point eval at any HTTP endpoint:
|
|
263
|
+
|
|
264
|
+
```yaml
|
|
265
|
+
name: Production Agent Eval
|
|
266
|
+
|
|
267
|
+
target:
|
|
268
|
+
type: http
|
|
269
|
+
url: https://my-agent.com/api/ask
|
|
270
|
+
method: POST
|
|
271
|
+
payload_template: '{"query": "{{question}}"}'
|
|
272
|
+
response_path: data.answer
|
|
273
|
+
headers:
|
|
274
|
+
Authorization: Bearer ${AGENT_API_KEY}
|
|
275
|
+
|
|
276
|
+
tests:
|
|
277
|
+
- question: What are your business hours?
|
|
278
|
+
expected_answer: Monday through Friday, 9am to 5pm EST
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### CLI Agent Target
|
|
282
|
+
|
|
283
|
+
Eval any command-line tool:
|
|
284
|
+
|
|
285
|
+
```yaml
|
|
286
|
+
name: CLI Agent Eval
|
|
287
|
+
|
|
288
|
+
target:
|
|
289
|
+
type: command
|
|
290
|
+
command: python my_agent.py --query "{{question}}"
|
|
291
|
+
|
|
292
|
+
tests:
|
|
293
|
+
- question: Summarize the Q4 report
|
|
294
|
+
expected_answer: Revenue grew 15% year-over-year
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
### Regression Diff
|
|
298
|
+
|
|
299
|
+
Compare runs to catch regressions:
|
|
300
|
+
|
|
301
|
+
```bash
|
|
302
|
+
# Save results from each run
|
|
303
|
+
cane-eval run tests.yaml --output-json results_v1.json
|
|
304
|
+
# ... make changes ...
|
|
305
|
+
cane-eval run tests.yaml --output-json results_v2.json
|
|
306
|
+
|
|
307
|
+
# Diff
|
|
308
|
+
cane-eval diff results_v1.json results_v2.json
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
```
|
|
312
|
+
Regression Diff
|
|
313
|
+
------------------------------------------------------------
|
|
314
|
+
|
|
315
|
+
2 Regressions
|
|
316
|
+
-25 85 -> 60 How do I cancel my subscription?
|
|
317
|
+
-12 72 -> 60 What payment methods do you accept?
|
|
318
|
+
|
|
319
|
+
1 Improvements
|
|
320
|
+
+30 45 -> 75 What is the return policy?
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### Failure Mining
|
|
324
|
+
|
|
325
|
+
Automatically classify failures and generate improved training data:
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
from cane_eval import EvalRunner, TestSuite, FailureMiner
|
|
329
|
+
|
|
330
|
+
suite = TestSuite.from_yaml("tests.yaml")
|
|
331
|
+
runner = EvalRunner()
|
|
332
|
+
summary = runner.run(suite, agent=my_agent)
|
|
333
|
+
|
|
334
|
+
# Mine all failures scoring below 60
|
|
335
|
+
miner = FailureMiner()
|
|
336
|
+
result = miner.mine(summary, max_score=60)
|
|
337
|
+
|
|
338
|
+
print(result.failure_distribution)
|
|
339
|
+
# {"hallucination": 3, "incomplete": 5, "factual_error": 2}
|
|
340
|
+
|
|
341
|
+
# Export as DPO training pairs
|
|
342
|
+
result.to_file("mined_dpo.jsonl", format="dpo")
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Root Cause Analysis
|
|
346
|
+
|
|
347
|
+
Go beyond "what failed" to understand "why it failed" with AI-powered root cause analysis:
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
from cane_eval import EvalRunner, TestSuite, RootCauseAnalyzer
|
|
351
|
+
|
|
352
|
+
suite = TestSuite.from_yaml("tests.yaml")
|
|
353
|
+
runner = EvalRunner()
|
|
354
|
+
summary = runner.run(suite, agent=my_agent)
|
|
355
|
+
|
|
356
|
+
# Batch analysis: find patterns across all failures
|
|
357
|
+
analyzer = RootCauseAnalyzer()
|
|
358
|
+
rca = analyzer.analyze(summary, max_score=60)
|
|
359
|
+
|
|
360
|
+
print(rca.summary)
|
|
361
|
+
# "Agent consistently fails on refund-related questions due to missing policy documentation"
|
|
362
|
+
|
|
363
|
+
print(rca.top_recommendation)
|
|
364
|
+
# "Add refund policy documents to the agent's knowledge base"
|
|
365
|
+
|
|
366
|
+
for rc in rca.root_causes:
|
|
367
|
+
print(f"[{rc.severity}] {rc.category}: {rc.title}")
|
|
368
|
+
print(f" {rc.recommendation}")
|
|
369
|
+
# [critical] knowledge_gap: Missing refund policy documentation
|
|
370
|
+
# Add refund policy documents to the agent's knowledge base
|
|
371
|
+
# [high] prompt_issue: No instruction to cite sources
|
|
372
|
+
# Update system prompt to require source citations
|
|
373
|
+
|
|
374
|
+
# Deep dive on a single failure
|
|
375
|
+
targeted = analyzer.analyze_result(summary.results[0])
|
|
376
|
+
print(targeted.diagnosis)
|
|
377
|
+
print(targeted.likely_cause) # "knowledge_gap", "hallucination", etc.
|
|
378
|
+
for fix in targeted.fix_actions:
|
|
379
|
+
print(f" [{fix.priority}] {fix.action} ({fix.effort})")
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
RCA categories: `knowledge_gap`, `prompt_issue`, `source_gap`, `behavior_pattern`, `data_quality`
|
|
383
|
+
|
|
384
|
+
Severity levels: `critical`, `high`, `medium`, `low`
|
|
385
|
+
|
|
386
|
+
### Custom Criteria
|
|
387
|
+
|
|
388
|
+
```yaml
|
|
389
|
+
criteria:
|
|
390
|
+
- key: accuracy
|
|
391
|
+
label: Factual Accuracy
|
|
392
|
+
description: Response matches expected answer on key facts
|
|
393
|
+
weight: 40
|
|
394
|
+
|
|
395
|
+
- key: tone
|
|
396
|
+
label: Professional Tone
|
|
397
|
+
description: Appropriate, helpful, non-condescending language
|
|
398
|
+
weight: 20
|
|
399
|
+
|
|
400
|
+
- key: citations
|
|
401
|
+
label: Source Citations
|
|
402
|
+
description: Claims are backed by referenced documents
|
|
403
|
+
weight: 25
|
|
404
|
+
|
|
405
|
+
- key: hallucination
|
|
406
|
+
label: Hallucination Check
|
|
407
|
+
description: No fabricated or unsupported information
|
|
408
|
+
weight: 15
|
|
409
|
+
|
|
410
|
+
custom_rules:
|
|
411
|
+
- Never recommend competitor products
|
|
412
|
+
- Always include a link to the help center when relevant
|
|
413
|
+
- Responses must be under 200 words
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
### Export Formats
|
|
417
|
+
|
|
418
|
+
| Format | Use Case | Structure |
|
|
419
|
+
|--------|----------|-----------|
|
|
420
|
+
| `dpo` | Direct Preference Optimization | `{prompt, chosen, rejected}` |
|
|
421
|
+
| `sft` | Supervised Fine-Tuning | `{prompt, completion, metadata}` |
|
|
422
|
+
| `openai` | OpenAI fine-tuning API | `{messages: [{role, content}]}` |
|
|
423
|
+
| `raw` | Analysis and debugging | Full eval result with all scores |
|
|
424
|
+
|
|
425
|
+
## CI Integration
|
|
426
|
+
|
|
427
|
+
```yaml
|
|
428
|
+
# .github/workflows/eval.yml
|
|
429
|
+
name: Agent Eval
|
|
430
|
+
on: [push]
|
|
431
|
+
jobs:
|
|
432
|
+
eval:
|
|
433
|
+
runs-on: ubuntu-latest
|
|
434
|
+
steps:
|
|
435
|
+
- uses: actions/checkout@v4
|
|
436
|
+
- uses: actions/setup-python@v5
|
|
437
|
+
with:
|
|
438
|
+
python-version: "3.12"
|
|
439
|
+
- run: pip install cane-eval
|
|
440
|
+
- run: cane-eval run tests/eval_suite.yaml --quiet
|
|
441
|
+
env:
|
|
442
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
## How It Works
|
|
446
|
+
|
|
447
|
+
```
|
|
448
|
+
YAML Suite Your Agent Claude Judge Output
|
|
449
|
+
----------- ---------- ------------ ------
|
|
450
|
+
questions ---> get answers ---> score 0-100 ---> DPO / SFT / OpenAI
|
|
451
|
+
expected per test per criteria
|
|
452
|
+
criteria pass/warn/fail
|
|
453
|
+
custom rules |
|
|
454
|
+
v
|
|
455
|
+
Root Cause Analysis (optional)
|
|
456
|
+
find patterns across failures
|
|
457
|
+
identify knowledge gaps, prompt issues
|
|
458
|
+
generate actionable recommendations
|
|
459
|
+
|
|
|
460
|
+
v
|
|
461
|
+
Failure Mining (optional)
|
|
462
|
+
classify failure type
|
|
463
|
+
LLM rewrite bad answers
|
|
464
|
+
generate improved training pairs
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
## License
|
|
468
|
+
|
|
469
|
+
MIT
|