evalkit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +312 -0
- package/dist/index.cjs +882 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +402 -0
- package/dist/index.d.ts +402 -0
- package/dist/index.js +817 -0
- package/dist/index.js.map +1 -0
- package/package.json +61 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 agentcheck contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
# evalkit
|
|
2
|
+
|
|
3
|
+
Lightweight deterministic evaluators for AI agents. Binary pass/fail checks, zero dependencies, no LLM cost.
|
|
4
|
+
|
|
5
|
+
## Why
|
|
6
|
+
|
|
7
|
+
Before you reach for LLM-as-judge or complex scoring rubrics, you should have 10-20 core test cases with deterministic, binary checks that run on every commit. These checks have zero API cost, zero ambiguity, and produce the same result every time.
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm install evalkit
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```typescript
|
|
18
|
+
import { runSuite, printSuiteResult } from 'evalkit';
|
|
19
|
+
|
|
20
|
+
const result = await runSuite({
|
|
21
|
+
cases: 'golden-set.yaml',
|
|
22
|
+
agent: async (query) => {
|
|
23
|
+
const res = await myAgent.invoke(query);
|
|
24
|
+
return {
|
|
25
|
+
responseText: res.text,
|
|
26
|
+
actualTools: res.toolsUsed,
|
|
27
|
+
latencyMs: res.duration,
|
|
28
|
+
};
|
|
29
|
+
},
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
printSuiteResult(result);
|
|
33
|
+
// eval-001 What is my portfolio allocation? PASS 1.2s
|
|
34
|
+
// eval-002 Show me my current holdings FAIL 3.4s
|
|
35
|
+
// content_match: Missing: $
|
|
36
|
+
//
|
|
37
|
+
// 1/2 passed (4.6s)
|
|
38
|
+
|
|
39
|
+
if (result.failed > 0) process.exit(1);
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Runner
|
|
43
|
+
|
|
44
|
+
Load test cases from JSON or YAML and run them against your agent.
|
|
45
|
+
|
|
46
|
+
### Test case format
|
|
47
|
+
|
|
48
|
+
#### JSON
|
|
49
|
+
|
|
50
|
+
```json
|
|
51
|
+
{
|
|
52
|
+
"test_cases": [
|
|
53
|
+
{
|
|
54
|
+
"id": "eval-001",
|
|
55
|
+
"query": "What is my portfolio allocation?",
|
|
56
|
+
"checks": {
|
|
57
|
+
"expectedTools": ["portfolio_holdings"],
|
|
58
|
+
"mustContain": ["%", "AAPL"],
|
|
59
|
+
"mustNotContain": ["I don't know"],
|
|
60
|
+
"thresholdMs": 20000
|
|
61
|
+
},
|
|
62
|
+
"metadata": { "category": "portfolio" }
|
|
63
|
+
}
|
|
64
|
+
]
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
#### YAML (built-in parser, no dependencies)
|
|
69
|
+
|
|
70
|
+
```yaml
|
|
71
|
+
# golden-set.yaml
|
|
72
|
+
test_cases:
|
|
73
|
+
- id: eval-001
|
|
74
|
+
query: "What is my portfolio allocation?"
|
|
75
|
+
checks:
|
|
76
|
+
expectedTools:
|
|
77
|
+
- portfolio_holdings
|
|
78
|
+
mustContain:
|
|
79
|
+
- "%"
|
|
80
|
+
- "AAPL"
|
|
81
|
+
mustNotContain:
|
|
82
|
+
- "I don't know"
|
|
83
|
+
thresholdMs: 20000
|
|
84
|
+
metadata:
|
|
85
|
+
category: portfolio
|
|
86
|
+
difficulty: basic
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Agent callback
|
|
90
|
+
|
|
91
|
+
The runner calls your function with each test case's `query` and expects an `AgentResult` back:
|
|
92
|
+
|
|
93
|
+
```typescript
|
|
94
|
+
interface AgentResult {
|
|
95
|
+
responseText: string; // The agent's text response (required)
|
|
96
|
+
actualTools?: string[]; // Tools the agent called
|
|
97
|
+
latencyMs?: number; // How long the agent took
|
|
98
|
+
toolCallCount?: number; // Number of tool calls (or derived from actualTools.length)
|
|
99
|
+
cost?: number; // Token count or dollar cost
|
|
100
|
+
}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
You provide the adapter — evalkit never touches your SDK, keys, or auth.
|
|
104
|
+
|
|
105
|
+
### `runSuite()` options
|
|
106
|
+
|
|
107
|
+
```typescript
|
|
108
|
+
const result = await runSuite({
|
|
109
|
+
cases: 'golden-set.yaml', // File path (.json, .yaml, .yml) or inline SuiteConfig object
|
|
110
|
+
agent: myAgentFn, // Your (query: string) => Promise<AgentResult> callback
|
|
111
|
+
name: 'Portfolio Suite', // Optional suite name (overrides name from file)
|
|
112
|
+
concurrency: 3, // Run cases in parallel (default: 1, sequential)
|
|
113
|
+
onCaseComplete: (caseResult) => {
|
|
114
|
+
console.log(`${caseResult.id}: ${caseResult.passed ? 'PASS' : 'FAIL'}`);
|
|
115
|
+
}, // Optional progress callback, fired after each case
|
|
116
|
+
});
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Or pass cases inline — no file needed:
|
|
120
|
+
|
|
121
|
+
```typescript
|
|
122
|
+
const result = await runSuite({
|
|
123
|
+
cases: {
|
|
124
|
+
test_cases: [
|
|
125
|
+
{ id: 'test-1', query: 'Hello', checks: { mustContain: ['hi'] } },
|
|
126
|
+
],
|
|
127
|
+
},
|
|
128
|
+
agent: myAgentFn,
|
|
129
|
+
});
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Available checks
|
|
133
|
+
|
|
134
|
+
| Check field | What it validates |
|
|
135
|
+
|---|---|
|
|
136
|
+
| `expectedTools` | Agent called exactly these tools (set equality) |
|
|
137
|
+
| `mustContain` | Response contains all strings (case-insensitive) |
|
|
138
|
+
| `mustNotContain` | Response contains none of these strings |
|
|
139
|
+
| `thresholdMs` | Response time under threshold |
|
|
140
|
+
| `json` | Response is valid JSON (`{ requireObject: true }` for objects only) |
|
|
141
|
+
| `schema` | Parsed JSON has required keys with correct types |
|
|
142
|
+
| `copOutPhrases` | Response isn't empty or a cop-out ("I don't know", etc.) |
|
|
143
|
+
| `lengthMin` / `lengthMax` | Response character count within bounds |
|
|
144
|
+
| `regexPatterns` | Response matches regex patterns (`regexMode: 'all' \| 'any'`) |
|
|
145
|
+
| `toolCallMin` / `toolCallMax` | Number of tool calls within bounds |
|
|
146
|
+
| `costBudget` | Cost under budget |
|
|
147
|
+
|
|
148
|
+
A case with no checks always passes — useful for smoke tests that just verify the agent doesn't crash.
|
|
149
|
+
|
|
150
|
+
## Individual evaluators
|
|
151
|
+
|
|
152
|
+
Every evaluator returns an `EvalResult` with `passed: boolean` and `details: string`. You can use them standalone without the runner.
|
|
153
|
+
|
|
154
|
+
### Core checks
|
|
155
|
+
|
|
156
|
+
```typescript
|
|
157
|
+
import { toolSelection, contentMatch, negativeMatch, latency } from 'evalkit';
|
|
158
|
+
|
|
159
|
+
toolSelection({
|
|
160
|
+
expected: ['search', 'summarize'],
|
|
161
|
+
actual: ['summarize', 'search'],
|
|
162
|
+
});
|
|
163
|
+
// passed: true — order-independent set equality
|
|
164
|
+
|
|
165
|
+
contentMatch({
|
|
166
|
+
responseText: 'The GDP growth rate is 2.5%',
|
|
167
|
+
mustContain: ['GDP', 'growth'],
|
|
168
|
+
});
|
|
169
|
+
// passed: true — case-insensitive substring match
|
|
170
|
+
|
|
171
|
+
negativeMatch({
|
|
172
|
+
responseText: 'Here is your analysis.',
|
|
173
|
+
mustNotContain: ["I don't know", 'error'],
|
|
174
|
+
});
|
|
175
|
+
// passed: true
|
|
176
|
+
|
|
177
|
+
latency({ latencyMs: 1200, thresholdMs: 5000 });
|
|
178
|
+
// passed: true — default threshold: 20,000ms
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Format checks
|
|
182
|
+
|
|
183
|
+
```typescript
|
|
184
|
+
import { jsonValid, schemaMatch, nonEmpty, lengthBounds } from 'evalkit';
|
|
185
|
+
|
|
186
|
+
jsonValid({ text: '{"valid": true}', requireObject: true });
|
|
187
|
+
// passed: true
|
|
188
|
+
|
|
189
|
+
schemaMatch({
|
|
190
|
+
data: { name: 'Alice', age: 30 },
|
|
191
|
+
requiredKeys: ['name', 'age'],
|
|
192
|
+
typeChecks: { name: 'string', age: 'number' },
|
|
193
|
+
});
|
|
194
|
+
// passed: true — zero-dep, no Zod needed
|
|
195
|
+
|
|
196
|
+
nonEmpty({ responseText: "I don't know" });
|
|
197
|
+
// passed: false — "Response is a cop-out phrase"
|
|
198
|
+
|
|
199
|
+
lengthBounds({ responseText: 'Hello world', min: 5, max: 1000 });
|
|
200
|
+
// passed: true
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Pattern & behavioral checks
|
|
204
|
+
|
|
205
|
+
```typescript
|
|
206
|
+
import { regexMatch, toolCallCount, costBudget } from 'evalkit';
|
|
207
|
+
|
|
208
|
+
regexMatch({
|
|
209
|
+
responseText: 'Contact: user@example.com',
|
|
210
|
+
patterns: [/\S+@\S+\.\S+/],
|
|
211
|
+
mode: 'all', // 'all' (default) or 'any'
|
|
212
|
+
});
|
|
213
|
+
// passed: true
|
|
214
|
+
|
|
215
|
+
toolCallCount({ count: 3, min: 1, max: 5 });
|
|
216
|
+
// passed: true
|
|
217
|
+
|
|
218
|
+
costBudget({ actual: 5000, budget: 10000 });
|
|
219
|
+
// passed: true — works with token counts or dollar amounts
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## `runChecks()`
|
|
223
|
+
|
|
224
|
+
Run any combination of checks at once. Only runs checks for which inputs are provided.
|
|
225
|
+
|
|
226
|
+
```typescript
|
|
227
|
+
import { runChecks } from 'evalkit';
|
|
228
|
+
|
|
229
|
+
const result = runChecks({
|
|
230
|
+
responseText: 'Your portfolio has 15 holdings',
|
|
231
|
+
expectedTools: ['portfolio_holdings'],
|
|
232
|
+
actualTools: ['portfolio_holdings'],
|
|
233
|
+
mustContain: ['portfolio', 'holdings'],
|
|
234
|
+
mustNotContain: ["I don't know"],
|
|
235
|
+
latencyMs: 1500,
|
|
236
|
+
thresholdMs: 5000,
|
|
237
|
+
});
|
|
238
|
+
// { passed: true, results: [...], summary: '4/4 checks passed' }
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
## Factory pattern
|
|
242
|
+
|
|
243
|
+
Each evaluator has a `create*Evaluator` factory for reuse across test cases:
|
|
244
|
+
|
|
245
|
+
```typescript
|
|
246
|
+
import { createContentMatchEvaluator } from 'evalkit';
|
|
247
|
+
|
|
248
|
+
const checkContent = createContentMatchEvaluator({
|
|
249
|
+
mustContain: ['portfolio', 'holdings', 'allocation'],
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
// Reuse across test cases
|
|
253
|
+
const r1 = checkContent({ responseText: response1 });
|
|
254
|
+
const r2 = checkContent({ responseText: response2 });
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## Integrating with your agent
|
|
258
|
+
|
|
259
|
+
evalkit is SDK-agnostic. You write a thin adapter function that calls your agent and returns an `AgentResult`. Here are patterns for common setups:
|
|
260
|
+
|
|
261
|
+
### Any agent (generic pattern)
|
|
262
|
+
|
|
263
|
+
```typescript
|
|
264
|
+
import { runSuite, printSuiteResult, AgentFn } from 'evalkit';
|
|
265
|
+
|
|
266
|
+
const agent: AgentFn = async (query) => {
|
|
267
|
+
const start = Date.now();
|
|
268
|
+
const response = await callMyAgent(query);
|
|
269
|
+
return {
|
|
270
|
+
responseText: response.text,
|
|
271
|
+
actualTools: response.toolCalls?.map((t) => t.name),
|
|
272
|
+
latencyMs: Date.now() - start,
|
|
273
|
+
toolCallCount: response.toolCalls?.length,
|
|
274
|
+
cost: response.usage?.totalTokens,
|
|
275
|
+
};
|
|
276
|
+
};
|
|
277
|
+
|
|
278
|
+
const result = await runSuite({ cases: 'golden-set.yaml', agent });
|
|
279
|
+
printSuiteResult(result);
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### CI / GitHub Actions
|
|
283
|
+
|
|
284
|
+
```typescript
|
|
285
|
+
// eval.ts — run with: npx tsx eval.ts
|
|
286
|
+
import { runSuite, printSuiteResult } from 'evalkit';
|
|
287
|
+
|
|
288
|
+
const result = await runSuite({
|
|
289
|
+
cases: 'golden-set.yaml',
|
|
290
|
+
agent: myAgentFn,
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
printSuiteResult(result);
|
|
294
|
+
process.exit(result.failed > 0 ? 1 : 0);
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
```yaml
|
|
298
|
+
# .github/workflows/evals.yml
|
|
299
|
+
- run: npx tsx eval.ts
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
## Design principles
|
|
303
|
+
|
|
304
|
+
- **Zero runtime dependencies** — no Zod, no Ajv, no LLM calls
|
|
305
|
+
- **Binary results** — every check returns `passed: boolean`
|
|
306
|
+
- **Deterministic** — same input always produces same output
|
|
307
|
+
- **CI-friendly** — fast enough to run on every commit
|
|
308
|
+
- **SDK-agnostic** — you provide the agent callback, evalkit runs the checks
|
|
309
|
+
|
|
310
|
+
## License
|
|
311
|
+
|
|
312
|
+
MIT
|