evalkit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +312 -0
- package/dist/index.cjs +882 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +402 -0
- package/dist/index.d.ts +402 -0
- package/dist/index.js +817 -0
- package/dist/index.js.map +1 -0
- package/package.json +61 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
/** Standard result returned by every evaluator. */
|
|
2
|
+
interface EvalResult {
|
|
3
|
+
/** Evaluator identifier */
|
|
4
|
+
key: string;
|
|
5
|
+
/** Binary pass/fail */
|
|
6
|
+
passed: boolean;
|
|
7
|
+
/** Human-readable explanation */
|
|
8
|
+
details: string;
|
|
9
|
+
}
|
|
10
|
+
/** Result for tool selection check. */
|
|
11
|
+
interface ToolSelectionResult extends EvalResult {
|
|
12
|
+
key: 'tool_selection';
|
|
13
|
+
expected: string[];
|
|
14
|
+
actual: string[];
|
|
15
|
+
}
|
|
16
|
+
/** Result for content match check. */
|
|
17
|
+
interface ContentMatchResult extends EvalResult {
|
|
18
|
+
key: 'content_match';
|
|
19
|
+
missing: string[];
|
|
20
|
+
}
|
|
21
|
+
/** Result for negative match check. */
|
|
22
|
+
interface NegativeMatchResult extends EvalResult {
|
|
23
|
+
key: 'negative_match';
|
|
24
|
+
found: string[];
|
|
25
|
+
}
|
|
26
|
+
/** Result for latency check. */
|
|
27
|
+
interface LatencyResult extends EvalResult {
|
|
28
|
+
key: 'latency';
|
|
29
|
+
ms: number;
|
|
30
|
+
threshold: number;
|
|
31
|
+
}
|
|
32
|
+
/** Result for JSON validity check. */
|
|
33
|
+
interface JsonValidResult extends EvalResult {
|
|
34
|
+
key: 'json_valid';
|
|
35
|
+
}
|
|
36
|
+
/** Result for schema match check. */
|
|
37
|
+
interface SchemaMatchResult extends EvalResult {
|
|
38
|
+
key: 'schema_match';
|
|
39
|
+
missingKeys: string[];
|
|
40
|
+
typeErrors: string[];
|
|
41
|
+
}
|
|
42
|
+
/** Result for non-empty check. */
|
|
43
|
+
interface NonEmptyResult extends EvalResult {
|
|
44
|
+
key: 'non_empty';
|
|
45
|
+
}
|
|
46
|
+
/** Result for length bounds check. */
|
|
47
|
+
interface LengthBoundsResult extends EvalResult {
|
|
48
|
+
key: 'length_bounds';
|
|
49
|
+
length: number;
|
|
50
|
+
min?: number;
|
|
51
|
+
max?: number;
|
|
52
|
+
}
|
|
53
|
+
/** Result for regex match check. */
|
|
54
|
+
interface RegexMatchResult extends EvalResult {
|
|
55
|
+
key: 'regex_match';
|
|
56
|
+
failedPatterns: string[];
|
|
57
|
+
}
|
|
58
|
+
/** Result for tool call count check. */
|
|
59
|
+
interface ToolCallCountResult extends EvalResult {
|
|
60
|
+
key: 'tool_call_count';
|
|
61
|
+
count: number;
|
|
62
|
+
min?: number;
|
|
63
|
+
max?: number;
|
|
64
|
+
}
|
|
65
|
+
/** Result for cost budget check. */
|
|
66
|
+
interface CostBudgetResult extends EvalResult {
|
|
67
|
+
key: 'cost_budget';
|
|
68
|
+
actual: number;
|
|
69
|
+
budget: number;
|
|
70
|
+
}
|
|
71
|
+
/** Summary result from runChecks(). */
|
|
72
|
+
interface CheckSuiteResult {
|
|
73
|
+
passed: boolean;
|
|
74
|
+
results: EvalResult[];
|
|
75
|
+
summary: string;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
interface ToolSelectionInput {
|
|
79
|
+
expected: string[];
|
|
80
|
+
actual: string[];
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Strict set equality between expected and actual tool names.
|
|
84
|
+
* Order-independent, deduplicates. Reports missing and unexpected tools.
|
|
85
|
+
*/
|
|
86
|
+
declare function toolSelection(input: ToolSelectionInput): ToolSelectionResult;
|
|
87
|
+
/**
|
|
88
|
+
* Factory: creates a reusable toolSelection evaluator with fixed expected tools.
|
|
89
|
+
*/
|
|
90
|
+
declare function createToolSelectionEvaluator(config: {
|
|
91
|
+
expected: string[];
|
|
92
|
+
}): (input: {
|
|
93
|
+
actual: string[];
|
|
94
|
+
}) => ToolSelectionResult;
|
|
95
|
+
|
|
96
|
+
interface ContentMatchInput {
|
|
97
|
+
responseText: string;
|
|
98
|
+
mustContain: string[];
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Case-insensitive substring matching: all mustContain strings must appear in responseText.
|
|
102
|
+
*/
|
|
103
|
+
declare function contentMatch(input: ContentMatchInput): ContentMatchResult;
|
|
104
|
+
/**
|
|
105
|
+
* Factory: creates a reusable contentMatch evaluator with fixed mustContain list.
|
|
106
|
+
*/
|
|
107
|
+
declare function createContentMatchEvaluator(config: {
|
|
108
|
+
mustContain: string[];
|
|
109
|
+
}): (input: {
|
|
110
|
+
responseText: string;
|
|
111
|
+
}) => ContentMatchResult;
|
|
112
|
+
|
|
113
|
+
interface NegativeMatchInput {
|
|
114
|
+
responseText: string;
|
|
115
|
+
mustNotContain: string[];
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Case-insensitive substring matching: no mustNotContain strings may appear in responseText.
|
|
119
|
+
*/
|
|
120
|
+
declare function negativeMatch(input: NegativeMatchInput): NegativeMatchResult;
|
|
121
|
+
/**
|
|
122
|
+
* Factory: creates a reusable negativeMatch evaluator with fixed mustNotContain list.
|
|
123
|
+
*/
|
|
124
|
+
declare function createNegativeMatchEvaluator(config: {
|
|
125
|
+
mustNotContain: string[];
|
|
126
|
+
}): (input: {
|
|
127
|
+
responseText: string;
|
|
128
|
+
}) => NegativeMatchResult;
|
|
129
|
+
|
|
130
|
+
interface LatencyInput {
|
|
131
|
+
latencyMs: number;
|
|
132
|
+
thresholdMs?: number;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Simple threshold check: latencyMs <= thresholdMs.
|
|
136
|
+
* Default threshold: 20,000ms.
|
|
137
|
+
*/
|
|
138
|
+
declare function latency(input: LatencyInput): LatencyResult;
|
|
139
|
+
/**
|
|
140
|
+
* Factory: creates a reusable latency evaluator with a fixed threshold.
|
|
141
|
+
*/
|
|
142
|
+
declare function createLatencyEvaluator(config: {
|
|
143
|
+
thresholdMs: number;
|
|
144
|
+
}): (input: {
|
|
145
|
+
latencyMs: number;
|
|
146
|
+
}) => LatencyResult;
|
|
147
|
+
|
|
148
|
+
interface JsonValidInput {
|
|
149
|
+
text: string;
|
|
150
|
+
/** If true, the parsed result must be an object or array (not a bare primitive). */
|
|
151
|
+
requireObject?: boolean;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Validates that a string is parseable JSON.
|
|
155
|
+
* Optionally checks that the parsed result is an object or array.
|
|
156
|
+
*/
|
|
157
|
+
declare function jsonValid(input: JsonValidInput): JsonValidResult;
|
|
158
|
+
/**
|
|
159
|
+
* Factory: creates a reusable jsonValid evaluator with fixed options.
|
|
160
|
+
*/
|
|
161
|
+
declare function createJsonValidEvaluator(config?: {
|
|
162
|
+
requireObject?: boolean;
|
|
163
|
+
}): (input: {
|
|
164
|
+
text: string;
|
|
165
|
+
}) => JsonValidResult;
|
|
166
|
+
|
|
167
|
+
interface SchemaMatchInput {
|
|
168
|
+
/** The object to validate. */
|
|
169
|
+
data: Record<string, unknown>;
|
|
170
|
+
/** Keys that must be present. */
|
|
171
|
+
requiredKeys: string[];
|
|
172
|
+
/** Optional type constraints: key -> expected typeof result. */
|
|
173
|
+
typeChecks?: Record<string, string>;
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Validates that a parsed object contains required keys and optionally checks value types.
|
|
177
|
+
* Zero-dep — just checks key presence and typeof.
|
|
178
|
+
*/
|
|
179
|
+
declare function schemaMatch(input: SchemaMatchInput): SchemaMatchResult;
|
|
180
|
+
/**
|
|
181
|
+
* Factory: creates a reusable schemaMatch evaluator with fixed schema expectations.
|
|
182
|
+
*/
|
|
183
|
+
declare function createSchemaMatchEvaluator(config: {
|
|
184
|
+
requiredKeys: string[];
|
|
185
|
+
typeChecks?: Record<string, string>;
|
|
186
|
+
}): (input: {
|
|
187
|
+
data: Record<string, unknown>;
|
|
188
|
+
}) => SchemaMatchResult;
|
|
189
|
+
|
|
190
|
+
interface NonEmptyInput {
|
|
191
|
+
responseText: string;
|
|
192
|
+
/** Custom cop-out phrases to check against (case-insensitive exact match after trim). */
|
|
193
|
+
copOutPhrases?: string[];
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Checks that the response is not empty, not just whitespace, and not a cop-out phrase.
|
|
197
|
+
*/
|
|
198
|
+
declare function nonEmpty(input: NonEmptyInput): NonEmptyResult;
|
|
199
|
+
/**
|
|
200
|
+
* Factory: creates a reusable nonEmpty evaluator with fixed cop-out phrases.
|
|
201
|
+
*/
|
|
202
|
+
declare function createNonEmptyEvaluator(config?: {
|
|
203
|
+
copOutPhrases?: string[];
|
|
204
|
+
}): (input: {
|
|
205
|
+
responseText: string;
|
|
206
|
+
}) => NonEmptyResult;
|
|
207
|
+
|
|
208
|
+
interface LengthBoundsInput {
|
|
209
|
+
responseText: string;
|
|
210
|
+
min?: number;
|
|
211
|
+
max?: number;
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Checks that the response length (character count) falls within min and/or max bounds.
|
|
215
|
+
*/
|
|
216
|
+
declare function lengthBounds(input: LengthBoundsInput): LengthBoundsResult;
|
|
217
|
+
/**
|
|
218
|
+
* Factory: creates a reusable lengthBounds evaluator with fixed bounds.
|
|
219
|
+
*/
|
|
220
|
+
declare function createLengthBoundsEvaluator(config: {
|
|
221
|
+
min?: number;
|
|
222
|
+
max?: number;
|
|
223
|
+
}): (input: {
|
|
224
|
+
responseText: string;
|
|
225
|
+
}) => LengthBoundsResult;
|
|
226
|
+
|
|
227
|
+
interface RegexMatchInput {
|
|
228
|
+
responseText: string;
|
|
229
|
+
patterns: (string | RegExp)[];
|
|
230
|
+
/** If 'all' (default), every pattern must match. If 'any', at least one must match. */
|
|
231
|
+
mode?: 'all' | 'any';
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Tests the response against one or more regex patterns.
|
|
235
|
+
*/
|
|
236
|
+
declare function regexMatch(input: RegexMatchInput): RegexMatchResult;
|
|
237
|
+
/**
|
|
238
|
+
* Factory: creates a reusable regexMatch evaluator with fixed patterns and mode.
|
|
239
|
+
*/
|
|
240
|
+
declare function createRegexMatchEvaluator(config: {
|
|
241
|
+
patterns: (string | RegExp)[];
|
|
242
|
+
mode?: 'all' | 'any';
|
|
243
|
+
}): (input: {
|
|
244
|
+
responseText: string;
|
|
245
|
+
}) => RegexMatchResult;
|
|
246
|
+
|
|
247
|
+
interface ToolCallCountInput {
|
|
248
|
+
count: number;
|
|
249
|
+
min?: number;
|
|
250
|
+
max?: number;
|
|
251
|
+
}
|
|
252
|
+
/**
|
|
253
|
+
* Checks that the number of tool calls falls within expected min/max bounds.
|
|
254
|
+
*/
|
|
255
|
+
declare function toolCallCount(input: ToolCallCountInput): ToolCallCountResult;
|
|
256
|
+
/**
|
|
257
|
+
* Factory: creates a reusable toolCallCount evaluator with fixed bounds.
|
|
258
|
+
*/
|
|
259
|
+
declare function createToolCallCountEvaluator(config: {
|
|
260
|
+
min?: number;
|
|
261
|
+
max?: number;
|
|
262
|
+
}): (input: {
|
|
263
|
+
count: number;
|
|
264
|
+
}) => ToolCallCountResult;
|
|
265
|
+
|
|
266
|
+
interface CostBudgetInput {
|
|
267
|
+
/** Actual cost — can be token count or dollar amount. */
|
|
268
|
+
actual: number;
|
|
269
|
+
/** Budget threshold — same unit as actual. */
|
|
270
|
+
budget: number;
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Checks that token count or dollar cost stays under a threshold.
|
|
274
|
+
*/
|
|
275
|
+
declare function costBudget(input: CostBudgetInput): CostBudgetResult;
|
|
276
|
+
/**
|
|
277
|
+
* Factory: creates a reusable costBudget evaluator with a fixed budget.
|
|
278
|
+
*/
|
|
279
|
+
declare function createCostBudgetEvaluator(config: {
|
|
280
|
+
budget: number;
|
|
281
|
+
}): (input: {
|
|
282
|
+
actual: number;
|
|
283
|
+
}) => CostBudgetResult;
|
|
284
|
+
|
|
285
|
+
interface RunChecksInput {
|
|
286
|
+
responseText?: string;
|
|
287
|
+
expectedTools?: string[];
|
|
288
|
+
actualTools?: string[];
|
|
289
|
+
mustContain?: string[];
|
|
290
|
+
mustNotContain?: string[];
|
|
291
|
+
latencyMs?: number;
|
|
292
|
+
thresholdMs?: number;
|
|
293
|
+
json?: {
|
|
294
|
+
text: string;
|
|
295
|
+
requireObject?: boolean;
|
|
296
|
+
};
|
|
297
|
+
schema?: {
|
|
298
|
+
data: Record<string, unknown>;
|
|
299
|
+
requiredKeys: string[];
|
|
300
|
+
typeChecks?: Record<string, string>;
|
|
301
|
+
};
|
|
302
|
+
copOutPhrases?: string[];
|
|
303
|
+
lengthMin?: number;
|
|
304
|
+
lengthMax?: number;
|
|
305
|
+
regexPatterns?: (string | RegExp)[];
|
|
306
|
+
regexMode?: 'all' | 'any';
|
|
307
|
+
toolCallCountValue?: number;
|
|
308
|
+
toolCallMin?: number;
|
|
309
|
+
toolCallMax?: number;
|
|
310
|
+
costActual?: number;
|
|
311
|
+
costBudget?: number;
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* Runs any combination of checks at once and returns a summary.
|
|
315
|
+
* Only runs checks for which the relevant inputs are provided.
|
|
316
|
+
*/
|
|
317
|
+
declare function runChecks(input: RunChecksInput): CheckSuiteResult;
|
|
318
|
+
|
|
319
|
+
/** TypeScript types for the YAML/JSON test case format. */
|
|
320
|
+
interface TestCase {
|
|
321
|
+
id: string;
|
|
322
|
+
query: string;
|
|
323
|
+
checks: TestCaseChecks;
|
|
324
|
+
metadata?: Record<string, unknown>;
|
|
325
|
+
}
|
|
326
|
+
interface TestCaseChecks {
|
|
327
|
+
expectedTools?: string[];
|
|
328
|
+
mustContain?: string[];
|
|
329
|
+
mustNotContain?: string[];
|
|
330
|
+
thresholdMs?: number;
|
|
331
|
+
json?: {
|
|
332
|
+
requireObject?: boolean;
|
|
333
|
+
};
|
|
334
|
+
schema?: {
|
|
335
|
+
requiredKeys: string[];
|
|
336
|
+
typeChecks?: Record<string, string>;
|
|
337
|
+
};
|
|
338
|
+
copOutPhrases?: string[];
|
|
339
|
+
lengthMin?: number;
|
|
340
|
+
lengthMax?: number;
|
|
341
|
+
regexPatterns?: string[];
|
|
342
|
+
regexMode?: 'all' | 'any';
|
|
343
|
+
toolCallMin?: number;
|
|
344
|
+
toolCallMax?: number;
|
|
345
|
+
costBudget?: number;
|
|
346
|
+
}
|
|
347
|
+
interface SuiteConfig {
|
|
348
|
+
name?: string;
|
|
349
|
+
test_cases: TestCase[];
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
interface AgentResult {
|
|
353
|
+
responseText: string;
|
|
354
|
+
actualTools?: string[];
|
|
355
|
+
latencyMs?: number;
|
|
356
|
+
toolCallCount?: number;
|
|
357
|
+
cost?: number;
|
|
358
|
+
}
|
|
359
|
+
type AgentFn = (query: string) => Promise<AgentResult>;
|
|
360
|
+
interface CaseResult {
|
|
361
|
+
id: string;
|
|
362
|
+
query: string;
|
|
363
|
+
passed: boolean;
|
|
364
|
+
checks: CheckSuiteResult;
|
|
365
|
+
metadata?: Record<string, unknown>;
|
|
366
|
+
agentResult: AgentResult;
|
|
367
|
+
}
|
|
368
|
+
interface SuiteResult {
|
|
369
|
+
name: string;
|
|
370
|
+
passed: number;
|
|
371
|
+
failed: number;
|
|
372
|
+
total: number;
|
|
373
|
+
cases: CaseResult[];
|
|
374
|
+
duration: number;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/** Parse a YAML string into a SuiteConfig. */
|
|
378
|
+
declare function parseYaml(content: string): SuiteConfig;
|
|
379
|
+
/** Parse a JSON string into a SuiteConfig. */
|
|
380
|
+
declare function parseJson(content: string): SuiteConfig;
|
|
381
|
+
/** Read a file from disk and parse by extension. */
|
|
382
|
+
declare function loadFile(filePath: string): SuiteConfig;
|
|
383
|
+
/**
|
|
384
|
+
* Main entry point. Accepts a file path (string) or an inline SuiteConfig.
|
|
385
|
+
* If a string is provided, delegates to loadFile.
|
|
386
|
+
*/
|
|
387
|
+
declare function loadCases(source: string | SuiteConfig): SuiteConfig;
|
|
388
|
+
|
|
389
|
+
interface RunSuiteOptions {
|
|
390
|
+
cases: string | SuiteConfig;
|
|
391
|
+
agent: AgentFn;
|
|
392
|
+
name?: string;
|
|
393
|
+
concurrency?: number;
|
|
394
|
+
onCaseComplete?: (result: CaseResult) => void;
|
|
395
|
+
}
|
|
396
|
+
/** Run a suite of test cases against an agent function. */
|
|
397
|
+
declare function runSuite(options: RunSuiteOptions): Promise<SuiteResult>;
|
|
398
|
+
|
|
399
|
+
/** Prints a human-readable summary of a suite run to the console. */
|
|
400
|
+
declare function printSuiteResult(result: SuiteResult): void;
|
|
401
|
+
|
|
402
|
+
export { type AgentFn, type AgentResult, type CaseResult, type CheckSuiteResult, type ContentMatchResult, type CostBudgetResult, type EvalResult, type JsonValidResult, type LatencyResult, type LengthBoundsResult, type NegativeMatchResult, type NonEmptyResult, type RegexMatchResult, type RunSuiteOptions, type SchemaMatchResult, type SuiteConfig, type SuiteResult, type TestCase, type TestCaseChecks, type ToolCallCountResult, type ToolSelectionResult, contentMatch, costBudget, createContentMatchEvaluator, createCostBudgetEvaluator, createJsonValidEvaluator, createLatencyEvaluator, createLengthBoundsEvaluator, createNegativeMatchEvaluator, createNonEmptyEvaluator, createRegexMatchEvaluator, createSchemaMatchEvaluator, createToolCallCountEvaluator, createToolSelectionEvaluator, jsonValid, latency, lengthBounds, loadCases, loadFile, negativeMatch, nonEmpty, parseJson, parseYaml, printSuiteResult, regexMatch, runChecks, runSuite, schemaMatch, toolCallCount, toolSelection };
|