@aliou/pi-evals 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +205 -0
- package/dist/chunk-342JG3E3.js +117 -0
- package/dist/chunk-342JG3E3.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +445 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +344 -0
- package/dist/index.js +455 -0
- package/dist/index.js.map +1 -0
- package/package.json +61 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core type definitions for pi-eval
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Message from the agent conversation.
|
|
6
|
+
* We use unknown since we just pass these through to scorers.
|
|
7
|
+
*/
|
|
8
|
+
type Message = unknown;
|
|
9
|
+
/**
|
|
10
|
+
* Pi configuration for running evals
|
|
11
|
+
*/
|
|
12
|
+
interface PiConfig {
|
|
13
|
+
model: string;
|
|
14
|
+
provider: string;
|
|
15
|
+
extensions?: string[];
|
|
16
|
+
env?: Record<string, string>;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Global configuration loaded from pi-eval.config.ts
|
|
20
|
+
*/
|
|
21
|
+
interface GlobalConfig {
|
|
22
|
+
/** Default Pi configuration */
|
|
23
|
+
defaults?: Partial<PiConfig>;
|
|
24
|
+
/** Directory containing eval files (default: ./evals) */
|
|
25
|
+
evalsDir?: string;
|
|
26
|
+
/** Delay between test cases in ms (default: 500) */
|
|
27
|
+
delayBetweenTests?: number;
|
|
28
|
+
/** Default timeout per test in ms (default: 60000) */
|
|
29
|
+
timeout?: number;
|
|
30
|
+
/** Warn if more than this many test cases (default: 30) */
|
|
31
|
+
warnTestCount?: number;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Setup configuration for test workspace
|
|
35
|
+
*/
|
|
36
|
+
interface TestSetup {
|
|
37
|
+
/** Files to create in workspace before running */
|
|
38
|
+
files?: Record<string, string>;
|
|
39
|
+
/** Shell commands to run before the eval */
|
|
40
|
+
commands?: string[];
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Expected outcome for scoring
|
|
44
|
+
*/
|
|
45
|
+
interface Expected {
|
|
46
|
+
/** Expected files and their content (substring match) */
|
|
47
|
+
files?: Record<string, string>;
|
|
48
|
+
/** Expected substring in output */
|
|
49
|
+
output?: string;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* A single test case
|
|
53
|
+
*/
|
|
54
|
+
interface TestCase<TExpected = Expected> {
|
|
55
|
+
/** Prompt to send to pi */
|
|
56
|
+
input: string;
|
|
57
|
+
/** Expected outcome for scorers */
|
|
58
|
+
expected?: TExpected;
|
|
59
|
+
/** Optional workspace setup */
|
|
60
|
+
setup?: TestSetup;
|
|
61
|
+
/** Run only this test case */
|
|
62
|
+
only?: boolean;
|
|
63
|
+
/** Skip this test case */
|
|
64
|
+
skip?: boolean;
|
|
65
|
+
/** Override timeout for this case */
|
|
66
|
+
timeout?: number;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Token usage statistics
|
|
70
|
+
*/
|
|
71
|
+
interface TokenStats {
|
|
72
|
+
input: number;
|
|
73
|
+
output: number;
|
|
74
|
+
total: number;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Session statistics from pi
|
|
78
|
+
*/
|
|
79
|
+
interface SessionStats {
|
|
80
|
+
tokens: TokenStats;
|
|
81
|
+
cost: number;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* A tool call captured from the session
|
|
85
|
+
*/
|
|
86
|
+
interface ToolCall {
|
|
87
|
+
name: string;
|
|
88
|
+
args: Record<string, unknown>;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Context passed to scorers
|
|
92
|
+
*/
|
|
93
|
+
interface ScoreContext<TExpected = Expected> {
|
|
94
|
+
/** Original input prompt */
|
|
95
|
+
input: string;
|
|
96
|
+
/** Agent's final response text */
|
|
97
|
+
output: string;
|
|
98
|
+
/** Expected outcome */
|
|
99
|
+
expected?: TExpected;
|
|
100
|
+
/** Workspace directory */
|
|
101
|
+
cwd: string;
|
|
102
|
+
/** Full conversation messages */
|
|
103
|
+
messages: Message[];
|
|
104
|
+
/** Tool calls made during the session */
|
|
105
|
+
toolCalls: ToolCall[];
|
|
106
|
+
/** Token and cost stats */
|
|
107
|
+
stats: SessionStats;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Result from a scorer
|
|
111
|
+
*/
|
|
112
|
+
interface ScoreResult {
|
|
113
|
+
/** Scorer name */
|
|
114
|
+
name: string;
|
|
115
|
+
/** Score from 0-1 */
|
|
116
|
+
score: number;
|
|
117
|
+
/** Explanation of the score */
|
|
118
|
+
reason?: string;
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* A scorer (evaluator) function
|
|
122
|
+
*/
|
|
123
|
+
interface Scorer<TExpected = Expected> {
|
|
124
|
+
/** Display name */
|
|
125
|
+
name: string;
|
|
126
|
+
/** Scoring function */
|
|
127
|
+
score: (ctx: ScoreContext<TExpected>) => Promise<ScoreResult>;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Options for defining an eval
|
|
131
|
+
*/
|
|
132
|
+
interface EvalOptions<TExpected = Expected> {
|
|
133
|
+
/** Pi configuration */
|
|
134
|
+
config: PiConfig;
|
|
135
|
+
/** Test cases */
|
|
136
|
+
data: TestCase<TExpected>[];
|
|
137
|
+
/** Scorers to run */
|
|
138
|
+
scorers: Scorer<TExpected>[];
|
|
139
|
+
/** Timeout per test case in ms */
|
|
140
|
+
timeout?: number;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Internal representation of a registered eval
|
|
144
|
+
*/
|
|
145
|
+
interface EvalDefinition<TExpected = Expected> {
|
|
146
|
+
/** Eval name */
|
|
147
|
+
name: string;
|
|
148
|
+
/** Eval options */
|
|
149
|
+
options: EvalOptions<TExpected>;
|
|
150
|
+
/** Source file path */
|
|
151
|
+
file: string;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Result of a single test case
|
|
155
|
+
*/
|
|
156
|
+
interface TestResult {
|
|
157
|
+
/** Eval name */
|
|
158
|
+
evalName: string;
|
|
159
|
+
/** Test input */
|
|
160
|
+
input: string;
|
|
161
|
+
/** Score results from all scorers */
|
|
162
|
+
scores: ScoreResult[];
|
|
163
|
+
/** Whether the test passed (all scores >= 0.5) */
|
|
164
|
+
passed: boolean;
|
|
165
|
+
/** Duration in ms */
|
|
166
|
+
duration: number;
|
|
167
|
+
/** Token usage */
|
|
168
|
+
tokens: TokenStats;
|
|
169
|
+
/** Cost in USD */
|
|
170
|
+
cost: number;
|
|
171
|
+
/** Error message if failed */
|
|
172
|
+
error?: string;
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Summary of an eval run
|
|
176
|
+
*/
|
|
177
|
+
interface EvalRunSummary {
|
|
178
|
+
/** All test results */
|
|
179
|
+
results: TestResult[];
|
|
180
|
+
/** Total tests */
|
|
181
|
+
total: number;
|
|
182
|
+
/** Passed tests */
|
|
183
|
+
passed: number;
|
|
184
|
+
/** Failed tests */
|
|
185
|
+
failed: number;
|
|
186
|
+
/** Total duration in ms */
|
|
187
|
+
duration: number;
|
|
188
|
+
/** Total tokens used */
|
|
189
|
+
totalTokens: number;
|
|
190
|
+
/** Total cost in USD */
|
|
191
|
+
totalCost: number;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* CLI options
|
|
195
|
+
*/
|
|
196
|
+
interface CliOptions {
|
|
197
|
+
/** Filter evals by name substring */
|
|
198
|
+
filter?: string;
|
|
199
|
+
/** Output JSON instead of pretty print */
|
|
200
|
+
json?: boolean;
|
|
201
|
+
/** Verbose output */
|
|
202
|
+
verbose?: boolean;
|
|
203
|
+
/** Minimum pass percentage to exit 0 */
|
|
204
|
+
threshold?: number;
|
|
205
|
+
/** Config file path */
|
|
206
|
+
config?: string;
|
|
207
|
+
/** Override model */
|
|
208
|
+
model?: string;
|
|
209
|
+
/** Override provider */
|
|
210
|
+
provider?: string;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Helper for defining config with type inference
|
|
215
|
+
*/
|
|
216
|
+
declare function defineConfig(config: GlobalConfig): GlobalConfig;
|
|
217
|
+
|
|
218
|
+
interface BashOptions {
|
|
219
|
+
/** Expected exit code (default: 0) */
|
|
220
|
+
exitCode?: number;
|
|
221
|
+
/** Timeout in ms (default: 30000) */
|
|
222
|
+
timeout?: number;
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Creates a scorer that runs a bash command and checks the exit code.
|
|
226
|
+
* Useful for running tests, linters, or other validation commands.
|
|
227
|
+
*/
|
|
228
|
+
declare function bash(command: string, options?: BashOptions): Scorer<Expected>;
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Output contains scorer - checks that output contains expected substring
|
|
232
|
+
*/
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Creates a scorer that checks if the agent's output contains expected.output
|
|
236
|
+
*/
|
|
237
|
+
declare function outputContains(): Scorer<Expected>;
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Creates a scorer that checks if expected files exist and contain expected content.
|
|
241
|
+
* Uses substring matching for content comparison.
|
|
242
|
+
*/
|
|
243
|
+
declare function files(): Scorer<Expected>;
|
|
244
|
+
|
|
245
|
+
/**
|
|
246
|
+
* LLM Judge scorer - uses an LLM to evaluate the output
|
|
247
|
+
*/
|
|
248
|
+
|
|
249
|
+
interface LlmJudgeOptions {
|
|
250
|
+
/** Criteria for the LLM to evaluate against */
|
|
251
|
+
criteria: string;
|
|
252
|
+
/** Model to use (default: gpt-4o-mini) */
|
|
253
|
+
model?: string;
|
|
254
|
+
/** Provider (default: openai) */
|
|
255
|
+
provider?: string;
|
|
256
|
+
}
|
|
257
|
+
/**
|
|
258
|
+
* Creates a scorer that uses an LLM to evaluate the output against criteria.
|
|
259
|
+
* Uses a cheap, fast model by default.
|
|
260
|
+
*
|
|
261
|
+
* Note: Requires OPENAI_API_KEY or appropriate provider API key.
|
|
262
|
+
*/
|
|
263
|
+
declare function llmJudge(options: LlmJudgeOptions): Scorer<Expected>;
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Regex scorer - checks that output matches a pattern
|
|
267
|
+
*/
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Creates a scorer that checks if the agent's output matches a regex pattern
|
|
271
|
+
*/
|
|
272
|
+
declare function outputMatches(pattern: RegExp): Scorer<Expected>;
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Tool called scorer - checks that a specific tool was called during the session
|
|
276
|
+
*/
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Creates a scorer that checks if a specific tool was called.
|
|
280
|
+
*
|
|
281
|
+
* @param name - The tool name to check for (e.g., "read", "bash", "linkup_web_search")
|
|
282
|
+
*/
|
|
283
|
+
declare function toolCalled(name: string): Scorer<Expected>;
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Creates a scorer that checks if a tool was called with specific arguments.
|
|
287
|
+
*
|
|
288
|
+
* For `path` arguments, both expected and actual values are resolved to
|
|
289
|
+
* absolute paths before comparison. All other arguments use direct equality.
|
|
290
|
+
*
|
|
291
|
+
* @param name - The tool name to check for
|
|
292
|
+
* @param expectedArgs - Key-value pairs the tool call args must contain
|
|
293
|
+
*/
|
|
294
|
+
declare function toolCalledWith(name: string, expectedArgs: Record<string, unknown>): Scorer<Expected>;
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Built-in scorers for pi-eval
|
|
298
|
+
*/
|
|
299
|
+
|
|
300
|
+
type ScorersModule_BashOptions = BashOptions;
|
|
301
|
+
type ScorersModule_LlmJudgeOptions = LlmJudgeOptions;
|
|
302
|
+
declare const ScorersModule_bash: typeof bash;
|
|
303
|
+
declare const ScorersModule_files: typeof files;
|
|
304
|
+
declare const ScorersModule_llmJudge: typeof llmJudge;
|
|
305
|
+
declare const ScorersModule_outputContains: typeof outputContains;
|
|
306
|
+
declare const ScorersModule_outputMatches: typeof outputMatches;
|
|
307
|
+
declare const ScorersModule_toolCalled: typeof toolCalled;
|
|
308
|
+
declare const ScorersModule_toolCalledWith: typeof toolCalledWith;
|
|
309
|
+
declare namespace ScorersModule {
|
|
310
|
+
export { type ScorersModule_BashOptions as BashOptions, type ScorersModule_LlmJudgeOptions as LlmJudgeOptions, ScorersModule_bash as bash, ScorersModule_files as files, ScorersModule_llmJudge as llmJudge, ScorersModule_outputContains as outputContains, ScorersModule_outputMatches as outputMatches, ScorersModule_toolCalled as toolCalled, ScorersModule_toolCalledWith as toolCalledWith };
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* pi-eval - Eval framework for pi coding agent
|
|
315
|
+
*/
|
|
316
|
+
|
|
317
|
+
declare const Scorers: typeof ScorersModule;
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* Define and register an eval.
|
|
321
|
+
* This is the main API for creating evals.
|
|
322
|
+
*
|
|
323
|
+
* @example
|
|
324
|
+
* ```typescript
|
|
325
|
+
* import { evaluate, Scorers } from "@aliou/pi-evals";
|
|
326
|
+
*
|
|
327
|
+
* evaluate("Create hello file", {
|
|
328
|
+
* config: {
|
|
329
|
+
* model: "claude-sonnet-4-20250514",
|
|
330
|
+
* provider: "anthropic",
|
|
331
|
+
* },
|
|
332
|
+
* data: [
|
|
333
|
+
* {
|
|
334
|
+
* input: 'Create a file called hello.txt containing "Hello World"',
|
|
335
|
+
* expected: { files: { "hello.txt": "Hello World" } },
|
|
336
|
+
* },
|
|
337
|
+
* ],
|
|
338
|
+
* scorers: [Scorers.files()],
|
|
339
|
+
* });
|
|
340
|
+
* ```
|
|
341
|
+
*/
|
|
342
|
+
declare function evaluate<TExpected = Expected>(name: string, options: EvalOptions<TExpected>): void;
|
|
343
|
+
|
|
344
|
+
export { type BashOptions, type CliOptions, type EvalDefinition, type EvalOptions, type EvalRunSummary, type Expected, type GlobalConfig, type LlmJudgeOptions, type PiConfig, type ScoreContext, type ScoreResult, type Scorer, Scorers, type SessionStats, type TestCase, type TestResult, type TestSetup, type TokenStats, type ToolCall, defineConfig, evaluate };
|