@aliou/pi-evals 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,344 @@
1
+ /**
2
+ * Core type definitions for pi-eval
3
+ */
4
+ /**
5
+ * Message from the agent conversation.
6
+ * We use unknown since we just pass these through to scorers.
7
+ */
8
+ type Message = unknown;
9
+ /**
10
+ * Pi configuration for running evals
11
+ */
12
+ interface PiConfig {
13
+ model: string;
14
+ provider: string;
15
+ extensions?: string[];
16
+ env?: Record<string, string>;
17
+ }
18
+ /**
19
+ * Global configuration loaded from pi-eval.config.ts
20
+ */
21
+ interface GlobalConfig {
22
+ /** Default Pi configuration */
23
+ defaults?: Partial<PiConfig>;
24
+ /** Directory containing eval files (default: ./evals) */
25
+ evalsDir?: string;
26
+ /** Delay between test cases in ms (default: 500) */
27
+ delayBetweenTests?: number;
28
+ /** Default timeout per test in ms (default: 60000) */
29
+ timeout?: number;
30
+ /** Warn if more than this many test cases (default: 30) */
31
+ warnTestCount?: number;
32
+ }
33
+ /**
34
+ * Setup configuration for test workspace
35
+ */
36
+ interface TestSetup {
37
+ /** Files to create in workspace before running */
38
+ files?: Record<string, string>;
39
+ /** Shell commands to run before the eval */
40
+ commands?: string[];
41
+ }
42
+ /**
43
+ * Expected outcome for scoring
44
+ */
45
+ interface Expected {
46
+ /** Expected files and their content (substring match) */
47
+ files?: Record<string, string>;
48
+ /** Expected substring in output */
49
+ output?: string;
50
+ }
51
+ /**
52
+ * A single test case
53
+ */
54
+ interface TestCase<TExpected = Expected> {
55
+ /** Prompt to send to pi */
56
+ input: string;
57
+ /** Expected outcome for scorers */
58
+ expected?: TExpected;
59
+ /** Optional workspace setup */
60
+ setup?: TestSetup;
61
+ /** Run only this test case */
62
+ only?: boolean;
63
+ /** Skip this test case */
64
+ skip?: boolean;
65
+ /** Override timeout for this case */
66
+ timeout?: number;
67
+ }
68
+ /**
69
+ * Token usage statistics
70
+ */
71
+ interface TokenStats {
72
+ input: number;
73
+ output: number;
74
+ total: number;
75
+ }
76
+ /**
77
+ * Session statistics from pi
78
+ */
79
+ interface SessionStats {
80
+ tokens: TokenStats;
81
+ cost: number;
82
+ }
83
+ /**
84
+ * A tool call captured from the session
85
+ */
86
+ interface ToolCall {
87
+ name: string;
88
+ args: Record<string, unknown>;
89
+ }
90
+ /**
91
+ * Context passed to scorers
92
+ */
93
+ interface ScoreContext<TExpected = Expected> {
94
+ /** Original input prompt */
95
+ input: string;
96
+ /** Agent's final response text */
97
+ output: string;
98
+ /** Expected outcome */
99
+ expected?: TExpected;
100
+ /** Workspace directory */
101
+ cwd: string;
102
+ /** Full conversation messages */
103
+ messages: Message[];
104
+ /** Tool calls made during the session */
105
+ toolCalls: ToolCall[];
106
+ /** Token and cost stats */
107
+ stats: SessionStats;
108
+ }
109
+ /**
110
+ * Result from a scorer
111
+ */
112
+ interface ScoreResult {
113
+ /** Scorer name */
114
+ name: string;
115
+ /** Score from 0-1 */
116
+ score: number;
117
+ /** Explanation of the score */
118
+ reason?: string;
119
+ }
120
+ /**
121
+ * A scorer (evaluator) function
122
+ */
123
+ interface Scorer<TExpected = Expected> {
124
+ /** Display name */
125
+ name: string;
126
+ /** Scoring function */
127
+ score: (ctx: ScoreContext<TExpected>) => Promise<ScoreResult>;
128
+ }
129
+ /**
130
+ * Options for defining an eval
131
+ */
132
+ interface EvalOptions<TExpected = Expected> {
133
+ /** Pi configuration */
134
+ config: PiConfig;
135
+ /** Test cases */
136
+ data: TestCase<TExpected>[];
137
+ /** Scorers to run */
138
+ scorers: Scorer<TExpected>[];
139
+ /** Timeout per test case in ms */
140
+ timeout?: number;
141
+ }
142
+ /**
143
+ * Internal representation of a registered eval
144
+ */
145
+ interface EvalDefinition<TExpected = Expected> {
146
+ /** Eval name */
147
+ name: string;
148
+ /** Eval options */
149
+ options: EvalOptions<TExpected>;
150
+ /** Source file path */
151
+ file: string;
152
+ }
153
+ /**
154
+ * Result of a single test case
155
+ */
156
+ interface TestResult {
157
+ /** Eval name */
158
+ evalName: string;
159
+ /** Test input */
160
+ input: string;
161
+ /** Score results from all scorers */
162
+ scores: ScoreResult[];
163
+ /** Whether the test passed (all scores >= 0.5) */
164
+ passed: boolean;
165
+ /** Duration in ms */
166
+ duration: number;
167
+ /** Token usage */
168
+ tokens: TokenStats;
169
+ /** Cost in USD */
170
+ cost: number;
171
+ /** Error message if failed */
172
+ error?: string;
173
+ }
174
+ /**
175
+ * Summary of an eval run
176
+ */
177
+ interface EvalRunSummary {
178
+ /** All test results */
179
+ results: TestResult[];
180
+ /** Total tests */
181
+ total: number;
182
+ /** Passed tests */
183
+ passed: number;
184
+ /** Failed tests */
185
+ failed: number;
186
+ /** Total duration in ms */
187
+ duration: number;
188
+ /** Total tokens used */
189
+ totalTokens: number;
190
+ /** Total cost in USD */
191
+ totalCost: number;
192
+ }
193
+ /**
194
+ * CLI options
195
+ */
196
+ interface CliOptions {
197
+ /** Filter evals by name substring */
198
+ filter?: string;
199
+ /** Output JSON instead of pretty print */
200
+ json?: boolean;
201
+ /** Verbose output */
202
+ verbose?: boolean;
203
+ /** Minimum pass percentage to exit 0 */
204
+ threshold?: number;
205
+ /** Config file path */
206
+ config?: string;
207
+ /** Override model */
208
+ model?: string;
209
+ /** Override provider */
210
+ provider?: string;
211
+ }
212
+
213
+ /**
214
+ * Helper for defining config with type inference
215
+ */
216
+ declare function defineConfig(config: GlobalConfig): GlobalConfig;
217
+
218
+ interface BashOptions {
219
+ /** Expected exit code (default: 0) */
220
+ exitCode?: number;
221
+ /** Timeout in ms (default: 30000) */
222
+ timeout?: number;
223
+ }
224
+ /**
225
+ * Creates a scorer that runs a bash command and checks the exit code.
226
+ * Useful for running tests, linters, or other validation commands.
227
+ */
228
+ declare function bash(command: string, options?: BashOptions): Scorer<Expected>;
229
+
230
+ /**
231
+ * Output contains scorer - checks that output contains expected substring
232
+ */
233
+
234
+ /**
235
+ * Creates a scorer that checks if the agent's output contains expected.output
236
+ */
237
+ declare function outputContains(): Scorer<Expected>;
238
+
239
+ /**
240
+ * Creates a scorer that checks if expected files exist and contain expected content.
241
+ * Uses substring matching for content comparison.
242
+ */
243
+ declare function files(): Scorer<Expected>;
244
+
245
+ /**
246
+ * LLM Judge scorer - uses an LLM to evaluate the output
247
+ */
248
+
249
+ interface LlmJudgeOptions {
250
+ /** Criteria for the LLM to evaluate against */
251
+ criteria: string;
252
+ /** Model to use (default: gpt-4o-mini) */
253
+ model?: string;
254
+ /** Provider (default: openai) */
255
+ provider?: string;
256
+ }
257
+ /**
258
+ * Creates a scorer that uses an LLM to evaluate the output against criteria.
259
+ * Uses a cheap, fast model by default.
260
+ *
261
+ * Note: Requires OPENAI_API_KEY or appropriate provider API key.
262
+ */
263
+ declare function llmJudge(options: LlmJudgeOptions): Scorer<Expected>;
264
+
265
+ /**
266
+ * Regex scorer - checks that output matches a pattern
267
+ */
268
+
269
+ /**
270
+ * Creates a scorer that checks if the agent's output matches a regex pattern
271
+ */
272
+ declare function outputMatches(pattern: RegExp): Scorer<Expected>;
273
+
274
+ /**
275
+ * Tool called scorer - checks that a specific tool was called during the session
276
+ */
277
+
278
+ /**
279
+ * Creates a scorer that checks if a specific tool was called.
280
+ *
281
+ * @param name - The tool name to check for (e.g., "read", "bash", "linkup_web_search")
282
+ */
283
+ declare function toolCalled(name: string): Scorer<Expected>;
284
+
285
+ /**
286
+ * Creates a scorer that checks if a tool was called with specific arguments.
287
+ *
288
+ * For `path` arguments, both expected and actual values are resolved to
289
+ * absolute paths before comparison. All other arguments use direct equality.
290
+ *
291
+ * @param name - The tool name to check for
292
+ * @param expectedArgs - Key-value pairs the tool call args must contain
293
+ */
294
+ declare function toolCalledWith(name: string, expectedArgs: Record<string, unknown>): Scorer<Expected>;
295
+
296
+ /**
297
+ * Built-in scorers for pi-eval
298
+ */
299
+
300
+ type ScorersModule_BashOptions = BashOptions;
301
+ type ScorersModule_LlmJudgeOptions = LlmJudgeOptions;
302
+ declare const ScorersModule_bash: typeof bash;
303
+ declare const ScorersModule_files: typeof files;
304
+ declare const ScorersModule_llmJudge: typeof llmJudge;
305
+ declare const ScorersModule_outputContains: typeof outputContains;
306
+ declare const ScorersModule_outputMatches: typeof outputMatches;
307
+ declare const ScorersModule_toolCalled: typeof toolCalled;
308
+ declare const ScorersModule_toolCalledWith: typeof toolCalledWith;
309
+ declare namespace ScorersModule {
310
+ export { type ScorersModule_BashOptions as BashOptions, type ScorersModule_LlmJudgeOptions as LlmJudgeOptions, ScorersModule_bash as bash, ScorersModule_files as files, ScorersModule_llmJudge as llmJudge, ScorersModule_outputContains as outputContains, ScorersModule_outputMatches as outputMatches, ScorersModule_toolCalled as toolCalled, ScorersModule_toolCalledWith as toolCalledWith };
311
+ }
312
+
313
+ /**
314
+ * pi-eval - Eval framework for pi coding agent
315
+ */
316
+
317
+ declare const Scorers: typeof ScorersModule;
318
+
319
+ /**
320
+ * Define and register an eval.
321
+ * This is the main API for creating evals.
322
+ *
323
+ * @example
324
+ * ```typescript
325
+ * import { evaluate, Scorers } from "@aliou/pi-evals";
326
+ *
327
+ * evaluate("Create hello file", {
328
+ * config: {
329
+ * model: "claude-sonnet-4-20250514",
330
+ * provider: "anthropic",
331
+ * },
332
+ * data: [
333
+ * {
334
+ * input: 'Create a file called hello.txt containing "Hello World"',
335
+ * expected: { files: { "hello.txt": "Hello World" } },
336
+ * },
337
+ * ],
338
+ * scorers: [Scorers.files()],
339
+ * });
340
+ * ```
341
+ */
342
+ declare function evaluate<TExpected = Expected>(name: string, options: EvalOptions<TExpected>): void;
343
+
344
+ export { type BashOptions, type CliOptions, type EvalDefinition, type EvalOptions, type EvalRunSummary, type Expected, type GlobalConfig, type LlmJudgeOptions, type PiConfig, type ScoreContext, type ScoreResult, type Scorer, Scorers, type SessionStats, type TestCase, type TestResult, type TestSetup, type TokenStats, type ToolCall, defineConfig, evaluate };