@aliou/pi-evals 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +205 -0
- package/dist/chunk-342JG3E3.js +117 -0
- package/dist/chunk-342JG3E3.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +445 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +344 -0
- package/dist/index.js +455 -0
- package/dist/index.js.map +1 -0
- package/package.json +61 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
import {
|
|
2
|
+
__export,
|
|
3
|
+
defineConfig,
|
|
4
|
+
getCurrentFile,
|
|
5
|
+
registerEval
|
|
6
|
+
} from "./chunk-342JG3E3.js";
|
|
7
|
+
|
|
8
|
+
// src/scorers/index.ts
|
|
9
|
+
var scorers_exports = {};
|
|
10
|
+
__export(scorers_exports, {
|
|
11
|
+
bash: () => bash,
|
|
12
|
+
files: () => files,
|
|
13
|
+
llmJudge: () => llmJudge,
|
|
14
|
+
outputContains: () => outputContains,
|
|
15
|
+
outputMatches: () => outputMatches,
|
|
16
|
+
toolCalled: () => toolCalled,
|
|
17
|
+
toolCalledWith: () => toolCalledWith
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
// src/scorers/bash.ts
|
|
21
|
+
import { exec } from "child_process";
|
|
22
|
+
import { promisify } from "util";
|
|
23
|
+
var execAsync = promisify(exec);
|
|
24
|
+
function bash(command, options = {}) {
|
|
25
|
+
const { exitCode: expectedCode = 0, timeout = 3e4 } = options;
|
|
26
|
+
return {
|
|
27
|
+
name: "bash",
|
|
28
|
+
score: async (ctx) => {
|
|
29
|
+
try {
|
|
30
|
+
const _result = await execAsync(command, {
|
|
31
|
+
cwd: ctx.cwd,
|
|
32
|
+
timeout,
|
|
33
|
+
env: { ...process.env, PATH: process.env.PATH }
|
|
34
|
+
});
|
|
35
|
+
if (expectedCode === 0) {
|
|
36
|
+
return {
|
|
37
|
+
name: "bash",
|
|
38
|
+
score: 1,
|
|
39
|
+
reason: `Command succeeded: ${command}`
|
|
40
|
+
};
|
|
41
|
+
} else {
|
|
42
|
+
return {
|
|
43
|
+
name: "bash",
|
|
44
|
+
score: 0,
|
|
45
|
+
reason: `Expected exit code ${expectedCode}, got 0`
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
} catch (err) {
|
|
49
|
+
const error = err;
|
|
50
|
+
if (error.killed) {
|
|
51
|
+
return {
|
|
52
|
+
name: "bash",
|
|
53
|
+
score: 0,
|
|
54
|
+
reason: `Command timed out after ${timeout}ms: ${command}`
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
const actualCode = error.code ?? 1;
|
|
58
|
+
if (actualCode === expectedCode) {
|
|
59
|
+
return {
|
|
60
|
+
name: "bash",
|
|
61
|
+
score: 1,
|
|
62
|
+
reason: `Command exited with expected code ${expectedCode}: ${command}`
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
const stderr = error.stderr ? `
|
|
66
|
+
${truncate(error.stderr, 200)}` : "";
|
|
67
|
+
return {
|
|
68
|
+
name: "bash",
|
|
69
|
+
score: 0,
|
|
70
|
+
reason: `Command failed with code ${actualCode}: ${command}${stderr}`
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
function truncate(str, maxLen) {
|
|
77
|
+
if (str.length <= maxLen) return str;
|
|
78
|
+
return `${str.slice(0, maxLen - 3)}...`;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// src/scorers/contains.ts
|
|
82
|
+
function outputContains() {
|
|
83
|
+
return {
|
|
84
|
+
name: "outputContains",
|
|
85
|
+
score: async (ctx) => {
|
|
86
|
+
const expectedOutput = ctx.expected?.output;
|
|
87
|
+
if (!expectedOutput) {
|
|
88
|
+
return {
|
|
89
|
+
name: "outputContains",
|
|
90
|
+
score: 1,
|
|
91
|
+
reason: "No output expected"
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
const contains = ctx.output.includes(expectedOutput);
|
|
95
|
+
return {
|
|
96
|
+
name: "outputContains",
|
|
97
|
+
score: contains ? 1 : 0,
|
|
98
|
+
reason: contains ? `Output contains "${truncate2(expectedOutput, 50)}"` : `Output missing "${truncate2(expectedOutput, 50)}"`
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
function truncate2(str, maxLen) {
|
|
104
|
+
if (str.length <= maxLen) return str;
|
|
105
|
+
return `${str.slice(0, maxLen - 3)}...`;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// src/scorers/files.ts
|
|
109
|
+
import * as fs from "fs/promises";
|
|
110
|
+
import * as path from "path";
|
|
111
|
+
function files() {
|
|
112
|
+
return {
|
|
113
|
+
name: "files",
|
|
114
|
+
score: async (ctx) => {
|
|
115
|
+
const expectedFiles = ctx.expected?.files;
|
|
116
|
+
if (!expectedFiles || Object.keys(expectedFiles).length === 0) {
|
|
117
|
+
return {
|
|
118
|
+
name: "files",
|
|
119
|
+
score: 1,
|
|
120
|
+
reason: "No files expected"
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
const results = [];
|
|
124
|
+
for (const [filePath, expectedContent] of Object.entries(expectedFiles)) {
|
|
125
|
+
const fullPath = path.join(ctx.cwd, filePath);
|
|
126
|
+
try {
|
|
127
|
+
const content = await fs.readFile(fullPath, "utf-8");
|
|
128
|
+
if (content.includes(expectedContent)) {
|
|
129
|
+
results.push({
|
|
130
|
+
file: filePath,
|
|
131
|
+
ok: true,
|
|
132
|
+
reason: "exists with expected content"
|
|
133
|
+
});
|
|
134
|
+
} else {
|
|
135
|
+
results.push({
|
|
136
|
+
file: filePath,
|
|
137
|
+
ok: false,
|
|
138
|
+
reason: `exists but missing expected content: "${truncate3(expectedContent, 50)}"`
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
} catch (err) {
|
|
142
|
+
if (err.code === "ENOENT") {
|
|
143
|
+
results.push({
|
|
144
|
+
file: filePath,
|
|
145
|
+
ok: false,
|
|
146
|
+
reason: "file not found"
|
|
147
|
+
});
|
|
148
|
+
} else {
|
|
149
|
+
results.push({
|
|
150
|
+
file: filePath,
|
|
151
|
+
ok: false,
|
|
152
|
+
reason: `error reading: ${err.message}`
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
const passed = results.filter((r) => r.ok).length;
|
|
158
|
+
const total = results.length;
|
|
159
|
+
const score = total > 0 ? passed / total : 1;
|
|
160
|
+
const reasons = results.map(
|
|
161
|
+
(r) => `${r.ok ? "+" : "-"} ${r.file}: ${r.reason}`
|
|
162
|
+
);
|
|
163
|
+
return {
|
|
164
|
+
name: "files",
|
|
165
|
+
score,
|
|
166
|
+
reason: reasons.join("\n")
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
function truncate3(str, maxLen) {
|
|
172
|
+
if (str.length <= maxLen) return str;
|
|
173
|
+
return `${str.slice(0, maxLen - 3)}...`;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// src/scorers/llm-judge.ts
|
|
177
|
+
function llmJudge(options) {
|
|
178
|
+
const { criteria, model = "gpt-4o-mini", provider = "openai" } = options;
|
|
179
|
+
return {
|
|
180
|
+
name: "llmJudge",
|
|
181
|
+
score: async (ctx) => {
|
|
182
|
+
const prompt = buildJudgePrompt(criteria, ctx.input, ctx.output);
|
|
183
|
+
try {
|
|
184
|
+
const result = await callLlm(prompt, model, provider);
|
|
185
|
+
return parseJudgeResponse(result);
|
|
186
|
+
} catch (err) {
|
|
187
|
+
return {
|
|
188
|
+
name: "llmJudge",
|
|
189
|
+
score: 0,
|
|
190
|
+
reason: `LLM judge error: ${err.message}`
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
function buildJudgePrompt(criteria, input, output) {
|
|
197
|
+
return `You are evaluating an AI coding assistant's response.
|
|
198
|
+
|
|
199
|
+
## Task given to the assistant
|
|
200
|
+
${input}
|
|
201
|
+
|
|
202
|
+
## Assistant's response
|
|
203
|
+
${output}
|
|
204
|
+
|
|
205
|
+
## Evaluation criteria
|
|
206
|
+
${criteria}
|
|
207
|
+
|
|
208
|
+
## Instructions
|
|
209
|
+
Evaluate the response against the criteria. Respond with a JSON object:
|
|
210
|
+
{
|
|
211
|
+
"score": <number from 0 to 1>,
|
|
212
|
+
"reason": "<brief explanation>"
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
Score meanings:
|
|
216
|
+
- 1.0: Fully meets criteria
|
|
217
|
+
- 0.7-0.9: Mostly meets criteria with minor issues
|
|
218
|
+
- 0.4-0.6: Partially meets criteria
|
|
219
|
+
- 0.1-0.3: Barely meets criteria
|
|
220
|
+
- 0.0: Does not meet criteria
|
|
221
|
+
|
|
222
|
+
Respond ONLY with the JSON object, no other text.`;
|
|
223
|
+
}
|
|
224
|
+
async function callLlm(prompt, model, provider) {
|
|
225
|
+
if (provider === "openai") {
|
|
226
|
+
return callOpenAI(prompt, model);
|
|
227
|
+
} else if (provider === "anthropic") {
|
|
228
|
+
return callAnthropic(prompt, model);
|
|
229
|
+
} else {
|
|
230
|
+
throw new Error(`Unsupported LLM judge provider: ${provider}`);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
async function callOpenAI(prompt, model) {
|
|
234
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
235
|
+
if (!apiKey) {
|
|
236
|
+
throw new Error("OPENAI_API_KEY not set");
|
|
237
|
+
}
|
|
238
|
+
const response = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
239
|
+
method: "POST",
|
|
240
|
+
headers: {
|
|
241
|
+
"Content-Type": "application/json",
|
|
242
|
+
Authorization: `Bearer ${apiKey}`
|
|
243
|
+
},
|
|
244
|
+
body: JSON.stringify({
|
|
245
|
+
model,
|
|
246
|
+
messages: [{ role: "user", content: prompt }],
|
|
247
|
+
temperature: 0
|
|
248
|
+
})
|
|
249
|
+
});
|
|
250
|
+
if (!response.ok) {
|
|
251
|
+
const text = await response.text();
|
|
252
|
+
throw new Error(`OpenAI API error: ${response.status} ${text}`);
|
|
253
|
+
}
|
|
254
|
+
const data = await response.json();
|
|
255
|
+
return data.choices[0]?.message?.content ?? "";
|
|
256
|
+
}
|
|
257
|
+
async function callAnthropic(prompt, model) {
|
|
258
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
259
|
+
if (!apiKey) {
|
|
260
|
+
throw new Error("ANTHROPIC_API_KEY not set");
|
|
261
|
+
}
|
|
262
|
+
const response = await fetch("https://api.anthropic.com/v1/messages", {
|
|
263
|
+
method: "POST",
|
|
264
|
+
headers: {
|
|
265
|
+
"Content-Type": "application/json",
|
|
266
|
+
"x-api-key": apiKey,
|
|
267
|
+
"anthropic-version": "2023-06-01"
|
|
268
|
+
},
|
|
269
|
+
body: JSON.stringify({
|
|
270
|
+
model,
|
|
271
|
+
max_tokens: 256,
|
|
272
|
+
messages: [{ role: "user", content: prompt }]
|
|
273
|
+
})
|
|
274
|
+
});
|
|
275
|
+
if (!response.ok) {
|
|
276
|
+
const text = await response.text();
|
|
277
|
+
throw new Error(`Anthropic API error: ${response.status} ${text}`);
|
|
278
|
+
}
|
|
279
|
+
const data = await response.json();
|
|
280
|
+
const textBlock = data.content.find((c) => c.type === "text");
|
|
281
|
+
return textBlock?.text ?? "";
|
|
282
|
+
}
|
|
283
|
+
function parseJudgeResponse(response) {
|
|
284
|
+
try {
|
|
285
|
+
const jsonMatch = response.match(/\{[\s\S]*\}/);
|
|
286
|
+
if (!jsonMatch) {
|
|
287
|
+
throw new Error("No JSON found in response");
|
|
288
|
+
}
|
|
289
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
290
|
+
if (typeof parsed.score !== "number" || parsed.score < 0 || parsed.score > 1) {
|
|
291
|
+
throw new Error("Invalid score in response");
|
|
292
|
+
}
|
|
293
|
+
return {
|
|
294
|
+
name: "llmJudge",
|
|
295
|
+
score: parsed.score,
|
|
296
|
+
reason: parsed.reason
|
|
297
|
+
};
|
|
298
|
+
} catch (err) {
|
|
299
|
+
return {
|
|
300
|
+
name: "llmJudge",
|
|
301
|
+
score: 0,
|
|
302
|
+
reason: `Failed to parse judge response: ${err.message}`
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// src/scorers/regex.ts
|
|
308
|
+
function outputMatches(pattern) {
|
|
309
|
+
return {
|
|
310
|
+
name: "outputMatches",
|
|
311
|
+
score: async (ctx) => {
|
|
312
|
+
const matches = pattern.test(ctx.output);
|
|
313
|
+
return {
|
|
314
|
+
name: "outputMatches",
|
|
315
|
+
score: matches ? 1 : 0,
|
|
316
|
+
reason: matches ? `Output matches ${pattern}` : `Output does not match ${pattern}`
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// src/scorers/tool-called.ts
|
|
323
|
+
function toolCalled(name) {
|
|
324
|
+
return {
|
|
325
|
+
name: `toolCalled(${name})`,
|
|
326
|
+
score: async (ctx) => {
|
|
327
|
+
const called = ctx.toolCalls.some((tc) => tc.name === name);
|
|
328
|
+
return {
|
|
329
|
+
name: `toolCalled(${name})`,
|
|
330
|
+
score: called ? 1 : 0,
|
|
331
|
+
reason: called ? `Tool "${name}" was called` : `Tool "${name}" was not called. Called: ${formatToolNames(ctx)}`
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
function formatToolNames(ctx) {
|
|
337
|
+
if (ctx.toolCalls.length === 0) return "(none)";
|
|
338
|
+
const unique = [...new Set(ctx.toolCalls.map((tc) => tc.name))];
|
|
339
|
+
return unique.join(", ");
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// src/scorers/tool-called-with.ts
|
|
343
|
+
import * as path2 from "path";
|
|
344
|
+
function toolCalledWith(name, expectedArgs) {
|
|
345
|
+
const label = `toolCalledWith(${name})`;
|
|
346
|
+
return {
|
|
347
|
+
name: label,
|
|
348
|
+
score: async (ctx) => {
|
|
349
|
+
const matching = ctx.toolCalls.filter((tc) => tc.name === name);
|
|
350
|
+
if (matching.length === 0) {
|
|
351
|
+
return {
|
|
352
|
+
name: label,
|
|
353
|
+
score: 0,
|
|
354
|
+
reason: `Tool "${name}" was not called. Called: ${formatToolNames2(ctx)}`
|
|
355
|
+
};
|
|
356
|
+
}
|
|
357
|
+
for (const tc of matching) {
|
|
358
|
+
if (argsMatch(tc.args, expectedArgs, ctx.cwd)) {
|
|
359
|
+
return {
|
|
360
|
+
name: label,
|
|
361
|
+
score: 1,
|
|
362
|
+
reason: `Tool "${name}" called with matching args`
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
const firstCall = matching[0];
|
|
367
|
+
const mismatches = getArgMismatches(
|
|
368
|
+
firstCall.args,
|
|
369
|
+
expectedArgs,
|
|
370
|
+
ctx.cwd
|
|
371
|
+
);
|
|
372
|
+
return {
|
|
373
|
+
name: label,
|
|
374
|
+
score: 0,
|
|
375
|
+
reason: `Tool "${name}" called ${matching.length} time(s) but args did not match. ${mismatches}`
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
};
|
|
379
|
+
}
|
|
380
|
+
function argsMatch(actual, expected, cwd) {
|
|
381
|
+
for (const [key, expectedValue] of Object.entries(expected)) {
|
|
382
|
+
const actualValue = actual[key];
|
|
383
|
+
if (isPathArg(key)) {
|
|
384
|
+
const resolvedActual = resolvePath(actualValue, cwd);
|
|
385
|
+
const resolvedExpected = resolvePath(expectedValue, cwd);
|
|
386
|
+
if (resolvedActual !== resolvedExpected) return false;
|
|
387
|
+
} else {
|
|
388
|
+
if (!deepEqual(actualValue, expectedValue)) return false;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
return true;
|
|
392
|
+
}
|
|
393
|
+
function getArgMismatches(actual, expected, cwd) {
|
|
394
|
+
const parts = [];
|
|
395
|
+
for (const [key, expectedValue] of Object.entries(expected)) {
|
|
396
|
+
const actualValue = actual[key];
|
|
397
|
+
if (isPathArg(key)) {
|
|
398
|
+
const resolvedActual = resolvePath(actualValue, cwd);
|
|
399
|
+
const resolvedExpected = resolvePath(expectedValue, cwd);
|
|
400
|
+
if (resolvedActual !== resolvedExpected) {
|
|
401
|
+
parts.push(
|
|
402
|
+
`${key}: expected "${resolvedExpected}", got "${resolvedActual}"`
|
|
403
|
+
);
|
|
404
|
+
}
|
|
405
|
+
} else if (!deepEqual(actualValue, expectedValue)) {
|
|
406
|
+
parts.push(
|
|
407
|
+
`${key}: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(actualValue)}`
|
|
408
|
+
);
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
return parts.join("; ");
|
|
412
|
+
}
|
|
413
|
+
function isPathArg(key) {
|
|
414
|
+
return key === "path" || key === "file" || key.endsWith("Path");
|
|
415
|
+
}
|
|
416
|
+
function resolvePath(value, cwd) {
|
|
417
|
+
if (typeof value !== "string") return String(value);
|
|
418
|
+
return path2.resolve(cwd, value);
|
|
419
|
+
}
|
|
420
|
+
function deepEqual(a, b) {
|
|
421
|
+
if (a === b) return true;
|
|
422
|
+
if (a === null || b === null) return false;
|
|
423
|
+
if (typeof a !== typeof b) return false;
|
|
424
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
425
|
+
if (a.length !== b.length) return false;
|
|
426
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
427
|
+
}
|
|
428
|
+
if (typeof a === "object" && typeof b === "object") {
|
|
429
|
+
const aObj = a;
|
|
430
|
+
const bObj = b;
|
|
431
|
+
const aKeys = Object.keys(aObj);
|
|
432
|
+
const bKeys = Object.keys(bObj);
|
|
433
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
434
|
+
return aKeys.every((key) => deepEqual(aObj[key], bObj[key]));
|
|
435
|
+
}
|
|
436
|
+
return false;
|
|
437
|
+
}
|
|
438
|
+
function formatToolNames2(ctx) {
|
|
439
|
+
if (ctx.toolCalls.length === 0) return "(none)";
|
|
440
|
+
const unique = [...new Set(ctx.toolCalls.map((tc) => tc.name))];
|
|
441
|
+
return unique.join(", ");
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// src/index.ts
|
|
445
|
+
var Scorers = scorers_exports;
|
|
446
|
+
function evaluate(name, options) {
|
|
447
|
+
const file = getCurrentFile() || "unknown";
|
|
448
|
+
registerEval(name, options, file);
|
|
449
|
+
}
|
|
450
|
+
export {
|
|
451
|
+
Scorers,
|
|
452
|
+
defineConfig,
|
|
453
|
+
evaluate
|
|
454
|
+
};
|
|
455
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/scorers/index.ts","../src/scorers/bash.ts","../src/scorers/contains.ts","../src/scorers/files.ts","../src/scorers/llm-judge.ts","../src/scorers/regex.ts","../src/scorers/tool-called.ts","../src/scorers/tool-called-with.ts","../src/index.ts"],"sourcesContent":["/**\n * Built-in scorers for pi-eval\n */\n\nexport type { BashOptions } from \"./bash\";\nexport { bash } from \"./bash\";\nexport { outputContains } from \"./contains\";\nexport { files } from \"./files\";\nexport type { LlmJudgeOptions } from \"./llm-judge\";\nexport { llmJudge } from \"./llm-judge\";\nexport { outputMatches } from \"./regex\";\nexport { toolCalled } from \"./tool-called\";\nexport { toolCalledWith } from \"./tool-called-with\";\n","/**\n * Bash scorer - runs a command and checks exit code\n */\nimport { exec } from \"node:child_process\";\nimport { promisify } from \"node:util\";\nimport type { Expected, Scorer } from \"../types\";\n\nconst execAsync = promisify(exec);\n\nexport interface BashOptions {\n /** Expected exit code (default: 0) */\n exitCode?: number;\n /** Timeout in ms (default: 30000) */\n timeout?: number;\n}\n\n/**\n * Creates a scorer that runs a bash command and checks the exit code.\n * Useful for running tests, linters, or other validation commands.\n */\nexport function bash(\n command: string,\n options: BashOptions = {},\n): Scorer<Expected> {\n const { exitCode: expectedCode = 0, timeout = 30000 } = options;\n\n return {\n name: \"bash\",\n score: async (ctx) => {\n try {\n const _result = await execAsync(command, {\n cwd: ctx.cwd,\n timeout,\n env: { ...process.env, PATH: process.env.PATH },\n });\n\n // Command succeeded (exit code 0)\n if (expectedCode === 0) {\n return {\n name: \"bash\",\n score: 1,\n reason: `Command succeeded: ${command}`,\n };\n } else {\n return {\n name: \"bash\",\n score: 0,\n reason: `Expected exit code ${expectedCode}, got 0`,\n };\n }\n } catch (err) {\n const error = err as Error & {\n code?: number;\n killed?: boolean;\n stdout?: string;\n stderr?: string;\n };\n\n if (error.killed) {\n return {\n name: \"bash\",\n score: 0,\n reason: `Command timed out after ${timeout}ms: ${command}`,\n };\n }\n\n const actualCode = error.code ?? 1;\n\n if (actualCode === expectedCode) {\n return {\n name: \"bash\",\n score: 1,\n reason: `Command exited with expected code ${expectedCode}: ${command}`,\n };\n }\n\n const stderr = error.stderr ? `\\n${truncate(error.stderr, 200)}` : \"\";\n\n return {\n name: \"bash\",\n score: 0,\n reason: `Command failed with code ${actualCode}: ${command}${stderr}`,\n };\n }\n },\n };\n}\n\nfunction truncate(str: string, maxLen: number): string {\n if (str.length <= maxLen) return str;\n return `${str.slice(0, maxLen - 3)}...`;\n}\n","/**\n * Output contains scorer - checks that output contains expected substring\n */\nimport type { Expected, ScoreContext, Scorer } from \"../types\";\n\n/**\n * Creates a scorer that checks if the agent's output contains expected.output\n */\nexport function outputContains(): Scorer<Expected> {\n return {\n name: \"outputContains\",\n score: async (ctx: ScoreContext<Expected>) => {\n const expectedOutput = ctx.expected?.output;\n\n if (!expectedOutput) {\n return {\n name: \"outputContains\",\n score: 1,\n reason: \"No output expected\",\n };\n }\n\n const contains = ctx.output.includes(expectedOutput);\n\n return {\n name: \"outputContains\",\n score: contains ? 1 : 0,\n reason: contains\n ? `Output contains \"${truncate(expectedOutput, 50)}\"`\n : `Output missing \"${truncate(expectedOutput, 50)}\"`,\n };\n },\n };\n}\n\nfunction truncate(str: string, maxLen: number): string {\n if (str.length <= maxLen) return str;\n return `${str.slice(0, maxLen - 3)}...`;\n}\n","/**\n * Files scorer - checks that expected files exist with expected content\n */\nimport * as fs from \"node:fs/promises\";\nimport * as path from \"node:path\";\nimport type { Expected, ScoreContext, Scorer } from \"../types\";\n\n/**\n * Creates a scorer that checks if expected files exist and contain expected content.\n * Uses substring matching for content comparison.\n */\nexport function files(): Scorer<Expected> {\n return {\n name: \"files\",\n score: async (ctx: ScoreContext<Expected>) => {\n const expectedFiles = ctx.expected?.files;\n\n if (!expectedFiles || Object.keys(expectedFiles).length === 0) {\n return {\n name: \"files\",\n score: 1,\n reason: \"No files expected\",\n };\n }\n\n const results: { file: string; ok: boolean; reason: string }[] = [];\n\n for (const [filePath, expectedContent] of Object.entries(expectedFiles)) {\n const fullPath = path.join(ctx.cwd, filePath);\n\n try {\n const content = await fs.readFile(fullPath, \"utf-8\");\n\n if (content.includes(expectedContent)) {\n results.push({\n file: filePath,\n ok: true,\n reason: \"exists with expected content\",\n });\n } else {\n results.push({\n file: filePath,\n ok: false,\n reason: `exists but missing expected content: \"${truncate(expectedContent, 50)}\"`,\n });\n }\n } catch (err) {\n if ((err as NodeJS.ErrnoException).code === \"ENOENT\") {\n results.push({\n file: filePath,\n ok: false,\n reason: \"file not found\",\n });\n } else {\n results.push({\n file: filePath,\n ok: false,\n reason: `error reading: ${(err as Error).message}`,\n });\n }\n }\n }\n\n const passed = results.filter((r) => r.ok).length;\n const total = results.length;\n const score = total > 0 ? passed / total : 1;\n\n const reasons = results.map(\n (r) => `${r.ok ? \"+\" : \"-\"} ${r.file}: ${r.reason}`,\n );\n\n return {\n name: \"files\",\n score,\n reason: reasons.join(\"\\n\"),\n };\n },\n };\n}\n\nfunction truncate(str: string, maxLen: number): string {\n if (str.length <= maxLen) return str;\n return `${str.slice(0, maxLen - 3)}...`;\n}\n","/**\n * LLM Judge scorer - uses an LLM to evaluate the output\n */\nimport type { Expected, Scorer } from \"../types\";\n\nexport interface LlmJudgeOptions {\n /** Criteria for the LLM to evaluate against */\n criteria: string;\n /** Model to use (default: gpt-4o-mini) */\n model?: string;\n /** Provider (default: openai) */\n provider?: string;\n}\n\n/**\n * Creates a scorer that uses an LLM to evaluate the output against criteria.\n * Uses a cheap, fast model by default.\n *\n * Note: Requires OPENAI_API_KEY or appropriate provider API key.\n */\nexport function llmJudge(options: LlmJudgeOptions): Scorer<Expected> {\n const { criteria, model = \"gpt-4o-mini\", provider = \"openai\" } = options;\n\n return {\n name: \"llmJudge\",\n score: async (ctx) => {\n const prompt = buildJudgePrompt(criteria, ctx.input, ctx.output);\n\n try {\n const result = await callLlm(prompt, model, provider);\n return parseJudgeResponse(result);\n } catch (err) {\n return {\n name: \"llmJudge\",\n score: 0,\n reason: `LLM judge error: ${(err as Error).message}`,\n };\n }\n },\n };\n}\n\nfunction buildJudgePrompt(\n criteria: string,\n input: string,\n output: string,\n): string {\n return `You are evaluating an AI coding assistant's response.\n\n## Task given to the assistant\n${input}\n\n## Assistant's response\n${output}\n\n## Evaluation criteria\n${criteria}\n\n## Instructions\nEvaluate the response against the criteria. Respond with a JSON object:\n{\n \"score\": <number from 0 to 1>,\n \"reason\": \"<brief explanation>\"\n}\n\nScore meanings:\n- 1.0: Fully meets criteria\n- 0.7-0.9: Mostly meets criteria with minor issues\n- 0.4-0.6: Partially meets criteria\n- 0.1-0.3: Barely meets criteria\n- 0.0: Does not meet criteria\n\nRespond ONLY with the JSON object, no other text.`;\n}\n\nasync function callLlm(\n prompt: string,\n model: string,\n provider: string,\n): Promise<string> {\n if (provider === \"openai\") {\n return callOpenAI(prompt, model);\n } else if (provider === \"anthropic\") {\n return callAnthropic(prompt, model);\n } else {\n throw new Error(`Unsupported LLM judge provider: ${provider}`);\n }\n}\n\nasync function callOpenAI(prompt: string, model: string): Promise<string> {\n const apiKey = process.env.OPENAI_API_KEY;\n if (!apiKey) {\n throw new Error(\"OPENAI_API_KEY not set\");\n }\n\n const response = await fetch(\"https://api.openai.com/v1/chat/completions\", {\n method: \"POST\",\n headers: {\n \"Content-Type\": \"application/json\",\n Authorization: `Bearer ${apiKey}`,\n },\n body: JSON.stringify({\n model,\n messages: [{ role: \"user\", content: prompt }],\n temperature: 0,\n }),\n });\n\n if (!response.ok) {\n const text = await response.text();\n throw new Error(`OpenAI API error: ${response.status} ${text}`);\n }\n\n const data = (await response.json()) as {\n choices: { message: { content: string } }[];\n };\n return data.choices[0]?.message?.content ?? \"\";\n}\n\nasync function callAnthropic(prompt: string, model: string): Promise<string> {\n const apiKey = process.env.ANTHROPIC_API_KEY;\n if (!apiKey) {\n throw new Error(\"ANTHROPIC_API_KEY not set\");\n }\n\n const response = await fetch(\"https://api.anthropic.com/v1/messages\", {\n method: \"POST\",\n headers: {\n \"Content-Type\": \"application/json\",\n \"x-api-key\": apiKey,\n \"anthropic-version\": \"2023-06-01\",\n },\n body: JSON.stringify({\n model,\n max_tokens: 256,\n messages: [{ role: \"user\", content: prompt }],\n }),\n });\n\n if (!response.ok) {\n const text = await response.text();\n throw new Error(`Anthropic API error: ${response.status} ${text}`);\n }\n\n const data = (await response.json()) as {\n content: { type: string; text: string }[];\n };\n const textBlock = data.content.find((c) => c.type === \"text\");\n return textBlock?.text ?? \"\";\n}\n\nfunction parseJudgeResponse(response: string): {\n name: string;\n score: number;\n reason?: string;\n} {\n try {\n // Try to extract JSON from the response\n const jsonMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (!jsonMatch) {\n throw new Error(\"No JSON found in response\");\n }\n\n const parsed = JSON.parse(jsonMatch[0]) as {\n score: number;\n reason?: string;\n };\n\n if (\n typeof parsed.score !== \"number\" ||\n parsed.score < 0 ||\n parsed.score > 1\n ) {\n throw new Error(\"Invalid score in response\");\n }\n\n return {\n name: \"llmJudge\",\n score: parsed.score,\n reason: parsed.reason,\n };\n } catch (err) {\n return {\n name: \"llmJudge\",\n score: 0,\n reason: `Failed to parse judge response: ${(err as Error).message}`,\n };\n }\n}\n","/**\n * Regex scorer - checks that output matches a pattern\n */\nimport type { Expected, Scorer } from \"../types\";\n\n/**\n * Creates a scorer that checks if the agent's output matches a regex pattern\n */\nexport function outputMatches(pattern: RegExp): Scorer<Expected> {\n return {\n name: \"outputMatches\",\n score: async (ctx) => {\n const matches = pattern.test(ctx.output);\n\n return {\n name: \"outputMatches\",\n score: matches ? 1 : 0,\n reason: matches\n ? `Output matches ${pattern}`\n : `Output does not match ${pattern}`,\n };\n },\n };\n}\n","/**\n * Tool called scorer - checks that a specific tool was called during the session\n */\nimport type { Expected, ScoreContext, Scorer } from \"../types\";\n\n/**\n * Creates a scorer that checks if a specific tool was called.\n *\n * @param name - The tool name to check for (e.g., \"read\", \"bash\", \"linkup_web_search\")\n */\nexport function toolCalled(name: string): Scorer<Expected> {\n return {\n name: `toolCalled(${name})`,\n score: async (ctx: ScoreContext<Expected>) => {\n const called = ctx.toolCalls.some((tc) => tc.name === name);\n\n return {\n name: `toolCalled(${name})`,\n score: called ? 1 : 0,\n reason: called\n ? `Tool \"${name}\" was called`\n : `Tool \"${name}\" was not called. Called: ${formatToolNames(ctx)}`,\n };\n },\n };\n}\n\nfunction formatToolNames(ctx: ScoreContext<Expected>): string {\n if (ctx.toolCalls.length === 0) return \"(none)\";\n const unique = [...new Set(ctx.toolCalls.map((tc) => tc.name))];\n return unique.join(\", \");\n}\n","/**\n * Tool called with args scorer - checks that a tool was called with specific arguments\n */\nimport * as path from \"node:path\";\nimport type { Expected, ScoreContext, Scorer } from \"../types\";\n\n/**\n * Creates a scorer that checks if a tool was called with specific arguments.\n *\n * For `path` arguments, both expected and actual values are resolved to\n * absolute paths before comparison. All other arguments use direct equality.\n *\n * @param name - The tool name to check for\n * @param expectedArgs - Key-value pairs the tool call args must contain\n */\nexport function toolCalledWith(\n name: string,\n expectedArgs: Record<string, unknown>,\n): Scorer<Expected> {\n const label = `toolCalledWith(${name})`;\n\n return {\n name: label,\n score: async (ctx: ScoreContext<Expected>) => {\n const matching = ctx.toolCalls.filter((tc) => tc.name === name);\n\n if (matching.length === 0) {\n return {\n name: label,\n score: 0,\n reason: `Tool \"${name}\" was not called. Called: ${formatToolNames(ctx)}`,\n };\n }\n\n // Check if any call matches all expected args\n for (const tc of matching) {\n if (argsMatch(tc.args, expectedArgs, ctx.cwd)) {\n return {\n name: label,\n score: 1,\n reason: `Tool \"${name}\" called with matching args`,\n };\n }\n }\n\n // Show the closest call for debugging\n const firstCall = matching[0];\n const mismatches = getArgMismatches(\n firstCall.args,\n expectedArgs,\n ctx.cwd,\n );\n\n return {\n name: label,\n score: 0,\n reason: `Tool \"${name}\" called ${matching.length} time(s) but args did not match. ${mismatches}`,\n };\n },\n };\n}\n\n/**\n * Check if actual args contain all expected key-value pairs.\n * Path args are resolved to absolute paths before comparison.\n */\nfunction argsMatch(\n actual: Record<string, unknown>,\n expected: Record<string, unknown>,\n cwd: string,\n): boolean {\n for (const [key, expectedValue] of Object.entries(expected)) {\n const actualValue = actual[key];\n\n if (isPathArg(key)) {\n const resolvedActual = resolvePath(actualValue, cwd);\n const resolvedExpected = resolvePath(expectedValue, cwd);\n if (resolvedActual !== resolvedExpected) return false;\n } else {\n if (!deepEqual(actualValue, expectedValue)) return false;\n }\n }\n\n return true;\n}\n\n/**\n * Get human-readable mismatch descriptions for debugging.\n */\nfunction getArgMismatches(\n actual: Record<string, unknown>,\n expected: Record<string, unknown>,\n cwd: string,\n): string {\n const parts: string[] = [];\n\n for (const [key, expectedValue] of Object.entries(expected)) {\n const actualValue = actual[key];\n\n if (isPathArg(key)) {\n const resolvedActual = resolvePath(actualValue, cwd);\n const resolvedExpected = resolvePath(expectedValue, cwd);\n if (resolvedActual !== resolvedExpected) {\n parts.push(\n `${key}: expected \"${resolvedExpected}\", got \"${resolvedActual}\"`,\n );\n }\n } else if (!deepEqual(actualValue, expectedValue)) {\n parts.push(\n `${key}: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(actualValue)}`,\n );\n }\n }\n\n return parts.join(\"; \");\n}\n\n/**\n * Check if an argument key represents a file path.\n */\nfunction isPathArg(key: string): boolean {\n return key === \"path\" || key === \"file\" || key.endsWith(\"Path\");\n}\n\n/**\n * Resolve a value as an absolute path relative to cwd.\n */\nfunction resolvePath(value: unknown, cwd: string): string {\n if (typeof value !== \"string\") return String(value);\n return path.resolve(cwd, value);\n}\n\n/**\n * Simple deep equality for JSON-compatible values.\n */\nfunction deepEqual(a: unknown, b: unknown): boolean {\n if (a === b) return true;\n if (a === null || b === null) return false;\n if (typeof a !== typeof b) return false;\n\n if (Array.isArray(a) && Array.isArray(b)) {\n if (a.length !== b.length) return false;\n return a.every((val, i) => deepEqual(val, b[i]));\n }\n\n if (typeof a === \"object\" && typeof b === \"object\") {\n const aObj = a as Record<string, unknown>;\n const bObj = b as Record<string, unknown>;\n const aKeys = Object.keys(aObj);\n const bKeys = Object.keys(bObj);\n if (aKeys.length !== bKeys.length) return false;\n return aKeys.every((key) => deepEqual(aObj[key], bObj[key]));\n }\n\n return false;\n}\n\nfunction formatToolNames(ctx: ScoreContext<Expected>): string {\n if (ctx.toolCalls.length === 0) return \"(none)\";\n const unique = [...new Set(ctx.toolCalls.map((tc) => tc.name))];\n return unique.join(\", \");\n}\n","/**\n * pi-eval - Eval framework for pi coding agent\n */\n\n// Main API\nexport { defineConfig } from \"./config\";\n\n// Scorers namespace\nimport * as ScorersModule from \"./scorers/index\";\nexport const Scorers = ScorersModule;\n\n// Re-export scorer option types\nexport type { BashOptions, LlmJudgeOptions } from \"./scorers/index\";\n// Types\nexport type {\n CliOptions,\n EvalDefinition,\n EvalOptions,\n EvalRunSummary,\n Expected,\n GlobalConfig,\n PiConfig,\n ScoreContext,\n ScoreResult,\n Scorer,\n SessionStats,\n TestCase,\n TestResult,\n TestSetup,\n TokenStats,\n ToolCall,\n} from \"./types\";\n\nimport { getCurrentFile, registerEval } from \"./discovery\";\nimport type { EvalOptions, Expected } from \"./types\";\n\n/**\n * Define and register an eval.\n * This is the main API for creating evals.\n *\n * @example\n * ```typescript\n * import { evaluate, Scorers } from \"@aliou/pi-evals\";\n *\n * evaluate(\"Create hello file\", {\n * config: {\n * model: \"claude-sonnet-4-20250514\",\n * provider: \"anthropic\",\n * },\n * data: [\n * {\n * input: 'Create a file called hello.txt containing \"Hello World\"',\n * expected: { files: { \"hello.txt\": \"Hello World\" } },\n * },\n * ],\n * scorers: [Scorers.files()],\n * });\n * ```\n */\nexport function evaluate<TExpected = Expected>(\n name: string,\n options: EvalOptions<TExpected>,\n): void {\n const file = getCurrentFile() || \"unknown\";\n registerEval(name, options, file);\n}\n"],"mappings":";;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACGA,SAAS,YAAY;AACrB,SAAS,iBAAiB;AAG1B,IAAM,YAAY,UAAU,IAAI;AAazB,SAAS,KACd,SACA,UAAuB,CAAC,GACN;AAClB,QAAM,EAAE,UAAU,eAAe,GAAG,UAAU,IAAM,IAAI;AAExD,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,OAAO,QAAQ;AACpB,UAAI;AACF,cAAM,UAAU,MAAM,UAAU,SAAS;AAAA,UACvC,KAAK,IAAI;AAAA,UACT;AAAA,UACA,KAAK,EAAE,GAAG,QAAQ,KAAK,MAAM,QAAQ,IAAI,KAAK;AAAA,QAChD,CAAC;AAGD,YAAI,iBAAiB,GAAG;AACtB,iBAAO;AAAA,YACL,MAAM;AAAA,YACN,OAAO;AAAA,YACP,QAAQ,sBAAsB,OAAO;AAAA,UACvC;AAAA,QACF,OAAO;AACL,iBAAO;AAAA,YACL,MAAM;AAAA,YACN,OAAO;AAAA,YACP,QAAQ,sBAAsB,YAAY;AAAA,UAC5C;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,cAAM,QAAQ;AAOd,YAAI,MAAM,QAAQ;AAChB,iBAAO;AAAA,YACL,MAAM;AAAA,YACN,OAAO;AAAA,YACP,QAAQ,2BAA2B,OAAO,OAAO,OAAO;AAAA,UAC1D;AAAA,QACF;AAEA,cAAM,aAAa,MAAM,QAAQ;AAEjC,YAAI,eAAe,cAAc;AAC/B,iBAAO;AAAA,YACL,MAAM;AAAA,YACN,OAAO;AAAA,YACP,QAAQ,qCAAqC,YAAY,KAAK,OAAO;AAAA,UACvE;AAAA,QACF;AAEA,cAAM,SAAS,MAAM,SAAS;AAAA,EAAK,SAAS,MAAM,QAAQ,GAAG,CAAC,KAAK;AAEnE,eAAO;AAAA,UACL,MAAM;AAAA,UACN,OAAO;AAAA,UACP,QAAQ,4BAA4B,UAAU,KAAK,OAAO,GAAG,MAAM;AAAA,QACrE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,SAAS,KAAa,QAAwB;AACrD,MAAI,IAAI,UAAU,OAAQ,QAAO;AACjC,SAAO,GAAG,IAAI,MAAM,GAAG,SAAS,CAAC,CAAC;AACpC;;;ACnFO,SAAS,iBAAmC;AACjD,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,OAAO,QAAgC;AAC5C,YAAM,iBAAiB,IAAI,UAAU;AAErC,UAAI,CAAC,gBAAgB;AACnB,eAAO;AAAA,UACL,MAAM;AAAA,UACN,OAAO;AAAA,UACP,QAAQ;AAAA,QACV;AAAA,MACF;AAEA,YAAM,WAAW,IAAI,OAAO,SAAS,cAAc;AAEnD,aAAO;AAAA,QACL,MAAM;AAAA,QACN,OAAO,WAAW,IAAI;AAAA,QACtB,QAAQ,WACJ,oBAAoBA,UAAS,gBAAgB,EAAE,CAAC,MAChD,mBAAmBA,UAAS,gBAAgB,EAAE,CAAC;AAAA,MACrD;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAASA,UAAS,KAAa,QAAwB;AACrD,MAAI,IAAI,UAAU,OAAQ,QAAO;AACjC,SAAO,GAAG,IAAI,MAAM,GAAG,SAAS,CAAC,CAAC;AACpC;;;ACnCA,YAAY,QAAQ;AACpB,YAAY,UAAU;AAOf,SAAS,QAA0B;AACxC,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,OAAO,QAAgC;AAC5C,YAAM,gBAAgB,IAAI,UAAU;AAEpC,UAAI,CAAC,iBAAiB,OAAO,KAAK,aAAa,EAAE,WAAW,GAAG;AAC7D,eAAO;AAAA,UACL,MAAM;AAAA,UACN,OAAO;AAAA,UACP,QAAQ;AAAA,QACV;AAAA,MACF;AAEA,YAAM,UAA2D,CAAC;AAElE,iBAAW,CAAC,UAAU,eAAe,KAAK,OAAO,QAAQ,aAAa,GAAG;AACvE,cAAM,WAAgB,UAAK,IAAI,KAAK,QAAQ;AAE5C,YAAI;AACF,gBAAM,UAAU,MAAS,YAAS,UAAU,OAAO;AAEnD,cAAI,QAAQ,SAAS,eAAe,GAAG;AACrC,oBAAQ,KAAK;AAAA,cACX,MAAM;AAAA,cACN,IAAI;AAAA,cACJ,QAAQ;AAAA,YACV,CAAC;AAAA,UACH,OAAO;AACL,oBAAQ,KAAK;AAAA,cACX,MAAM;AAAA,cACN,IAAI;AAAA,cACJ,QAAQ,yCAAyCC,UAAS,iBAAiB,EAAE,CAAC;AAAA,YAChF,CAAC;AAAA,UACH;AAAA,QACF,SAAS,KAAK;AACZ,cAAK,IAA8B,SAAS,UAAU;AACpD,oBAAQ,KAAK;AAAA,cACX,MAAM;AAAA,cACN,IAAI;AAAA,cACJ,QAAQ;AAAA,YACV,CAAC;AAAA,UACH,OAAO;AACL,oBAAQ,KAAK;AAAA,cACX,MAAM;AAAA,cACN,IAAI;AAAA,cACJ,QAAQ,kBAAmB,IAAc,OAAO;AAAA,YAClD,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF;AAEA,YAAM,SAAS,QAAQ,OAAO,CAAC,MAAM,EAAE,EAAE,EAAE;AAC3C,YAAM,QAAQ,QAAQ;AACtB,YAAM,QAAQ,QAAQ,IAAI,SAAS,QAAQ;AAE3C,YAAM,UAAU,QAAQ;AAAA,QACtB,CAAC,MAAM,GAAG,EAAE,KAAK,MAAM,GAAG,IAAI,EAAE,IAAI,KAAK,EAAE,MAAM;AAAA,MACnD;AAEA,aAAO;AAAA,QACL,MAAM;AAAA,QACN;AAAA,QACA,QAAQ,QAAQ,KAAK,IAAI;AAAA,MAC3B;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAASA,UAAS,KAAa,QAAwB;AACrD,MAAI,IAAI,UAAU,OAAQ,QAAO;AACjC,SAAO,GAAG,IAAI,MAAM,GAAG,SAAS,CAAC,CAAC;AACpC;;;AC/DO,SAAS,SAAS,SAA4C;AACnE,QAAM,EAAE,UAAU,QAAQ,eAAe,WAAW,SAAS,IAAI;AAEjE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,OAAO,QAAQ;AACpB,YAAM,SAAS,iBAAiB,UAAU,IAAI,OAAO,IAAI,MAAM;AAE/D,UAAI;AACF,cAAM,SAAS,MAAM,QAAQ,QAAQ,OAAO,QAAQ;AACpD,eAAO,mBAAmB,MAAM;AAAA,MAClC,SAAS,KAAK;AACZ,eAAO;AAAA,UACL,MAAM;AAAA,UACN,OAAO;AAAA,UACP,QAAQ,oBAAqB,IAAc,OAAO;AAAA,QACpD;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,iBACP,UACA,OACA,QACQ;AACR,SAAO;AAAA;AAAA;AAAA,EAGP,KAAK;AAAA;AAAA;AAAA,EAGL,MAAM;AAAA;AAAA;AAAA,EAGN,QAAQ;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAiBV;AAEA,eAAe,QACb,QACA,OACA,UACiB;AACjB,MAAI,aAAa,UAAU;AACzB,WAAO,WAAW,QAAQ,KAAK;AAAA,EACjC,WAAW,aAAa,aAAa;AACnC,WAAO,cAAc,QAAQ,KAAK;AAAA,EACpC,OAAO;AACL,UAAM,IAAI,MAAM,mCAAmC,QAAQ,EAAE;AAAA,EAC/D;AACF;AAEA,eAAe,WAAW,QAAgB,OAAgC;AACxE,QAAM,SAAS,QAAQ,IAAI;AAC3B,MAAI,CAAC,QAAQ;AACX,UAAM,IAAI,MAAM,wBAAwB;AAAA,EAC1C;AAEA,QAAM,WAAW,MAAM,MAAM,8CAA8C;AAAA,IACzE,QAAQ;AAAA,IACR,SAAS;AAAA,MACP,gBAAgB;AAAA,MAChB,eAAe,UAAU,MAAM;AAAA,IACjC;AAAA,IACA,MAAM,KAAK,UAAU;AAAA,MACnB;AAAA,MACA,UAAU,CAAC,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAAA,MAC5C,aAAa;AAAA,IACf,CAAC;AAAA,EACH,CAAC;AAED,MAAI,CAAC,SAAS,IAAI;AAChB,UAAM,OAAO,MAAM,SAAS,KAAK;AACjC,UAAM,IAAI,MAAM,qBAAqB,SAAS,MAAM,IAAI,IAAI,EAAE;AAAA,EAChE;AAEA,QAAM,OAAQ,MAAM,SAAS,KAAK;AAGlC,SAAO,KAAK,QAAQ,CAAC,GAAG,SAAS,WAAW;AAC9C;AAEA,eAAe,cAAc,QAAgB,OAAgC;AAC3E,QAAM,SAAS,QAAQ,IAAI;AAC3B,MAAI,CAAC,QAAQ;AACX,UAAM,IAAI,MAAM,2BAA2B;AAAA,EAC7C;AAEA,QAAM,WAAW,MAAM,MAAM,yCAAyC;AAAA,IACpE,QAAQ;AAAA,IACR,SAAS;AAAA,MACP,gBAAgB;AAAA,MAChB,aAAa;AAAA,MACb,qBAAqB;AAAA,IACvB;AAAA,IACA,MAAM,KAAK,UAAU;AAAA,MACnB;AAAA,MACA,YAAY;AAAA,MACZ,UAAU,CAAC,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAAA,IAC9C,CAAC;AAAA,EACH,CAAC;AAED,MAAI,CAAC,SAAS,IAAI;AAChB,UAAM,OAAO,MAAM,SAAS,KAAK;AACjC,UAAM,IAAI,MAAM,wBAAwB,SAAS,MAAM,IAAI,IAAI,EAAE;AAAA,EACnE;AAEA,QAAM,OAAQ,MAAM,SAAS,KAAK;AAGlC,QAAM,YAAY,KAAK,QAAQ,KAAK,CAAC,MAAM,EAAE,SAAS,MAAM;AAC5D,SAAO,WAAW,QAAQ;AAC5B;AAEA,SAAS,mBAAmB,UAI1B;AACA,MAAI;AAEF,UAAM,YAAY,SAAS,MAAM,aAAa;AAC9C,QAAI,CAAC,WAAW;AACd,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AAEA,UAAM,SAAS,KAAK,MAAM,UAAU,CAAC,CAAC;AAKtC,QACE,OAAO,OAAO,UAAU,YACxB,OAAO,QAAQ,KACf,OAAO,QAAQ,GACf;AACA,YAAM,IAAI,MAAM,2BAA2B;AAAA,IAC7C;AAEA,WAAO;AAAA,MACL,MAAM;AAAA,MACN,OAAO,OAAO;AAAA,MACd,QAAQ,OAAO;AAAA,IACjB;AAAA,EACF,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,MAAM;AAAA,MACN,OAAO;AAAA,MACP,QAAQ,mCAAoC,IAAc,OAAO;AAAA,IACnE;AAAA,EACF;AACF;;;ACpLO,SAAS,cAAc,SAAmC;AAC/D,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,OAAO,QAAQ;AACpB,YAAM,UAAU,QAAQ,KAAK,IAAI,MAAM;AAEvC,aAAO;AAAA,QACL,MAAM;AAAA,QACN,OAAO,UAAU,IAAI;AAAA,QACrB,QAAQ,UACJ,kBAAkB,OAAO,KACzB,yBAAyB,OAAO;AAAA,MACtC;AAAA,IACF;AAAA,EACF;AACF;;;ACbO,SAAS,WAAW,MAAgC;AACzD,SAAO;AAAA,IACL,MAAM,cAAc,IAAI;AAAA,IACxB,OAAO,OAAO,QAAgC;AAC5C,YAAM,SAAS,IAAI,UAAU,KAAK,CAAC,OAAO,GAAG,SAAS,IAAI;AAE1D,aAAO;AAAA,QACL,MAAM,cAAc,IAAI;AAAA,QACxB,OAAO,SAAS,IAAI;AAAA,QACpB,QAAQ,SACJ,SAAS,IAAI,iBACb,SAAS,IAAI,6BAA6B,gBAAgB,GAAG,CAAC;AAAA,MACpE;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,gBAAgB,KAAqC;AAC5D,MAAI,IAAI,UAAU,WAAW,EAAG,QAAO;AACvC,QAAM,SAAS,CAAC,GAAG,IAAI,IAAI,IAAI,UAAU,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,CAAC;AAC9D,SAAO,OAAO,KAAK,IAAI;AACzB;;;AC5BA,YAAYC,WAAU;AAYf,SAAS,eACd,MACA,cACkB;AAClB,QAAM,QAAQ,kBAAkB,IAAI;AAEpC,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,OAAO,QAAgC;AAC5C,YAAM,WAAW,IAAI,UAAU,OAAO,CAAC,OAAO,GAAG,SAAS,IAAI;AAE9D,UAAI,SAAS,WAAW,GAAG;AACzB,eAAO;AAAA,UACL,MAAM;AAAA,UACN,OAAO;AAAA,UACP,QAAQ,SAAS,IAAI,6BAA6BC,iBAAgB,GAAG,CAAC;AAAA,QACxE;AAAA,MACF;AAGA,iBAAW,MAAM,UAAU;AACzB,YAAI,UAAU,GAAG,MAAM,cAAc,IAAI,GAAG,GAAG;AAC7C,iBAAO;AAAA,YACL,MAAM;AAAA,YACN,OAAO;AAAA,YACP,QAAQ,SAAS,IAAI;AAAA,UACvB;AAAA,QACF;AAAA,MACF;AAGA,YAAM,YAAY,SAAS,CAAC;AAC5B,YAAM,aAAa;AAAA,QACjB,UAAU;AAAA,QACV;AAAA,QACA,IAAI;AAAA,MACN;AAEA,aAAO;AAAA,QACL,MAAM;AAAA,QACN,OAAO;AAAA,QACP,QAAQ,SAAS,IAAI,YAAY,SAAS,MAAM,oCAAoC,UAAU;AAAA,MAChG;AAAA,IACF;AAAA,EACF;AACF;AAMA,SAAS,UACP,QACA,UACA,KACS;AACT,aAAW,CAAC,KAAK,aAAa,KAAK,OAAO,QAAQ,QAAQ,GAAG;AAC3D,UAAM,cAAc,OAAO,GAAG;AAE9B,QAAI,UAAU,GAAG,GAAG;AAClB,YAAM,iBAAiB,YAAY,aAAa,GAAG;AACnD,YAAM,mBAAmB,YAAY,eAAe,GAAG;AACvD,UAAI,mBAAmB,iBAAkB,QAAO;AAAA,IAClD,OAAO;AACL,UAAI,CAAC,UAAU,aAAa,aAAa,EAAG,QAAO;AAAA,IACrD;AAAA,EACF;AAEA,SAAO;AACT;AAKA,SAAS,iBACP,QACA,UACA,KACQ;AACR,QAAM,QAAkB,CAAC;AAEzB,aAAW,CAAC,KAAK,aAAa,KAAK,OAAO,QAAQ,QAAQ,GAAG;AAC3D,UAAM,cAAc,OAAO,GAAG;AAE9B,QAAI,UAAU,GAAG,GAAG;AAClB,YAAM,iBAAiB,YAAY,aAAa,GAAG;AACnD,YAAM,mBAAmB,YAAY,eAAe,GAAG;AACvD,UAAI,mBAAmB,kBAAkB;AACvC,cAAM;AAAA,UACJ,GAAG,GAAG,eAAe,gBAAgB,WAAW,cAAc;AAAA,QAChE;AAAA,MACF;AAAA,IACF,WAAW,CAAC,UAAU,aAAa,aAAa,GAAG;AACjD,YAAM;AAAA,QACJ,GAAG,GAAG,cAAc,KAAK,UAAU,aAAa,CAAC,SAAS,KAAK,UAAU,WAAW,CAAC;AAAA,MACvF;AAAA,IACF;AAAA,EACF;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;AAKA,SAAS,UAAU,KAAsB;AACvC,SAAO,QAAQ,UAAU,QAAQ,UAAU,IAAI,SAAS,MAAM;AAChE;AAKA,SAAS,YAAY,OAAgB,KAAqB;AACxD,MAAI,OAAO,UAAU,SAAU,QAAO,OAAO,KAAK;AAClD,SAAY,cAAQ,KAAK,KAAK;AAChC;AAKA,SAAS,UAAU,GAAY,GAAqB;AAClD,MAAI,MAAM,EAAG,QAAO;AACpB,MAAI,MAAM,QAAQ,MAAM,KAAM,QAAO;AACrC,MAAI,OAAO,MAAM,OAAO,EAAG,QAAO;AAElC,MAAI,MAAM,QAAQ,CAAC,KAAK,MAAM,QAAQ,CAAC,GAAG;AACxC,QAAI,EAAE,WAAW,EAAE,OAAQ,QAAO;AAClC,WAAO,EAAE,MAAM,CAAC,KAAK,MAAM,UAAU,KAAK,EAAE,CAAC,CAAC,CAAC;AAAA,EACjD;AAEA,MAAI,OAAO,MAAM,YAAY,OAAO,MAAM,UAAU;AAClD,UAAM,OAAO;AACb,UAAM,OAAO;AACb,UAAM,QAAQ,OAAO,KAAK,IAAI;AAC9B,UAAM,QAAQ,OAAO,KAAK,IAAI;AAC9B,QAAI,MAAM,WAAW,MAAM,OAAQ,QAAO;AAC1C,WAAO,MAAM,MAAM,CAAC,QAAQ,UAAU,KAAK,GAAG,GAAG,KAAK,GAAG,CAAC,CAAC;AAAA,EAC7D;AAEA,SAAO;AACT;AAEA,SAASA,iBAAgB,KAAqC;AAC5D,MAAI,IAAI,UAAU,WAAW,EAAG,QAAO;AACvC,QAAM,SAAS,CAAC,GAAG,IAAI,IAAI,IAAI,UAAU,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,CAAC;AAC9D,SAAO,OAAO,KAAK,IAAI;AACzB;;;ACxJO,IAAM,UAAU;AAkDhB,SAAS,SACd,MACA,SACM;AACN,QAAM,OAAO,eAAe,KAAK;AACjC,eAAa,MAAM,SAAS,IAAI;AAClC;","names":["truncate","truncate","path","formatToolNames"]}
|
package/package.json
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@aliou/pi-evals",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "Eval framework for pi coding agent",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"bin": {
|
|
9
|
+
"pi-evals": "dist/cli.js"
|
|
10
|
+
},
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"import": "./dist/index.js"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"dependencies": {
|
|
18
|
+
"@mariozechner/pi-ai": "^0.51.6",
|
|
19
|
+
"@mariozechner/pi-coding-agent": "^0.51.6",
|
|
20
|
+
"glob": "^11.0.0"
|
|
21
|
+
},
|
|
22
|
+
"devDependencies": {
|
|
23
|
+
"@biomejs/biome": "^2.3.14",
|
|
24
|
+
"@changesets/cli": "^2.29.8",
|
|
25
|
+
"@types/node": "^22.0.0",
|
|
26
|
+
"husky": "^9.1.7",
|
|
27
|
+
"tsup": "^8.5.1",
|
|
28
|
+
"tsx": "^4.21.0",
|
|
29
|
+
"typescript": "^5.7.0"
|
|
30
|
+
},
|
|
31
|
+
"engines": {
|
|
32
|
+
"node": ">=20"
|
|
33
|
+
},
|
|
34
|
+
"files": [
|
|
35
|
+
"dist"
|
|
36
|
+
],
|
|
37
|
+
"keywords": [
|
|
38
|
+
"pi",
|
|
39
|
+
"eval",
|
|
40
|
+
"llm",
|
|
41
|
+
"testing"
|
|
42
|
+
],
|
|
43
|
+
"license": "MIT",
|
|
44
|
+
"repository": {
|
|
45
|
+
"type": "git",
|
|
46
|
+
"url": "https://github.com/aliou/pi-evals"
|
|
47
|
+
},
|
|
48
|
+
"publishConfig": {
|
|
49
|
+
"access": "public"
|
|
50
|
+
},
|
|
51
|
+
"scripts": {
|
|
52
|
+
"build": "tsup",
|
|
53
|
+
"dev": "tsup --watch",
|
|
54
|
+
"typecheck": "tsc --noEmit",
|
|
55
|
+
"lint": "biome check",
|
|
56
|
+
"format": "biome check --write",
|
|
57
|
+
"eval": "tsx src/cli.ts",
|
|
58
|
+
"eval:build": "node dist/cli.js",
|
|
59
|
+
"changeset": "changeset"
|
|
60
|
+
}
|
|
61
|
+
}
|