@aliou/pi-evals 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +205 -0
- package/dist/chunk-342JG3E3.js +117 -0
- package/dist/chunk-342JG3E3.js.map +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +445 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +344 -0
- package/dist/index.js +455 -0
- package/dist/index.js.map +1 -0
- package/package.json +61 -0
package/README.md
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# @aliou/pi-evals
|
|
2
|
+
|
|
3
|
+
Eval framework for the pi coding agent.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pnpm add @aliou/pi-evals
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
Create an eval file in `evals/`:
|
|
14
|
+
|
|
15
|
+
```typescript
|
|
16
|
+
// evals/hello.eval.ts
|
|
17
|
+
import { evaluate, Scorers } from "@aliou/pi-evals";
|
|
18
|
+
|
|
19
|
+
evaluate("Create hello file", {
|
|
20
|
+
config: {
|
|
21
|
+
model: "claude-sonnet-4-20250514",
|
|
22
|
+
provider: "anthropic",
|
|
23
|
+
},
|
|
24
|
+
data: [
|
|
25
|
+
{
|
|
26
|
+
input: 'Create a file called hello.txt containing "Hello World"',
|
|
27
|
+
expected: { files: { "hello.txt": "Hello World" } },
|
|
28
|
+
},
|
|
29
|
+
],
|
|
30
|
+
scorers: [Scorers.files()],
|
|
31
|
+
});
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Run evals:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
npx pi-evals
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Configuration
|
|
41
|
+
|
|
42
|
+
Create `pi-evals.config.ts`:
|
|
43
|
+
|
|
44
|
+
```typescript
|
|
45
|
+
import { defineConfig } from "@aliou/pi-evals";
|
|
46
|
+
|
|
47
|
+
export default defineConfig({
|
|
48
|
+
defaults: {
|
|
49
|
+
model: "claude-sonnet-4-20250514",
|
|
50
|
+
provider: "anthropic",
|
|
51
|
+
},
|
|
52
|
+
evalsDir: "./evals",
|
|
53
|
+
delayBetweenTests: 500,
|
|
54
|
+
timeout: 60_000,
|
|
55
|
+
warnTestCount: 30,
|
|
56
|
+
});
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## CLI Options
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
pi-evals [options]
|
|
63
|
+
|
|
64
|
+
Options:
|
|
65
|
+
-h, --help Show help
|
|
66
|
+
-f, --filter <pattern> Filter evals by name
|
|
67
|
+
-t, --threshold <pct> Minimum pass percentage to exit 0
|
|
68
|
+
-c, --config <path> Config file path
|
|
69
|
+
-m, --model <model> Override model
|
|
70
|
+
-p, --provider <name> Override provider
|
|
71
|
+
-v, --verbose Verbose output
|
|
72
|
+
--json Output results as JSON
|
|
73
|
+
|
|
74
|
+
Environment Variables:
|
|
75
|
+
PI_EVAL_MODEL Override model (lower priority than -m)
|
|
76
|
+
PI_EVAL_PROVIDER Override provider (lower priority than -p)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Examples:
|
|
80
|
+
```bash
|
|
81
|
+
pi-evals # Run all evals
|
|
82
|
+
pi-evals -p github -m gpt-4o # Use GitHub Models
|
|
83
|
+
PI_EVAL_PROVIDER=github pi-evals # Via env var
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Built-in Scorers
|
|
87
|
+
|
|
88
|
+
### `Scorers.files()`
|
|
89
|
+
|
|
90
|
+
Checks that expected files exist with expected content.
|
|
91
|
+
|
|
92
|
+
```typescript
|
|
93
|
+
{
|
|
94
|
+
expected: { files: { "hello.txt": "Hello World" } },
|
|
95
|
+
scorers: [Scorers.files()],
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### `Scorers.outputContains()`
|
|
100
|
+
|
|
101
|
+
Checks that the agent's output contains expected substring.
|
|
102
|
+
|
|
103
|
+
```typescript
|
|
104
|
+
{
|
|
105
|
+
expected: { output: "created file" },
|
|
106
|
+
scorers: [Scorers.outputContains()],
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### `Scorers.outputMatches(pattern)`
|
|
111
|
+
|
|
112
|
+
Checks that the agent's output matches a regex.
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
{
|
|
116
|
+
scorers: [Scorers.outputMatches(/function \w+\(/)],
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### `Scorers.bash(command, options?)`
|
|
121
|
+
|
|
122
|
+
Runs a command and checks the exit code.
|
|
123
|
+
|
|
124
|
+
```typescript
|
|
125
|
+
{
|
|
126
|
+
scorers: [Scorers.bash("npm test")],
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Options:
|
|
131
|
+
- `exitCode`: Expected exit code (default: 0)
|
|
132
|
+
- `timeout`: Command timeout in ms (default: 30000)
|
|
133
|
+
|
|
134
|
+
### `Scorers.llmJudge(options)`
|
|
135
|
+
|
|
136
|
+
Uses an LLM to evaluate the output.
|
|
137
|
+
|
|
138
|
+
```typescript
|
|
139
|
+
{
|
|
140
|
+
scorers: [
|
|
141
|
+
Scorers.llmJudge({
|
|
142
|
+
criteria: "The response correctly explains the solution",
|
|
143
|
+
model: "gpt-4o-mini", // optional
|
|
144
|
+
provider: "openai", // optional
|
|
145
|
+
}),
|
|
146
|
+
],
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Test Case Options
|
|
151
|
+
|
|
152
|
+
```typescript
|
|
153
|
+
{
|
|
154
|
+
input: "Create a file",
|
|
155
|
+
expected: { files: { "file.txt": "content" } },
|
|
156
|
+
setup: {
|
|
157
|
+
files: { "existing.txt": "existing content" },
|
|
158
|
+
commands: ["npm init -y"],
|
|
159
|
+
},
|
|
160
|
+
timeout: 30_000,
|
|
161
|
+
only: false, // Run only this test
|
|
162
|
+
skip: false, // Skip this test
|
|
163
|
+
}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Custom Scorers
|
|
167
|
+
|
|
168
|
+
```typescript
|
|
169
|
+
const customScorer: Scorer = {
|
|
170
|
+
name: "custom",
|
|
171
|
+
score: async (ctx) => {
|
|
172
|
+
const fileExists = await fs.access(path.join(ctx.cwd, "output.txt"))
|
|
173
|
+
.then(() => true)
|
|
174
|
+
.catch(() => false);
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
name: "custom",
|
|
178
|
+
score: fileExists ? 1 : 0,
|
|
179
|
+
reason: fileExists ? "File exists" : "File not found",
|
|
180
|
+
};
|
|
181
|
+
},
|
|
182
|
+
};
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## CI Integration
|
|
186
|
+
|
|
187
|
+
```yaml
|
|
188
|
+
- name: Run evals
|
|
189
|
+
env:
|
|
190
|
+
PI_EVAL_PROVIDER: github
|
|
191
|
+
PI_EVAL_MODEL: gpt-4o
|
|
192
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
193
|
+
run: npx pi-evals --json > results.json
|
|
194
|
+
|
|
195
|
+
- name: Check results
|
|
196
|
+
run: |
|
|
197
|
+
PASSED=$(jq '.passed' results.json)
|
|
198
|
+
TOTAL=$(jq '.total' results.json)
|
|
199
|
+
echo "Passed: $PASSED/$TOTAL"
|
|
200
|
+
if [ "$PASSED" -lt "$TOTAL" ]; then exit 1; fi
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## License
|
|
204
|
+
|
|
205
|
+
MIT
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __export = (target, all) => {
|
|
3
|
+
for (var name in all)
|
|
4
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
5
|
+
};
|
|
6
|
+
|
|
7
|
+
// src/config.ts
|
|
8
|
+
import * as fs from "fs/promises";
|
|
9
|
+
import * as path from "path";
|
|
10
|
+
import { pathToFileURL } from "url";
|
|
11
|
+
var DEFAULT_CONFIG_FILES = [
|
|
12
|
+
"pi-evals.config.ts",
|
|
13
|
+
"pi-evals.config.js",
|
|
14
|
+
"pi-evals.config.mjs"
|
|
15
|
+
];
|
|
16
|
+
var DEFAULT_CONFIG = {
|
|
17
|
+
defaults: {},
|
|
18
|
+
evalsDir: "./evals",
|
|
19
|
+
delayBetweenTests: 500,
|
|
20
|
+
timeout: 6e4,
|
|
21
|
+
warnTestCount: 30
|
|
22
|
+
};
|
|
23
|
+
function defineConfig(config) {
|
|
24
|
+
return config;
|
|
25
|
+
}
|
|
26
|
+
async function loadConfig(configPath) {
|
|
27
|
+
const cwd = process.cwd();
|
|
28
|
+
if (configPath) {
|
|
29
|
+
const fullPath = path.resolve(cwd, configPath);
|
|
30
|
+
return loadConfigFile(fullPath);
|
|
31
|
+
}
|
|
32
|
+
for (const filename of DEFAULT_CONFIG_FILES) {
|
|
33
|
+
const fullPath = path.join(cwd, filename);
|
|
34
|
+
try {
|
|
35
|
+
await fs.access(fullPath);
|
|
36
|
+
return loadConfigFile(fullPath);
|
|
37
|
+
} catch {
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return DEFAULT_CONFIG;
|
|
41
|
+
}
|
|
42
|
+
async function loadConfigFile(filePath) {
|
|
43
|
+
try {
|
|
44
|
+
const fileUrl = pathToFileURL(filePath).href;
|
|
45
|
+
const module = await import(fileUrl);
|
|
46
|
+
const userConfig = module.default ?? {};
|
|
47
|
+
return {
|
|
48
|
+
...DEFAULT_CONFIG,
|
|
49
|
+
...userConfig,
|
|
50
|
+
defaults: {
|
|
51
|
+
...DEFAULT_CONFIG.defaults,
|
|
52
|
+
...userConfig.defaults
|
|
53
|
+
}
|
|
54
|
+
};
|
|
55
|
+
} catch (err) {
|
|
56
|
+
console.error(`Failed to load config from ${filePath}:`, err);
|
|
57
|
+
return DEFAULT_CONFIG;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// src/discovery.ts
|
|
62
|
+
import * as path2 from "path";
|
|
63
|
+
import { pathToFileURL as pathToFileURL2 } from "url";
|
|
64
|
+
import { glob } from "glob";
|
|
65
|
+
var evalRegistry = [];
|
|
66
|
+
function registerEval(name, options, file) {
|
|
67
|
+
evalRegistry.push({
|
|
68
|
+
name,
|
|
69
|
+
// Cast to unknown first to avoid type overlap issues
|
|
70
|
+
options,
|
|
71
|
+
file
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
function clearRegistry() {
|
|
75
|
+
evalRegistry.length = 0;
|
|
76
|
+
}
|
|
77
|
+
function getRegisteredEvals() {
|
|
78
|
+
return [...evalRegistry];
|
|
79
|
+
}
|
|
80
|
+
async function discoverEvals(evalsDir) {
|
|
81
|
+
const cwd = process.cwd();
|
|
82
|
+
const fullDir = path2.resolve(cwd, evalsDir);
|
|
83
|
+
const pattern = path2.join(fullDir, "**/*.eval.{ts,js,mjs}");
|
|
84
|
+
const files = await glob(pattern, { absolute: true });
|
|
85
|
+
if (files.length === 0) {
|
|
86
|
+
console.warn(`No eval files found in ${evalsDir}`);
|
|
87
|
+
return [];
|
|
88
|
+
}
|
|
89
|
+
clearRegistry();
|
|
90
|
+
for (const file of files) {
|
|
91
|
+
try {
|
|
92
|
+
setCurrentFile(file);
|
|
93
|
+
const fileUrl = pathToFileURL2(file).href;
|
|
94
|
+
await import(fileUrl);
|
|
95
|
+
} catch (err) {
|
|
96
|
+
console.error(`Failed to load eval file ${file}:`, err);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return getRegisteredEvals();
|
|
100
|
+
}
|
|
101
|
+
var currentFile = "";
|
|
102
|
+
function setCurrentFile(file) {
|
|
103
|
+
currentFile = file;
|
|
104
|
+
}
|
|
105
|
+
function getCurrentFile() {
|
|
106
|
+
return currentFile;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export {
|
|
110
|
+
__export,
|
|
111
|
+
defineConfig,
|
|
112
|
+
loadConfig,
|
|
113
|
+
registerEval,
|
|
114
|
+
discoverEvals,
|
|
115
|
+
getCurrentFile
|
|
116
|
+
};
|
|
117
|
+
//# sourceMappingURL=chunk-342JG3E3.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/config.ts","../src/discovery.ts"],"sourcesContent":["/**\n * Config loading for pi-eval\n */\nimport * as fs from \"node:fs/promises\";\nimport * as path from \"node:path\";\nimport { pathToFileURL } from \"node:url\";\nimport type { GlobalConfig } from \"./types\";\n\nconst DEFAULT_CONFIG_FILES = [\n \"pi-evals.config.ts\",\n \"pi-evals.config.js\",\n \"pi-evals.config.mjs\",\n];\n\nconst DEFAULT_CONFIG: Required<GlobalConfig> = {\n defaults: {},\n evalsDir: \"./evals\",\n delayBetweenTests: 500,\n timeout: 60_000,\n warnTestCount: 30,\n};\n\n/**\n * Helper for defining config with type inference\n */\nexport function defineConfig(config: GlobalConfig): GlobalConfig {\n return config;\n}\n\n/**\n * Load config from file or return defaults\n */\nexport async function loadConfig(\n configPath?: string,\n): Promise<Required<GlobalConfig>> {\n const cwd = process.cwd();\n\n // If explicit path provided, use it\n if (configPath) {\n const fullPath = path.resolve(cwd, configPath);\n return loadConfigFile(fullPath);\n }\n\n // Try default config file names\n for (const filename of DEFAULT_CONFIG_FILES) {\n const fullPath = path.join(cwd, filename);\n try {\n await fs.access(fullPath);\n return loadConfigFile(fullPath);\n } catch {\n // File doesn't exist, try next\n }\n }\n\n // No config file found, return defaults\n return DEFAULT_CONFIG;\n}\n\nasync function loadConfigFile(\n filePath: string,\n): Promise<Required<GlobalConfig>> {\n try {\n // For TypeScript files, we need to compile or use a loader\n // For now, assume the file is pre-compiled or use tsx/ts-node\n const fileUrl = pathToFileURL(filePath).href;\n const module = (await import(fileUrl)) as { default?: GlobalConfig };\n\n const userConfig = module.default ?? {};\n\n return {\n ...DEFAULT_CONFIG,\n ...userConfig,\n defaults: {\n ...DEFAULT_CONFIG.defaults,\n ...userConfig.defaults,\n },\n };\n } catch (err) {\n console.error(`Failed to load config from ${filePath}:`, err);\n return DEFAULT_CONFIG;\n }\n}\n","/**\n * Eval file discovery\n */\n\nimport * as path from \"node:path\";\nimport { pathToFileURL } from \"node:url\";\nimport { glob } from \"glob\";\nimport type { EvalDefinition, EvalOptions } from \"./types\";\n\n// Global registry for evals (populated when eval files are imported)\nconst evalRegistry: EvalDefinition[] = [];\n\n/**\n * Register an eval definition (called by evaluate())\n */\nexport function registerEval<TExpected>(\n name: string,\n options: EvalOptions<TExpected>,\n file: string,\n): void {\n evalRegistry.push({\n name,\n // Cast to unknown first to avoid type overlap issues\n options: options as unknown as EvalOptions,\n file,\n });\n}\n\n/**\n * Clear the eval registry (for testing)\n */\nexport function clearRegistry(): void {\n evalRegistry.length = 0;\n}\n\n/**\n * Get all registered evals\n */\nexport function getRegisteredEvals(): EvalDefinition[] {\n return [...evalRegistry];\n}\n\n/**\n * Discover and load all eval files from a directory\n */\nexport async function discoverEvals(\n evalsDir: string,\n): Promise<EvalDefinition[]> {\n const cwd = process.cwd();\n const fullDir = path.resolve(cwd, evalsDir);\n\n // Find all *.eval.ts and *.eval.js files\n const pattern = path.join(fullDir, \"**/*.eval.{ts,js,mjs}\");\n const files = await glob(pattern, { absolute: true });\n\n if (files.length === 0) {\n console.warn(`No eval files found in ${evalsDir}`);\n return [];\n }\n\n // Clear registry before loading\n clearRegistry();\n\n // Import each file (this triggers evaluate() calls which register evals)\n for (const file of files) {\n try {\n // Set current file context for registration\n setCurrentFile(file);\n const fileUrl = pathToFileURL(file).href;\n await import(fileUrl);\n } catch (err) {\n console.error(`Failed to load eval file ${file}:`, err);\n }\n }\n\n return getRegisteredEvals();\n}\n\n// Track current file being loaded (for registration)\nlet currentFile = \"\";\n\nexport function setCurrentFile(file: string): void {\n currentFile = file;\n}\n\nexport function getCurrentFile(): string {\n return currentFile;\n}\n"],"mappings":";;;;;;;AAGA,YAAY,QAAQ;AACpB,YAAY,UAAU;AACtB,SAAS,qBAAqB;AAG9B,IAAM,uBAAuB;AAAA,EAC3B;AAAA,EACA;AAAA,EACA;AACF;AAEA,IAAM,iBAAyC;AAAA,EAC7C,UAAU,CAAC;AAAA,EACX,UAAU;AAAA,EACV,mBAAmB;AAAA,EACnB,SAAS;AAAA,EACT,eAAe;AACjB;AAKO,SAAS,aAAa,QAAoC;AAC/D,SAAO;AACT;AAKA,eAAsB,WACpB,YACiC;AACjC,QAAM,MAAM,QAAQ,IAAI;AAGxB,MAAI,YAAY;AACd,UAAM,WAAgB,aAAQ,KAAK,UAAU;AAC7C,WAAO,eAAe,QAAQ;AAAA,EAChC;AAGA,aAAW,YAAY,sBAAsB;AAC3C,UAAM,WAAgB,UAAK,KAAK,QAAQ;AACxC,QAAI;AACF,YAAS,UAAO,QAAQ;AACxB,aAAO,eAAe,QAAQ;AAAA,IAChC,QAAQ;AAAA,IAER;AAAA,EACF;AAGA,SAAO;AACT;AAEA,eAAe,eACb,UACiC;AACjC,MAAI;AAGF,UAAM,UAAU,cAAc,QAAQ,EAAE;AACxC,UAAM,SAAU,MAAM,OAAO;AAE7B,UAAM,aAAa,OAAO,WAAW,CAAC;AAEtC,WAAO;AAAA,MACL,GAAG;AAAA,MACH,GAAG;AAAA,MACH,UAAU;AAAA,QACR,GAAG,eAAe;AAAA,QAClB,GAAG,WAAW;AAAA,MAChB;AAAA,IACF;AAAA,EACF,SAAS,KAAK;AACZ,YAAQ,MAAM,8BAA8B,QAAQ,KAAK,GAAG;AAC5D,WAAO;AAAA,EACT;AACF;;;AC7EA,YAAYA,WAAU;AACtB,SAAS,iBAAAC,sBAAqB;AAC9B,SAAS,YAAY;AAIrB,IAAM,eAAiC,CAAC;AAKjC,SAAS,aACd,MACA,SACA,MACM;AACN,eAAa,KAAK;AAAA,IAChB;AAAA;AAAA,IAEA;AAAA,IACA;AAAA,EACF,CAAC;AACH;AAKO,SAAS,gBAAsB;AACpC,eAAa,SAAS;AACxB;AAKO,SAAS,qBAAuC;AACrD,SAAO,CAAC,GAAG,YAAY;AACzB;AAKA,eAAsB,cACpB,UAC2B;AAC3B,QAAM,MAAM,QAAQ,IAAI;AACxB,QAAM,UAAe,cAAQ,KAAK,QAAQ;AAG1C,QAAM,UAAe,WAAK,SAAS,uBAAuB;AAC1D,QAAM,QAAQ,MAAM,KAAK,SAAS,EAAE,UAAU,KAAK,CAAC;AAEpD,MAAI,MAAM,WAAW,GAAG;AACtB,YAAQ,KAAK,0BAA0B,QAAQ,EAAE;AACjD,WAAO,CAAC;AAAA,EACV;AAGA,gBAAc;AAGd,aAAW,QAAQ,OAAO;AACxB,QAAI;AAEF,qBAAe,IAAI;AACnB,YAAM,UAAUA,eAAc,IAAI,EAAE;AACpC,YAAM,OAAO;AAAA,IACf,SAAS,KAAK;AACZ,cAAQ,MAAM,4BAA4B,IAAI,KAAK,GAAG;AAAA,IACxD;AAAA,EACF;AAEA,SAAO,mBAAmB;AAC5B;AAGA,IAAI,cAAc;AAEX,SAAS,eAAe,MAAoB;AACjD,gBAAc;AAChB;AAEO,SAAS,iBAAyB;AACvC,SAAO;AACT;","names":["path","pathToFileURL"]}
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
#!/usr/bin/env node
|