@aliou/pi-evals 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,205 @@
1
+ # @aliou/pi-evals
2
+
3
+ Eval framework for the pi coding agent.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pnpm add @aliou/pi-evals
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ Create an eval file in `evals/`:
14
+
15
+ ```typescript
16
+ // evals/hello.eval.ts
17
+ import { evaluate, Scorers } from "@aliou/pi-evals";
18
+
19
+ evaluate("Create hello file", {
20
+ config: {
21
+ model: "claude-sonnet-4-20250514",
22
+ provider: "anthropic",
23
+ },
24
+ data: [
25
+ {
26
+ input: 'Create a file called hello.txt containing "Hello World"',
27
+ expected: { files: { "hello.txt": "Hello World" } },
28
+ },
29
+ ],
30
+ scorers: [Scorers.files()],
31
+ });
32
+ ```
33
+
34
+ Run evals:
35
+
36
+ ```bash
37
+ npx pi-evals
38
+ ```
39
+
40
+ ## Configuration
41
+
42
+ Create `pi-evals.config.ts`:
43
+
44
+ ```typescript
45
+ import { defineConfig } from "@aliou/pi-evals";
46
+
47
+ export default defineConfig({
48
+ defaults: {
49
+ model: "claude-sonnet-4-20250514",
50
+ provider: "anthropic",
51
+ },
52
+ evalsDir: "./evals",
53
+ delayBetweenTests: 500,
54
+ timeout: 60_000,
55
+ warnTestCount: 30,
56
+ });
57
+ ```
58
+
59
+ ## CLI Options
60
+
61
+ ```
62
+ pi-evals [options]
63
+
64
+ Options:
65
+ -h, --help Show help
66
+ -f, --filter <pattern> Filter evals by name
67
+ -t, --threshold <pct> Minimum pass percentage to exit 0
68
+ -c, --config <path> Config file path
69
+ -m, --model <model> Override model
70
+ -p, --provider <name> Override provider
71
+ -v, --verbose Verbose output
72
+ --json Output results as JSON
73
+
74
+ Environment Variables:
75
+ PI_EVAL_MODEL Override model (lower priority than -m)
76
+ PI_EVAL_PROVIDER Override provider (lower priority than -p)
77
+ ```
78
+
79
+ Examples:
80
+ ```bash
81
+ pi-evals # Run all evals
82
+ pi-evals -p github -m gpt-4o # Use GitHub Models
83
+ PI_EVAL_PROVIDER=github pi-evals # Via env var
84
+ ```
85
+
86
+ ## Built-in Scorers
87
+
88
+ ### `Scorers.files()`
89
+
90
+ Checks that expected files exist with expected content.
91
+
92
+ ```typescript
93
+ {
94
+ expected: { files: { "hello.txt": "Hello World" } },
95
+ scorers: [Scorers.files()],
96
+ }
97
+ ```
98
+
99
+ ### `Scorers.outputContains()`
100
+
101
+ Checks that the agent's output contains expected substring.
102
+
103
+ ```typescript
104
+ {
105
+ expected: { output: "created file" },
106
+ scorers: [Scorers.outputContains()],
107
+ }
108
+ ```
109
+
110
+ ### `Scorers.outputMatches(pattern)`
111
+
112
+ Checks that the agent's output matches a regex.
113
+
114
+ ```typescript
115
+ {
116
+ scorers: [Scorers.outputMatches(/function \w+\(/)],
117
+ }
118
+ ```
119
+
120
+ ### `Scorers.bash(command, options?)`
121
+
122
+ Runs a command and checks the exit code.
123
+
124
+ ```typescript
125
+ {
126
+ scorers: [Scorers.bash("npm test")],
127
+ }
128
+ ```
129
+
130
+ Options:
131
+ - `exitCode`: Expected exit code (default: 0)
132
+ - `timeout`: Command timeout in ms (default: 30000)
133
+
134
+ ### `Scorers.llmJudge(options)`
135
+
136
+ Uses an LLM to evaluate the output.
137
+
138
+ ```typescript
139
+ {
140
+ scorers: [
141
+ Scorers.llmJudge({
142
+ criteria: "The response correctly explains the solution",
143
+ model: "gpt-4o-mini", // optional
144
+ provider: "openai", // optional
145
+ }),
146
+ ],
147
+ }
148
+ ```
149
+
150
+ ## Test Case Options
151
+
152
+ ```typescript
153
+ {
154
+ input: "Create a file",
155
+ expected: { files: { "file.txt": "content" } },
156
+ setup: {
157
+ files: { "existing.txt": "existing content" },
158
+ commands: ["npm init -y"],
159
+ },
160
+ timeout: 30_000,
161
+ only: false, // Run only this test
162
+ skip: false, // Skip this test
163
+ }
164
+ ```
165
+
166
+ ## Custom Scorers
167
+
168
+ ```typescript
169
+ const customScorer: Scorer = {
170
+ name: "custom",
171
+ score: async (ctx) => {
172
+ const fileExists = await fs.access(path.join(ctx.cwd, "output.txt"))
173
+ .then(() => true)
174
+ .catch(() => false);
175
+
176
+ return {
177
+ name: "custom",
178
+ score: fileExists ? 1 : 0,
179
+ reason: fileExists ? "File exists" : "File not found",
180
+ };
181
+ },
182
+ };
183
+ ```
184
+
185
+ ## CI Integration
186
+
187
+ ```yaml
188
+ - name: Run evals
189
+ env:
190
+ PI_EVAL_PROVIDER: github
191
+ PI_EVAL_MODEL: gpt-4o
192
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
193
+ run: npx pi-evals --json > results.json
194
+
195
+ - name: Check results
196
+ run: |
197
+ PASSED=$(jq '.passed' results.json)
198
+ TOTAL=$(jq '.total' results.json)
199
+ echo "Passed: $PASSED/$TOTAL"
200
+ if [ "$PASSED" -lt "$TOTAL" ]; then exit 1; fi
201
+ ```
202
+
203
+ ## License
204
+
205
+ MIT
@@ -0,0 +1,117 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __export = (target, all) => {
3
+ for (var name in all)
4
+ __defProp(target, name, { get: all[name], enumerable: true });
5
+ };
6
+
7
+ // src/config.ts
8
+ import * as fs from "fs/promises";
9
+ import * as path from "path";
10
+ import { pathToFileURL } from "url";
11
+ var DEFAULT_CONFIG_FILES = [
12
+ "pi-evals.config.ts",
13
+ "pi-evals.config.js",
14
+ "pi-evals.config.mjs"
15
+ ];
16
+ var DEFAULT_CONFIG = {
17
+ defaults: {},
18
+ evalsDir: "./evals",
19
+ delayBetweenTests: 500,
20
+ timeout: 6e4,
21
+ warnTestCount: 30
22
+ };
23
+ function defineConfig(config) {
24
+ return config;
25
+ }
26
+ async function loadConfig(configPath) {
27
+ const cwd = process.cwd();
28
+ if (configPath) {
29
+ const fullPath = path.resolve(cwd, configPath);
30
+ return loadConfigFile(fullPath);
31
+ }
32
+ for (const filename of DEFAULT_CONFIG_FILES) {
33
+ const fullPath = path.join(cwd, filename);
34
+ try {
35
+ await fs.access(fullPath);
36
+ return loadConfigFile(fullPath);
37
+ } catch {
38
+ }
39
+ }
40
+ return DEFAULT_CONFIG;
41
+ }
42
+ async function loadConfigFile(filePath) {
43
+ try {
44
+ const fileUrl = pathToFileURL(filePath).href;
45
+ const module = await import(fileUrl);
46
+ const userConfig = module.default ?? {};
47
+ return {
48
+ ...DEFAULT_CONFIG,
49
+ ...userConfig,
50
+ defaults: {
51
+ ...DEFAULT_CONFIG.defaults,
52
+ ...userConfig.defaults
53
+ }
54
+ };
55
+ } catch (err) {
56
+ console.error(`Failed to load config from ${filePath}:`, err);
57
+ return DEFAULT_CONFIG;
58
+ }
59
+ }
60
+
61
+ // src/discovery.ts
62
+ import * as path2 from "path";
63
+ import { pathToFileURL as pathToFileURL2 } from "url";
64
+ import { glob } from "glob";
65
+ var evalRegistry = [];
66
+ function registerEval(name, options, file) {
67
+ evalRegistry.push({
68
+ name,
69
+ // Cast to unknown first to avoid type overlap issues
70
+ options,
71
+ file
72
+ });
73
+ }
74
+ function clearRegistry() {
75
+ evalRegistry.length = 0;
76
+ }
77
+ function getRegisteredEvals() {
78
+ return [...evalRegistry];
79
+ }
80
+ async function discoverEvals(evalsDir) {
81
+ const cwd = process.cwd();
82
+ const fullDir = path2.resolve(cwd, evalsDir);
83
+ const pattern = path2.join(fullDir, "**/*.eval.{ts,js,mjs}");
84
+ const files = await glob(pattern, { absolute: true });
85
+ if (files.length === 0) {
86
+ console.warn(`No eval files found in ${evalsDir}`);
87
+ return [];
88
+ }
89
+ clearRegistry();
90
+ for (const file of files) {
91
+ try {
92
+ setCurrentFile(file);
93
+ const fileUrl = pathToFileURL2(file).href;
94
+ await import(fileUrl);
95
+ } catch (err) {
96
+ console.error(`Failed to load eval file ${file}:`, err);
97
+ }
98
+ }
99
+ return getRegisteredEvals();
100
+ }
101
+ var currentFile = "";
102
+ function setCurrentFile(file) {
103
+ currentFile = file;
104
+ }
105
+ function getCurrentFile() {
106
+ return currentFile;
107
+ }
108
+
109
+ export {
110
+ __export,
111
+ defineConfig,
112
+ loadConfig,
113
+ registerEval,
114
+ discoverEvals,
115
+ getCurrentFile
116
+ };
117
+ //# sourceMappingURL=chunk-342JG3E3.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/config.ts","../src/discovery.ts"],"sourcesContent":["/**\n * Config loading for pi-eval\n */\nimport * as fs from \"node:fs/promises\";\nimport * as path from \"node:path\";\nimport { pathToFileURL } from \"node:url\";\nimport type { GlobalConfig } from \"./types\";\n\nconst DEFAULT_CONFIG_FILES = [\n \"pi-evals.config.ts\",\n \"pi-evals.config.js\",\n \"pi-evals.config.mjs\",\n];\n\nconst DEFAULT_CONFIG: Required<GlobalConfig> = {\n defaults: {},\n evalsDir: \"./evals\",\n delayBetweenTests: 500,\n timeout: 60_000,\n warnTestCount: 30,\n};\n\n/**\n * Helper for defining config with type inference\n */\nexport function defineConfig(config: GlobalConfig): GlobalConfig {\n return config;\n}\n\n/**\n * Load config from file or return defaults\n */\nexport async function loadConfig(\n configPath?: string,\n): Promise<Required<GlobalConfig>> {\n const cwd = process.cwd();\n\n // If explicit path provided, use it\n if (configPath) {\n const fullPath = path.resolve(cwd, configPath);\n return loadConfigFile(fullPath);\n }\n\n // Try default config file names\n for (const filename of DEFAULT_CONFIG_FILES) {\n const fullPath = path.join(cwd, filename);\n try {\n await fs.access(fullPath);\n return loadConfigFile(fullPath);\n } catch {\n // File doesn't exist, try next\n }\n }\n\n // No config file found, return defaults\n return DEFAULT_CONFIG;\n}\n\nasync function loadConfigFile(\n filePath: string,\n): Promise<Required<GlobalConfig>> {\n try {\n // For TypeScript files, we need to compile or use a loader\n // For now, assume the file is pre-compiled or use tsx/ts-node\n const fileUrl = pathToFileURL(filePath).href;\n const module = (await import(fileUrl)) as { default?: GlobalConfig };\n\n const userConfig = module.default ?? {};\n\n return {\n ...DEFAULT_CONFIG,\n ...userConfig,\n defaults: {\n ...DEFAULT_CONFIG.defaults,\n ...userConfig.defaults,\n },\n };\n } catch (err) {\n console.error(`Failed to load config from ${filePath}:`, err);\n return DEFAULT_CONFIG;\n }\n}\n","/**\n * Eval file discovery\n */\n\nimport * as path from \"node:path\";\nimport { pathToFileURL } from \"node:url\";\nimport { glob } from \"glob\";\nimport type { EvalDefinition, EvalOptions } from \"./types\";\n\n// Global registry for evals (populated when eval files are imported)\nconst evalRegistry: EvalDefinition[] = [];\n\n/**\n * Register an eval definition (called by evaluate())\n */\nexport function registerEval<TExpected>(\n name: string,\n options: EvalOptions<TExpected>,\n file: string,\n): void {\n evalRegistry.push({\n name,\n // Cast to unknown first to avoid type overlap issues\n options: options as unknown as EvalOptions,\n file,\n });\n}\n\n/**\n * Clear the eval registry (for testing)\n */\nexport function clearRegistry(): void {\n evalRegistry.length = 0;\n}\n\n/**\n * Get all registered evals\n */\nexport function getRegisteredEvals(): EvalDefinition[] {\n return [...evalRegistry];\n}\n\n/**\n * Discover and load all eval files from a directory\n */\nexport async function discoverEvals(\n evalsDir: string,\n): Promise<EvalDefinition[]> {\n const cwd = process.cwd();\n const fullDir = path.resolve(cwd, evalsDir);\n\n // Find all *.eval.ts and *.eval.js files\n const pattern = path.join(fullDir, \"**/*.eval.{ts,js,mjs}\");\n const files = await glob(pattern, { absolute: true });\n\n if (files.length === 0) {\n console.warn(`No eval files found in ${evalsDir}`);\n return [];\n }\n\n // Clear registry before loading\n clearRegistry();\n\n // Import each file (this triggers evaluate() calls which register evals)\n for (const file of files) {\n try {\n // Set current file context for registration\n setCurrentFile(file);\n const fileUrl = pathToFileURL(file).href;\n await import(fileUrl);\n } catch (err) {\n console.error(`Failed to load eval file ${file}:`, err);\n }\n }\n\n return getRegisteredEvals();\n}\n\n// Track current file being loaded (for registration)\nlet currentFile = \"\";\n\nexport function setCurrentFile(file: string): void {\n currentFile = file;\n}\n\nexport function getCurrentFile(): string {\n return currentFile;\n}\n"],"mappings":";;;;;;;AAGA,YAAY,QAAQ;AACpB,YAAY,UAAU;AACtB,SAAS,qBAAqB;AAG9B,IAAM,uBAAuB;AAAA,EAC3B;AAAA,EACA;AAAA,EACA;AACF;AAEA,IAAM,iBAAyC;AAAA,EAC7C,UAAU,CAAC;AAAA,EACX,UAAU;AAAA,EACV,mBAAmB;AAAA,EACnB,SAAS;AAAA,EACT,eAAe;AACjB;AAKO,SAAS,aAAa,QAAoC;AAC/D,SAAO;AACT;AAKA,eAAsB,WACpB,YACiC;AACjC,QAAM,MAAM,QAAQ,IAAI;AAGxB,MAAI,YAAY;AACd,UAAM,WAAgB,aAAQ,KAAK,UAAU;AAC7C,WAAO,eAAe,QAAQ;AAAA,EAChC;AAGA,aAAW,YAAY,sBAAsB;AAC3C,UAAM,WAAgB,UAAK,KAAK,QAAQ;AACxC,QAAI;AACF,YAAS,UAAO,QAAQ;AACxB,aAAO,eAAe,QAAQ;AAAA,IAChC,QAAQ;AAAA,IAER;AAAA,EACF;AAGA,SAAO;AACT;AAEA,eAAe,eACb,UACiC;AACjC,MAAI;AAGF,UAAM,UAAU,cAAc,QAAQ,EAAE;AACxC,UAAM,SAAU,MAAM,OAAO;AAE7B,UAAM,aAAa,OAAO,WAAW,CAAC;AAEtC,WAAO;AAAA,MACL,GAAG;AAAA,MACH,GAAG;AAAA,MACH,UAAU;AAAA,QACR,GAAG,eAAe;AAAA,QAClB,GAAG,WAAW;AAAA,MAChB;AAAA,IACF;AAAA,EACF,SAAS,KAAK;AACZ,YAAQ,MAAM,8BAA8B,QAAQ,KAAK,GAAG;AAC5D,WAAO;AAAA,EACT;AACF;;;AC7EA,YAAYA,WAAU;AACtB,SAAS,iBAAAC,sBAAqB;AAC9B,SAAS,YAAY;AAIrB,IAAM,eAAiC,CAAC;AAKjC,SAAS,aACd,MACA,SACA,MACM;AACN,eAAa,KAAK;AAAA,IAChB;AAAA;AAAA,IAEA;AAAA,IACA;AAAA,EACF,CAAC;AACH;AAKO,SAAS,gBAAsB;AACpC,eAAa,SAAS;AACxB;AAKO,SAAS,qBAAuC;AACrD,SAAO,CAAC,GAAG,YAAY;AACzB;AAKA,eAAsB,cACpB,UAC2B;AAC3B,QAAM,MAAM,QAAQ,IAAI;AACxB,QAAM,UAAe,cAAQ,KAAK,QAAQ;AAG1C,QAAM,UAAe,WAAK,SAAS,uBAAuB;AAC1D,QAAM,QAAQ,MAAM,KAAK,SAAS,EAAE,UAAU,KAAK,CAAC;AAEpD,MAAI,MAAM,WAAW,GAAG;AACtB,YAAQ,KAAK,0BAA0B,QAAQ,EAAE;AACjD,WAAO,CAAC;AAAA,EACV;AAGA,gBAAc;AAGd,aAAW,QAAQ,OAAO;AACxB,QAAI;AAEF,qBAAe,IAAI;AACnB,YAAM,UAAUA,eAAc,IAAI,EAAE;AACpC,YAAM,OAAO;AAAA,IACf,SAAS,KAAK;AACZ,cAAQ,MAAM,4BAA4B,IAAI,KAAK,GAAG;AAAA,IACxD;AAAA,EACF;AAEA,SAAO,mBAAmB;AAC5B;AAGA,IAAI,cAAc;AAEX,SAAS,eAAe,MAAoB;AACjD,gBAAc;AAChB;AAEO,SAAS,iBAAyB;AACvC,SAAO;AACT;","names":["path","pathToFileURL"]}
package/dist/cli.d.ts ADDED
@@ -0,0 +1 @@
1
+ #!/usr/bin/env node