@aliou/pi-evals 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +3 -2
  2. package/skill/SKILL.md +160 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aliou/pi-evals",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "description": "Eval framework for pi coding agent",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -32,7 +32,8 @@
32
32
  "node": ">=20"
33
33
  },
34
34
  "files": [
35
- "dist"
35
+ "dist",
36
+ "skill"
36
37
  ],
37
38
  "keywords": [
38
39
  "pi",
package/skill/SKILL.md ADDED
@@ -0,0 +1,160 @@
1
+ ---
2
+ name: pi-evals
3
+ description: Write and run evals for pi extensions and agent behavior using @aliou/pi-evals. Use when creating eval files, writing custom scorers, configuring eval runs, or testing that pi extensions work correctly.
4
+ ---
5
+
6
+ # pi-evals
7
+
8
+ Eval framework for testing pi coding agent behavior. Runs prompts against pi via `createAgentSession`, then scores the results.
9
+
10
+ ## Quick Start
11
+
12
+ Install:
13
+ ```bash
14
+ pnpm add -D @aliou/pi-evals
15
+ ```
16
+
17
+ Create `pi-evals.config.ts` at the project root:
18
+ ```typescript
19
+ import { defineConfig } from "@aliou/pi-evals";
20
+
21
+ export default defineConfig({
22
+ defaults: {
23
+ model: "claude-haiku-4-5",
24
+ provider: "anthropic",
25
+ },
26
+ evalsDir: "./evals",
27
+ timeout: 60_000,
28
+ });
29
+ ```
30
+
31
+ Create an eval file in `evals/`:
32
+ ```typescript
33
+ // evals/hello.eval.ts
34
+ import { evaluate, Scorers } from "@aliou/pi-evals";
35
+
36
+ evaluate("Create hello file", {
37
+ config: {
38
+ model: "claude-haiku-4-5",
39
+ provider: "anthropic",
40
+ },
41
+ data: [
42
+ {
43
+ input: 'Create a file called hello.txt containing "Hello World"',
44
+ expected: { files: { "hello.txt": "Hello World" } },
45
+ },
46
+ ],
47
+ scorers: [Scorers.files()],
48
+ timeout: 30_000,
49
+ });
50
+ ```
51
+
52
+ Run:
53
+ ```bash
54
+ pnpm pi-evals # all evals
55
+ pnpm pi-evals --filter "hello" # by name substring
56
+ ```
57
+
58
+ ## Eval File Structure
59
+
60
+ Eval files are `*.eval.ts` files in the configured `evalsDir`. Each calls `evaluate()` to register one eval.
61
+
62
+ ```typescript
63
+ evaluate("Eval name", {
64
+ config: { model, provider, extensions?, env? },
65
+ data: [{ input, expected?, setup?, timeout? }],
66
+ scorers: [...],
67
+ timeout?: number,
68
+ });
69
+ ```
70
+
71
+ ### Test Cases (`data`)
72
+
73
+ Each test case runs in an isolated temp directory.
74
+
75
+ - `input`: prompt sent to the agent
76
+ - `expected`: optional expected outcome (used by scorers)
77
+ - `setup.files`: files to pre-create in the workspace (`{ "path": "content" }`)
78
+ - `setup.commands`: shell commands to run before the eval
79
+ - `timeout`: override timeout for this case
80
+
81
+ ### Config (`config`)
82
+
83
+ - `model`: model name (e.g. `"claude-haiku-4-5"`)
84
+ - `provider`: provider name (e.g. `"anthropic"`, `"github"`)
85
+ - `extensions`: array of extension paths, resolved relative to `process.cwd()`
86
+ - `env`: environment variables to set
87
+
88
+ ## Built-in Scorers
89
+
90
+ All scorers are accessed via `Scorers.*`:
91
+
92
+ | Scorer | Description |
93
+ |--------|-------------|
94
+ | `Scorers.files()` | Checks `expected.files` exist with matching content (substring) |
95
+ | `Scorers.outputContains()` | Checks `expected.output` is a substring of agent output |
96
+ | `Scorers.outputMatches(regex)` | Checks agent output matches a regex |
97
+ | `Scorers.toolCalled(name)` | Checks a tool was called by name |
98
+ | `Scorers.toolCalledWith(name, args)` | Checks a tool was called with specific args |
99
+ | `Scorers.bash(command, opts?)` | Runs a shell command in the workspace, checks exit code |
100
+ | `Scorers.llmJudge({ criteria })` | Uses an LLM to evaluate the output against criteria |
101
+
102
+ ## Custom Scorers
103
+
104
+ A scorer is an object with `name` and `score(ctx) => ScoreResult`:
105
+
106
+ ```typescript
107
+ import type { Scorer } from "@aliou/pi-evals";
108
+
109
+ const myScorer: Scorer = {
110
+ name: "my_scorer",
111
+ async score(ctx) {
112
+ // ctx.input - the prompt
113
+ // ctx.output - agent's final text response
114
+ // ctx.cwd - workspace directory
115
+ // ctx.toolCalls - array of { name, args }
116
+ // ctx.messages - full conversation
117
+ // ctx.expected - the expected object from the test case
118
+ // ctx.stats - { tokens: { input, output, total }, cost }
119
+ return {
120
+ name: "my_scorer",
121
+ score: 1, // 0 to 1, >= 0.5 passes
122
+ reason: "Looks good",
123
+ };
124
+ },
125
+ };
126
+ ```
127
+
128
+ ## Testing Extensions
129
+
130
+ Pass extension paths in `config.extensions`. Paths resolve relative to `process.cwd()` (the project root), not the temp workspace.
131
+
132
+ ```typescript
133
+ evaluate("My extension eval", {
134
+ config: {
135
+ model: "claude-haiku-4-5",
136
+ provider: "anthropic",
137
+ extensions: ["./extensions/my-ext/index.ts"],
138
+ },
139
+ data: [
140
+ { input: "Use the custom tool provided by my extension." },
141
+ ],
142
+ scorers: [Scorers.toolCalled("my_custom_tool")],
143
+ });
144
+ ```
145
+
146
+ ## CLI Options
147
+
148
+ ```
149
+ -f, --filter <pattern> Filter evals by name substring
150
+ -t, --threshold <pct> Minimum pass percentage to exit 0
151
+ -c, --config <path> Config file path (default: pi-evals.config.ts)
152
+ -m, --model <model> Override model (env: PI_EVAL_MODEL)
153
+ -p, --provider <name> Override provider (env: PI_EVAL_PROVIDER)
154
+ -v, --verbose Detailed output
155
+ --json Output results as JSON
156
+ ```
157
+
158
+ ## Session Behavior
159
+
160
+ Each eval test case runs in an isolated temp directory. Sessions use in-memory storage and are not persisted to the user's session directory.