@aliou/pi-evals 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -2
- package/skill/SKILL.md +160 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aliou/pi-evals",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "Eval framework for pi coding agent",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -32,7 +32,8 @@
|
|
|
32
32
|
"node": ">=20"
|
|
33
33
|
},
|
|
34
34
|
"files": [
|
|
35
|
-
"dist"
|
|
35
|
+
"dist",
|
|
36
|
+
"skill"
|
|
36
37
|
],
|
|
37
38
|
"keywords": [
|
|
38
39
|
"pi",
|
package/skill/SKILL.md
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: pi-evals
|
|
3
|
+
description: Write and run evals for pi extensions and agent behavior using @aliou/pi-evals. Use when creating eval files, writing custom scorers, configuring eval runs, or testing that pi extensions work correctly.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# pi-evals
|
|
7
|
+
|
|
8
|
+
Eval framework for testing pi coding agent behavior. Runs prompts against pi via `createAgentSession`, then scores the results.
|
|
9
|
+
|
|
10
|
+
## Quick Start
|
|
11
|
+
|
|
12
|
+
Install:
|
|
13
|
+
```bash
|
|
14
|
+
pnpm add -D @aliou/pi-evals
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Create `pi-evals.config.ts` at the project root:
|
|
18
|
+
```typescript
|
|
19
|
+
import { defineConfig } from "@aliou/pi-evals";
|
|
20
|
+
|
|
21
|
+
export default defineConfig({
|
|
22
|
+
defaults: {
|
|
23
|
+
model: "claude-haiku-4-5",
|
|
24
|
+
provider: "anthropic",
|
|
25
|
+
},
|
|
26
|
+
evalsDir: "./evals",
|
|
27
|
+
timeout: 60_000,
|
|
28
|
+
});
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Create an eval file in `evals/`:
|
|
32
|
+
```typescript
|
|
33
|
+
// evals/hello.eval.ts
|
|
34
|
+
import { evaluate, Scorers } from "@aliou/pi-evals";
|
|
35
|
+
|
|
36
|
+
evaluate("Create hello file", {
|
|
37
|
+
config: {
|
|
38
|
+
model: "claude-haiku-4-5",
|
|
39
|
+
provider: "anthropic",
|
|
40
|
+
},
|
|
41
|
+
data: [
|
|
42
|
+
{
|
|
43
|
+
input: 'Create a file called hello.txt containing "Hello World"',
|
|
44
|
+
expected: { files: { "hello.txt": "Hello World" } },
|
|
45
|
+
},
|
|
46
|
+
],
|
|
47
|
+
scorers: [Scorers.files()],
|
|
48
|
+
timeout: 30_000,
|
|
49
|
+
});
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Run:
|
|
53
|
+
```bash
|
|
54
|
+
pnpm pi-evals # all evals
|
|
55
|
+
pnpm pi-evals --filter "hello" # by name substring
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Eval File Structure
|
|
59
|
+
|
|
60
|
+
Eval files are `*.eval.ts` files in the configured `evalsDir`. Each calls `evaluate()` to register one eval.
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
evaluate("Eval name", {
|
|
64
|
+
config: { model, provider, extensions?, env? },
|
|
65
|
+
data: [{ input, expected?, setup?, timeout? }],
|
|
66
|
+
scorers: [...],
|
|
67
|
+
timeout?: number,
|
|
68
|
+
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Test Cases (`data`)
|
|
72
|
+
|
|
73
|
+
Each test case runs in an isolated temp directory.
|
|
74
|
+
|
|
75
|
+
- `input`: prompt sent to the agent
|
|
76
|
+
- `expected`: optional expected outcome (used by scorers)
|
|
77
|
+
- `setup.files`: files to pre-create in the workspace (`{ "path": "content" }`)
|
|
78
|
+
- `setup.commands`: shell commands to run before the eval
|
|
79
|
+
- `timeout`: override timeout for this case
|
|
80
|
+
|
|
81
|
+
### Config (`config`)
|
|
82
|
+
|
|
83
|
+
- `model`: model name (e.g. `"claude-haiku-4-5"`)
|
|
84
|
+
- `provider`: provider name (e.g. `"anthropic"`, `"github"`)
|
|
85
|
+
- `extensions`: array of extension paths, resolved relative to `process.cwd()`
|
|
86
|
+
- `env`: environment variables to set
|
|
87
|
+
|
|
88
|
+
## Built-in Scorers
|
|
89
|
+
|
|
90
|
+
All scorers are accessed via `Scorers.*`:
|
|
91
|
+
|
|
92
|
+
| Scorer | Description |
|
|
93
|
+
|--------|-------------|
|
|
94
|
+
| `Scorers.files()` | Checks `expected.files` exist with matching content (substring) |
|
|
95
|
+
| `Scorers.outputContains()` | Checks `expected.output` is a substring of agent output |
|
|
96
|
+
| `Scorers.outputMatches(regex)` | Checks agent output matches a regex |
|
|
97
|
+
| `Scorers.toolCalled(name)` | Checks a tool was called by name |
|
|
98
|
+
| `Scorers.toolCalledWith(name, args)` | Checks a tool was called with specific args |
|
|
99
|
+
| `Scorers.bash(command, opts?)` | Runs a shell command in the workspace, checks exit code |
|
|
100
|
+
| `Scorers.llmJudge({ criteria })` | Uses an LLM to evaluate the output against criteria |
|
|
101
|
+
|
|
102
|
+
## Custom Scorers
|
|
103
|
+
|
|
104
|
+
A scorer is an object with `name` and `score(ctx) => ScoreResult`:
|
|
105
|
+
|
|
106
|
+
```typescript
|
|
107
|
+
import type { Scorer } from "@aliou/pi-evals";
|
|
108
|
+
|
|
109
|
+
const myScorer: Scorer = {
|
|
110
|
+
name: "my_scorer",
|
|
111
|
+
async score(ctx) {
|
|
112
|
+
// ctx.input - the prompt
|
|
113
|
+
// ctx.output - agent's final text response
|
|
114
|
+
// ctx.cwd - workspace directory
|
|
115
|
+
// ctx.toolCalls - array of { name, args }
|
|
116
|
+
// ctx.messages - full conversation
|
|
117
|
+
// ctx.expected - the expected object from the test case
|
|
118
|
+
// ctx.stats - { tokens: { input, output, total }, cost }
|
|
119
|
+
return {
|
|
120
|
+
name: "my_scorer",
|
|
121
|
+
score: 1, // 0 to 1, >= 0.5 passes
|
|
122
|
+
reason: "Looks good",
|
|
123
|
+
};
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Testing Extensions
|
|
129
|
+
|
|
130
|
+
Pass extension paths in `config.extensions`. Paths resolve relative to `process.cwd()` (the project root), not the temp workspace.
|
|
131
|
+
|
|
132
|
+
```typescript
|
|
133
|
+
evaluate("My extension eval", {
|
|
134
|
+
config: {
|
|
135
|
+
model: "claude-haiku-4-5",
|
|
136
|
+
provider: "anthropic",
|
|
137
|
+
extensions: ["./extensions/my-ext/index.ts"],
|
|
138
|
+
},
|
|
139
|
+
data: [
|
|
140
|
+
{ input: "Use the custom tool provided by my extension." },
|
|
141
|
+
],
|
|
142
|
+
scorers: [Scorers.toolCalled("my_custom_tool")],
|
|
143
|
+
});
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## CLI Options
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
-f, --filter <pattern> Filter evals by name substring
|
|
150
|
+
-t, --threshold <pct> Minimum pass percentage to exit 0
|
|
151
|
+
-c, --config <path> Config file path (default: pi-evals.config.ts)
|
|
152
|
+
-m, --model <model> Override model (env: PI_EVAL_MODEL)
|
|
153
|
+
-p, --provider <name> Override provider (env: PI_EVAL_PROVIDER)
|
|
154
|
+
-v, --verbose Detailed output
|
|
155
|
+
--json Output results as JSON
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Session Behavior
|
|
159
|
+
|
|
160
|
+
Each eval test case runs in an isolated temp directory. Sessions use in-memory storage and are not persisted to the user's session directory.
|