katt 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +174 -0
- package/dist/index.js +566 -0
- package/package.json +46 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 raphaelpor
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# Katt
|
|
2
|
+
|
|
3
|
+
Katt is a lightweight testing framework for running AI Evals, inspired by [Jest](https://github.com/jestjs/jest).
|
|
4
|
+
|
|
5
|
+
<img src="docs/logo.png" alt="Katt logo" width="250" />
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
Katt is designed to evaluate and validate the behavior of AI agents like **Claude Code**, **GitHub Copilot**, **OpenAI Codex** and more. It provides a simple, intuitive API for writing tests that interact with AI models and assert their responses.
|
|
10
|
+
|
|
11
|
+
## API Documentation
|
|
12
|
+
|
|
13
|
+
For a complete list of features and usage examples, see [docs/api-documentation.md](docs/api-documentation.md).
|
|
14
|
+
|
|
15
|
+
## Hello World - Example
|
|
16
|
+
|
|
17
|
+
```typescript
|
|
18
|
+
const result = await prompt("If you read this just say 'hello world'");
|
|
19
|
+
expect(result).toContain("hello world");
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
It also supports the familiar `describe` and `it` syntax for organizing tests:
|
|
23
|
+
|
|
24
|
+
```typescript
|
|
25
|
+
describe("Greeting agent", () => {
|
|
26
|
+
it("should say hello world", async () => {
|
|
27
|
+
const result = await prompt("If you read this just say 'hello world'");
|
|
28
|
+
expect(result).toContain("hello world");
|
|
29
|
+
});
|
|
30
|
+
});
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Main Features
|
|
34
|
+
|
|
35
|
+
- **Simple Testing API**: Familiar `describe` and `it` syntax for organizing tests
|
|
36
|
+
- **AI Interaction and Verification**: Built-in `prompt()`, `promptFile()` and `promptCheck()` functions for running and analyzing prompts to AI agents
|
|
37
|
+
- **Classification Matcher**: Built-in `toBeClassifiedAs()` matcher to grade a response against a target label on a 1-5 scale
|
|
38
|
+
- **Concurrent Execution**: Runs eval files concurrently for faster test execution
|
|
39
|
+
- **Model Selection**: Support for specifying custom AI models
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
### Basic Usage
|
|
44
|
+
|
|
45
|
+
1. Create a file with the `.eval.ts` or `.eval.js` extension and write your tests.
|
|
46
|
+
```typescript
|
|
47
|
+
const result = await prompt("If you read this just say 'hello world'");
|
|
48
|
+
expect(result).toContain("hello world");
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
2. Run Katt from your project directory:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
npx katt
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Using promptFile
|
|
58
|
+
|
|
59
|
+
Load prompts from external files:
|
|
60
|
+
|
|
61
|
+
```javascript
|
|
62
|
+
// test.eval.js
|
|
63
|
+
describe("Working with files", () => {
|
|
64
|
+
it("should load the file and respond", async () => {
|
|
65
|
+
const result = await promptFile("./myPrompt.md");
|
|
66
|
+
expect(result).toContain("expected response");
|
|
67
|
+
});
|
|
68
|
+
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Specifying AI Models
|
|
72
|
+
|
|
73
|
+
You can specify a custom model for your prompts:
|
|
74
|
+
|
|
75
|
+
```javascript
|
|
76
|
+
describe("Model selection", () => {
|
|
77
|
+
it("should use a specific model", async () => {
|
|
78
|
+
const promptString = "You are a helpful agent. Say hi and ask what you could help the user with.";
|
|
79
|
+
const result = await prompt(promptString, { model: "gpt-5.2" });
|
|
80
|
+
|
|
81
|
+
expect(result).promptCheck("It should be friendly and helpful");
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
You can also set a default model for the project by adding a `katt.json` file in the project root:
|
|
87
|
+
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"copilot": {
|
|
91
|
+
"model": "gpt-5-mini"
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
When this file exists:
|
|
97
|
+
|
|
98
|
+
- `prompt("...")` and `promptFile("...")` use `copilot.model` by default
|
|
99
|
+
- `prompt("...", { model: "..." })` still overrides the config value
|
|
100
|
+
|
|
101
|
+
## Development
|
|
102
|
+
|
|
103
|
+
### Setup
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
npm install
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Available Scripts
|
|
110
|
+
|
|
111
|
+
- `npm run dev` - Run the CLI in development mode
|
|
112
|
+
- `npm run build` - Build the project
|
|
113
|
+
- `npm run test` - Run tests
|
|
114
|
+
- `npm run typecheck` - Run TypeScript type checking
|
|
115
|
+
- `npm run format` - Format code using Biome
|
|
116
|
+
- `npm run lint` - Lint code using Biome
|
|
117
|
+
- `npm run test:build` - Test the built CLI
|
|
118
|
+
|
|
119
|
+
### Verification Process
|
|
120
|
+
|
|
121
|
+
After making changes, run the following sequence:
|
|
122
|
+
|
|
123
|
+
1. `npm run format`
|
|
124
|
+
2. `npm run typecheck`
|
|
125
|
+
3. `npm run test`
|
|
126
|
+
4. `npm run build`
|
|
127
|
+
5. `npm run test:build`
|
|
128
|
+
|
|
129
|
+
## Project Structure
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
katt/
|
|
133
|
+
├── src/ # Source code
|
|
134
|
+
│ ├── cli/ # CLI implementation
|
|
135
|
+
│ ├── lib/ # Core libraries (describe, it, expect, prompt)
|
|
136
|
+
│ └── types/ # TypeScript type definitions
|
|
137
|
+
├── examples/ # Example eval files
|
|
138
|
+
├── specs/ # Markdown specifications
|
|
139
|
+
├── package.json # Package configuration
|
|
140
|
+
└── tsconfig.json # TypeScript configuration
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## How It Works
|
|
144
|
+
|
|
145
|
+
1. Katt searches the current directory recursively for `*.eval.js` and `*.eval.ts` files
|
|
146
|
+
2. It skips `.git` and `node_modules` directories
|
|
147
|
+
3. Found eval files are imported and executed concurrently
|
|
148
|
+
4. Tests registered with `describe()` and `it()` are collected and run
|
|
149
|
+
5. Each test duration is printed after execution
|
|
150
|
+
6. A summary is displayed showing passed/failed tests and total duration
|
|
151
|
+
7. Katt exits with code `0` on success or `1` on failure
|
|
152
|
+
|
|
153
|
+
## Requirements
|
|
154
|
+
|
|
155
|
+
- Node.js
|
|
156
|
+
- GitHub Copilot CLI installed (see [GitHub Copilot CLI installation docs](https://docs.github.com/en/copilot/how-tos/copilot-cli/install-copilot-cli))
|
|
157
|
+
- Access to AI models (e.g., OpenAI API key for Codex)
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
MIT
|
|
162
|
+
|
|
163
|
+
## Contributing
|
|
164
|
+
|
|
165
|
+
We welcome contributions from the community! Please see our [CONTRIBUTING.md](CONTRIBUTING.md) guide for detailed information on how to contribute to Katt.
|
|
166
|
+
|
|
167
|
+
Quick start:
|
|
168
|
+
1. Fork the repository
|
|
169
|
+
2. Create a feature branch
|
|
170
|
+
3. Make your changes
|
|
171
|
+
4. Run the verification process
|
|
172
|
+
5. Submit a pull request
|
|
173
|
+
|
|
174
|
+
For detailed guidelines, development setup, coding standards, and more, check out our [contribution guide](CONTRIBUTING.md).
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { AsyncLocalStorage as E } from "node:async_hooks";
|
|
3
|
+
import { CopilotClient as et } from "@github/copilot-sdk";
|
|
4
|
+
import { readFile as M, readdir as nt } from "node:fs/promises";
|
|
5
|
+
import { resolve as x, dirname as T, isAbsolute as ot, basename as it, join as st } from "node:path";
|
|
6
|
+
import { readFileSync as Y, writeFileSync as Z, mkdirSync as rt } from "node:fs";
|
|
7
|
+
import { fileURLToPath as ct, pathToFileURL as at } from "node:url";
|
|
8
|
+
const D = new E(), lt = {
|
|
9
|
+
describeStack: [],
|
|
10
|
+
itStack: [],
|
|
11
|
+
tokenUsageStack: [],
|
|
12
|
+
modelStack: []
|
|
13
|
+
};
|
|
14
|
+
let U = 0, W = 0;
|
|
15
|
+
const w = [], A = [];
|
|
16
|
+
let J = 0;
|
|
17
|
+
function u() {
|
|
18
|
+
return D.getStore() ?? lt;
|
|
19
|
+
}
|
|
20
|
+
function H(t) {
|
|
21
|
+
return {
|
|
22
|
+
describeStack: [...t.describeStack],
|
|
23
|
+
itStack: [...t.itStack],
|
|
24
|
+
tokenUsageStack: [...t.tokenUsageStack],
|
|
25
|
+
modelStack: [...t.modelStack]
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
function ut() {
|
|
29
|
+
return U += 1, `d${U}`;
|
|
30
|
+
}
|
|
31
|
+
function dt() {
|
|
32
|
+
return W += 1, `i${W}`;
|
|
33
|
+
}
|
|
34
|
+
function X(t, e) {
|
|
35
|
+
const n = e ?? H(u());
|
|
36
|
+
return D.run(n, t);
|
|
37
|
+
}
|
|
38
|
+
function z() {
|
|
39
|
+
return H(u());
|
|
40
|
+
}
|
|
41
|
+
function gt(t) {
|
|
42
|
+
u().describeStack.push({ id: ut(), description: t });
|
|
43
|
+
}
|
|
44
|
+
function b() {
|
|
45
|
+
u().describeStack.pop();
|
|
46
|
+
}
|
|
47
|
+
function P() {
|
|
48
|
+
return u().describeStack.map((t) => t.description).join(" > ");
|
|
49
|
+
}
|
|
50
|
+
function ft(t) {
|
|
51
|
+
u().itStack.push({ id: dt(), description: t }), u().tokenUsageStack.push(0), u().modelStack.push(void 0);
|
|
52
|
+
}
|
|
53
|
+
function v() {
|
|
54
|
+
u().itStack.pop(), u().tokenUsageStack.pop(), u().modelStack.pop();
|
|
55
|
+
}
|
|
56
|
+
function V() {
|
|
57
|
+
return u().itStack.map((t) => t.description).join(" > ");
|
|
58
|
+
}
|
|
59
|
+
function pt(t) {
|
|
60
|
+
if (!Number.isFinite(t) || t <= 0)
|
|
61
|
+
return;
|
|
62
|
+
const e = u(), n = e.tokenUsageStack.length - 1;
|
|
63
|
+
n < 0 || (e.tokenUsageStack[n] += t);
|
|
64
|
+
}
|
|
65
|
+
function ht() {
|
|
66
|
+
const t = u(), e = t.tokenUsageStack.length - 1;
|
|
67
|
+
return e < 0 ? 0 : t.tokenUsageStack[e] ?? 0;
|
|
68
|
+
}
|
|
69
|
+
function It(t) {
|
|
70
|
+
if (t.length === 0)
|
|
71
|
+
return;
|
|
72
|
+
const e = u(), n = e.modelStack.length - 1;
|
|
73
|
+
n < 0 || (e.modelStack[n] = t);
|
|
74
|
+
}
|
|
75
|
+
function Ct() {
|
|
76
|
+
const t = u(), e = t.modelStack.length - 1;
|
|
77
|
+
if (!(e < 0))
|
|
78
|
+
return t.modelStack[e];
|
|
79
|
+
}
|
|
80
|
+
function $(t) {
|
|
81
|
+
w.push(t);
|
|
82
|
+
}
|
|
83
|
+
function mt() {
|
|
84
|
+
J += 1;
|
|
85
|
+
}
|
|
86
|
+
function St() {
|
|
87
|
+
return J;
|
|
88
|
+
}
|
|
89
|
+
function wt() {
|
|
90
|
+
J = 0;
|
|
91
|
+
}
|
|
92
|
+
function At(t) {
|
|
93
|
+
A.push(t);
|
|
94
|
+
}
|
|
95
|
+
function $t() {
|
|
96
|
+
return [...A];
|
|
97
|
+
}
|
|
98
|
+
function B() {
|
|
99
|
+
return A.length;
|
|
100
|
+
}
|
|
101
|
+
function kt() {
|
|
102
|
+
A.length = 0;
|
|
103
|
+
}
|
|
104
|
+
async function yt() {
|
|
105
|
+
const t = [];
|
|
106
|
+
for (; w.length > 0; ) {
|
|
107
|
+
const e = w.splice(0, w.length), n = await Promise.allSettled(e);
|
|
108
|
+
t.push(...n);
|
|
109
|
+
}
|
|
110
|
+
return t;
|
|
111
|
+
}
|
|
112
|
+
function bt(t, e) {
|
|
113
|
+
X(() => {
|
|
114
|
+
gt(t);
|
|
115
|
+
try {
|
|
116
|
+
const n = e();
|
|
117
|
+
if (n && typeof n.then == "function") {
|
|
118
|
+
$(
|
|
119
|
+
n.finally(() => {
|
|
120
|
+
b();
|
|
121
|
+
})
|
|
122
|
+
);
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
125
|
+
} catch (n) {
|
|
126
|
+
throw b(), n;
|
|
127
|
+
}
|
|
128
|
+
b();
|
|
129
|
+
}, z());
|
|
130
|
+
}
|
|
131
|
+
const vt = "\x1B[1;36m", Lt = "\x1B[33m", jt = "\x1B[38;5;208m", Ft = "\x1B[1;38;5;208m", k = "\x1B[0m";
|
|
132
|
+
function f(t) {
|
|
133
|
+
return `${vt}${t}${k}`;
|
|
134
|
+
}
|
|
135
|
+
function m(t) {
|
|
136
|
+
return `${Lt}${t}${k}`;
|
|
137
|
+
}
|
|
138
|
+
function G(t) {
|
|
139
|
+
return `${jt}${t}${k}`;
|
|
140
|
+
}
|
|
141
|
+
function xt(t) {
|
|
142
|
+
return `${Ft}${t}${k}`;
|
|
143
|
+
}
|
|
144
|
+
let F = "";
|
|
145
|
+
function Tt() {
|
|
146
|
+
F = "";
|
|
147
|
+
}
|
|
148
|
+
function Jt({
|
|
149
|
+
suitePath: t,
|
|
150
|
+
casePath: e,
|
|
151
|
+
didPass: n,
|
|
152
|
+
durationMs: o,
|
|
153
|
+
model: s,
|
|
154
|
+
tokenUsage: i
|
|
155
|
+
}) {
|
|
156
|
+
const r = t.length > 0 ? t : "(root)", a = e.length > 0 ? e : "(root)";
|
|
157
|
+
F !== r && (console.log(`Suite "${f(r)}"`), F = r);
|
|
158
|
+
const l = n ? "✅ Passed in" : "❌ Failed in", d = [
|
|
159
|
+
`Test "${f(a)}"`,
|
|
160
|
+
`- ${l} ${f(`${o}ms`)}`
|
|
161
|
+
];
|
|
162
|
+
s && d.push(`- Model ${f(s)}`), (i ?? 0) > 0 && d.push(`- Tokens used ${f(String(i))}`), d.push("---"), console.log(d.join(`
|
|
163
|
+
`));
|
|
164
|
+
}
|
|
165
|
+
function I(t, e, n = "(root)") {
|
|
166
|
+
const o = V();
|
|
167
|
+
Jt({
|
|
168
|
+
suitePath: P(),
|
|
169
|
+
casePath: o.length > 0 ? o : n,
|
|
170
|
+
didPass: t,
|
|
171
|
+
durationMs: e,
|
|
172
|
+
model: Ct(),
|
|
173
|
+
tokenUsage: ht()
|
|
174
|
+
});
|
|
175
|
+
}
|
|
176
|
+
const N = new E();
|
|
177
|
+
function Nt(t, e) {
|
|
178
|
+
return typeof t == "object" && t !== null && "code" in t && t.code === e;
|
|
179
|
+
}
|
|
180
|
+
function Rt(t) {
|
|
181
|
+
try {
|
|
182
|
+
const e = JSON.parse(t);
|
|
183
|
+
return typeof e == "object" && e !== null ? e : void 0;
|
|
184
|
+
} catch (e) {
|
|
185
|
+
console.warn(`Failed to parse katt.json: ${String(e)}`);
|
|
186
|
+
return;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
function Zt(t) {
|
|
190
|
+
const e = t?.copilot;
|
|
191
|
+
if (typeof e != "object" || e === null || Array.isArray(e))
|
|
192
|
+
return;
|
|
193
|
+
const n = {
|
|
194
|
+
...e
|
|
195
|
+
}, o = n.model;
|
|
196
|
+
return (typeof o != "string" || o.length === 0) && delete n.model, Object.keys(n).length > 0 ? n : void 0;
|
|
197
|
+
}
|
|
198
|
+
async function Ut() {
|
|
199
|
+
const t = x(process.cwd(), "katt.json");
|
|
200
|
+
try {
|
|
201
|
+
const e = await M(t, "utf8");
|
|
202
|
+
return Zt(Rt(e));
|
|
203
|
+
} catch (e) {
|
|
204
|
+
if (Nt(e, "ENOENT"))
|
|
205
|
+
return;
|
|
206
|
+
console.warn(`Failed to read katt.json: ${String(e)}`);
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
function K(t) {
|
|
211
|
+
return typeof t == "string" && t.length > 0 ? t : void 0;
|
|
212
|
+
}
|
|
213
|
+
function L(t) {
|
|
214
|
+
if (!t)
|
|
215
|
+
return;
|
|
216
|
+
const e = { ...t };
|
|
217
|
+
if (e.model !== void 0) {
|
|
218
|
+
const n = K(e.model);
|
|
219
|
+
n ? e.model = n : delete e.model;
|
|
220
|
+
}
|
|
221
|
+
return Object.keys(e).length > 0 ? e : void 0;
|
|
222
|
+
}
|
|
223
|
+
function S(t) {
|
|
224
|
+
return !Number.isFinite(t) || (t ?? 0) <= 0 ? 0 : Math.floor(t ?? 0);
|
|
225
|
+
}
|
|
226
|
+
function Wt(t) {
|
|
227
|
+
return S(t.inputTokens) + S(t.outputTokens) + S(t.cacheReadTokens) + S(t.cacheWriteTokens);
|
|
228
|
+
}
|
|
229
|
+
async function y(t, e = {}) {
|
|
230
|
+
const n = L(await Ut()), o = L(e), s = L({
|
|
231
|
+
...n ?? {},
|
|
232
|
+
...o ?? {}
|
|
233
|
+
}), i = K(s?.model), r = new et({ useLoggedInUser: !0 });
|
|
234
|
+
let a, l, d = 0;
|
|
235
|
+
try {
|
|
236
|
+
await r.start(), a = await r.createSession(s), l = a.on("assistant.usage", (c) => {
|
|
237
|
+
d += Wt(c.data);
|
|
238
|
+
});
|
|
239
|
+
const g = await a.sendAndWait({ prompt: t });
|
|
240
|
+
if (!g?.data?.content)
|
|
241
|
+
throw new Error("Copilot did not return a response.");
|
|
242
|
+
return i && It(i), g.data.content;
|
|
243
|
+
} finally {
|
|
244
|
+
const g = [];
|
|
245
|
+
if (l?.(), d > 0 && pt(d), a)
|
|
246
|
+
try {
|
|
247
|
+
await a.destroy();
|
|
248
|
+
} catch (c) {
|
|
249
|
+
g.push(c);
|
|
250
|
+
}
|
|
251
|
+
try {
|
|
252
|
+
const c = await r.stop();
|
|
253
|
+
g.push(...c);
|
|
254
|
+
} catch (c) {
|
|
255
|
+
g.push(c);
|
|
256
|
+
}
|
|
257
|
+
g.length > 0 && console.error(
|
|
258
|
+
`Copilot cleanup encountered ${g.length} error(s).`
|
|
259
|
+
);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
async function Bt(t, e = {}) {
|
|
263
|
+
const n = N.getStore(), o = n?.evalFile ? T(n.evalFile) : process.cwd(), s = ot(t) ? t : x(o, t), i = await M(s, "utf8");
|
|
264
|
+
return y(i, e);
|
|
265
|
+
}
|
|
266
|
+
function p(t) {
|
|
267
|
+
At({
|
|
268
|
+
describePath: P(),
|
|
269
|
+
itPath: V(),
|
|
270
|
+
message: t
|
|
271
|
+
});
|
|
272
|
+
}
|
|
273
|
+
async function Gt(t, e) {
|
|
274
|
+
const n = Date.now(), o = `expected '${t}' to satisfy '${e}'`, s = y(`Evaluate if the expectation is fulfiled in by the input.
|
|
275
|
+
Expectation: "${e}".
|
|
276
|
+
Input:
|
|
277
|
+
---
|
|
278
|
+
${t}
|
|
279
|
+
---
|
|
280
|
+
Important: Answer with "Yes" or "No" only, without any additional text.
|
|
281
|
+
`).then((i) => {
|
|
282
|
+
i.includes("Yes") ? I(!0, Date.now() - n, "promptCheck") : i.includes("No") ? p(o) : p(`failed to evaluate expectation '${e}'`);
|
|
283
|
+
});
|
|
284
|
+
return $(s), s;
|
|
285
|
+
}
|
|
286
|
+
const Ot = 3;
|
|
287
|
+
function Et(t) {
|
|
288
|
+
const e = t.match(/\b([1-5])\b/);
|
|
289
|
+
return !e || !e[1] ? null : Number(e[1]);
|
|
290
|
+
}
|
|
291
|
+
async function Mt(t, e, n) {
|
|
292
|
+
const o = Date.now(), s = n?.threshold ?? Ot, i = y(
|
|
293
|
+
`Classify the input by how "${e}" it is on a scale of 1 to 5.
|
|
294
|
+
1 means "not ${e}" and 5 means "very ${e}".
|
|
295
|
+
Return only a single number: 1, 2, 3, 4, or 5.
|
|
296
|
+
|
|
297
|
+
Input:
|
|
298
|
+
---
|
|
299
|
+
${t}
|
|
300
|
+
---`,
|
|
301
|
+
n?.model ? { model: n.model } : void 0
|
|
302
|
+
).then((r) => {
|
|
303
|
+
const a = Et(r);
|
|
304
|
+
if (a === null) {
|
|
305
|
+
p(
|
|
306
|
+
`failed to classify as '${e}'. Evaluator returned '${r}'`
|
|
307
|
+
);
|
|
308
|
+
return;
|
|
309
|
+
}
|
|
310
|
+
const l = `expected response to be classified as '${e}' with score >= ${s}, got ${a}`;
|
|
311
|
+
if (a < s) {
|
|
312
|
+
p(l);
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
I(
|
|
316
|
+
!0,
|
|
317
|
+
Date.now() - o,
|
|
318
|
+
"toBeClassifiedAs"
|
|
319
|
+
);
|
|
320
|
+
});
|
|
321
|
+
return $(i), i;
|
|
322
|
+
}
|
|
323
|
+
function Yt(t, e) {
|
|
324
|
+
const n = `expected '${t}' to include '${e}'`;
|
|
325
|
+
t.includes(e) || p(n);
|
|
326
|
+
}
|
|
327
|
+
let _ = !1;
|
|
328
|
+
function Dt(t) {
|
|
329
|
+
_ = t;
|
|
330
|
+
}
|
|
331
|
+
function Ht() {
|
|
332
|
+
return _;
|
|
333
|
+
}
|
|
334
|
+
function Xt(t) {
|
|
335
|
+
const n = it(t).replace(/\.eval\.[^./\\]+$/, "");
|
|
336
|
+
return st(
|
|
337
|
+
T(t),
|
|
338
|
+
"__snapshots__",
|
|
339
|
+
`${n}.snap.md`
|
|
340
|
+
);
|
|
341
|
+
}
|
|
342
|
+
function O(t) {
|
|
343
|
+
return t.split(/\r?\n/);
|
|
344
|
+
}
|
|
345
|
+
function zt(t, e) {
|
|
346
|
+
if (t === e)
|
|
347
|
+
return " (no diff)";
|
|
348
|
+
const n = O(t), o = O(e), s = Math.max(n.length, o.length), i = [];
|
|
349
|
+
for (let r = 0; r < s; r += 1) {
|
|
350
|
+
const a = n[r], l = o[r];
|
|
351
|
+
if (a !== l) {
|
|
352
|
+
if (a === void 0 && l !== void 0) {
|
|
353
|
+
i.push(`+ ${l}`);
|
|
354
|
+
continue;
|
|
355
|
+
}
|
|
356
|
+
if (a !== void 0 && l === void 0) {
|
|
357
|
+
i.push(`- ${a}`);
|
|
358
|
+
continue;
|
|
359
|
+
}
|
|
360
|
+
i.push(`- ${a ?? ""}`), i.push(`+ ${l ?? ""}`);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
return i.join(`
|
|
364
|
+
`);
|
|
365
|
+
}
|
|
366
|
+
function Pt(t) {
|
|
367
|
+
const e = N.getStore()?.evalFile;
|
|
368
|
+
if (!e) {
|
|
369
|
+
p(
|
|
370
|
+
"toMatchSnapshot can only be used while running an eval file."
|
|
371
|
+
);
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
374
|
+
const n = Xt(e);
|
|
375
|
+
try {
|
|
376
|
+
const o = Y(n, "utf8");
|
|
377
|
+
if (o === t)
|
|
378
|
+
return;
|
|
379
|
+
if (Ht()) {
|
|
380
|
+
Z(n, t, "utf8");
|
|
381
|
+
return;
|
|
382
|
+
}
|
|
383
|
+
const s = zt(o, t);
|
|
384
|
+
p(
|
|
385
|
+
[
|
|
386
|
+
`Snapshot mismatch at ${n}`,
|
|
387
|
+
"",
|
|
388
|
+
"Diff:",
|
|
389
|
+
s,
|
|
390
|
+
"",
|
|
391
|
+
"Run katt with --update-snapshots (or -u) to accept this change."
|
|
392
|
+
].join(`
|
|
393
|
+
`)
|
|
394
|
+
);
|
|
395
|
+
} catch (o) {
|
|
396
|
+
if (o.code !== "ENOENT") {
|
|
397
|
+
p(
|
|
398
|
+
`Failed to read snapshot at ${n}: ${String(o)}`
|
|
399
|
+
);
|
|
400
|
+
return;
|
|
401
|
+
}
|
|
402
|
+
try {
|
|
403
|
+
rt(T(n), { recursive: !0 }), Z(n, t, "utf8");
|
|
404
|
+
} catch (i) {
|
|
405
|
+
p(
|
|
406
|
+
`Failed to write snapshot at ${n}: ${String(i)}`
|
|
407
|
+
);
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
function Vt(t) {
|
|
412
|
+
return {
|
|
413
|
+
toContain: (e) => {
|
|
414
|
+
Yt(t, e);
|
|
415
|
+
},
|
|
416
|
+
toMatchSnapshot: () => {
|
|
417
|
+
Pt(t);
|
|
418
|
+
},
|
|
419
|
+
promptCheck: async (e) => {
|
|
420
|
+
await Gt(t, e);
|
|
421
|
+
},
|
|
422
|
+
toBeClassifiedAs: async (e, n) => {
|
|
423
|
+
await Mt(t, e, n);
|
|
424
|
+
}
|
|
425
|
+
};
|
|
426
|
+
}
|
|
427
|
+
function Kt(t, e) {
|
|
428
|
+
X(() => {
|
|
429
|
+
mt(), ft(t);
|
|
430
|
+
const n = B(), o = Date.now(), s = () => B() === n, i = () => Date.now() - o;
|
|
431
|
+
try {
|
|
432
|
+
const r = e();
|
|
433
|
+
if (r && typeof r.then == "function") {
|
|
434
|
+
$(
|
|
435
|
+
r.then(() => {
|
|
436
|
+
I(!0, i());
|
|
437
|
+
}).catch((a) => {
|
|
438
|
+
throw I(!1, i()), a;
|
|
439
|
+
}).finally(() => {
|
|
440
|
+
v();
|
|
441
|
+
})
|
|
442
|
+
);
|
|
443
|
+
return;
|
|
444
|
+
}
|
|
445
|
+
} catch (r) {
|
|
446
|
+
throw I(!1, i()), v(), r;
|
|
447
|
+
}
|
|
448
|
+
I(s(), i()), v();
|
|
449
|
+
}, z());
|
|
450
|
+
}
|
|
451
|
+
const _t = /\.eval\.(js|ts)$/, Qt = /* @__PURE__ */ new Set([".git", "node_modules"]);
|
|
452
|
+
async function Q(t) {
|
|
453
|
+
const e = await nt(t, { withFileTypes: !0 }), n = [];
|
|
454
|
+
return await Promise.all(
|
|
455
|
+
e.map(async (o) => {
|
|
456
|
+
const s = x(t, o.name);
|
|
457
|
+
if (o.isDirectory()) {
|
|
458
|
+
if (Qt.has(o.name))
|
|
459
|
+
return;
|
|
460
|
+
n.push(...await Q(s));
|
|
461
|
+
return;
|
|
462
|
+
}
|
|
463
|
+
o.isFile() && _t.test(o.name) && n.push(s);
|
|
464
|
+
})
|
|
465
|
+
), n;
|
|
466
|
+
}
|
|
467
|
+
const j = new URL("data:application/json;base64,ewogICJuYW1lIjogImthdHQiLAogICJ2ZXJzaW9uIjogIjAuMS4wIiwKICAiZGVzY3JpcHRpb24iOiAiQ0xJIHRvb2wgdGhhdCB0ZXN0cyB0aGUgb3V0cHV0IG9mIGFnZW50aWMgQUkgdG9vbHMiLAogICJrZXl3b3JkcyI6IFsKICAgICJjbGkiLAogICAgImFpIiwKICAgICJhZ2VudGljLWFpIiwKICAgICJ0ZXN0aW5nIiwKICAgICJldmFsdWF0aW9uIgogIF0sCiAgImF1dGhvciI6ICJSYXBoYWVsIFBvcnRvIChodHRwczovL2dpdGh1Yi5jb20vcmFwaGFlbHBvcikiLAogICJsaWNlbnNlIjogIk1JVCIsCiAgInByaXZhdGUiOiB0cnVlLAogICJ0eXBlIjogIm1vZHVsZSIsCiAgIm1haW4iOiAiZGlzdC9pbmRleC5qcyIsCiAgImJpbiI6IHsKICAgICJrYXR0IjogImRpc3QvaW5kZXguanMiCiAgfSwKICAic2NyaXB0cyI6IHsKICAgICJidWlsZCI6ICJ2aXRlIGJ1aWxkIiwKICAgICJkZXYiOiAidHN4IHNyYy9pbmRleC50cyIsCiAgICAibGludCI6ICJiaW9tZSBsaW50IC4vc3JjIiwKICAgICJmb3JtYXQiOiAiYmlvbWUgZm9ybWF0IC0td3JpdGUgLi9zcmMiLAogICAgInRlc3QiOiAidml0ZXN0IiwKICAgICJ0eXBlY2hlY2siOiAidHNjIC1wIHRzY29uZmlnLmpzb24gLS1ub0VtaXQiLAogICAgInRlc3Q6YnVpbGQiOiAibm9kZSAuL2Rpc3QvaW5kZXguanMiCiAgfSwKICAidHlwZXMiOiAiZGlzdC9pbmRleC5kLnRzIiwKICAiZGV2RGVwZW5kZW5jaWVzIjogewogICAgIkBiaW9tZWpzL2Jpb21lIjogIjEuOS40IiwKICAgICJAdHlwZXMvbm9kZSI6ICIyNS4yLjAiLAogICAgInRzeCI6ICI0LjIxLjAiLAogICAgInR5cGVzY3JpcHQiOiAiNS44LjIiLAogICAgInZpdGUiOiAiNy4zLjEiLAogICAgInZpdGUtcGx1Z2luLWR0cyI6ICI0LjUuNCIsCiAgICAidml0ZXN0IjogIjMuMi40IiwKICAgICJ2c2NvZGUtanNvbnJwYyI6ICJeOC4yLjEiCiAgfSwKICAiZGVwZW5kZW5jaWVzIjogewogICAgIkBnaXRodWIvY29waWxvdC1zZGsiOiAiXjAuMS4yMSIKICB9LAogICJidWdzIjogewogICAgInVybCI6ICJodHRwczovL2dpdGh1Yi5jb20vcmFwaGFlbHBvci9rYXR0L2lzc3VlcyIKICB9LAogICJob21lcGFnZSI6ICJodHRwczovL2dpdGh1Yi5jb20vcmFwaGFlbHBvci9rYXR0Igp9Cg==", import.meta.url);
|
|
468
|
+
let C;
|
|
469
|
+
function qt() {
|
|
470
|
+
if (C !== void 0)
|
|
471
|
+
return C;
|
|
472
|
+
try {
|
|
473
|
+
const t = j.protocol === "data:" ? te(j) : Y(ct(j), "utf8"), e = JSON.parse(t);
|
|
474
|
+
C = typeof e.version == "string" ? e.version : "unknown";
|
|
475
|
+
} catch {
|
|
476
|
+
C = "unknown";
|
|
477
|
+
}
|
|
478
|
+
return C;
|
|
479
|
+
}
|
|
480
|
+
function te(t) {
|
|
481
|
+
const e = t.pathname.indexOf(",");
|
|
482
|
+
if (e < 0)
|
|
483
|
+
throw new Error("Invalid data URL.");
|
|
484
|
+
const n = t.pathname.slice(0, e), o = t.pathname.slice(e + 1);
|
|
485
|
+
return n.includes(";base64") ? Buffer.from(o, "base64").toString("utf8") : decodeURIComponent(o);
|
|
486
|
+
}
|
|
487
|
+
function ee() {
|
|
488
|
+
const t = " ██╗ ██╗ █████╗ ████████╗████████╗", e = " ██║ ██╔╝██╔══██╗╚══██╔══╝╚══██╔══╝", n = " █████╔╝ ███████║ ██║ ██║", o = " ██╔═██╗ ██╔══██║ ██║ ██║", s = " ██║ ██╗██║ ██║ ██║ ██║", i = " ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝", r = `v${qt()}`, a = Math.max(
|
|
489
|
+
0,
|
|
490
|
+
Math.floor((t.length - r.length) / 2)
|
|
491
|
+
), l = `${" ".repeat(a)}${r}`;
|
|
492
|
+
console.log(`
|
|
493
|
+
${m(t)}
|
|
494
|
+
${m(e)}
|
|
495
|
+
${m(n)}
|
|
496
|
+
${G(o)}
|
|
497
|
+
${G(s)}
|
|
498
|
+
${xt(i)}
|
|
499
|
+
${m(l)}
|
|
500
|
+
`);
|
|
501
|
+
}
|
|
502
|
+
function ne(t) {
|
|
503
|
+
const e = String(t.getHours()).padStart(2, "0"), n = String(t.getMinutes()).padStart(2, "0"), o = String(t.getSeconds()).padStart(2, "0");
|
|
504
|
+
return `${e}:${n}:${o}`;
|
|
505
|
+
}
|
|
506
|
+
async function oe() {
|
|
507
|
+
const t = process.argv.slice(2), e = t.includes("--update-snapshots") || t.includes("-u");
|
|
508
|
+
Dt(e), ee();
|
|
509
|
+
const n = /* @__PURE__ */ new Date();
|
|
510
|
+
Tt(), kt(), wt();
|
|
511
|
+
const o = await Q(process.cwd());
|
|
512
|
+
if (o.length === 0)
|
|
513
|
+
return console.log("No .eval.js or .eval.ts files found."), 1;
|
|
514
|
+
const i = (await Promise.allSettled(
|
|
515
|
+
o.map(
|
|
516
|
+
(c) => N.run(
|
|
517
|
+
{ evalFile: c },
|
|
518
|
+
() => import(at(c).href)
|
|
519
|
+
)
|
|
520
|
+
)
|
|
521
|
+
)).map((c, h) => ({ result: c, file: o[h] })).filter(({ result: c }) => c.status === "rejected");
|
|
522
|
+
if (i.length > 0) {
|
|
523
|
+
for (const c of i) {
|
|
524
|
+
const h = c.result.status === "rejected" ? c.result.reason : void 0;
|
|
525
|
+
console.error(`Error executing ${c.file}: ${String(h)}`);
|
|
526
|
+
}
|
|
527
|
+
return 1;
|
|
528
|
+
}
|
|
529
|
+
const a = (await yt()).filter(
|
|
530
|
+
(c) => c.status === "rejected"
|
|
531
|
+
);
|
|
532
|
+
if (a.length > 0) {
|
|
533
|
+
for (const c of a)
|
|
534
|
+
c.status === "rejected" && console.error(`Error executing async test: ${String(c.reason)}`);
|
|
535
|
+
return 1;
|
|
536
|
+
}
|
|
537
|
+
const l = $t();
|
|
538
|
+
if (l.length > 0) {
|
|
539
|
+
console.error("❌ Failed tests:");
|
|
540
|
+
for (const [c, h] of l.entries()) {
|
|
541
|
+
const R = [h.describePath, h.itPath].filter((tt) => tt.length > 0).join(" > "), q = R.length > 0 ? `${R}: ` : "";
|
|
542
|
+
console.error(`${c + 1}. ${q}${h.message}`);
|
|
543
|
+
}
|
|
544
|
+
return 1;
|
|
545
|
+
}
|
|
546
|
+
const d = St(), g = Date.now() - n.getTime();
|
|
547
|
+
return console.log(
|
|
548
|
+
[
|
|
549
|
+
"---",
|
|
550
|
+
`${f("Files")} ${o.length} passed`,
|
|
551
|
+
`${f("Evals")} ${d} passed`,
|
|
552
|
+
`${f("Start at")} ${ne(n)}`,
|
|
553
|
+
`${f("Duration")} ${g}ms`
|
|
554
|
+
].join(`
|
|
555
|
+
`)
|
|
556
|
+
), 0;
|
|
557
|
+
}
|
|
558
|
+
Object.assign(globalThis, { describe: bt, it: Kt, expect: Vt, prompt: y, promptFile: Bt });
|
|
559
|
+
oe().then((t) => {
|
|
560
|
+
process.exit(t);
|
|
561
|
+
}).catch((t) => {
|
|
562
|
+
console.error(`Unexpected error: ${String(t)}`), process.exit(1);
|
|
563
|
+
});
|
|
564
|
+
export {
|
|
565
|
+
oe as runCli
|
|
566
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "katt",
|
|
3
|
+
"version": "0.0.2",
|
|
4
|
+
"description": "CLI tool that tests the output of agentic AI tools",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"cli",
|
|
7
|
+
"ai",
|
|
8
|
+
"agentic-ai",
|
|
9
|
+
"testing",
|
|
10
|
+
"evaluation"
|
|
11
|
+
],
|
|
12
|
+
"author": "Raphael Porto (https://github.com/raphaelpor)",
|
|
13
|
+
"license": "MIT",
|
|
14
|
+
"type": "module",
|
|
15
|
+
"main": "dist/index.js",
|
|
16
|
+
"bin": {
|
|
17
|
+
"katt": "dist/index.js"
|
|
18
|
+
},
|
|
19
|
+
"scripts": {
|
|
20
|
+
"build": "vite build",
|
|
21
|
+
"dev": "tsx src/index.ts",
|
|
22
|
+
"lint": "biome lint ./src",
|
|
23
|
+
"format": "biome format --write ./src",
|
|
24
|
+
"test": "vitest",
|
|
25
|
+
"typecheck": "tsc -p tsconfig.json --noEmit",
|
|
26
|
+
"test:build": "node ./dist/index.js"
|
|
27
|
+
},
|
|
28
|
+
"types": "dist/index.d.ts",
|
|
29
|
+
"devDependencies": {
|
|
30
|
+
"@biomejs/biome": "1.9.4",
|
|
31
|
+
"@types/node": "25.2.0",
|
|
32
|
+
"tsx": "4.21.0",
|
|
33
|
+
"typescript": "5.8.2",
|
|
34
|
+
"vite": "7.3.1",
|
|
35
|
+
"vite-plugin-dts": "4.5.4",
|
|
36
|
+
"vitest": "3.2.4",
|
|
37
|
+
"vscode-jsonrpc": "^8.2.1"
|
|
38
|
+
},
|
|
39
|
+
"dependencies": {
|
|
40
|
+
"@github/copilot-sdk": "^0.1.21"
|
|
41
|
+
},
|
|
42
|
+
"bugs": {
|
|
43
|
+
"url": "https://github.com/raphaelpor/katt/issues"
|
|
44
|
+
},
|
|
45
|
+
"homepage": "https://github.com/raphaelpor/katt"
|
|
46
|
+
}
|