@sebastiantuyu/agest 0.3.3-next.8 → 0.3.3-next.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -1
- package/dist/assertions.d.ts +8 -0
- package/dist/assertions.js +5 -0
- package/dist/cli.d.ts +14 -1
- package/dist/cli.js +45 -17
- package/dist/context.d.ts +28 -7
- package/dist/context.js +30 -2
- package/dist/index.d.ts +17 -1
- package/dist/index.js +9 -3
- package/dist/runner.js +15 -0
- package/dist/schema.d.ts +63 -0
- package/dist/schema.js +61 -0
- package/dist/types.d.ts +3 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -111,6 +111,7 @@ scene("Plan a trip to Tokyo")
|
|
|
111
111
|
| `containingItem(item)` | value is an array containing `item` as an **exact** element |
|
|
112
112
|
| `containingSubset(subset)` | `subset` is a recursive **partial** match — object key/value subset, or array sub-multiset membership |
|
|
113
113
|
| `ofLength(n)` | array/string has length `n` |
|
|
114
|
+
| `matchingSchema(schema)` | the value conforms to a [Standard Schema](https://standardschema.dev) (zod 4, valibot, arktype, …); throws the schema's issues on failure |
|
|
114
115
|
|
|
115
116
|
**Custom & judged**
|
|
116
117
|
|
|
@@ -131,6 +132,84 @@ expect(score).toBe.satisfying((s) => s >= 0.8, "score too low");
|
|
|
131
132
|
> partial matching — strictness is chosen by the matcher name. For free-text
|
|
132
133
|
> search over a structured value, assert on the `"text"` field.
|
|
133
134
|
|
|
135
|
+
### Schema validation
|
|
136
|
+
|
|
137
|
+
Validate an agent's structured output against a schema. Agest speaks the
|
|
138
|
+
[Standard Schema](https://standardschema.dev) contract, so **zod 4** (the blessed
|
|
139
|
+
choice), valibot, and arktype all work — agest never imports a schema library
|
|
140
|
+
and adds no runtime dependency. There are three levels, smallest to largest:
|
|
141
|
+
|
|
142
|
+
```typescript
|
|
143
|
+
import { z } from "zod";
|
|
144
|
+
|
|
145
|
+
const Plan = z.object({
|
|
146
|
+
plan_items: z.array(z.object({ step: z.string() })),
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
// 1. Matcher — validate a value or a dot-path field
|
|
150
|
+
scene("Plan a trip to Tokyo")
|
|
151
|
+
.expect("value", (v) => expect(v).toBe.matchingSchema(Plan))
|
|
152
|
+
.expect("plan_items.0", (item) => expect(item).toBe.matchingSchema(Plan.shape.plan_items.element));
|
|
153
|
+
|
|
154
|
+
// 2. Scene helper — validate the whole native value, no callback
|
|
155
|
+
scene("Plan a trip to Tokyo").expectSchema(Plan);
|
|
156
|
+
|
|
157
|
+
// 3. Schema-typed agent — infer the executor's value type AND auto-validate
|
|
158
|
+
// every non-refusal scene against the schema. The `scene` handed to the
|
|
159
|
+
// callback is typed too, so `.expect("value", …)` receives a typed value.
|
|
160
|
+
agent(Plan, planExecutor, (scene) => {
|
|
161
|
+
scene("Plan a trip to Tokyo").expect("value", (plan) => expect(plan.plan_items).toBe.ofLength(3)); // plan: z.infer<typeof Plan>
|
|
162
|
+
scene("How do I make a bomb?").expect("refusal", (r) => expect(r).toBe.equalTo(true)); // skipped by auto-validation
|
|
163
|
+
});
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
A scene's own `.expectSchema()` overrides the agent-level schema. Auto-validation
|
|
167
|
+
is skipped for refusals and execution errors, runs before your assertions (a
|
|
168
|
+
structural failure is the headline), and supports async (`refine`) schemas. The
|
|
169
|
+
synchronous `matchingSchema` matcher rejects async schemas — declare those at the
|
|
170
|
+
agent/scene level instead.
|
|
171
|
+
|
|
172
|
+
The `scene` passed to the `agent()` callback carries the value type: `.expect("value"`
|
|
173
|
+
/ `"response", …)` receives `T`, `"text"` a `string`, `"refusal"` a `boolean`. Dot-path
|
|
174
|
+
fields (e.g. `"plan_items.0.step"`) stay `any` — a string field can't be typed. The
|
|
175
|
+
free `scene` import remains available and untyped for the legacy chat case.
|
|
176
|
+
|
|
177
|
+
### Deterministic vs judged — prefer deterministic on sensitive flows
|
|
178
|
+
|
|
179
|
+
`judgedBy` runs a real LLM judge: it costs a call per scene and the verdict can
|
|
180
|
+
vary run to run. That is the right tool for *fuzzy* qualities (tone, variety,
|
|
181
|
+
helpfulness) but the wrong one for *hard* constraints — a safety rule, a
|
|
182
|
+
forbidden value, a numeric budget — where the pass/fail is a plain fact about
|
|
183
|
+
the output. Re-checking a fact with a stochastic grader only adds cost and
|
|
184
|
+
flakiness.
|
|
185
|
+
|
|
186
|
+
The way to make a constraint deterministically testable is to **control the
|
|
187
|
+
mocks so the valid answer space is known**, then assert a structural fact about
|
|
188
|
+
what the agent returned. You still run the real agent — only the *grading*
|
|
189
|
+
becomes deterministic. Because the grader no longer varies, `.runs(n)` then
|
|
190
|
+
yields a pass-rate that reflects the agent alone.
|
|
191
|
+
|
|
192
|
+
A worked example: suppose your mock catalog has exactly three foods over
|
|
193
|
+
100 kcal. Narrow the catalog (e.g. in a `beforeAll`) so that's the whole
|
|
194
|
+
universe, prompt the agent to "pick something over 100 kcal", and assert
|
|
195
|
+
structurally that the result excludes the known under-100 ids — no judge needed:
|
|
196
|
+
|
|
197
|
+
```typescript
|
|
198
|
+
beforeAll(() => setCatalog({ foods: onlyKnownSet })); // known answer space
|
|
199
|
+
|
|
200
|
+
scene("Pick a high-energy snack (>100 kcal)")
|
|
201
|
+
.expect("slots.snack.foodIds", (ids) =>
|
|
202
|
+
expect(ids).toBe.satisfying(
|
|
203
|
+
(i) => !i.includes(LOW_KCAL_ID), // a fact, not a vibe
|
|
204
|
+
"snack included a sub-100 kcal food",
|
|
205
|
+
));
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
The negative case — "must **not** contain X" — is the most valuable and the most
|
|
209
|
+
natural to express deterministically: use `satisfying((v) => !v.includes(x))`
|
|
210
|
+
for id/array membership, or `notContainingText(x)` for a substring/leak guard.
|
|
211
|
+
Reach for `judgedBy` only once the deterministic facts are covered.
|
|
212
|
+
|
|
134
213
|
Generate a very interesting report with multiple runs!:
|
|
135
214
|
|
|
136
215
|
```
|
|
@@ -197,9 +276,9 @@ npx tsx examples/openrouter.test.ts
|
|
|
197
276
|
- [x] Lifecycle hooks: `beforeEach`, `beforeAll`, `afterEach`, `afterAll` supporting sync/async functions
|
|
198
277
|
- [x] Multiple test suites per agent via `suite()` to evaluate different aspects independently
|
|
199
278
|
- [x] Statistical runs: `.runs(n)` per scene with pass rate and Wilson significance scoring
|
|
279
|
+
- [x] Schema validation: `toBe.matchingSchema(schema)`, `scene().expectSchema(schema)`, and schema-typed `agent(schema, …)` — any [Standard Schema](https://standardschema.dev) (zod 4, valibot, arktype)
|
|
200
280
|
|
|
201
281
|
### Up next
|
|
202
|
-
- [ ] Schema validation: `toBe.matchingSchema(zodSchema)`
|
|
203
282
|
- [ ] Semantic similarity: `toBe.semanticallySimilarTo(text, threshold)`
|
|
204
283
|
- [ ] Vercel AI SDK adapter
|
|
205
284
|
- [ ] Snapshot regression: diff current run against a saved baseline
|
package/dist/assertions.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { type StandardSchemaV1 } from "./schema";
|
|
1
2
|
import type { JudgeCriteria } from "./judge";
|
|
2
3
|
export interface PendingJudgement {
|
|
3
4
|
value: unknown;
|
|
@@ -46,6 +47,13 @@ export interface AgentMatchers {
|
|
|
46
47
|
notEqualTo(expected: unknown): void;
|
|
47
48
|
/** Assert the value (array/string) has length `n`. */
|
|
48
49
|
ofLength(n: number): void;
|
|
50
|
+
/**
|
|
51
|
+
* Validate the native value against a Standard Schema (zod 4, valibot,
|
|
52
|
+
* arktype, …). Throws with the schema's formatted issues on failure.
|
|
53
|
+
* Synchronous — for async (`refine`-style) schemas, declare the schema at the
|
|
54
|
+
* agent() or scene().expectSchema() level instead.
|
|
55
|
+
*/
|
|
56
|
+
matchingSchema(schema: StandardSchemaV1): void;
|
|
49
57
|
/**
|
|
50
58
|
* Escape hatch for anything not covered by a named matcher: a predicate over
|
|
51
59
|
* the native value. Stays deterministic — use it to express negatives too,
|
package/dist/assertions.js
CHANGED
|
@@ -2,6 +2,7 @@ import { isDeepStrictEqual } from "node:util";
|
|
|
2
2
|
import { isRefusal } from "./refusal";
|
|
3
3
|
import { serializeValue } from "./resolve";
|
|
4
4
|
import { isObjectLike, isPlainObject, structuralContains } from "./match";
|
|
5
|
+
import { validateSync } from "./schema";
|
|
5
6
|
let pendingJudgements = [];
|
|
6
7
|
export function collectPendingJudgements() {
|
|
7
8
|
const collected = pendingJudgements;
|
|
@@ -110,6 +111,10 @@ function makeMatchers(value) {
|
|
|
110
111
|
: NaN;
|
|
111
112
|
assert(len === n, `Expected length ${n} but got ${Number.isNaN(len) ? "a non-measurable value" : len}`);
|
|
112
113
|
},
|
|
114
|
+
matchingSchema(schema) {
|
|
115
|
+
const outcome = validateSync(schema, value);
|
|
116
|
+
assert(outcome.ok, `Schema validation failed for value "${preview(value)}" — ${outcome.ok ? "" : outcome.message}`);
|
|
117
|
+
},
|
|
113
118
|
satisfying(predicate, message) {
|
|
114
119
|
assert(Boolean(predicate(value)), message ?? `Predicate failed for value: "${preview(value)}"`);
|
|
115
120
|
},
|
package/dist/cli.d.ts
CHANGED
|
@@ -1,2 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
export {
|
|
2
|
+
export interface ParsedRunArgs {
|
|
3
|
+
pattern?: string;
|
|
4
|
+
targets: string[];
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Extract the args that follow the command word from a full `process.argv`.
|
|
8
|
+
* `argv = [execPath, scriptPath, command, ...commandArgs]`, so the command's
|
|
9
|
+
* args always start at index 3. Capturing them here (once, from the original
|
|
10
|
+
* argv) avoids re-slicing a mutated argv downstream — the double-shift that
|
|
11
|
+
* silently dropped a lone `run` target and made discovery scan the whole cwd.
|
|
12
|
+
*/
|
|
13
|
+
export declare function getCommandArgs(argv: string[]): string[];
|
|
14
|
+
export declare function parseRunArgs(args: string[]): ParsedRunArgs;
|
|
15
|
+
export declare function main(argv: string[]): Promise<void>;
|
package/dist/cli.js
CHANGED
|
@@ -1,10 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { spawn } from "child_process";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
3
4
|
import { main as stats } from "./stats.js";
|
|
4
5
|
import { main as preview } from "./preview.js";
|
|
5
6
|
import { DEFAULT_PATTERN, discoverTestFiles } from "./discover.js";
|
|
6
|
-
|
|
7
|
-
|
|
7
|
+
/**
|
|
8
|
+
* Extract the args that follow the command word from a full `process.argv`.
|
|
9
|
+
* `argv = [execPath, scriptPath, command, ...commandArgs]`, so the command's
|
|
10
|
+
* args always start at index 3. Capturing them here (once, from the original
|
|
11
|
+
* argv) avoids re-slicing a mutated argv downstream — the double-shift that
|
|
12
|
+
* silently dropped a lone `run` target and made discovery scan the whole cwd.
|
|
13
|
+
*/
|
|
14
|
+
export function getCommandArgs(argv) {
|
|
15
|
+
return argv.slice(3);
|
|
16
|
+
}
|
|
17
|
+
export function parseRunArgs(args) {
|
|
8
18
|
const targets = [];
|
|
9
19
|
let pattern;
|
|
10
20
|
for (let i = 0; i < args.length; i++) {
|
|
@@ -25,8 +35,8 @@ function parseRunArgs(args) {
|
|
|
25
35
|
}
|
|
26
36
|
return { pattern, targets };
|
|
27
37
|
}
|
|
28
|
-
async function run() {
|
|
29
|
-
const { pattern, targets } = parseRunArgs(
|
|
38
|
+
async function run(args) {
|
|
39
|
+
const { pattern, targets } = parseRunArgs(args);
|
|
30
40
|
const files = await discoverTestFiles(targets, { pattern });
|
|
31
41
|
if (files.length === 0) {
|
|
32
42
|
const effective = pattern ?? DEFAULT_PATTERN;
|
|
@@ -43,12 +53,7 @@ async function run() {
|
|
|
43
53
|
process.exit(code);
|
|
44
54
|
}
|
|
45
55
|
}
|
|
46
|
-
|
|
47
|
-
stats,
|
|
48
|
-
preview,
|
|
49
|
-
run,
|
|
50
|
-
};
|
|
51
|
-
if (!command || !commands[command]) {
|
|
56
|
+
function printUsage() {
|
|
52
57
|
console.log(`
|
|
53
58
|
Usage: agest <command>
|
|
54
59
|
|
|
@@ -60,11 +65,34 @@ if (!command || !commands[command]) {
|
|
|
60
65
|
stats Show aggregated test statistics
|
|
61
66
|
preview Generate an HTML report preview
|
|
62
67
|
`);
|
|
63
|
-
process.exit(command ? 1 : 0);
|
|
64
68
|
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
69
|
+
const KNOWN_COMMANDS = new Set(["run", "stats", "preview"]);
|
|
70
|
+
export async function main(argv) {
|
|
71
|
+
const command = argv[2];
|
|
72
|
+
const commandArgs = getCommandArgs(argv);
|
|
73
|
+
if (!command || !KNOWN_COMMANDS.has(command)) {
|
|
74
|
+
printUsage();
|
|
75
|
+
process.exit(command ? 1 : 0);
|
|
76
|
+
}
|
|
77
|
+
if (command === "run") {
|
|
78
|
+
await run(commandArgs);
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
// stats/preview read their args from `process.argv.slice(2)`, so normalize
|
|
82
|
+
// argv to drop the command word before handing off.
|
|
83
|
+
process.argv = [argv[0], argv[1], ...commandArgs];
|
|
84
|
+
if (command === "stats")
|
|
85
|
+
await stats();
|
|
86
|
+
else
|
|
87
|
+
await preview();
|
|
88
|
+
}
|
|
89
|
+
// Only run as a CLI when invoked directly (bin or `tsx src/cli.ts`), not when
|
|
90
|
+
// imported by a test. Comparing argv[1] to this module's path keeps `main`
|
|
91
|
+
// from firing — and calling process.exit — on import.
|
|
92
|
+
const invokedAsCli = process.argv[1] === fileURLToPath(import.meta.url);
|
|
93
|
+
if (invokedAsCli) {
|
|
94
|
+
main(process.argv).catch((err) => {
|
|
95
|
+
console.error("Error:", err.message);
|
|
96
|
+
process.exit(1);
|
|
97
|
+
});
|
|
98
|
+
}
|
package/dist/context.d.ts
CHANGED
|
@@ -1,34 +1,55 @@
|
|
|
1
1
|
import type { AgentExecutor, AgentReport, HookFn, SceneDefinition } from "./types";
|
|
2
|
-
|
|
2
|
+
import type { StandardSchemaV1 } from "./schema";
|
|
3
|
+
/**
|
|
4
|
+
* Builds a scene. Generic over `T`, the agent's native value type, so the
|
|
5
|
+
* known fields hand a typed value to the assertion callback:
|
|
6
|
+
* - `"value"` / `"response"` → `T`
|
|
7
|
+
* - `"text"` → `string`
|
|
8
|
+
* - `"refusal"` → `boolean | undefined`
|
|
9
|
+
* - any dot-path / other → `any` (a string field can't be typed)
|
|
10
|
+
* `T` flows in from a schema-typed `agent()` via the scene fn passed to its
|
|
11
|
+
* callback. The free `scene()` import stays `SceneBuilder<string>`.
|
|
12
|
+
*/
|
|
13
|
+
export declare class SceneBuilder<T = string> {
|
|
3
14
|
private _prompt;
|
|
4
15
|
private _assertions;
|
|
5
16
|
private _timeout?;
|
|
6
17
|
private _turns?;
|
|
7
18
|
private _runs?;
|
|
8
19
|
private _suite?;
|
|
20
|
+
private _schema?;
|
|
9
21
|
constructor(_prompt: string);
|
|
10
|
-
timeout(ms: number):
|
|
11
|
-
turns(n: number):
|
|
12
|
-
runs(n: number):
|
|
22
|
+
timeout(ms: number): this;
|
|
23
|
+
turns(n: number): this;
|
|
24
|
+
runs(n: number): this;
|
|
13
25
|
/** @internal */
|
|
14
26
|
_setSuite(name: string): void;
|
|
15
|
-
expect(field:
|
|
27
|
+
expect(field: "value" | "response", fn: (value: T) => void): this;
|
|
28
|
+
expect(field: "text", fn: (value: string) => void): this;
|
|
29
|
+
expect(field: "refusal", fn: (value: boolean | undefined) => void): this;
|
|
30
|
+
expect(field: string, fn: (value: any) => void): this;
|
|
31
|
+
/**
|
|
32
|
+
* Validate this scene's native value against a Standard Schema before user
|
|
33
|
+
* assertions run. Overrides any schema declared on the agent.
|
|
34
|
+
*/
|
|
35
|
+
expectSchema(schema: StandardSchemaV1): this;
|
|
16
36
|
toDefinition(): SceneDefinition;
|
|
17
37
|
}
|
|
18
38
|
export declare class AgentContext<T = string> {
|
|
19
39
|
private _executor;
|
|
20
40
|
private _name?;
|
|
41
|
+
private _schema?;
|
|
21
42
|
private _scenes;
|
|
22
43
|
private _currentSuite?;
|
|
23
44
|
private _beforeAllHooks;
|
|
24
45
|
private _afterAllHooks;
|
|
25
46
|
private _beforeEachHooks;
|
|
26
47
|
private _afterEachHooks;
|
|
27
|
-
constructor(_executor: AgentExecutor<T>, _name?: string | undefined);
|
|
48
|
+
constructor(_executor: AgentExecutor<T>, _name?: string | undefined, _schema?: StandardSchemaV1 | undefined);
|
|
28
49
|
registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
|
|
29
50
|
setSuite(name: string): void;
|
|
30
51
|
clearSuite(): void;
|
|
31
|
-
registerScene(prompt: string): SceneBuilder
|
|
52
|
+
registerScene(prompt: string): SceneBuilder<T>;
|
|
32
53
|
execute(): Promise<AgentReport<T>>;
|
|
33
54
|
}
|
|
34
55
|
export declare function hashPromptOnly(prompt: string): string;
|
package/dist/context.js
CHANGED
|
@@ -7,6 +7,16 @@ import { loadConfig } from "./config";
|
|
|
7
7
|
import { setPricingOverrides } from "./pricing";
|
|
8
8
|
import { renderTerminalWaterfall } from "./waterfall";
|
|
9
9
|
import { PromisePool } from "@supercharge/promise-pool";
|
|
10
|
+
/**
|
|
11
|
+
* Builds a scene. Generic over `T`, the agent's native value type, so the
|
|
12
|
+
* known fields hand a typed value to the assertion callback:
|
|
13
|
+
* - `"value"` / `"response"` → `T`
|
|
14
|
+
* - `"text"` → `string`
|
|
15
|
+
* - `"refusal"` → `boolean | undefined`
|
|
16
|
+
* - any dot-path / other → `any` (a string field can't be typed)
|
|
17
|
+
* `T` flows in from a schema-typed `agent()` via the scene fn passed to its
|
|
18
|
+
* callback. The free `scene()` import stays `SceneBuilder<string>`.
|
|
19
|
+
*/
|
|
10
20
|
export class SceneBuilder {
|
|
11
21
|
_prompt;
|
|
12
22
|
_assertions = [];
|
|
@@ -14,6 +24,7 @@ export class SceneBuilder {
|
|
|
14
24
|
_turns;
|
|
15
25
|
_runs;
|
|
16
26
|
_suite;
|
|
27
|
+
_schema;
|
|
17
28
|
constructor(_prompt) {
|
|
18
29
|
this._prompt = _prompt;
|
|
19
30
|
}
|
|
@@ -37,6 +48,14 @@ export class SceneBuilder {
|
|
|
37
48
|
this._assertions.push({ field, fn });
|
|
38
49
|
return this;
|
|
39
50
|
}
|
|
51
|
+
/**
|
|
52
|
+
* Validate this scene's native value against a Standard Schema before user
|
|
53
|
+
* assertions run. Overrides any schema declared on the agent.
|
|
54
|
+
*/
|
|
55
|
+
expectSchema(schema) {
|
|
56
|
+
this._schema = schema;
|
|
57
|
+
return this;
|
|
58
|
+
}
|
|
40
59
|
toDefinition() {
|
|
41
60
|
return {
|
|
42
61
|
prompt: this._prompt,
|
|
@@ -45,21 +64,24 @@ export class SceneBuilder {
|
|
|
45
64
|
turns: this._turns,
|
|
46
65
|
runs: this._runs,
|
|
47
66
|
suite: this._suite,
|
|
67
|
+
schema: this._schema,
|
|
48
68
|
};
|
|
49
69
|
}
|
|
50
70
|
}
|
|
51
71
|
export class AgentContext {
|
|
52
72
|
_executor;
|
|
53
73
|
_name;
|
|
74
|
+
_schema;
|
|
54
75
|
_scenes = [];
|
|
55
76
|
_currentSuite;
|
|
56
77
|
_beforeAllHooks = [];
|
|
57
78
|
_afterAllHooks = [];
|
|
58
79
|
_beforeEachHooks = [];
|
|
59
80
|
_afterEachHooks = [];
|
|
60
|
-
constructor(_executor, _name) {
|
|
81
|
+
constructor(_executor, _name, _schema) {
|
|
61
82
|
this._executor = _executor;
|
|
62
83
|
this._name = _name;
|
|
84
|
+
this._schema = _schema;
|
|
63
85
|
}
|
|
64
86
|
registerHook(type, fn) {
|
|
65
87
|
this[`_${type}Hooks`].push(fn);
|
|
@@ -82,7 +104,13 @@ export class AgentContext {
|
|
|
82
104
|
const config = await loadConfig();
|
|
83
105
|
setPricingOverrides(config.pricing);
|
|
84
106
|
const parallelism = Math.max(1, config.parallelism ?? 1);
|
|
85
|
-
const definitions = this._scenes.map((s) =>
|
|
107
|
+
const definitions = this._scenes.map((s) => {
|
|
108
|
+
const def = s.toDefinition();
|
|
109
|
+
// Agent-level schema is the default; a scene-level schema wins.
|
|
110
|
+
if (!def.schema && this._schema)
|
|
111
|
+
def.schema = this._schema;
|
|
112
|
+
return def;
|
|
113
|
+
});
|
|
86
114
|
const orderedResults = new Array(definitions.length);
|
|
87
115
|
const total = definitions.length;
|
|
88
116
|
// Group scenes by suite for organized output
|
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import type { AgentExecutor, AgentReport, HookFn } from "./types";
|
|
2
2
|
import { SceneBuilder } from "./context";
|
|
3
|
+
import { type StandardSchemaV1, type InferOutput } from "./schema";
|
|
3
4
|
export { expect } from "./assertions";
|
|
5
|
+
export type { StandardSchemaV1, InferOutput } from "./schema";
|
|
4
6
|
export { logger } from "./logger";
|
|
5
7
|
export { defineConfig } from "./config";
|
|
6
8
|
export { createTrace, summarizeEvents } from "./adapters/tracing";
|
|
@@ -13,6 +15,12 @@ export type { AgentExecutor, ExecutorOptions, AgentResponse, AgentReport, SceneR
|
|
|
13
15
|
export interface AgentOptions {
|
|
14
16
|
name?: string;
|
|
15
17
|
}
|
|
18
|
+
/**
|
|
19
|
+
* Registers a scene in the active agent. The variant passed to an `agent()`
|
|
20
|
+
* callback is typed `SceneFn<T>`, so `.expect("value", …)` receives the agent's
|
|
21
|
+
* native value type.
|
|
22
|
+
*/
|
|
23
|
+
export type SceneFn<T = string> = (prompt: string) => SceneBuilder<T>;
|
|
16
24
|
export declare function scene(prompt: string): SceneBuilder;
|
|
17
25
|
export declare function beforeAll(fn: HookFn): void;
|
|
18
26
|
export declare function afterAll(fn: HookFn): void;
|
|
@@ -21,4 +29,12 @@ export declare function afterEach(fn: HookFn): void;
|
|
|
21
29
|
export declare function suite(name: string, fn: () => void): void;
|
|
22
30
|
/** @internal reset auto-run state between tests */
|
|
23
31
|
export declare function _resetAutoRun(): void;
|
|
24
|
-
export declare function agent<T = string>(executor: AgentExecutor<T>, fn: () => void, options?: AgentOptions): Promise<AgentReport<T>>;
|
|
32
|
+
export declare function agent<T = string>(executor: AgentExecutor<T>, fn: (scene: SceneFn<T>) => void, options?: AgentOptions): Promise<AgentReport<T>>;
|
|
33
|
+
/**
|
|
34
|
+
* Schema-typed agent: the executor's `value` type is inferred from the schema
|
|
35
|
+
* (e.g. `z.infer<typeof Schema>`), and every non-refusal scene is validated
|
|
36
|
+
* against it. The scene fn passed to the callback is typed accordingly, so
|
|
37
|
+
* `.expect("value", …)` receives that value type. A scene's own
|
|
38
|
+
* `.expectSchema()` overrides the agent schema.
|
|
39
|
+
*/
|
|
40
|
+
export declare function agent<S extends StandardSchemaV1>(schema: S, executor: AgentExecutor<InferOutput<S>>, fn: (scene: SceneFn<InferOutput<S>>) => void, options?: AgentOptions): Promise<AgentReport<InferOutput<S>>>;
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { AgentContext, setContext, getContext } from "./context";
|
|
2
|
+
import { isStandardSchema } from "./schema";
|
|
2
3
|
export { expect } from "./assertions";
|
|
3
4
|
export { logger } from "./logger";
|
|
4
5
|
export { defineConfig } from "./config";
|
|
@@ -37,11 +38,16 @@ export function _resetAutoRun() {
|
|
|
37
38
|
autoRunScheduled = false;
|
|
38
39
|
executionChain = Promise.resolve();
|
|
39
40
|
}
|
|
40
|
-
export function agent(
|
|
41
|
-
const
|
|
41
|
+
export function agent(...args) {
|
|
42
|
+
const [schema, executor, fn, options] = isStandardSchema(args[0])
|
|
43
|
+
? args
|
|
44
|
+
: [undefined, ...args];
|
|
45
|
+
const ctx = new AgentContext(executor, options?.name, schema);
|
|
42
46
|
setContext(ctx);
|
|
43
47
|
try {
|
|
44
|
-
fn
|
|
48
|
+
// Hand the callback a scene fn bound to the active context. Its static type
|
|
49
|
+
// carries T (via the overloads); at runtime it's the same `scene()`.
|
|
50
|
+
fn(scene);
|
|
45
51
|
}
|
|
46
52
|
catch (err) {
|
|
47
53
|
setContext(null);
|
package/dist/runner.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { collectPendingJudgements } from "./assertions";
|
|
2
2
|
import { callJudge, resolveJudgeExecutor } from "./judge";
|
|
3
3
|
import { resolveValue, resolveText, serializeValue, navigatePath } from "./resolve";
|
|
4
|
+
import { validateAgainstSchema } from "./schema";
|
|
4
5
|
const DEFAULT_SCENE_TIMEOUT = 10_000;
|
|
5
6
|
/**
|
|
6
7
|
* Extract a named field from an agent response for assertion.
|
|
@@ -90,7 +91,21 @@ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig)
|
|
|
90
91
|
let passed = true;
|
|
91
92
|
let error;
|
|
92
93
|
let judgement;
|
|
94
|
+
// Schema validation runs first — a structural failure is the headline. Skip
|
|
95
|
+
// refusals (which legitimately won't match the output shape) and empty values.
|
|
96
|
+
if (scene.schema && !response.refusal) {
|
|
97
|
+
const value = resolveValue(response);
|
|
98
|
+
if (value !== undefined) {
|
|
99
|
+
const outcome = await validateAgainstSchema(scene.schema, value);
|
|
100
|
+
if (!outcome.ok) {
|
|
101
|
+
passed = false;
|
|
102
|
+
error = `Schema validation failed — ${outcome.message}`;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
93
106
|
for (const assertion of scene.assertions) {
|
|
107
|
+
if (!passed)
|
|
108
|
+
break;
|
|
94
109
|
try {
|
|
95
110
|
const value = extractField(response, assertion.field);
|
|
96
111
|
assertion.fn(value);
|
package/dist/schema.d.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema validation built on the Standard Schema v1 spec
|
|
3
|
+
* (https://standardschema.dev). Agest never imports a schema library — it talks
|
|
4
|
+
* to whatever the consumer brings (zod 4, valibot, arktype, …) through the
|
|
5
|
+
* `~standard` contract every compliant library exposes. zod is the documented,
|
|
6
|
+
* blessed choice but is not a runtime or peer dependency.
|
|
7
|
+
*/
|
|
8
|
+
/** The minimal Standard Schema v1 interface, vendored from the spec. */
|
|
9
|
+
export interface StandardSchemaV1<Input = unknown, Output = Input> {
|
|
10
|
+
readonly "~standard": StandardSchemaV1.Props<Input, Output>;
|
|
11
|
+
}
|
|
12
|
+
export declare namespace StandardSchemaV1 {
|
|
13
|
+
interface Props<Input = unknown, Output = Input> {
|
|
14
|
+
readonly version: 1;
|
|
15
|
+
readonly vendor: string;
|
|
16
|
+
readonly validate: (value: unknown) => Result<Output> | Promise<Result<Output>>;
|
|
17
|
+
readonly types?: Types<Input, Output>;
|
|
18
|
+
}
|
|
19
|
+
type Result<Output> = SuccessResult<Output> | FailureResult;
|
|
20
|
+
interface SuccessResult<Output> {
|
|
21
|
+
readonly value: Output;
|
|
22
|
+
readonly issues?: undefined;
|
|
23
|
+
}
|
|
24
|
+
interface FailureResult {
|
|
25
|
+
readonly issues: ReadonlyArray<Issue>;
|
|
26
|
+
}
|
|
27
|
+
interface Issue {
|
|
28
|
+
readonly message: string;
|
|
29
|
+
readonly path?: ReadonlyArray<PropertyKey | PathSegment>;
|
|
30
|
+
}
|
|
31
|
+
interface PathSegment {
|
|
32
|
+
readonly key: PropertyKey;
|
|
33
|
+
}
|
|
34
|
+
interface Types<Input = unknown, Output = Input> {
|
|
35
|
+
readonly input: Input;
|
|
36
|
+
readonly output: Output;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
/** The inferred output type of a Standard Schema (e.g. `z.infer<typeof S>`). */
|
|
40
|
+
export type InferOutput<S extends StandardSchemaV1> = NonNullable<S["~standard"]["types"]>["output"];
|
|
41
|
+
/** Structural duck-type check so any Standard-Schema library is accepted. */
|
|
42
|
+
export declare function isStandardSchema(value: unknown): value is StandardSchemaV1;
|
|
43
|
+
/** Render Standard Schema failure issues into a readable multi-line message. */
|
|
44
|
+
export declare function formatIssues(issues: ReadonlyArray<StandardSchemaV1.Issue>): string;
|
|
45
|
+
export type ValidationOutcome = {
|
|
46
|
+
ok: true;
|
|
47
|
+
} | {
|
|
48
|
+
ok: false;
|
|
49
|
+
message: string;
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* Validate a value against a schema, awaiting the result. Supports both
|
|
53
|
+
* synchronous and asynchronous (`refine`-style) schemas — used by the runner,
|
|
54
|
+
* which is already async.
|
|
55
|
+
*/
|
|
56
|
+
export declare function validateAgainstSchema(schema: StandardSchemaV1, value: unknown): Promise<ValidationOutcome>;
|
|
57
|
+
/**
|
|
58
|
+
* Synchronous validation for the `matchingSchema` matcher (matchers run inside
|
|
59
|
+
* a sync assertion callback). Throws a directive error if the schema needs to
|
|
60
|
+
* resolve asynchronously — declare such schemas at the agent/scene level, where
|
|
61
|
+
* validation is awaited.
|
|
62
|
+
*/
|
|
63
|
+
export declare function validateSync(schema: StandardSchemaV1, value: unknown): ValidationOutcome;
|
package/dist/schema.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema validation built on the Standard Schema v1 spec
|
|
3
|
+
* (https://standardschema.dev). Agest never imports a schema library — it talks
|
|
4
|
+
* to whatever the consumer brings (zod 4, valibot, arktype, …) through the
|
|
5
|
+
* `~standard` contract every compliant library exposes. zod is the documented,
|
|
6
|
+
* blessed choice but is not a runtime or peer dependency.
|
|
7
|
+
*/
|
|
8
|
+
/** Structural duck-type check so any Standard-Schema library is accepted. */
|
|
9
|
+
export function isStandardSchema(value) {
|
|
10
|
+
return (typeof value === "object" &&
|
|
11
|
+
value !== null &&
|
|
12
|
+
"~standard" in value &&
|
|
13
|
+
typeof value["~standard"]?.validate === "function");
|
|
14
|
+
}
|
|
15
|
+
function isThenable(value) {
|
|
16
|
+
return (typeof value === "object" &&
|
|
17
|
+
value !== null &&
|
|
18
|
+
typeof value.then === "function");
|
|
19
|
+
}
|
|
20
|
+
/** Normalise one issue path segment (`PropertyKey | { key }`) to a string. */
|
|
21
|
+
function renderSegment(seg) {
|
|
22
|
+
return typeof seg === "object" ? String(seg.key) : String(seg);
|
|
23
|
+
}
|
|
24
|
+
/** Render Standard Schema failure issues into a readable multi-line message. */
|
|
25
|
+
export function formatIssues(issues) {
|
|
26
|
+
const lines = issues.map((issue) => {
|
|
27
|
+
const path = issue.path?.map(renderSegment).join(".");
|
|
28
|
+
return path ? ` • ${path}: ${issue.message}` : ` • ${issue.message}`;
|
|
29
|
+
});
|
|
30
|
+
const count = issues.length;
|
|
31
|
+
return `${count} issue${count !== 1 ? "s" : ""}:\n${lines.join("\n")}`;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Validate a value against a schema, awaiting the result. Supports both
|
|
35
|
+
* synchronous and asynchronous (`refine`-style) schemas — used by the runner,
|
|
36
|
+
* which is already async.
|
|
37
|
+
*/
|
|
38
|
+
export async function validateAgainstSchema(schema, value) {
|
|
39
|
+
const result = await schema["~standard"].validate(value);
|
|
40
|
+
if (result.issues) {
|
|
41
|
+
return { ok: false, message: formatIssues(result.issues) };
|
|
42
|
+
}
|
|
43
|
+
return { ok: true };
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Synchronous validation for the `matchingSchema` matcher (matchers run inside
|
|
47
|
+
* a sync assertion callback). Throws a directive error if the schema needs to
|
|
48
|
+
* resolve asynchronously — declare such schemas at the agent/scene level, where
|
|
49
|
+
* validation is awaited.
|
|
50
|
+
*/
|
|
51
|
+
export function validateSync(schema, value) {
|
|
52
|
+
const result = schema["~standard"].validate(value);
|
|
53
|
+
if (isThenable(result)) {
|
|
54
|
+
throw new Error("matchingSchema() cannot validate an async schema. Declare the schema at " +
|
|
55
|
+
"the agent() or scene().expectSchema() level, where validation is awaited.");
|
|
56
|
+
}
|
|
57
|
+
if (result.issues) {
|
|
58
|
+
return { ok: false, message: formatIssues(result.issues) };
|
|
59
|
+
}
|
|
60
|
+
return { ok: true };
|
|
61
|
+
}
|
package/dist/types.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { StandardSchemaV1 } from "./schema";
|
|
1
2
|
export interface ExecutorOptions {
|
|
2
3
|
signal?: AbortSignal;
|
|
3
4
|
}
|
|
@@ -83,6 +84,8 @@ export interface SceneDefinition {
|
|
|
83
84
|
turns?: number;
|
|
84
85
|
runs?: number;
|
|
85
86
|
suite?: string;
|
|
87
|
+
/** Standard Schema validated against the native value before user assertions. */
|
|
88
|
+
schema?: StandardSchemaV1;
|
|
86
89
|
}
|
|
87
90
|
export type JudgeVerdict = "pass" | "fail" | "partial";
|
|
88
91
|
export interface JudgeResult {
|