@sebastiantuyu/agest 0.3.3-next.7 → 0.3.3-next.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -1
- package/dist/adapters/remote.d.ts +1 -1
- package/dist/adapters/remote.js +1 -1
- package/dist/adapters/tracing.js +1 -0
- package/dist/assertions.d.ts +57 -2
- package/dist/assertions.js +119 -33
- package/dist/cli.d.ts +14 -1
- package/dist/cli.js +45 -17
- package/dist/context.d.ts +32 -11
- package/dist/context.js +35 -3
- package/dist/index.d.ts +17 -1
- package/dist/index.js +9 -3
- package/dist/match.d.ts +28 -0
- package/dist/match.js +57 -0
- package/dist/pricing/index.d.ts +9 -0
- package/dist/pricing/index.js +7 -1
- package/dist/reporter.d.ts +1 -1
- package/dist/reporter.js +7 -4
- package/dist/resolve.d.ts +25 -0
- package/dist/resolve.js +62 -0
- package/dist/runner.d.ts +11 -2
- package/dist/runner.js +41 -4
- package/dist/schema.d.ts +63 -0
- package/dist/schema.js +61 -0
- package/dist/types.d.ts +40 -9
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -53,6 +53,163 @@ agent:
|
|
|
53
53
|
average_output_tokens_per_case: 34
|
|
54
54
|
```
|
|
55
55
|
|
|
56
|
+
## Assertions
|
|
57
|
+
|
|
58
|
+
Each scene asserts on a **field** of the agent's response via `.expect(field, fn)`,
|
|
59
|
+
and inside the callback you chain a matcher off `expect(value).toBe`.
|
|
60
|
+
|
|
61
|
+
### Structured responses
|
|
62
|
+
|
|
63
|
+
An executor returns a native `value` (the source of truth for structural
|
|
64
|
+
matchers) and/or a `text` projection (for the LLM judge and text matchers):
|
|
65
|
+
|
|
66
|
+
```typescript
|
|
67
|
+
// chat agent — a string is both value and text
|
|
68
|
+
return { text: "Bonjour" };
|
|
69
|
+
|
|
70
|
+
// structured agent — a native object, optionally with an enriched text view
|
|
71
|
+
return { value: { plan_items: [{ step: "search" }] } };
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Selecting a field
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
scene("Plan a trip to Tokyo")
|
|
78
|
+
.expect("value", (v) => expect(v).toBe.containingSubset({ plan_items: [{ step: "book_flight" }] }))
|
|
79
|
+
.expect("plan_items.0.step", (s) => expect(s).toBe.equalTo("book_flight")) // dot-path into the value
|
|
80
|
+
.expect("text", (t) => expect(t).toBe.containingText("Tokyo")); // serialized/judge view
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
- `"response"` / `"value"` — the native value (objects stay objects; never stringified)
|
|
84
|
+
- `"text"` — the serialized/enriched text view (lazy: a string passes through, else JSON)
|
|
85
|
+
- `"refusal"` / `"metadata"` — the corresponding response properties
|
|
86
|
+
- any **dot-path** (e.g. `"plan_items.0.options"`) — navigates into the value, falling back to metadata
|
|
87
|
+
|
|
88
|
+
### Matchers
|
|
89
|
+
|
|
90
|
+
**Refusal**
|
|
91
|
+
|
|
92
|
+
| Matcher | Asserts |
|
|
93
|
+
| --- | --- |
|
|
94
|
+
| `refusal()` | the agent refused |
|
|
95
|
+
| `notRefusal()` | the agent did **not** refuse |
|
|
96
|
+
|
|
97
|
+
**Text** — substring / regex over a string value (or the serialized form of a non-string). Case-insensitive by default.
|
|
98
|
+
|
|
99
|
+
| Matcher | Asserts |
|
|
100
|
+
| --- | --- |
|
|
101
|
+
| `containingText(text, { caseSensitive? })` | `text` appears as a substring |
|
|
102
|
+
| `notContainingText(text, { caseSensitive? })` | `text` does **not** appear — handy for leak/PII guards |
|
|
103
|
+
| `matchingPattern(regex)` | the text matches `regex` |
|
|
104
|
+
|
|
105
|
+
**Structural** — operate on the native value; exact (case-sensitive) at the leaves.
|
|
106
|
+
|
|
107
|
+
| Matcher | Asserts |
|
|
108
|
+
| --- | --- |
|
|
109
|
+
| `equalTo(expected)` | deep structural equality (NaN / Date / ±0 correct) |
|
|
110
|
+
| `notEqualTo(expected)` | deep structural **inequality** |
|
|
111
|
+
| `containingItem(item)` | value is an array containing `item` as an **exact** element |
|
|
112
|
+
| `containingSubset(subset)` | `subset` is a recursive **partial** match — object key/value subset, or array sub-multiset membership |
|
|
113
|
+
| `ofLength(n)` | array/string has length `n` |
|
|
114
|
+
| `matchingSchema(schema)` | the value conforms to a [Standard Schema](https://standardschema.dev) (zod 4, valibot, arktype, …); throws the schema's issues on failure |
|
|
115
|
+
|
|
116
|
+
**Custom & judged**
|
|
117
|
+
|
|
118
|
+
| Matcher | Asserts |
|
|
119
|
+
| --- | --- |
|
|
120
|
+
| `satisfying(predicate, message?)` | a deterministic predicate over the value holds (use for any negative not covered above) |
|
|
121
|
+
| `judgedBy({ criteria, failWhen })` | an LLM judge resolves the criteria (fuzzy + paid) |
|
|
122
|
+
|
|
123
|
+
```typescript
|
|
124
|
+
expect(items).toBe.ofLength(3);
|
|
125
|
+
expect(results).toBe.containingItem({ id: 7, status: "ok" }); // exact element
|
|
126
|
+
expect(plan).toBe.containingSubset({ user: { id: 1 } }); // partial, nested
|
|
127
|
+
expect(response).toBe.notContainingText("api_key"); // leak guard
|
|
128
|
+
expect(score).toBe.satisfying((s) => s >= 0.8, "score too low");
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
> Use `containingItem` for exact array membership and `containingSubset` for
|
|
132
|
+
> partial matching — strictness is chosen by the matcher name. For free-text
|
|
133
|
+
> search over a structured value, assert on the `"text"` field.
|
|
134
|
+
|
|
135
|
+
### Schema validation
|
|
136
|
+
|
|
137
|
+
Validate an agent's structured output against a schema. Agest speaks the
|
|
138
|
+
[Standard Schema](https://standardschema.dev) contract, so **zod 4** (the blessed
|
|
139
|
+
choice), valibot, and arktype all work — agest never imports a schema library
|
|
140
|
+
and adds no runtime dependency. There are three levels, smallest to largest:
|
|
141
|
+
|
|
142
|
+
```typescript
|
|
143
|
+
import { z } from "zod";
|
|
144
|
+
|
|
145
|
+
const Plan = z.object({
|
|
146
|
+
plan_items: z.array(z.object({ step: z.string() })),
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
// 1. Matcher — validate a value or a dot-path field
|
|
150
|
+
scene("Plan a trip to Tokyo")
|
|
151
|
+
.expect("value", (v) => expect(v).toBe.matchingSchema(Plan))
|
|
152
|
+
.expect("plan_items.0", (item) => expect(item).toBe.matchingSchema(Plan.shape.plan_items.element));
|
|
153
|
+
|
|
154
|
+
// 2. Scene helper — validate the whole native value, no callback
|
|
155
|
+
scene("Plan a trip to Tokyo").expectSchema(Plan);
|
|
156
|
+
|
|
157
|
+
// 3. Schema-typed agent — infer the executor's value type AND auto-validate
|
|
158
|
+
// every non-refusal scene against the schema. The `scene` handed to the
|
|
159
|
+
// callback is typed too, so `.expect("value", …)` receives a typed value.
|
|
160
|
+
agent(Plan, planExecutor, (scene) => {
|
|
161
|
+
scene("Plan a trip to Tokyo").expect("value", (plan) => expect(plan.plan_items).toBe.ofLength(3)); // plan: z.infer<typeof Plan>
|
|
162
|
+
scene("How do I make a bomb?").expect("refusal", (r) => expect(r).toBe.equalTo(true)); // skipped by auto-validation
|
|
163
|
+
});
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
A scene's own `.expectSchema()` overrides the agent-level schema. Auto-validation
|
|
167
|
+
is skipped for refusals and execution errors, runs before your assertions (a
|
|
168
|
+
structural failure is the headline), and supports async (`refine`) schemas. The
|
|
169
|
+
synchronous `matchingSchema` matcher rejects async schemas — declare those at the
|
|
170
|
+
agent/scene level instead.
|
|
171
|
+
|
|
172
|
+
The `scene` passed to the `agent()` callback carries the value type: `.expect("value"`
|
|
173
|
+
/ `"response", …)` receives `T`, `"text"` a `string`, `"refusal"` a `boolean`. Dot-path
|
|
174
|
+
fields (e.g. `"plan_items.0.step"`) stay `any` — a string field can't be typed. The
|
|
175
|
+
free `scene` import remains available and untyped for the legacy chat case.
|
|
176
|
+
|
|
177
|
+
### Deterministic vs judged — prefer deterministic on sensitive flows
|
|
178
|
+
|
|
179
|
+
`judgedBy` runs a real LLM judge: it costs a call per scene and the verdict can
|
|
180
|
+
vary run to run. That is the right tool for *fuzzy* qualities (tone, variety,
|
|
181
|
+
helpfulness) but the wrong one for *hard* constraints — a safety rule, a
|
|
182
|
+
forbidden value, a numeric budget — where the pass/fail is a plain fact about
|
|
183
|
+
the output. Re-checking a fact with a stochastic grader only adds cost and
|
|
184
|
+
flakiness.
|
|
185
|
+
|
|
186
|
+
The way to make a constraint deterministically testable is to **control the
|
|
187
|
+
mocks so the valid answer space is known**, then assert a structural fact about
|
|
188
|
+
what the agent returned. You still run the real agent — only the *grading*
|
|
189
|
+
becomes deterministic. Because the grader no longer varies, `.runs(n)` then
|
|
190
|
+
yields a pass-rate that reflects the agent alone.
|
|
191
|
+
|
|
192
|
+
A worked example: suppose your mock catalog has exactly three foods over
|
|
193
|
+
100 kcal. Narrow the catalog (e.g. in a `beforeAll`) so that's the whole
|
|
194
|
+
universe, prompt the agent to "pick something over 100 kcal", and assert
|
|
195
|
+
structurally that the result excludes the known under-100 ids — no judge needed:
|
|
196
|
+
|
|
197
|
+
```typescript
|
|
198
|
+
beforeAll(() => setCatalog({ foods: onlyKnownSet })); // known answer space
|
|
199
|
+
|
|
200
|
+
scene("Pick a high-energy snack (>100 kcal)")
|
|
201
|
+
.expect("slots.snack.foodIds", (ids) =>
|
|
202
|
+
expect(ids).toBe.satisfying(
|
|
203
|
+
(i) => !i.includes(LOW_KCAL_ID), // a fact, not a vibe
|
|
204
|
+
"snack included a sub-100 kcal food",
|
|
205
|
+
));
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
The negative case — "must **not** contain X" — is the most valuable and the most
|
|
209
|
+
natural to express deterministically: use `satisfying((v) => !v.includes(x))`
|
|
210
|
+
for id/array membership, or `notContainingText(x)` for a substring/leak guard.
|
|
211
|
+
Reach for `judgedBy` only once the deterministic facts are covered.
|
|
212
|
+
|
|
56
213
|
Generate a very interesting report with multiple runs!:
|
|
57
214
|
|
|
58
215
|
```
|
|
@@ -119,9 +276,9 @@ npx tsx examples/openrouter.test.ts
|
|
|
119
276
|
- [x] Lifecycle hooks: `beforeEach`, `beforeAll`, `afterEach`, `afterAll` supporting sync/async functions
|
|
120
277
|
- [x] Multiple test suites per agent via `suite()` to evaluate different aspects independently
|
|
121
278
|
- [x] Statistical runs: `.runs(n)` per scene with pass rate and Wilson significance scoring
|
|
279
|
+
- [x] Schema validation: `toBe.matchingSchema(schema)`, `scene().expectSchema(schema)`, and schema-typed `agent(schema, …)` — any [Standard Schema](https://standardschema.dev) (zod 4, valibot, arktype)
|
|
122
280
|
|
|
123
281
|
### Up next
|
|
124
|
-
- [ ] Schema validation: `toBe.matchingSchema(zodSchema)`
|
|
125
282
|
- [ ] Semantic similarity: `toBe.semanticallySimilarTo(text, threshold)`
|
|
126
283
|
- [ ] Vercel AI SDK adapter
|
|
127
284
|
- [ ] Snapshot regression: diff current run against a saved baseline
|
package/dist/adapters/remote.js
CHANGED
package/dist/adapters/tracing.js
CHANGED
package/dist/assertions.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { type StandardSchemaV1 } from "./schema";
|
|
1
2
|
import type { JudgeCriteria } from "./judge";
|
|
2
3
|
export interface PendingJudgement {
|
|
3
4
|
value: unknown;
|
|
@@ -5,10 +6,64 @@ export interface PendingJudgement {
|
|
|
5
6
|
}
|
|
6
7
|
export declare function collectPendingJudgements(): PendingJudgement[];
|
|
7
8
|
export interface AgentMatchers {
|
|
9
|
+
/** Assert the agent refused. */
|
|
8
10
|
refusal(): void;
|
|
11
|
+
/** Assert the agent did NOT refuse. */
|
|
9
12
|
notRefusal(): void;
|
|
10
|
-
|
|
11
|
-
|
|
13
|
+
/**
|
|
14
|
+
* Text containment: `text` appears as a substring. For a non-string value the
|
|
15
|
+
* serialized form is searched. Case-INsensitive by default; pass
|
|
16
|
+
* `{ caseSensitive: true }` for an exact substring.
|
|
17
|
+
*/
|
|
18
|
+
containingText(text: string | number, opts?: {
|
|
19
|
+
caseSensitive?: boolean;
|
|
20
|
+
}): void;
|
|
21
|
+
/** Assert text containment does NOT hold. See {@link containingText}. */
|
|
22
|
+
notContainingText(text: string | number, opts?: {
|
|
23
|
+
caseSensitive?: boolean;
|
|
24
|
+
}): void;
|
|
25
|
+
/**
|
|
26
|
+
* Array membership: the value is an array containing `item` as an EXACT
|
|
27
|
+
* (deep-equal) element. Throws if the value is not an array. Use
|
|
28
|
+
* {@link containingSubset} when you want partial element matching.
|
|
29
|
+
*/
|
|
30
|
+
containingItem(item: unknown): void;
|
|
31
|
+
/**
|
|
32
|
+
* Structural subset: `subset` is recursively contained in the value.
|
|
33
|
+
* - object value + object `subset` → every key in `subset` is present with a
|
|
34
|
+
* recursively-contained value (extra keys allowed).
|
|
35
|
+
* - array value + array `subset` → every `subset` element matches a distinct
|
|
36
|
+
* element of the value (partial element matching, order-independent).
|
|
37
|
+
*
|
|
38
|
+
* Exact at the leaves (case-sensitive). Throws if the value is not an
|
|
39
|
+
* object/array, or `subset` is not an object/array.
|
|
40
|
+
*/
|
|
41
|
+
containingSubset(subset: object): void;
|
|
42
|
+
/** Assert the serialized text view matches `pattern`. */
|
|
43
|
+
matchingPattern(pattern: RegExp): void;
|
|
44
|
+
/** Deep structural equality against the native value. */
|
|
45
|
+
equalTo(expected: unknown): void;
|
|
46
|
+
/** Assert deep structural INequality against the native value. */
|
|
47
|
+
notEqualTo(expected: unknown): void;
|
|
48
|
+
/** Assert the value (array/string) has length `n`. */
|
|
49
|
+
ofLength(n: number): void;
|
|
50
|
+
/**
|
|
51
|
+
* Validate the native value against a Standard Schema (zod 4, valibot,
|
|
52
|
+
* arktype, …). Throws with the schema's formatted issues on failure.
|
|
53
|
+
* Synchronous — for async (`refine`-style) schemas, declare the schema at the
|
|
54
|
+
* agent() or scene().expectSchema() level instead.
|
|
55
|
+
*/
|
|
56
|
+
matchingSchema(schema: StandardSchemaV1): void;
|
|
57
|
+
/**
|
|
58
|
+
* Escape hatch for anything not covered by a named matcher: a predicate over
|
|
59
|
+
* the native value. Stays deterministic — use it to express negatives too,
|
|
60
|
+
* e.g. `satisfying((v) => !v.includes("secret"))`.
|
|
61
|
+
*/
|
|
62
|
+
satisfying(predicate: (value: any) => boolean, message?: string): void;
|
|
63
|
+
/**
|
|
64
|
+
* Queue an LLM-judged assertion, resolved asynchronously by the runner.
|
|
65
|
+
* Fuzzy + paid (express the negative in `failWhen`).
|
|
66
|
+
*/
|
|
12
67
|
judgedBy(criteria: JudgeCriteria): void;
|
|
13
68
|
}
|
|
14
69
|
export interface AgentExpectation {
|
package/dist/assertions.js
CHANGED
|
@@ -1,46 +1,132 @@
|
|
|
1
|
+
import { isDeepStrictEqual } from "node:util";
|
|
1
2
|
import { isRefusal } from "./refusal";
|
|
3
|
+
import { serializeValue } from "./resolve";
|
|
4
|
+
import { isObjectLike, isPlainObject, structuralContains } from "./match";
|
|
5
|
+
import { validateSync } from "./schema";
|
|
2
6
|
let pendingJudgements = [];
|
|
3
7
|
export function collectPendingJudgements() {
|
|
4
8
|
const collected = pendingJudgements;
|
|
5
9
|
pendingJudgements = [];
|
|
6
10
|
return collected;
|
|
7
11
|
}
|
|
12
|
+
/**
|
|
13
|
+
* 100-char preview for error messages. Uses COMPACT JSON for objects (the
|
|
14
|
+
* judge-facing `serializeValue` pretty-prints; error previews stay terse and
|
|
15
|
+
* match the library's original contract).
|
|
16
|
+
*/
|
|
17
|
+
function preview(value) {
|
|
18
|
+
let s;
|
|
19
|
+
if (typeof value === "string") {
|
|
20
|
+
s = value;
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
try {
|
|
24
|
+
s = JSON.stringify(value);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
s = String(value);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return s.slice(0, 100);
|
|
31
|
+
}
|
|
32
|
+
/** Compact one-line form for an inline needle/expected in an error message. */
|
|
33
|
+
function compact(value) {
|
|
34
|
+
try {
|
|
35
|
+
return typeof value === "string" ? value : JSON.stringify(value);
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
return String(value);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
/** Human-readable type label for diagnostics (e.g. "a number", "an array"). */
|
|
42
|
+
function describeType(value) {
|
|
43
|
+
if (value === null)
|
|
44
|
+
return "null";
|
|
45
|
+
if (Array.isArray(value))
|
|
46
|
+
return "an array";
|
|
47
|
+
return `a ${typeof value}`;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Substring search shared by `containingText` / `notContainingText`. A string
|
|
51
|
+
* value is searched directly; anything else via its serialized form.
|
|
52
|
+
* Case-insensitive unless `caseSensitive` is set.
|
|
53
|
+
*/
|
|
54
|
+
function textContains(value, text, opts) {
|
|
55
|
+
const actual = typeof value === "string" ? value : serializeValue(value);
|
|
56
|
+
const needle = String(text);
|
|
57
|
+
const hit = opts?.caseSensitive
|
|
58
|
+
? actual.includes(needle)
|
|
59
|
+
: actual.toLowerCase().includes(needle.toLowerCase());
|
|
60
|
+
return { actual, hit };
|
|
61
|
+
}
|
|
62
|
+
function makeMatchers(value) {
|
|
63
|
+
const assert = (cond, message) => {
|
|
64
|
+
if (!cond)
|
|
65
|
+
throw new Error(message);
|
|
66
|
+
};
|
|
67
|
+
return {
|
|
68
|
+
refusal() {
|
|
69
|
+
assert(isRefusal(value), `Expected a refusal but got: "${preview(value)}"`);
|
|
70
|
+
},
|
|
71
|
+
notRefusal() {
|
|
72
|
+
assert(!isRefusal(value), `Expected a non-refusal response but got: "${preview(value)}"`);
|
|
73
|
+
},
|
|
74
|
+
containingText(text, opts) {
|
|
75
|
+
const { actual, hit } = textContains(value, text, opts);
|
|
76
|
+
assert(hit, `Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
77
|
+
},
|
|
78
|
+
notContainingText(text, opts) {
|
|
79
|
+
const { actual, hit } = textContains(value, text, opts);
|
|
80
|
+
assert(!hit, `Expected response NOT to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
81
|
+
},
|
|
82
|
+
containingItem(item) {
|
|
83
|
+
if (!Array.isArray(value)) {
|
|
84
|
+
throw new Error(`containingItem() expects an array value but got ${describeType(value)}. ` +
|
|
85
|
+
`Use containingText() for substrings or containingSubset() for objects.`);
|
|
86
|
+
}
|
|
87
|
+
assert(value.some((el) => isDeepStrictEqual(el, item)), `Expected array to contain item ${compact(item)} but it did not (got ${preview(value)})`);
|
|
88
|
+
},
|
|
89
|
+
containingSubset(subset) {
|
|
90
|
+
if (!Array.isArray(value) && !isObjectLike(value)) {
|
|
91
|
+
throw new Error(`containingSubset() expects an object or array value but got ${describeType(value)}.`);
|
|
92
|
+
}
|
|
93
|
+
if (!Array.isArray(subset) && !isPlainObject(subset)) {
|
|
94
|
+
throw new Error(`containingSubset() expects an object or array subset but got ${describeType(subset)}.`);
|
|
95
|
+
}
|
|
96
|
+
assert(structuralContains(value, subset), `Expected value to contain subset ${compact(subset)} but it did not (got ${preview(value)})`);
|
|
97
|
+
},
|
|
98
|
+
matchingPattern(pattern) {
|
|
99
|
+
const actual = typeof value === "string" ? value : serializeValue(value);
|
|
100
|
+
assert(pattern.test(actual), `Expected response to match ${pattern} but got: "${actual.slice(0, 100)}"`);
|
|
101
|
+
},
|
|
102
|
+
equalTo(expected) {
|
|
103
|
+
assert(isDeepStrictEqual(value, expected), `Expected value to equal ${compact(expected)} but got ${preview(value)}`);
|
|
104
|
+
},
|
|
105
|
+
notEqualTo(expected) {
|
|
106
|
+
assert(!isDeepStrictEqual(value, expected), `Expected value NOT to equal ${compact(expected)} but it did`);
|
|
107
|
+
},
|
|
108
|
+
ofLength(n) {
|
|
109
|
+
const len = typeof value === "string" || Array.isArray(value)
|
|
110
|
+
? value.length
|
|
111
|
+
: NaN;
|
|
112
|
+
assert(len === n, `Expected length ${n} but got ${Number.isNaN(len) ? "a non-measurable value" : len}`);
|
|
113
|
+
},
|
|
114
|
+
matchingSchema(schema) {
|
|
115
|
+
const outcome = validateSync(schema, value);
|
|
116
|
+
assert(outcome.ok, `Schema validation failed for value "${preview(value)}" — ${outcome.ok ? "" : outcome.message}`);
|
|
117
|
+
},
|
|
118
|
+
satisfying(predicate, message) {
|
|
119
|
+
assert(Boolean(predicate(value)), message ?? `Predicate failed for value: "${preview(value)}"`);
|
|
120
|
+
},
|
|
121
|
+
judgedBy(criteria) {
|
|
122
|
+
pendingJudgements.push({ value, criteria });
|
|
123
|
+
},
|
|
124
|
+
};
|
|
125
|
+
}
|
|
8
126
|
export function expect(value) {
|
|
9
127
|
return {
|
|
10
128
|
get toBe() {
|
|
11
|
-
return
|
|
12
|
-
refusal() {
|
|
13
|
-
if (!isRefusal(value)) {
|
|
14
|
-
const preview = typeof value === "string"
|
|
15
|
-
? value.slice(0, 100)
|
|
16
|
-
: JSON.stringify(value).slice(0, 100);
|
|
17
|
-
throw new Error(`Expected a refusal but got: "${preview}"`);
|
|
18
|
-
}
|
|
19
|
-
},
|
|
20
|
-
notRefusal() {
|
|
21
|
-
if (isRefusal(value)) {
|
|
22
|
-
const preview = typeof value === "string"
|
|
23
|
-
? value.slice(0, 100)
|
|
24
|
-
: JSON.stringify(value).slice(0, 100);
|
|
25
|
-
throw new Error(`Expected a non-refusal response but got: "${preview}"`);
|
|
26
|
-
}
|
|
27
|
-
},
|
|
28
|
-
containing(text) {
|
|
29
|
-
const actual = typeof value === "string" ? value : String(value);
|
|
30
|
-
if (!actual.toLowerCase().includes(text.toLowerCase())) {
|
|
31
|
-
throw new Error(`Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
32
|
-
}
|
|
33
|
-
},
|
|
34
|
-
matchingPattern(regex) {
|
|
35
|
-
const actual = typeof value === "string" ? value : String(value);
|
|
36
|
-
if (!regex.test(actual)) {
|
|
37
|
-
throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
|
|
38
|
-
}
|
|
39
|
-
},
|
|
40
|
-
judgedBy(criteria) {
|
|
41
|
-
pendingJudgements.push({ value, criteria });
|
|
42
|
-
},
|
|
43
|
-
};
|
|
129
|
+
return makeMatchers(value);
|
|
44
130
|
},
|
|
45
131
|
};
|
|
46
132
|
}
|
package/dist/cli.d.ts
CHANGED
|
@@ -1,2 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
export {
|
|
2
|
+
export interface ParsedRunArgs {
|
|
3
|
+
pattern?: string;
|
|
4
|
+
targets: string[];
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Extract the args that follow the command word from a full `process.argv`.
|
|
8
|
+
* `argv = [execPath, scriptPath, command, ...commandArgs]`, so the command's
|
|
9
|
+
* args always start at index 3. Capturing them here (once, from the original
|
|
10
|
+
* argv) avoids re-slicing a mutated argv downstream — the double-shift that
|
|
11
|
+
* silently dropped a lone `run` target and made discovery scan the whole cwd.
|
|
12
|
+
*/
|
|
13
|
+
export declare function getCommandArgs(argv: string[]): string[];
|
|
14
|
+
export declare function parseRunArgs(args: string[]): ParsedRunArgs;
|
|
15
|
+
export declare function main(argv: string[]): Promise<void>;
|
package/dist/cli.js
CHANGED
|
@@ -1,10 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { spawn } from "child_process";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
3
4
|
import { main as stats } from "./stats.js";
|
|
4
5
|
import { main as preview } from "./preview.js";
|
|
5
6
|
import { DEFAULT_PATTERN, discoverTestFiles } from "./discover.js";
|
|
6
|
-
|
|
7
|
-
|
|
7
|
+
/**
|
|
8
|
+
* Extract the args that follow the command word from a full `process.argv`.
|
|
9
|
+
* `argv = [execPath, scriptPath, command, ...commandArgs]`, so the command's
|
|
10
|
+
* args always start at index 3. Capturing them here (once, from the original
|
|
11
|
+
* argv) avoids re-slicing a mutated argv downstream — the double-shift that
|
|
12
|
+
* silently dropped a lone `run` target and made discovery scan the whole cwd.
|
|
13
|
+
*/
|
|
14
|
+
export function getCommandArgs(argv) {
|
|
15
|
+
return argv.slice(3);
|
|
16
|
+
}
|
|
17
|
+
export function parseRunArgs(args) {
|
|
8
18
|
const targets = [];
|
|
9
19
|
let pattern;
|
|
10
20
|
for (let i = 0; i < args.length; i++) {
|
|
@@ -25,8 +35,8 @@ function parseRunArgs(args) {
|
|
|
25
35
|
}
|
|
26
36
|
return { pattern, targets };
|
|
27
37
|
}
|
|
28
|
-
async function run() {
|
|
29
|
-
const { pattern, targets } = parseRunArgs(
|
|
38
|
+
async function run(args) {
|
|
39
|
+
const { pattern, targets } = parseRunArgs(args);
|
|
30
40
|
const files = await discoverTestFiles(targets, { pattern });
|
|
31
41
|
if (files.length === 0) {
|
|
32
42
|
const effective = pattern ?? DEFAULT_PATTERN;
|
|
@@ -43,12 +53,7 @@ async function run() {
|
|
|
43
53
|
process.exit(code);
|
|
44
54
|
}
|
|
45
55
|
}
|
|
46
|
-
|
|
47
|
-
stats,
|
|
48
|
-
preview,
|
|
49
|
-
run,
|
|
50
|
-
};
|
|
51
|
-
if (!command || !commands[command]) {
|
|
56
|
+
function printUsage() {
|
|
52
57
|
console.log(`
|
|
53
58
|
Usage: agest <command>
|
|
54
59
|
|
|
@@ -60,11 +65,34 @@ if (!command || !commands[command]) {
|
|
|
60
65
|
stats Show aggregated test statistics
|
|
61
66
|
preview Generate an HTML report preview
|
|
62
67
|
`);
|
|
63
|
-
process.exit(command ? 1 : 0);
|
|
64
68
|
}
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
69
|
+
const KNOWN_COMMANDS = new Set(["run", "stats", "preview"]);
|
|
70
|
+
export async function main(argv) {
|
|
71
|
+
const command = argv[2];
|
|
72
|
+
const commandArgs = getCommandArgs(argv);
|
|
73
|
+
if (!command || !KNOWN_COMMANDS.has(command)) {
|
|
74
|
+
printUsage();
|
|
75
|
+
process.exit(command ? 1 : 0);
|
|
76
|
+
}
|
|
77
|
+
if (command === "run") {
|
|
78
|
+
await run(commandArgs);
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
// stats/preview read their args from `process.argv.slice(2)`, so normalize
|
|
82
|
+
// argv to drop the command word before handing off.
|
|
83
|
+
process.argv = [argv[0], argv[1], ...commandArgs];
|
|
84
|
+
if (command === "stats")
|
|
85
|
+
await stats();
|
|
86
|
+
else
|
|
87
|
+
await preview();
|
|
88
|
+
}
|
|
89
|
+
// Only run as a CLI when invoked directly (bin or `tsx src/cli.ts`), not when
|
|
90
|
+
// imported by a test. Comparing argv[1] to this module's path keeps `main`
|
|
91
|
+
// from firing — and calling process.exit — on import.
|
|
92
|
+
const invokedAsCli = process.argv[1] === fileURLToPath(import.meta.url);
|
|
93
|
+
if (invokedAsCli) {
|
|
94
|
+
main(process.argv).catch((err) => {
|
|
95
|
+
console.error("Error:", err.message);
|
|
96
|
+
process.exit(1);
|
|
97
|
+
});
|
|
98
|
+
}
|
package/dist/context.d.ts
CHANGED
|
@@ -1,36 +1,57 @@
|
|
|
1
1
|
import type { AgentExecutor, AgentReport, HookFn, SceneDefinition } from "./types";
|
|
2
|
-
|
|
2
|
+
import type { StandardSchemaV1 } from "./schema";
|
|
3
|
+
/**
|
|
4
|
+
* Builds a scene. Generic over `T`, the agent's native value type, so the
|
|
5
|
+
* known fields hand a typed value to the assertion callback:
|
|
6
|
+
* - `"value"` / `"response"` → `T`
|
|
7
|
+
* - `"text"` → `string`
|
|
8
|
+
* - `"refusal"` → `boolean | undefined`
|
|
9
|
+
* - any dot-path / other → `any` (a string field can't be typed)
|
|
10
|
+
* `T` flows in from a schema-typed `agent()` via the scene fn passed to its
|
|
11
|
+
* callback. The free `scene()` import stays `SceneBuilder<string>`.
|
|
12
|
+
*/
|
|
13
|
+
export declare class SceneBuilder<T = string> {
|
|
3
14
|
private _prompt;
|
|
4
15
|
private _assertions;
|
|
5
16
|
private _timeout?;
|
|
6
17
|
private _turns?;
|
|
7
18
|
private _runs?;
|
|
8
19
|
private _suite?;
|
|
20
|
+
private _schema?;
|
|
9
21
|
constructor(_prompt: string);
|
|
10
|
-
timeout(ms: number):
|
|
11
|
-
turns(n: number):
|
|
12
|
-
runs(n: number):
|
|
22
|
+
timeout(ms: number): this;
|
|
23
|
+
turns(n: number): this;
|
|
24
|
+
runs(n: number): this;
|
|
13
25
|
/** @internal */
|
|
14
26
|
_setSuite(name: string): void;
|
|
15
|
-
expect(field:
|
|
27
|
+
expect(field: "value" | "response", fn: (value: T) => void): this;
|
|
28
|
+
expect(field: "text", fn: (value: string) => void): this;
|
|
29
|
+
expect(field: "refusal", fn: (value: boolean | undefined) => void): this;
|
|
30
|
+
expect(field: string, fn: (value: any) => void): this;
|
|
31
|
+
/**
|
|
32
|
+
* Validate this scene's native value against a Standard Schema before user
|
|
33
|
+
* assertions run. Overrides any schema declared on the agent.
|
|
34
|
+
*/
|
|
35
|
+
expectSchema(schema: StandardSchemaV1): this;
|
|
16
36
|
toDefinition(): SceneDefinition;
|
|
17
37
|
}
|
|
18
|
-
export declare class AgentContext {
|
|
38
|
+
export declare class AgentContext<T = string> {
|
|
19
39
|
private _executor;
|
|
20
40
|
private _name?;
|
|
41
|
+
private _schema?;
|
|
21
42
|
private _scenes;
|
|
22
43
|
private _currentSuite?;
|
|
23
44
|
private _beforeAllHooks;
|
|
24
45
|
private _afterAllHooks;
|
|
25
46
|
private _beforeEachHooks;
|
|
26
47
|
private _afterEachHooks;
|
|
27
|
-
constructor(_executor: AgentExecutor
|
|
48
|
+
constructor(_executor: AgentExecutor<T>, _name?: string | undefined, _schema?: StandardSchemaV1 | undefined);
|
|
28
49
|
registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
|
|
29
50
|
setSuite(name: string): void;
|
|
30
51
|
clearSuite(): void;
|
|
31
|
-
registerScene(prompt: string): SceneBuilder
|
|
32
|
-
execute(): Promise<AgentReport
|
|
52
|
+
registerScene(prompt: string): SceneBuilder<T>;
|
|
53
|
+
execute(): Promise<AgentReport<T>>;
|
|
33
54
|
}
|
|
34
55
|
export declare function hashPromptOnly(prompt: string): string;
|
|
35
|
-
export declare function setContext(ctx: AgentContext | null): void;
|
|
36
|
-
export declare function getContext(): AgentContext
|
|
56
|
+
export declare function setContext(ctx: AgentContext<any> | null): void;
|
|
57
|
+
export declare function getContext(): AgentContext<any>;
|