@sebastiantuyu/agest 0.3.3-next.7 → 0.3.3-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -0
- package/dist/adapters/remote.d.ts +1 -1
- package/dist/adapters/remote.js +1 -1
- package/dist/adapters/tracing.js +1 -0
- package/dist/assertions.d.ts +49 -2
- package/dist/assertions.js +114 -33
- package/dist/context.d.ts +5 -5
- package/dist/context.js +5 -1
- package/dist/index.d.ts +1 -1
- package/dist/match.d.ts +28 -0
- package/dist/match.js +57 -0
- package/dist/pricing/index.d.ts +9 -0
- package/dist/pricing/index.js +7 -1
- package/dist/reporter.d.ts +1 -1
- package/dist/reporter.js +7 -4
- package/dist/resolve.d.ts +25 -0
- package/dist/resolve.js +62 -0
- package/dist/runner.d.ts +11 -2
- package/dist/runner.js +26 -4
- package/dist/types.d.ts +37 -9
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -53,6 +53,84 @@ agent:
|
|
|
53
53
|
average_output_tokens_per_case: 34
|
|
54
54
|
```
|
|
55
55
|
|
|
56
|
+
## Assertions
|
|
57
|
+
|
|
58
|
+
Each scene asserts on a **field** of the agent's response via `.expect(field, fn)`,
|
|
59
|
+
and inside the callback you chain a matcher off `expect(value).toBe`.
|
|
60
|
+
|
|
61
|
+
### Structured responses
|
|
62
|
+
|
|
63
|
+
An executor returns a native `value` (the source of truth for structural
|
|
64
|
+
matchers) and/or a `text` projection (for the LLM judge and text matchers):
|
|
65
|
+
|
|
66
|
+
```typescript
|
|
67
|
+
// chat agent — a string is both value and text
|
|
68
|
+
return { text: "Bonjour" };
|
|
69
|
+
|
|
70
|
+
// structured agent — a native object, optionally with an enriched text view
|
|
71
|
+
return { value: { plan_items: [{ step: "search" }] } };
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Selecting a field
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
scene("Plan a trip to Tokyo")
|
|
78
|
+
.expect("value", (v) => expect(v).toBe.containingSubset({ plan_items: [{ step: "book_flight" }] }))
|
|
79
|
+
.expect("plan_items.0.step", (s) => expect(s).toBe.equalTo("book_flight")) // dot-path into the value
|
|
80
|
+
.expect("text", (t) => expect(t).toBe.containingText("Tokyo")); // serialized/judge view
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
- `"response"` / `"value"` — the native value (objects stay objects; never stringified)
|
|
84
|
+
- `"text"` — the serialized/enriched text view (lazy: a string passes through, else JSON)
|
|
85
|
+
- `"refusal"` / `"metadata"` — the corresponding response properties
|
|
86
|
+
- any **dot-path** (e.g. `"plan_items.0.options"`) — navigates into the value, falling back to metadata
|
|
87
|
+
|
|
88
|
+
### Matchers
|
|
89
|
+
|
|
90
|
+
**Refusal**
|
|
91
|
+
|
|
92
|
+
| Matcher | Asserts |
|
|
93
|
+
| --- | --- |
|
|
94
|
+
| `refusal()` | the agent refused |
|
|
95
|
+
| `notRefusal()` | the agent did **not** refuse |
|
|
96
|
+
|
|
97
|
+
**Text** — substring / regex over a string value (or the serialized form of a non-string). Case-insensitive by default.
|
|
98
|
+
|
|
99
|
+
| Matcher | Asserts |
|
|
100
|
+
| --- | --- |
|
|
101
|
+
| `containingText(text, { caseSensitive? })` | `text` appears as a substring |
|
|
102
|
+
| `notContainingText(text, { caseSensitive? })` | `text` does **not** appear — handy for leak/PII guards |
|
|
103
|
+
| `matchingPattern(regex)` | the text matches `regex` |
|
|
104
|
+
|
|
105
|
+
**Structural** — operate on the native value; exact (case-sensitive) at the leaves.
|
|
106
|
+
|
|
107
|
+
| Matcher | Asserts |
|
|
108
|
+
| --- | --- |
|
|
109
|
+
| `equalTo(expected)` | deep structural equality (NaN / Date / ±0 correct) |
|
|
110
|
+
| `notEqualTo(expected)` | deep structural **inequality** |
|
|
111
|
+
| `containingItem(item)` | value is an array containing `item` as an **exact** element |
|
|
112
|
+
| `containingSubset(subset)` | `subset` is a recursive **partial** match — object key/value subset, or array sub-multiset membership |
|
|
113
|
+
| `ofLength(n)` | array/string has length `n` |
|
|
114
|
+
|
|
115
|
+
**Custom & judged**
|
|
116
|
+
|
|
117
|
+
| Matcher | Asserts |
|
|
118
|
+
| --- | --- |
|
|
119
|
+
| `satisfying(predicate, message?)` | a deterministic predicate over the value holds (use for any negative not covered above) |
|
|
120
|
+
| `judgedBy({ criteria, failWhen })` | an LLM judge resolves the criteria (fuzzy + paid) |
|
|
121
|
+
|
|
122
|
+
```typescript
|
|
123
|
+
expect(items).toBe.ofLength(3);
|
|
124
|
+
expect(results).toBe.containingItem({ id: 7, status: "ok" }); // exact element
|
|
125
|
+
expect(plan).toBe.containingSubset({ user: { id: 1 } }); // partial, nested
|
|
126
|
+
expect(response).toBe.notContainingText("api_key"); // leak guard
|
|
127
|
+
expect(score).toBe.satisfying((s) => s >= 0.8, "score too low");
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
> Use `containingItem` for exact array membership and `containingSubset` for
|
|
131
|
+
> partial matching — strictness is chosen by the matcher name. For free-text
|
|
132
|
+
> search over a structured value, assert on the `"text"` field.
|
|
133
|
+
|
|
56
134
|
Generate a very interesting report with multiple runs!:
|
|
57
135
|
|
|
58
136
|
```
|
package/dist/adapters/remote.js
CHANGED
package/dist/adapters/tracing.js
CHANGED
package/dist/assertions.d.ts
CHANGED
|
@@ -5,10 +5,57 @@ export interface PendingJudgement {
|
|
|
5
5
|
}
|
|
6
6
|
export declare function collectPendingJudgements(): PendingJudgement[];
|
|
7
7
|
export interface AgentMatchers {
|
|
8
|
+
/** Assert the agent refused. */
|
|
8
9
|
refusal(): void;
|
|
10
|
+
/** Assert the agent did NOT refuse. */
|
|
9
11
|
notRefusal(): void;
|
|
10
|
-
|
|
11
|
-
|
|
12
|
+
/**
|
|
13
|
+
* Text containment: `text` appears as a substring. For a non-string value the
|
|
14
|
+
* serialized form is searched. Case-INsensitive by default; pass
|
|
15
|
+
* `{ caseSensitive: true }` for an exact substring.
|
|
16
|
+
*/
|
|
17
|
+
containingText(text: string | number, opts?: {
|
|
18
|
+
caseSensitive?: boolean;
|
|
19
|
+
}): void;
|
|
20
|
+
/** Assert text containment does NOT hold. See {@link containingText}. */
|
|
21
|
+
notContainingText(text: string | number, opts?: {
|
|
22
|
+
caseSensitive?: boolean;
|
|
23
|
+
}): void;
|
|
24
|
+
/**
|
|
25
|
+
* Array membership: the value is an array containing `item` as an EXACT
|
|
26
|
+
* (deep-equal) element. Throws if the value is not an array. Use
|
|
27
|
+
* {@link containingSubset} when you want partial element matching.
|
|
28
|
+
*/
|
|
29
|
+
containingItem(item: unknown): void;
|
|
30
|
+
/**
|
|
31
|
+
* Structural subset: `subset` is recursively contained in the value.
|
|
32
|
+
* - object value + object `subset` → every key in `subset` is present with a
|
|
33
|
+
* recursively-contained value (extra keys allowed).
|
|
34
|
+
* - array value + array `subset` → every `subset` element matches a distinct
|
|
35
|
+
* element of the value (partial element matching, order-independent).
|
|
36
|
+
*
|
|
37
|
+
* Exact at the leaves (case-sensitive). Throws if the value is not an
|
|
38
|
+
* object/array, or `subset` is not an object/array.
|
|
39
|
+
*/
|
|
40
|
+
containingSubset(subset: object): void;
|
|
41
|
+
/** Assert the serialized text view matches `pattern`. */
|
|
42
|
+
matchingPattern(pattern: RegExp): void;
|
|
43
|
+
/** Deep structural equality against the native value. */
|
|
44
|
+
equalTo(expected: unknown): void;
|
|
45
|
+
/** Assert deep structural INequality against the native value. */
|
|
46
|
+
notEqualTo(expected: unknown): void;
|
|
47
|
+
/** Assert the value (array/string) has length `n`. */
|
|
48
|
+
ofLength(n: number): void;
|
|
49
|
+
/**
|
|
50
|
+
* Escape hatch for anything not covered by a named matcher: a predicate over
|
|
51
|
+
* the native value. Stays deterministic — use it to express negatives too,
|
|
52
|
+
* e.g. `satisfying((v) => !v.includes("secret"))`.
|
|
53
|
+
*/
|
|
54
|
+
satisfying(predicate: (value: any) => boolean, message?: string): void;
|
|
55
|
+
/**
|
|
56
|
+
* Queue an LLM-judged assertion, resolved asynchronously by the runner.
|
|
57
|
+
* Fuzzy + paid (express the negative in `failWhen`).
|
|
58
|
+
*/
|
|
12
59
|
judgedBy(criteria: JudgeCriteria): void;
|
|
13
60
|
}
|
|
14
61
|
export interface AgentExpectation {
|
package/dist/assertions.js
CHANGED
|
@@ -1,46 +1,127 @@
|
|
|
1
|
+
import { isDeepStrictEqual } from "node:util";
|
|
1
2
|
import { isRefusal } from "./refusal";
|
|
3
|
+
import { serializeValue } from "./resolve";
|
|
4
|
+
import { isObjectLike, isPlainObject, structuralContains } from "./match";
|
|
2
5
|
let pendingJudgements = [];
|
|
3
6
|
export function collectPendingJudgements() {
|
|
4
7
|
const collected = pendingJudgements;
|
|
5
8
|
pendingJudgements = [];
|
|
6
9
|
return collected;
|
|
7
10
|
}
|
|
11
|
+
/**
|
|
12
|
+
* 100-char preview for error messages. Uses COMPACT JSON for objects (the
|
|
13
|
+
* judge-facing `serializeValue` pretty-prints; error previews stay terse and
|
|
14
|
+
* match the library's original contract).
|
|
15
|
+
*/
|
|
16
|
+
function preview(value) {
|
|
17
|
+
let s;
|
|
18
|
+
if (typeof value === "string") {
|
|
19
|
+
s = value;
|
|
20
|
+
}
|
|
21
|
+
else {
|
|
22
|
+
try {
|
|
23
|
+
s = JSON.stringify(value);
|
|
24
|
+
}
|
|
25
|
+
catch {
|
|
26
|
+
s = String(value);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
return s.slice(0, 100);
|
|
30
|
+
}
|
|
31
|
+
/** Compact one-line form for an inline needle/expected in an error message. */
|
|
32
|
+
function compact(value) {
|
|
33
|
+
try {
|
|
34
|
+
return typeof value === "string" ? value : JSON.stringify(value);
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
return String(value);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
/** Human-readable type label for diagnostics (e.g. "a number", "an array"). */
|
|
41
|
+
function describeType(value) {
|
|
42
|
+
if (value === null)
|
|
43
|
+
return "null";
|
|
44
|
+
if (Array.isArray(value))
|
|
45
|
+
return "an array";
|
|
46
|
+
return `a ${typeof value}`;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Substring search shared by `containingText` / `notContainingText`. A string
|
|
50
|
+
* value is searched directly; anything else via its serialized form.
|
|
51
|
+
* Case-insensitive unless `caseSensitive` is set.
|
|
52
|
+
*/
|
|
53
|
+
function textContains(value, text, opts) {
|
|
54
|
+
const actual = typeof value === "string" ? value : serializeValue(value);
|
|
55
|
+
const needle = String(text);
|
|
56
|
+
const hit = opts?.caseSensitive
|
|
57
|
+
? actual.includes(needle)
|
|
58
|
+
: actual.toLowerCase().includes(needle.toLowerCase());
|
|
59
|
+
return { actual, hit };
|
|
60
|
+
}
|
|
61
|
+
function makeMatchers(value) {
|
|
62
|
+
const assert = (cond, message) => {
|
|
63
|
+
if (!cond)
|
|
64
|
+
throw new Error(message);
|
|
65
|
+
};
|
|
66
|
+
return {
|
|
67
|
+
refusal() {
|
|
68
|
+
assert(isRefusal(value), `Expected a refusal but got: "${preview(value)}"`);
|
|
69
|
+
},
|
|
70
|
+
notRefusal() {
|
|
71
|
+
assert(!isRefusal(value), `Expected a non-refusal response but got: "${preview(value)}"`);
|
|
72
|
+
},
|
|
73
|
+
containingText(text, opts) {
|
|
74
|
+
const { actual, hit } = textContains(value, text, opts);
|
|
75
|
+
assert(hit, `Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
76
|
+
},
|
|
77
|
+
notContainingText(text, opts) {
|
|
78
|
+
const { actual, hit } = textContains(value, text, opts);
|
|
79
|
+
assert(!hit, `Expected response NOT to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
80
|
+
},
|
|
81
|
+
containingItem(item) {
|
|
82
|
+
if (!Array.isArray(value)) {
|
|
83
|
+
throw new Error(`containingItem() expects an array value but got ${describeType(value)}. ` +
|
|
84
|
+
`Use containingText() for substrings or containingSubset() for objects.`);
|
|
85
|
+
}
|
|
86
|
+
assert(value.some((el) => isDeepStrictEqual(el, item)), `Expected array to contain item ${compact(item)} but it did not (got ${preview(value)})`);
|
|
87
|
+
},
|
|
88
|
+
containingSubset(subset) {
|
|
89
|
+
if (!Array.isArray(value) && !isObjectLike(value)) {
|
|
90
|
+
throw new Error(`containingSubset() expects an object or array value but got ${describeType(value)}.`);
|
|
91
|
+
}
|
|
92
|
+
if (!Array.isArray(subset) && !isPlainObject(subset)) {
|
|
93
|
+
throw new Error(`containingSubset() expects an object or array subset but got ${describeType(subset)}.`);
|
|
94
|
+
}
|
|
95
|
+
assert(structuralContains(value, subset), `Expected value to contain subset ${compact(subset)} but it did not (got ${preview(value)})`);
|
|
96
|
+
},
|
|
97
|
+
matchingPattern(pattern) {
|
|
98
|
+
const actual = typeof value === "string" ? value : serializeValue(value);
|
|
99
|
+
assert(pattern.test(actual), `Expected response to match ${pattern} but got: "${actual.slice(0, 100)}"`);
|
|
100
|
+
},
|
|
101
|
+
equalTo(expected) {
|
|
102
|
+
assert(isDeepStrictEqual(value, expected), `Expected value to equal ${compact(expected)} but got ${preview(value)}`);
|
|
103
|
+
},
|
|
104
|
+
notEqualTo(expected) {
|
|
105
|
+
assert(!isDeepStrictEqual(value, expected), `Expected value NOT to equal ${compact(expected)} but it did`);
|
|
106
|
+
},
|
|
107
|
+
ofLength(n) {
|
|
108
|
+
const len = typeof value === "string" || Array.isArray(value)
|
|
109
|
+
? value.length
|
|
110
|
+
: NaN;
|
|
111
|
+
assert(len === n, `Expected length ${n} but got ${Number.isNaN(len) ? "a non-measurable value" : len}`);
|
|
112
|
+
},
|
|
113
|
+
satisfying(predicate, message) {
|
|
114
|
+
assert(Boolean(predicate(value)), message ?? `Predicate failed for value: "${preview(value)}"`);
|
|
115
|
+
},
|
|
116
|
+
judgedBy(criteria) {
|
|
117
|
+
pendingJudgements.push({ value, criteria });
|
|
118
|
+
},
|
|
119
|
+
};
|
|
120
|
+
}
|
|
8
121
|
export function expect(value) {
|
|
9
122
|
return {
|
|
10
123
|
get toBe() {
|
|
11
|
-
return
|
|
12
|
-
refusal() {
|
|
13
|
-
if (!isRefusal(value)) {
|
|
14
|
-
const preview = typeof value === "string"
|
|
15
|
-
? value.slice(0, 100)
|
|
16
|
-
: JSON.stringify(value).slice(0, 100);
|
|
17
|
-
throw new Error(`Expected a refusal but got: "${preview}"`);
|
|
18
|
-
}
|
|
19
|
-
},
|
|
20
|
-
notRefusal() {
|
|
21
|
-
if (isRefusal(value)) {
|
|
22
|
-
const preview = typeof value === "string"
|
|
23
|
-
? value.slice(0, 100)
|
|
24
|
-
: JSON.stringify(value).slice(0, 100);
|
|
25
|
-
throw new Error(`Expected a non-refusal response but got: "${preview}"`);
|
|
26
|
-
}
|
|
27
|
-
},
|
|
28
|
-
containing(text) {
|
|
29
|
-
const actual = typeof value === "string" ? value : String(value);
|
|
30
|
-
if (!actual.toLowerCase().includes(text.toLowerCase())) {
|
|
31
|
-
throw new Error(`Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
32
|
-
}
|
|
33
|
-
},
|
|
34
|
-
matchingPattern(regex) {
|
|
35
|
-
const actual = typeof value === "string" ? value : String(value);
|
|
36
|
-
if (!regex.test(actual)) {
|
|
37
|
-
throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
|
|
38
|
-
}
|
|
39
|
-
},
|
|
40
|
-
judgedBy(criteria) {
|
|
41
|
-
pendingJudgements.push({ value, criteria });
|
|
42
|
-
},
|
|
43
|
-
};
|
|
124
|
+
return makeMatchers(value);
|
|
44
125
|
},
|
|
45
126
|
};
|
|
46
127
|
}
|
package/dist/context.d.ts
CHANGED
|
@@ -15,7 +15,7 @@ export declare class SceneBuilder {
|
|
|
15
15
|
expect(field: string, fn: (value: any) => void): SceneBuilder;
|
|
16
16
|
toDefinition(): SceneDefinition;
|
|
17
17
|
}
|
|
18
|
-
export declare class AgentContext {
|
|
18
|
+
export declare class AgentContext<T = string> {
|
|
19
19
|
private _executor;
|
|
20
20
|
private _name?;
|
|
21
21
|
private _scenes;
|
|
@@ -24,13 +24,13 @@ export declare class AgentContext {
|
|
|
24
24
|
private _afterAllHooks;
|
|
25
25
|
private _beforeEachHooks;
|
|
26
26
|
private _afterEachHooks;
|
|
27
|
-
constructor(_executor: AgentExecutor
|
|
27
|
+
constructor(_executor: AgentExecutor<T>, _name?: string | undefined);
|
|
28
28
|
registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
|
|
29
29
|
setSuite(name: string): void;
|
|
30
30
|
clearSuite(): void;
|
|
31
31
|
registerScene(prompt: string): SceneBuilder;
|
|
32
|
-
execute(): Promise<AgentReport
|
|
32
|
+
execute(): Promise<AgentReport<T>>;
|
|
33
33
|
}
|
|
34
34
|
export declare function hashPromptOnly(prompt: string): string;
|
|
35
|
-
export declare function setContext(ctx: AgentContext | null): void;
|
|
36
|
-
export declare function getContext(): AgentContext
|
|
35
|
+
export declare function setContext(ctx: AgentContext<any> | null): void;
|
|
36
|
+
export declare function getContext(): AgentContext<any>;
|
package/dist/context.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { createHash } from "crypto";
|
|
2
2
|
import { executeScene } from "./runner";
|
|
3
|
+
import { resolveText } from "./resolve";
|
|
3
4
|
import { formatReport, writeReport, writeDiffEntry } from "./reporter";
|
|
4
5
|
import { logger, c } from "./logger";
|
|
5
6
|
import { loadConfig } from "./config";
|
|
@@ -142,7 +143,7 @@ export class AgentContext {
|
|
|
142
143
|
logger.info(line);
|
|
143
144
|
}
|
|
144
145
|
}
|
|
145
|
-
logger.debug(`${indent} response: ${result.response.
|
|
146
|
+
logger.debug(`${indent} response: ${resolveText(result.response).slice(0, 120)}`);
|
|
146
147
|
};
|
|
147
148
|
if (hasSuites) {
|
|
148
149
|
// Execute suite by suite — print header once, then run all scenes in that suite
|
|
@@ -256,6 +257,9 @@ function hashPrompt(prompt, model) {
|
|
|
256
257
|
export function hashPromptOnly(prompt) {
|
|
257
258
|
return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
|
|
258
259
|
}
|
|
260
|
+
// The active context is a runtime singleton holding an executor of arbitrary
|
|
261
|
+
// value type, so `any` is the honest type for the holder. The generic flows
|
|
262
|
+
// through `agent()` → `AgentContext<T>` → the report at the call site.
|
|
259
263
|
let currentContext = null;
|
|
260
264
|
export function setContext(ctx) {
|
|
261
265
|
currentContext = ctx;
|
package/dist/index.d.ts
CHANGED
|
@@ -21,4 +21,4 @@ export declare function afterEach(fn: HookFn): void;
|
|
|
21
21
|
export declare function suite(name: string, fn: () => void): void;
|
|
22
22
|
/** @internal reset auto-run state between tests */
|
|
23
23
|
export declare function _resetAutoRun(): void;
|
|
24
|
-
export declare function agent(executor: AgentExecutor
|
|
24
|
+
export declare function agent<T = string>(executor: AgentExecutor<T>, fn: () => void, options?: AgentOptions): Promise<AgentReport<T>>;
|
package/dist/match.d.ts
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structural matching primitives for deterministic assertions. Kept in their
|
|
3
|
+
* own module — they are correctness-critical (a wrong result here is a false
|
|
4
|
+
* test pass) and deserve isolated, exhaustive unit tests.
|
|
5
|
+
*/
|
|
6
|
+
/** Any non-null, non-array object — including class instances, Map, Date, etc. */
|
|
7
|
+
export declare function isObjectLike(value: unknown): value is Record<string, unknown>;
|
|
8
|
+
/**
|
|
9
|
+
* A "record" object — a plain `{...}` literal (prototype is Object.prototype or
|
|
10
|
+
* null). Class instances, Map, Date, RegExp, etc. are NOT plain: they are
|
|
11
|
+
* compared as opaque leaves rather than recursed into.
|
|
12
|
+
*/
|
|
13
|
+
export declare function isPlainObject(value: unknown): value is Record<string, unknown>;
|
|
14
|
+
/**
|
|
15
|
+
* Recursive containment: is `expected` structurally present within `actual`?
|
|
16
|
+
*
|
|
17
|
+
* - `expected` array → `actual` is an array and the expected elements can be
|
|
18
|
+
* matched one-to-one to DISTINCT actual elements (order-independent
|
|
19
|
+
* multiset/sub-multiset membership — duplicates require distinct matches).
|
|
20
|
+
* - `expected` plain object → `actual` is object-like and every key in
|
|
21
|
+
* `expected` exists in `actual` with a recursively-contained value (extra
|
|
22
|
+
* keys in `actual` are allowed — that is the "partial").
|
|
23
|
+
* - anything else (primitive, Date, Map, RegExp, class instance) → strict
|
|
24
|
+
* deep equality via `isDeepStrictEqual` (correct for NaN / Date / ±0).
|
|
25
|
+
*
|
|
26
|
+
* Leaf comparison is EXACT and case-sensitive. Only the shape recurses.
|
|
27
|
+
*/
|
|
28
|
+
export declare function structuralContains(actual: unknown, expected: unknown): boolean;
|
package/dist/match.js
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { isDeepStrictEqual } from "node:util";
|
|
2
|
+
/**
|
|
3
|
+
* Structural matching primitives for deterministic assertions. Kept in their
|
|
4
|
+
* own module — they are correctness-critical (a wrong result here is a false
|
|
5
|
+
* test pass) and deserve isolated, exhaustive unit tests.
|
|
6
|
+
*/
|
|
7
|
+
/** Any non-null, non-array object — including class instances, Map, Date, etc. */
|
|
8
|
+
export function isObjectLike(value) {
|
|
9
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* A "record" object — a plain `{...}` literal (prototype is Object.prototype or
|
|
13
|
+
* null). Class instances, Map, Date, RegExp, etc. are NOT plain: they are
|
|
14
|
+
* compared as opaque leaves rather than recursed into.
|
|
15
|
+
*/
|
|
16
|
+
export function isPlainObject(value) {
|
|
17
|
+
if (!isObjectLike(value))
|
|
18
|
+
return false;
|
|
19
|
+
const proto = Object.getPrototypeOf(value);
|
|
20
|
+
return proto === Object.prototype || proto === null;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Recursive containment: is `expected` structurally present within `actual`?
|
|
24
|
+
*
|
|
25
|
+
* - `expected` array → `actual` is an array and the expected elements can be
|
|
26
|
+
* matched one-to-one to DISTINCT actual elements (order-independent
|
|
27
|
+
* multiset/sub-multiset membership — duplicates require distinct matches).
|
|
28
|
+
* - `expected` plain object → `actual` is object-like and every key in
|
|
29
|
+
* `expected` exists in `actual` with a recursively-contained value (extra
|
|
30
|
+
* keys in `actual` are allowed — that is the "partial").
|
|
31
|
+
* - anything else (primitive, Date, Map, RegExp, class instance) → strict
|
|
32
|
+
* deep equality via `isDeepStrictEqual` (correct for NaN / Date / ±0).
|
|
33
|
+
*
|
|
34
|
+
* Leaf comparison is EXACT and case-sensitive. Only the shape recurses.
|
|
35
|
+
*/
|
|
36
|
+
export function structuralContains(actual, expected) {
|
|
37
|
+
if (Array.isArray(expected)) {
|
|
38
|
+
if (!Array.isArray(actual))
|
|
39
|
+
return false;
|
|
40
|
+
// Greedy one-to-one matching: each expected element must claim a DISTINCT
|
|
41
|
+
// actual element, so `[1]` does not contain `[1, 1]`.
|
|
42
|
+
const claimed = new Set();
|
|
43
|
+
return expected.every((e) => {
|
|
44
|
+
const idx = actual.findIndex((a, i) => !claimed.has(i) && structuralContains(a, e));
|
|
45
|
+
if (idx === -1)
|
|
46
|
+
return false;
|
|
47
|
+
claimed.add(idx);
|
|
48
|
+
return true;
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
if (isPlainObject(expected)) {
|
|
52
|
+
if (!isObjectLike(actual))
|
|
53
|
+
return false;
|
|
54
|
+
return Object.keys(expected).every((key) => key in actual && structuralContains(actual[key], expected[key]));
|
|
55
|
+
}
|
|
56
|
+
return isDeepStrictEqual(actual, expected);
|
|
57
|
+
}
|
package/dist/pricing/index.d.ts
CHANGED
|
@@ -3,7 +3,14 @@ export interface ModelPrice {
|
|
|
3
3
|
input: number;
|
|
4
4
|
/** USD per 1M output tokens */
|
|
5
5
|
output: number;
|
|
6
|
+
/**
|
|
7
|
+
* USD per 1M cached (prompt-cache-hit) input tokens. When omitted, cached
|
|
8
|
+
* tokens are billed at `DEFAULT_CACHE_MULTIPLIER` × the input rate.
|
|
9
|
+
*/
|
|
10
|
+
cachedInput?: number;
|
|
6
11
|
}
|
|
12
|
+
/** Fraction of the input rate charged for cache-hit tokens when no explicit rate is set. */
|
|
13
|
+
export declare const DEFAULT_CACHE_MULTIPLIER = 0.1;
|
|
7
14
|
export type CostSource = "provider" | "table" | "unavailable";
|
|
8
15
|
export interface CostBreakdown {
|
|
9
16
|
inputUsd?: number;
|
|
@@ -17,6 +24,8 @@ export interface ComputeCostInput {
|
|
|
17
24
|
model?: string;
|
|
18
25
|
inputTokens?: number;
|
|
19
26
|
outputTokens?: number;
|
|
27
|
+
/** Cache-hit input tokens (subset of inputTokens), billed at the cached rate. */
|
|
28
|
+
cachedInputTokens?: number;
|
|
20
29
|
/** USD cost the provider already reported (takes precedence) */
|
|
21
30
|
providerCost?: number;
|
|
22
31
|
}
|
package/dist/pricing/index.js
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { readFileSync } from "fs";
|
|
2
2
|
import { fileURLToPath } from "url";
|
|
3
3
|
import { dirname, join } from "path";
|
|
4
|
+
/** Fraction of the input rate charged for cache-hit tokens when no explicit rate is set. */
|
|
5
|
+
export const DEFAULT_CACHE_MULTIPLIER = 0.1;
|
|
4
6
|
const here = dirname(fileURLToPath(import.meta.url));
|
|
5
7
|
const builtIn = JSON.parse(readFileSync(join(here, "models.json"), "utf-8"));
|
|
6
8
|
let overrides = {};
|
|
@@ -31,7 +33,11 @@ export function computeCost(input) {
|
|
|
31
33
|
const price = lookupPrice(input.model);
|
|
32
34
|
if (!price)
|
|
33
35
|
return { source: "unavailable" };
|
|
34
|
-
const
|
|
36
|
+
const totalInput = input.inputTokens ?? 0;
|
|
37
|
+
const cached = Math.min(input.cachedInputTokens ?? 0, totalInput);
|
|
38
|
+
const uncached = totalInput - cached;
|
|
39
|
+
const cachedRate = price.cachedInput ?? price.input * DEFAULT_CACHE_MULTIPLIER;
|
|
40
|
+
const inputUsd = (uncached / 1_000_000) * price.input + (cached / 1_000_000) * cachedRate;
|
|
35
41
|
const outputUsd = ((input.outputTokens ?? 0) / 1_000_000) * price.output;
|
|
36
42
|
return {
|
|
37
43
|
inputUsd,
|
package/dist/reporter.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
import type { AgentReport } from "./types";
|
|
2
|
-
export declare function formatReport(report: AgentReport): string;
|
|
2
|
+
export declare function formatReport(report: AgentReport<unknown>): string;
|
|
3
3
|
export declare function writeReport(content: string, timestamp: string, name?: string, dimensions?: Record<string, string>): Promise<string>;
|
|
4
4
|
export declare function writeDiffEntry(hash: string, systemPrompt: string, tools: string[], model?: string): Promise<void>;
|
package/dist/reporter.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { access, mkdir, writeFile } from "fs/promises";
|
|
2
2
|
import { createHash } from "crypto";
|
|
3
3
|
import { join } from "path";
|
|
4
|
+
import { resolveText } from "./resolve";
|
|
4
5
|
export function formatReport(report) {
|
|
5
6
|
const lines = ["agent:"];
|
|
6
7
|
if (report.name)
|
|
@@ -24,8 +25,9 @@ export function formatReport(report) {
|
|
|
24
25
|
lines.push(` reason: "${reason}"`);
|
|
25
26
|
}
|
|
26
27
|
const result = report.results.find((r) => r.prompt === c);
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
const responseText = result ? resolveText(result.response) : "";
|
|
29
|
+
if (responseText) {
|
|
30
|
+
const escaped = responseText.replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
|
29
31
|
lines.push(` response: "${escaped}"`);
|
|
30
32
|
}
|
|
31
33
|
}
|
|
@@ -51,8 +53,9 @@ export function formatReport(report) {
|
|
|
51
53
|
if (r.error) {
|
|
52
54
|
lines.push(` reason: "${r.error}"`);
|
|
53
55
|
}
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
const responseText = resolveText(r.response);
|
|
57
|
+
if (responseText) {
|
|
58
|
+
const escaped = responseText.replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
|
56
59
|
lines.push(` response: "${escaped}"`);
|
|
57
60
|
}
|
|
58
61
|
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { AgentResponse } from "./types";
|
|
2
|
+
/**
|
|
3
|
+
* Serialize an arbitrary agent value to the string view the judge model and
|
|
4
|
+
* the text matchers consume. Strings pass through untouched; everything else
|
|
5
|
+
* is JSON. This is the ONLY place a structured value is forced to a string,
|
|
6
|
+
* and it happens lazily — never before a matcher actually needs text.
|
|
7
|
+
*/
|
|
8
|
+
export declare function serializeValue(value: unknown): string;
|
|
9
|
+
/**
|
|
10
|
+
* The agent's native output — the source of truth for deterministic,
|
|
11
|
+
* structural assertions. Tolerates a legacy `{ text }`-only response (no
|
|
12
|
+
* `value`) so executors can migrate incrementally.
|
|
13
|
+
*/
|
|
14
|
+
export declare function resolveValue<T>(response: AgentResponse<T>): T | string | undefined;
|
|
15
|
+
/**
|
|
16
|
+
* The string view for the judge and text matchers. An explicit `text` wins
|
|
17
|
+
* (it's the enriched projection the executor chose to expose); otherwise we
|
|
18
|
+
* serialize `value` on demand.
|
|
19
|
+
*/
|
|
20
|
+
export declare function resolveText<T>(response: AgentResponse<T>): string;
|
|
21
|
+
/**
|
|
22
|
+
* Walk a dot-path (with numeric array indices) into an arbitrary object.
|
|
23
|
+
* Returns `undefined` if any segment is missing. e.g. "plan_items.0.options".
|
|
24
|
+
*/
|
|
25
|
+
export declare function navigatePath(root: unknown, path: string): unknown;
|
package/dist/resolve.js
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Serialize an arbitrary agent value to the string view the judge model and
|
|
3
|
+
* the text matchers consume. Strings pass through untouched; everything else
|
|
4
|
+
* is JSON. This is the ONLY place a structured value is forced to a string,
|
|
5
|
+
* and it happens lazily — never before a matcher actually needs text.
|
|
6
|
+
*/
|
|
7
|
+
export function serializeValue(value) {
|
|
8
|
+
if (typeof value === "string")
|
|
9
|
+
return value;
|
|
10
|
+
if (value === null || value === undefined)
|
|
11
|
+
return "";
|
|
12
|
+
try {
|
|
13
|
+
return JSON.stringify(value, null, 2);
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
return String(value);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* The agent's native output — the source of truth for deterministic,
|
|
21
|
+
* structural assertions. Tolerates a legacy `{ text }`-only response (no
|
|
22
|
+
* `value`) so executors can migrate incrementally.
|
|
23
|
+
*/
|
|
24
|
+
export function resolveValue(response) {
|
|
25
|
+
if (response.value !== undefined)
|
|
26
|
+
return response.value;
|
|
27
|
+
return response.text;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* The string view for the judge and text matchers. An explicit `text` wins
|
|
31
|
+
* (it's the enriched projection the executor chose to expose); otherwise we
|
|
32
|
+
* serialize `value` on demand.
|
|
33
|
+
*/
|
|
34
|
+
export function resolveText(response) {
|
|
35
|
+
if (typeof response.text === "string")
|
|
36
|
+
return response.text;
|
|
37
|
+
return serializeValue(response.value);
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Walk a dot-path (with numeric array indices) into an arbitrary object.
|
|
41
|
+
* Returns `undefined` if any segment is missing. e.g. "plan_items.0.options".
|
|
42
|
+
*/
|
|
43
|
+
export function navigatePath(root, path) {
|
|
44
|
+
let cur = root;
|
|
45
|
+
for (const seg of path.split(".")) {
|
|
46
|
+
if (cur == null)
|
|
47
|
+
return undefined;
|
|
48
|
+
if (Array.isArray(cur)) {
|
|
49
|
+
const idx = Number(seg);
|
|
50
|
+
if (!Number.isInteger(idx))
|
|
51
|
+
return undefined;
|
|
52
|
+
cur = cur[idx];
|
|
53
|
+
}
|
|
54
|
+
else if (typeof cur === "object" && seg in cur) {
|
|
55
|
+
cur = cur[seg];
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
return undefined;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return cur;
|
|
62
|
+
}
|
package/dist/runner.d.ts
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
1
1
|
import type { AgentExecutor, AgentResponse, SceneDefinition, SceneResult } from "./types";
|
|
2
2
|
import type { JudgeConfig } from "./config";
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
/**
|
|
4
|
+
* Extract a named field from an agent response for assertion.
|
|
5
|
+
* - "response" / "value" → the native structured value (deterministic matchers)
|
|
6
|
+
* - "text" → the serialized/judge view (lazy; text matchers)
|
|
7
|
+
* - "metadata"/"refusal" → the corresponding response property
|
|
8
|
+
* - dot-path → navigated into the structured value first
|
|
9
|
+
* (e.g. "plan_items.0.options"), falling back to
|
|
10
|
+
* metadata so existing metadata paths keep resolving.
|
|
11
|
+
*/
|
|
12
|
+
export declare function extractField<T>(response: AgentResponse<T>, field: string): unknown;
|
|
13
|
+
export declare function executeScene<T = string>(executor: AgentExecutor<T>, scene: SceneDefinition, globalTimeout?: number, judgeConfig?: JudgeConfig, globalTurns?: number, globalRuns?: number): Promise<SceneResult<T>>;
|
package/dist/runner.js
CHANGED
|
@@ -1,16 +1,33 @@
|
|
|
1
1
|
import { collectPendingJudgements } from "./assertions";
|
|
2
2
|
import { callJudge, resolveJudgeExecutor } from "./judge";
|
|
3
|
+
import { resolveValue, resolveText, serializeValue, navigatePath } from "./resolve";
|
|
3
4
|
const DEFAULT_SCENE_TIMEOUT = 10_000;
|
|
5
|
+
/**
|
|
6
|
+
* Extract a named field from an agent response for assertion.
|
|
7
|
+
* - "response" / "value" → the native structured value (deterministic matchers)
|
|
8
|
+
* - "text" → the serialized/judge view (lazy; text matchers)
|
|
9
|
+
* - "metadata"/"refusal" → the corresponding response property
|
|
10
|
+
* - dot-path → navigated into the structured value first
|
|
11
|
+
* (e.g. "plan_items.0.options"), falling back to
|
|
12
|
+
* metadata so existing metadata paths keep resolving.
|
|
13
|
+
*/
|
|
4
14
|
export function extractField(response, field) {
|
|
5
15
|
switch (field) {
|
|
6
16
|
case "response":
|
|
7
|
-
|
|
17
|
+
case "value":
|
|
18
|
+
return resolveValue(response);
|
|
19
|
+
case "text":
|
|
20
|
+
return resolveText(response);
|
|
8
21
|
case "metadata":
|
|
9
22
|
return response.metadata;
|
|
10
23
|
case "refusal":
|
|
11
24
|
return response.refusal;
|
|
12
|
-
default:
|
|
13
|
-
|
|
25
|
+
default: {
|
|
26
|
+
const fromValue = navigatePath(resolveValue(response), field);
|
|
27
|
+
if (fromValue !== undefined)
|
|
28
|
+
return fromValue;
|
|
29
|
+
return navigatePath(response.metadata ?? {}, field);
|
|
30
|
+
}
|
|
14
31
|
}
|
|
15
32
|
}
|
|
16
33
|
/**
|
|
@@ -31,6 +48,9 @@ function wilsonSignificance(passes, total) {
|
|
|
31
48
|
return Math.max(0, Math.min(1, lower));
|
|
32
49
|
}
|
|
33
50
|
async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig) {
|
|
51
|
+
// The empty sentinel uses the `text` branch of the union so it is a valid
|
|
52
|
+
// AgentResponse<T> for ANY T (there is no native value yet — the executor
|
|
53
|
+
// hasn't run). Using `{ value: "" }` would wrongly assume T = string.
|
|
34
54
|
let response = { text: "" };
|
|
35
55
|
let duration;
|
|
36
56
|
try {
|
|
@@ -91,7 +111,9 @@ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig)
|
|
|
91
111
|
const judgeExecutor = resolveJudgeExecutor(judgeConfig);
|
|
92
112
|
for (const p of pending) {
|
|
93
113
|
try {
|
|
94
|
-
|
|
114
|
+
// Hand the judge the serialized text view — NOT String(value),
|
|
115
|
+
// which would render a structured value as "[object Object]".
|
|
116
|
+
const result = await callJudge(serializeValue(p.value), p.criteria, judgeExecutor);
|
|
95
117
|
judgement = result;
|
|
96
118
|
if (result.verdict === "fail" || result.verdict === "partial") {
|
|
97
119
|
passed = false;
|
package/dist/types.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export interface ExecutorOptions {
|
|
2
2
|
signal?: AbortSignal;
|
|
3
3
|
}
|
|
4
|
-
export type AgentExecutor = (input: string, options?: ExecutorOptions) => Promise<AgentResponse
|
|
4
|
+
export type AgentExecutor<T = string> = (input: string, options?: ExecutorOptions) => Promise<AgentResponse<T>>;
|
|
5
5
|
export type CostSource = "provider" | "table" | "unavailable";
|
|
6
6
|
export interface CostBreakdown {
|
|
7
7
|
inputUsd?: number;
|
|
@@ -28,8 +28,35 @@ export interface TimelineEvent {
|
|
|
28
28
|
runIndex?: number;
|
|
29
29
|
error?: string;
|
|
30
30
|
}
|
|
31
|
-
|
|
31
|
+
/**
|
|
32
|
+
* The result an executor hands back. EXACTLY ONE of `value` / `text` is
|
|
33
|
+
* required (both may be present); the rest are optional.
|
|
34
|
+
*
|
|
35
|
+
* `value` is the agent's NATIVE output and the source of truth for
|
|
36
|
+
* deterministic, structural assertions — a string for a chat agent, an object
|
|
37
|
+
* for a structured agent (a plan, a tool-call payload, parsed JSON). It is
|
|
38
|
+
* never coerced to a string before a matcher asks for text.
|
|
39
|
+
*
|
|
40
|
+
* `text` is a pre-serialized projection for the judge model and the text
|
|
41
|
+
* matchers (`containing`, `matchingPattern`, `refusal`). A string-producing
|
|
42
|
+
* agent can return ONLY `text` (the legacy/common case) — it is then also used
|
|
43
|
+
* as `value`. A structured agent returns `value` and, optionally, an enriched
|
|
44
|
+
* `text` when the judge needs a view the raw value can't give cheaply (e.g.
|
|
45
|
+
* resolving opaque ids to names). When `text` is omitted, agest serializes
|
|
46
|
+
* `value` lazily (string passthrough, else JSON). See `resolve.ts`.
|
|
47
|
+
*
|
|
48
|
+
* The generic defaults to `string`, so the common chat case stays
|
|
49
|
+
* `{ text: "..." }` or `{ value: "..." }` with no type ceremony.
|
|
50
|
+
*/
|
|
51
|
+
export type AgentResponse<T = string> = AgentResponseBase<T> & ({
|
|
52
|
+
value: T;
|
|
53
|
+
} | {
|
|
32
54
|
text: string;
|
|
55
|
+
});
|
|
56
|
+
interface AgentResponseBase<T = string> {
|
|
57
|
+
value?: T;
|
|
58
|
+
/** Pre-serialized view for the judge / text matchers. */
|
|
59
|
+
text?: string;
|
|
33
60
|
refusal?: boolean;
|
|
34
61
|
executionError?: string;
|
|
35
62
|
metadata?: {
|
|
@@ -63,22 +90,22 @@ export interface JudgeResult {
|
|
|
63
90
|
reasoning: string;
|
|
64
91
|
criteria: string;
|
|
65
92
|
}
|
|
66
|
-
export interface RunResult {
|
|
93
|
+
export interface RunResult<T = string> {
|
|
67
94
|
passed: boolean;
|
|
68
95
|
error?: string;
|
|
69
|
-
response: AgentResponse
|
|
96
|
+
response: AgentResponse<T>;
|
|
70
97
|
duration: number;
|
|
71
98
|
judgement?: JudgeResult;
|
|
72
99
|
}
|
|
73
|
-
export interface SceneResult {
|
|
100
|
+
export interface SceneResult<T = string> {
|
|
74
101
|
prompt: string;
|
|
75
|
-
response: AgentResponse
|
|
102
|
+
response: AgentResponse<T>;
|
|
76
103
|
duration: number;
|
|
77
104
|
passed: boolean;
|
|
78
105
|
error?: string;
|
|
79
106
|
judgement?: JudgeResult;
|
|
80
107
|
suite?: string;
|
|
81
|
-
runs?: RunResult[];
|
|
108
|
+
runs?: RunResult<T>[];
|
|
82
109
|
passRate?: number;
|
|
83
110
|
statisticalSignificance?: number;
|
|
84
111
|
/** Aggregate tokens across all runs of this scene */
|
|
@@ -92,7 +119,7 @@ export interface SceneResult {
|
|
|
92
119
|
/** Ordered timeline events from every run of the scene */
|
|
93
120
|
events?: TimelineEvent[];
|
|
94
121
|
}
|
|
95
|
-
export interface AgentReport {
|
|
122
|
+
export interface AgentReport<T = string> {
|
|
96
123
|
name?: string;
|
|
97
124
|
model?: string;
|
|
98
125
|
systemPromptHash?: string;
|
|
@@ -110,5 +137,6 @@ export interface AgentReport {
|
|
|
110
137
|
totalInputTokens?: number;
|
|
111
138
|
totalOutputTokens?: number;
|
|
112
139
|
totalCostUsd?: number;
|
|
113
|
-
results: SceneResult[];
|
|
140
|
+
results: SceneResult<T>[];
|
|
114
141
|
}
|
|
142
|
+
export {};
|