@sebastiantuyu/agest 0.3.3-next.7 → 0.3.3-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -53,6 +53,84 @@ agent:
53
53
  average_output_tokens_per_case: 34
54
54
  ```
55
55
 
56
+ ## Assertions
57
+
58
+ Each scene asserts on a **field** of the agent's response via `.expect(field, fn)`,
59
+ and inside the callback you chain a matcher off `expect(value).toBe`.
60
+
61
+ ### Structured responses
62
+
63
+ An executor returns a native `value` (the source of truth for structural
64
+ matchers) and/or a `text` projection (for the LLM judge and text matchers):
65
+
66
+ ```typescript
67
+ // chat agent — a string is both value and text
68
+ return { text: "Bonjour" };
69
+
70
+ // structured agent — a native object, optionally with an enriched text view
71
+ return { value: { plan_items: [{ step: "search" }] } };
72
+ ```
73
+
74
+ ### Selecting a field
75
+
76
+ ```typescript
77
+ scene("Plan a trip to Tokyo")
78
+ .expect("value", (v) => expect(v).toBe.containingSubset({ plan_items: [{ step: "book_flight" }] }))
79
+ .expect("plan_items.0.step", (s) => expect(s).toBe.equalTo("book_flight")) // dot-path into the value
80
+ .expect("text", (t) => expect(t).toBe.containingText("Tokyo")); // serialized/judge view
81
+ ```
82
+
83
+ - `"response"` / `"value"` — the native value (objects stay objects; never stringified)
84
+ - `"text"` — the serialized/enriched text view (lazy: a string passes through, else JSON)
85
+ - `"refusal"` / `"metadata"` — the corresponding response properties
86
+ - any **dot-path** (e.g. `"plan_items.0.options"`) — navigates into the value, falling back to metadata
87
+
88
+ ### Matchers
89
+
90
+ **Refusal**
91
+
92
+ | Matcher | Asserts |
93
+ | --- | --- |
94
+ | `refusal()` | the agent refused |
95
+ | `notRefusal()` | the agent did **not** refuse |
96
+
97
+ **Text** — substring / regex over a string value (or the serialized form of a non-string). Case-insensitive by default.
98
+
99
+ | Matcher | Asserts |
100
+ | --- | --- |
101
+ | `containingText(text, { caseSensitive? })` | `text` appears as a substring |
102
+ | `notContainingText(text, { caseSensitive? })` | `text` does **not** appear — handy for leak/PII guards |
103
+ | `matchingPattern(regex)` | the text matches `regex` |
104
+
105
+ **Structural** — operate on the native value; exact (case-sensitive) at the leaves.
106
+
107
+ | Matcher | Asserts |
108
+ | --- | --- |
109
+ | `equalTo(expected)` | deep structural equality (NaN / Date / ±0 correct) |
110
+ | `notEqualTo(expected)` | deep structural **inequality** |
111
+ | `containingItem(item)` | value is an array containing `item` as an **exact** element |
112
+ | `containingSubset(subset)` | `subset` is a recursive **partial** match — object key/value subset, or array sub-multiset membership |
113
+ | `ofLength(n)` | array/string has length `n` |
114
+
115
+ **Custom & judged**
116
+
117
+ | Matcher | Asserts |
118
+ | --- | --- |
119
+ | `satisfying(predicate, message?)` | a deterministic predicate over the value holds (use for any negative not covered above) |
120
+ | `judgedBy({ criteria, failWhen })` | an LLM judge resolves the criteria (fuzzy + paid) |
121
+
122
+ ```typescript
123
+ expect(items).toBe.ofLength(3);
124
+ expect(results).toBe.containingItem({ id: 7, status: "ok" }); // exact element
125
+ expect(plan).toBe.containingSubset({ user: { id: 1 } }); // partial, nested
126
+ expect(response).toBe.notContainingText("api_key"); // leak guard
127
+ expect(score).toBe.satisfying((s) => s >= 0.8, "score too low");
128
+ ```
129
+
130
+ > Use `containingItem` for exact array membership and `containingSubset` for
131
+ > partial matching — strictness is chosen by the matcher name. For free-text
132
+ > search over a structured value, assert on the `"text"` field.
133
+
56
134
  Generate a very interesting report with multiple runs!:
57
135
 
58
136
  ```
@@ -56,7 +56,7 @@ export interface RemoteAdapterOptions {
56
56
  *
57
57
  * await agent(executor, () => {
58
58
  * scene("What is 2+2?").expect("response", (r) => {
59
- * expect(r).toBe.containing("4");
59
+ * expect(r).toBe.containingText("4");
60
60
  * });
61
61
  * });
62
62
  * ```
@@ -17,7 +17,7 @@
17
17
  *
18
18
  * await agent(executor, () => {
19
19
  * scene("What is 2+2?").expect("response", (r) => {
20
- * expect(r).toBe.containing("4");
20
+ * expect(r).toBe.containingText("4");
21
21
  * });
22
22
  * });
23
23
  * ```
@@ -56,6 +56,7 @@ export async function createTracingHandle(baselineMs) {
56
56
  model: name,
57
57
  inputTokens: tokens?.input,
58
58
  outputTokens: tokens?.output,
59
+ cachedInputTokens,
59
60
  providerCost,
60
61
  });
61
62
  events.push({
@@ -5,10 +5,57 @@ export interface PendingJudgement {
5
5
  }
6
6
  export declare function collectPendingJudgements(): PendingJudgement[];
7
7
  export interface AgentMatchers {
8
+ /** Assert the agent refused. */
8
9
  refusal(): void;
10
+ /** Assert the agent did NOT refuse. */
9
11
  notRefusal(): void;
10
- containing(text: string): void;
11
- matchingPattern(regex: RegExp): void;
12
+ /**
13
+ * Text containment: `text` appears as a substring. For a non-string value the
14
+ * serialized form is searched. Case-INsensitive by default; pass
15
+ * `{ caseSensitive: true }` for an exact substring.
16
+ */
17
+ containingText(text: string | number, opts?: {
18
+ caseSensitive?: boolean;
19
+ }): void;
20
+ /** Assert text containment does NOT hold. See {@link containingText}. */
21
+ notContainingText(text: string | number, opts?: {
22
+ caseSensitive?: boolean;
23
+ }): void;
24
+ /**
25
+ * Array membership: the value is an array containing `item` as an EXACT
26
+ * (deep-equal) element. Throws if the value is not an array. Use
27
+ * {@link containingSubset} when you want partial element matching.
28
+ */
29
+ containingItem(item: unknown): void;
30
+ /**
31
+ * Structural subset: `subset` is recursively contained in the value.
32
+ * - object value + object `subset` → every key in `subset` is present with a
33
+ * recursively-contained value (extra keys allowed).
34
+ * - array value + array `subset` → every `subset` element matches a distinct
35
+ * element of the value (partial element matching, order-independent).
36
+ *
37
+ * Exact at the leaves (case-sensitive). Throws if the value is not an
38
+ * object/array, or `subset` is not an object/array.
39
+ */
40
+ containingSubset(subset: object): void;
41
+ /** Assert the serialized text view matches `pattern`. */
42
+ matchingPattern(pattern: RegExp): void;
43
+ /** Deep structural equality against the native value. */
44
+ equalTo(expected: unknown): void;
45
+ /** Assert deep structural INequality against the native value. */
46
+ notEqualTo(expected: unknown): void;
47
+ /** Assert the value (array/string) has length `n`. */
48
+ ofLength(n: number): void;
49
+ /**
50
+ * Escape hatch for anything not covered by a named matcher: a predicate over
51
+ * the native value. Stays deterministic — use it to express negatives too,
52
+ * e.g. `satisfying((v) => !v.includes("secret"))`.
53
+ */
54
+ satisfying(predicate: (value: any) => boolean, message?: string): void;
55
+ /**
56
+ * Queue an LLM-judged assertion, resolved asynchronously by the runner.
57
+ * Fuzzy + paid (express the negative in `failWhen`).
58
+ */
12
59
  judgedBy(criteria: JudgeCriteria): void;
13
60
  }
14
61
  export interface AgentExpectation {
@@ -1,46 +1,127 @@
1
+ import { isDeepStrictEqual } from "node:util";
1
2
  import { isRefusal } from "./refusal";
3
+ import { serializeValue } from "./resolve";
4
+ import { isObjectLike, isPlainObject, structuralContains } from "./match";
2
5
  let pendingJudgements = [];
3
6
  export function collectPendingJudgements() {
4
7
  const collected = pendingJudgements;
5
8
  pendingJudgements = [];
6
9
  return collected;
7
10
  }
11
+ /**
12
+ * 100-char preview for error messages. Uses COMPACT JSON for objects (the
13
+ * judge-facing `serializeValue` pretty-prints; error previews stay terse and
14
+ * match the library's original contract).
15
+ */
16
+ function preview(value) {
17
+ let s;
18
+ if (typeof value === "string") {
19
+ s = value;
20
+ }
21
+ else {
22
+ try {
23
+ s = JSON.stringify(value);
24
+ }
25
+ catch {
26
+ s = String(value);
27
+ }
28
+ }
29
+ return s.slice(0, 100);
30
+ }
31
+ /** Compact one-line form for an inline needle/expected in an error message. */
32
+ function compact(value) {
33
+ try {
34
+ return typeof value === "string" ? value : JSON.stringify(value);
35
+ }
36
+ catch {
37
+ return String(value);
38
+ }
39
+ }
40
+ /** Human-readable type label for diagnostics (e.g. "a number", "an array"). */
41
+ function describeType(value) {
42
+ if (value === null)
43
+ return "null";
44
+ if (Array.isArray(value))
45
+ return "an array";
46
+ return `a ${typeof value}`;
47
+ }
48
+ /**
49
+ * Substring search shared by `containingText` / `notContainingText`. A string
50
+ * value is searched directly; anything else via its serialized form.
51
+ * Case-insensitive unless `caseSensitive` is set.
52
+ */
53
+ function textContains(value, text, opts) {
54
+ const actual = typeof value === "string" ? value : serializeValue(value);
55
+ const needle = String(text);
56
+ const hit = opts?.caseSensitive
57
+ ? actual.includes(needle)
58
+ : actual.toLowerCase().includes(needle.toLowerCase());
59
+ return { actual, hit };
60
+ }
61
+ function makeMatchers(value) {
62
+ const assert = (cond, message) => {
63
+ if (!cond)
64
+ throw new Error(message);
65
+ };
66
+ return {
67
+ refusal() {
68
+ assert(isRefusal(value), `Expected a refusal but got: "${preview(value)}"`);
69
+ },
70
+ notRefusal() {
71
+ assert(!isRefusal(value), `Expected a non-refusal response but got: "${preview(value)}"`);
72
+ },
73
+ containingText(text, opts) {
74
+ const { actual, hit } = textContains(value, text, opts);
75
+ assert(hit, `Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
76
+ },
77
+ notContainingText(text, opts) {
78
+ const { actual, hit } = textContains(value, text, opts);
79
+ assert(!hit, `Expected response NOT to contain "${text}" but got: "${actual.slice(0, 100)}"`);
80
+ },
81
+ containingItem(item) {
82
+ if (!Array.isArray(value)) {
83
+ throw new Error(`containingItem() expects an array value but got ${describeType(value)}. ` +
84
+ `Use containingText() for substrings or containingSubset() for objects.`);
85
+ }
86
+ assert(value.some((el) => isDeepStrictEqual(el, item)), `Expected array to contain item ${compact(item)} but it did not (got ${preview(value)})`);
87
+ },
88
+ containingSubset(subset) {
89
+ if (!Array.isArray(value) && !isObjectLike(value)) {
90
+ throw new Error(`containingSubset() expects an object or array value but got ${describeType(value)}.`);
91
+ }
92
+ if (!Array.isArray(subset) && !isPlainObject(subset)) {
93
+ throw new Error(`containingSubset() expects an object or array subset but got ${describeType(subset)}.`);
94
+ }
95
+ assert(structuralContains(value, subset), `Expected value to contain subset ${compact(subset)} but it did not (got ${preview(value)})`);
96
+ },
97
+ matchingPattern(pattern) {
98
+ const actual = typeof value === "string" ? value : serializeValue(value);
99
+ assert(pattern.test(actual), `Expected response to match ${pattern} but got: "${actual.slice(0, 100)}"`);
100
+ },
101
+ equalTo(expected) {
102
+ assert(isDeepStrictEqual(value, expected), `Expected value to equal ${compact(expected)} but got ${preview(value)}`);
103
+ },
104
+ notEqualTo(expected) {
105
+ assert(!isDeepStrictEqual(value, expected), `Expected value NOT to equal ${compact(expected)} but it did`);
106
+ },
107
+ ofLength(n) {
108
+ const len = typeof value === "string" || Array.isArray(value)
109
+ ? value.length
110
+ : NaN;
111
+ assert(len === n, `Expected length ${n} but got ${Number.isNaN(len) ? "a non-measurable value" : len}`);
112
+ },
113
+ satisfying(predicate, message) {
114
+ assert(Boolean(predicate(value)), message ?? `Predicate failed for value: "${preview(value)}"`);
115
+ },
116
+ judgedBy(criteria) {
117
+ pendingJudgements.push({ value, criteria });
118
+ },
119
+ };
120
+ }
8
121
  export function expect(value) {
9
122
  return {
10
123
  get toBe() {
11
- return {
12
- refusal() {
13
- if (!isRefusal(value)) {
14
- const preview = typeof value === "string"
15
- ? value.slice(0, 100)
16
- : JSON.stringify(value).slice(0, 100);
17
- throw new Error(`Expected a refusal but got: "${preview}"`);
18
- }
19
- },
20
- notRefusal() {
21
- if (isRefusal(value)) {
22
- const preview = typeof value === "string"
23
- ? value.slice(0, 100)
24
- : JSON.stringify(value).slice(0, 100);
25
- throw new Error(`Expected a non-refusal response but got: "${preview}"`);
26
- }
27
- },
28
- containing(text) {
29
- const actual = typeof value === "string" ? value : String(value);
30
- if (!actual.toLowerCase().includes(text.toLowerCase())) {
31
- throw new Error(`Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
32
- }
33
- },
34
- matchingPattern(regex) {
35
- const actual = typeof value === "string" ? value : String(value);
36
- if (!regex.test(actual)) {
37
- throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
38
- }
39
- },
40
- judgedBy(criteria) {
41
- pendingJudgements.push({ value, criteria });
42
- },
43
- };
124
+ return makeMatchers(value);
44
125
  },
45
126
  };
46
127
  }
package/dist/context.d.ts CHANGED
@@ -15,7 +15,7 @@ export declare class SceneBuilder {
15
15
  expect(field: string, fn: (value: any) => void): SceneBuilder;
16
16
  toDefinition(): SceneDefinition;
17
17
  }
18
- export declare class AgentContext {
18
+ export declare class AgentContext<T = string> {
19
19
  private _executor;
20
20
  private _name?;
21
21
  private _scenes;
@@ -24,13 +24,13 @@ export declare class AgentContext {
24
24
  private _afterAllHooks;
25
25
  private _beforeEachHooks;
26
26
  private _afterEachHooks;
27
- constructor(_executor: AgentExecutor, _name?: string | undefined);
27
+ constructor(_executor: AgentExecutor<T>, _name?: string | undefined);
28
28
  registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
29
29
  setSuite(name: string): void;
30
30
  clearSuite(): void;
31
31
  registerScene(prompt: string): SceneBuilder;
32
- execute(): Promise<AgentReport>;
32
+ execute(): Promise<AgentReport<T>>;
33
33
  }
34
34
  export declare function hashPromptOnly(prompt: string): string;
35
- export declare function setContext(ctx: AgentContext | null): void;
36
- export declare function getContext(): AgentContext;
35
+ export declare function setContext(ctx: AgentContext<any> | null): void;
36
+ export declare function getContext(): AgentContext<any>;
package/dist/context.js CHANGED
@@ -1,5 +1,6 @@
1
1
  import { createHash } from "crypto";
2
2
  import { executeScene } from "./runner";
3
+ import { resolveText } from "./resolve";
3
4
  import { formatReport, writeReport, writeDiffEntry } from "./reporter";
4
5
  import { logger, c } from "./logger";
5
6
  import { loadConfig } from "./config";
@@ -142,7 +143,7 @@ export class AgentContext {
142
143
  logger.info(line);
143
144
  }
144
145
  }
145
- logger.debug(`${indent} response: ${result.response.text?.slice(0, 120)}`);
146
+ logger.debug(`${indent} response: ${resolveText(result.response).slice(0, 120)}`);
146
147
  };
147
148
  if (hasSuites) {
148
149
  // Execute suite by suite — print header once, then run all scenes in that suite
@@ -256,6 +257,9 @@ function hashPrompt(prompt, model) {
256
257
  export function hashPromptOnly(prompt) {
257
258
  return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
258
259
  }
260
+ // The active context is a runtime singleton holding an executor of arbitrary
261
+ // value type, so `any` is the honest type for the holder. The generic flows
262
+ // through `agent()` → `AgentContext<T>` → the report at the call site.
259
263
  let currentContext = null;
260
264
  export function setContext(ctx) {
261
265
  currentContext = ctx;
package/dist/index.d.ts CHANGED
@@ -21,4 +21,4 @@ export declare function afterEach(fn: HookFn): void;
21
21
  export declare function suite(name: string, fn: () => void): void;
22
22
  /** @internal reset auto-run state between tests */
23
23
  export declare function _resetAutoRun(): void;
24
- export declare function agent(executor: AgentExecutor, fn: () => void, options?: AgentOptions): Promise<AgentReport>;
24
+ export declare function agent<T = string>(executor: AgentExecutor<T>, fn: () => void, options?: AgentOptions): Promise<AgentReport<T>>;
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Structural matching primitives for deterministic assertions. Kept in their
3
+ * own module — they are correctness-critical (a wrong result here is a false
4
+ * test pass) and deserve isolated, exhaustive unit tests.
5
+ */
6
+ /** Any non-null, non-array object — including class instances, Map, Date, etc. */
7
+ export declare function isObjectLike(value: unknown): value is Record<string, unknown>;
8
+ /**
9
+ * A "record" object — a plain `{...}` literal (prototype is Object.prototype or
10
+ * null). Class instances, Map, Date, RegExp, etc. are NOT plain: they are
11
+ * compared as opaque leaves rather than recursed into.
12
+ */
13
+ export declare function isPlainObject(value: unknown): value is Record<string, unknown>;
14
+ /**
15
+ * Recursive containment: is `expected` structurally present within `actual`?
16
+ *
17
+ * - `expected` array → `actual` is an array and the expected elements can be
18
+ * matched one-to-one to DISTINCT actual elements (order-independent
19
+ * multiset/sub-multiset membership — duplicates require distinct matches).
20
+ * - `expected` plain object → `actual` is object-like and every key in
21
+ * `expected` exists in `actual` with a recursively-contained value (extra
22
+ * keys in `actual` are allowed — that is the "partial").
23
+ * - anything else (primitive, Date, Map, RegExp, class instance) → strict
24
+ * deep equality via `isDeepStrictEqual` (correct for NaN / Date / ±0).
25
+ *
26
+ * Leaf comparison is EXACT and case-sensitive. Only the shape recurses.
27
+ */
28
+ export declare function structuralContains(actual: unknown, expected: unknown): boolean;
package/dist/match.js ADDED
@@ -0,0 +1,57 @@
1
+ import { isDeepStrictEqual } from "node:util";
2
+ /**
3
+ * Structural matching primitives for deterministic assertions. Kept in their
4
+ * own module — they are correctness-critical (a wrong result here is a false
5
+ * test pass) and deserve isolated, exhaustive unit tests.
6
+ */
7
+ /** Any non-null, non-array object — including class instances, Map, Date, etc. */
8
+ export function isObjectLike(value) {
9
+ return typeof value === "object" && value !== null && !Array.isArray(value);
10
+ }
11
+ /**
12
+ * A "record" object — a plain `{...}` literal (prototype is Object.prototype or
13
+ * null). Class instances, Map, Date, RegExp, etc. are NOT plain: they are
14
+ * compared as opaque leaves rather than recursed into.
15
+ */
16
+ export function isPlainObject(value) {
17
+ if (!isObjectLike(value))
18
+ return false;
19
+ const proto = Object.getPrototypeOf(value);
20
+ return proto === Object.prototype || proto === null;
21
+ }
22
+ /**
23
+ * Recursive containment: is `expected` structurally present within `actual`?
24
+ *
25
+ * - `expected` array → `actual` is an array and the expected elements can be
26
+ * matched one-to-one to DISTINCT actual elements (order-independent
27
+ * multiset/sub-multiset membership — duplicates require distinct matches).
28
+ * - `expected` plain object → `actual` is object-like and every key in
29
+ * `expected` exists in `actual` with a recursively-contained value (extra
30
+ * keys in `actual` are allowed — that is the "partial").
31
+ * - anything else (primitive, Date, Map, RegExp, class instance) → strict
32
+ * deep equality via `isDeepStrictEqual` (correct for NaN / Date / ±0).
33
+ *
34
+ * Leaf comparison is EXACT and case-sensitive. Only the shape recurses.
35
+ */
36
+ export function structuralContains(actual, expected) {
37
+ if (Array.isArray(expected)) {
38
+ if (!Array.isArray(actual))
39
+ return false;
40
+ // Greedy one-to-one matching: each expected element must claim a DISTINCT
41
+ // actual element, so `[1]` does not contain `[1, 1]`.
42
+ const claimed = new Set();
43
+ return expected.every((e) => {
44
+ const idx = actual.findIndex((a, i) => !claimed.has(i) && structuralContains(a, e));
45
+ if (idx === -1)
46
+ return false;
47
+ claimed.add(idx);
48
+ return true;
49
+ });
50
+ }
51
+ if (isPlainObject(expected)) {
52
+ if (!isObjectLike(actual))
53
+ return false;
54
+ return Object.keys(expected).every((key) => key in actual && structuralContains(actual[key], expected[key]));
55
+ }
56
+ return isDeepStrictEqual(actual, expected);
57
+ }
@@ -3,7 +3,14 @@ export interface ModelPrice {
3
3
  input: number;
4
4
  /** USD per 1M output tokens */
5
5
  output: number;
6
+ /**
7
+ * USD per 1M cached (prompt-cache-hit) input tokens. When omitted, cached
8
+ * tokens are billed at `DEFAULT_CACHE_MULTIPLIER` × the input rate.
9
+ */
10
+ cachedInput?: number;
6
11
  }
12
+ /** Fraction of the input rate charged for cache-hit tokens when no explicit rate is set. */
13
+ export declare const DEFAULT_CACHE_MULTIPLIER = 0.1;
7
14
  export type CostSource = "provider" | "table" | "unavailable";
8
15
  export interface CostBreakdown {
9
16
  inputUsd?: number;
@@ -17,6 +24,8 @@ export interface ComputeCostInput {
17
24
  model?: string;
18
25
  inputTokens?: number;
19
26
  outputTokens?: number;
27
+ /** Cache-hit input tokens (subset of inputTokens), billed at the cached rate. */
28
+ cachedInputTokens?: number;
20
29
  /** USD cost the provider already reported (takes precedence) */
21
30
  providerCost?: number;
22
31
  }
@@ -1,6 +1,8 @@
1
1
  import { readFileSync } from "fs";
2
2
  import { fileURLToPath } from "url";
3
3
  import { dirname, join } from "path";
4
+ /** Fraction of the input rate charged for cache-hit tokens when no explicit rate is set. */
5
+ export const DEFAULT_CACHE_MULTIPLIER = 0.1;
4
6
  const here = dirname(fileURLToPath(import.meta.url));
5
7
  const builtIn = JSON.parse(readFileSync(join(here, "models.json"), "utf-8"));
6
8
  let overrides = {};
@@ -31,7 +33,11 @@ export function computeCost(input) {
31
33
  const price = lookupPrice(input.model);
32
34
  if (!price)
33
35
  return { source: "unavailable" };
34
- const inputUsd = ((input.inputTokens ?? 0) / 1_000_000) * price.input;
36
+ const totalInput = input.inputTokens ?? 0;
37
+ const cached = Math.min(input.cachedInputTokens ?? 0, totalInput);
38
+ const uncached = totalInput - cached;
39
+ const cachedRate = price.cachedInput ?? price.input * DEFAULT_CACHE_MULTIPLIER;
40
+ const inputUsd = (uncached / 1_000_000) * price.input + (cached / 1_000_000) * cachedRate;
35
41
  const outputUsd = ((input.outputTokens ?? 0) / 1_000_000) * price.output;
36
42
  return {
37
43
  inputUsd,
@@ -1,4 +1,4 @@
1
1
  import type { AgentReport } from "./types";
2
- export declare function formatReport(report: AgentReport): string;
2
+ export declare function formatReport(report: AgentReport<unknown>): string;
3
3
  export declare function writeReport(content: string, timestamp: string, name?: string, dimensions?: Record<string, string>): Promise<string>;
4
4
  export declare function writeDiffEntry(hash: string, systemPrompt: string, tools: string[], model?: string): Promise<void>;
package/dist/reporter.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import { access, mkdir, writeFile } from "fs/promises";
2
2
  import { createHash } from "crypto";
3
3
  import { join } from "path";
4
+ import { resolveText } from "./resolve";
4
5
  export function formatReport(report) {
5
6
  const lines = ["agent:"];
6
7
  if (report.name)
@@ -24,8 +25,9 @@ export function formatReport(report) {
24
25
  lines.push(` reason: "${reason}"`);
25
26
  }
26
27
  const result = report.results.find((r) => r.prompt === c);
27
- if (result?.response.text) {
28
- const escaped = result.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
28
+ const responseText = result ? resolveText(result.response) : "";
29
+ if (responseText) {
30
+ const escaped = responseText.replace(/"/g, '\\"').replace(/\n/g, '\\n');
29
31
  lines.push(` response: "${escaped}"`);
30
32
  }
31
33
  }
@@ -51,8 +53,9 @@ export function formatReport(report) {
51
53
  if (r.error) {
52
54
  lines.push(` reason: "${r.error}"`);
53
55
  }
54
- if (r.response.text) {
55
- const escaped = r.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
56
+ const responseText = resolveText(r.response);
57
+ if (responseText) {
58
+ const escaped = responseText.replace(/"/g, '\\"').replace(/\n/g, '\\n');
56
59
  lines.push(` response: "${escaped}"`);
57
60
  }
58
61
  }
@@ -0,0 +1,25 @@
1
+ import type { AgentResponse } from "./types";
2
+ /**
3
+ * Serialize an arbitrary agent value to the string view the judge model and
4
+ * the text matchers consume. Strings pass through untouched; everything else
5
+ * is JSON. This is the ONLY place a structured value is forced to a string,
6
+ * and it happens lazily — never before a matcher actually needs text.
7
+ */
8
+ export declare function serializeValue(value: unknown): string;
9
+ /**
10
+ * The agent's native output — the source of truth for deterministic,
11
+ * structural assertions. Tolerates a legacy `{ text }`-only response (no
12
+ * `value`) so executors can migrate incrementally.
13
+ */
14
+ export declare function resolveValue<T>(response: AgentResponse<T>): T | string | undefined;
15
+ /**
16
+ * The string view for the judge and text matchers. An explicit `text` wins
17
+ * (it's the enriched projection the executor chose to expose); otherwise we
18
+ * serialize `value` on demand.
19
+ */
20
+ export declare function resolveText<T>(response: AgentResponse<T>): string;
21
+ /**
22
+ * Walk a dot-path (with numeric array indices) into an arbitrary object.
23
+ * Returns `undefined` if any segment is missing. e.g. "plan_items.0.options".
24
+ */
25
+ export declare function navigatePath(root: unknown, path: string): unknown;
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Serialize an arbitrary agent value to the string view the judge model and
3
+ * the text matchers consume. Strings pass through untouched; everything else
4
+ * is JSON. This is the ONLY place a structured value is forced to a string,
5
+ * and it happens lazily — never before a matcher actually needs text.
6
+ */
7
+ export function serializeValue(value) {
8
+ if (typeof value === "string")
9
+ return value;
10
+ if (value === null || value === undefined)
11
+ return "";
12
+ try {
13
+ return JSON.stringify(value, null, 2);
14
+ }
15
+ catch {
16
+ return String(value);
17
+ }
18
+ }
19
+ /**
20
+ * The agent's native output — the source of truth for deterministic,
21
+ * structural assertions. Tolerates a legacy `{ text }`-only response (no
22
+ * `value`) so executors can migrate incrementally.
23
+ */
24
+ export function resolveValue(response) {
25
+ if (response.value !== undefined)
26
+ return response.value;
27
+ return response.text;
28
+ }
29
+ /**
30
+ * The string view for the judge and text matchers. An explicit `text` wins
31
+ * (it's the enriched projection the executor chose to expose); otherwise we
32
+ * serialize `value` on demand.
33
+ */
34
+ export function resolveText(response) {
35
+ if (typeof response.text === "string")
36
+ return response.text;
37
+ return serializeValue(response.value);
38
+ }
39
+ /**
40
+ * Walk a dot-path (with numeric array indices) into an arbitrary object.
41
+ * Returns `undefined` if any segment is missing. e.g. "plan_items.0.options".
42
+ */
43
+ export function navigatePath(root, path) {
44
+ let cur = root;
45
+ for (const seg of path.split(".")) {
46
+ if (cur == null)
47
+ return undefined;
48
+ if (Array.isArray(cur)) {
49
+ const idx = Number(seg);
50
+ if (!Number.isInteger(idx))
51
+ return undefined;
52
+ cur = cur[idx];
53
+ }
54
+ else if (typeof cur === "object" && seg in cur) {
55
+ cur = cur[seg];
56
+ }
57
+ else {
58
+ return undefined;
59
+ }
60
+ }
61
+ return cur;
62
+ }
package/dist/runner.d.ts CHANGED
@@ -1,4 +1,13 @@
1
1
  import type { AgentExecutor, AgentResponse, SceneDefinition, SceneResult } from "./types";
2
2
  import type { JudgeConfig } from "./config";
3
- export declare function extractField(response: AgentResponse, field: string): unknown;
4
- export declare function executeScene(executor: AgentExecutor, scene: SceneDefinition, globalTimeout?: number, judgeConfig?: JudgeConfig, globalTurns?: number, globalRuns?: number): Promise<SceneResult>;
3
+ /**
4
+ * Extract a named field from an agent response for assertion.
5
+ * - "response" / "value" → the native structured value (deterministic matchers)
6
+ * - "text" → the serialized/judge view (lazy; text matchers)
7
+ * - "metadata"/"refusal" → the corresponding response property
8
+ * - dot-path → navigated into the structured value first
9
+ * (e.g. "plan_items.0.options"), falling back to
10
+ * metadata so existing metadata paths keep resolving.
11
+ */
12
+ export declare function extractField<T>(response: AgentResponse<T>, field: string): unknown;
13
+ export declare function executeScene<T = string>(executor: AgentExecutor<T>, scene: SceneDefinition, globalTimeout?: number, judgeConfig?: JudgeConfig, globalTurns?: number, globalRuns?: number): Promise<SceneResult<T>>;
package/dist/runner.js CHANGED
@@ -1,16 +1,33 @@
1
1
  import { collectPendingJudgements } from "./assertions";
2
2
  import { callJudge, resolveJudgeExecutor } from "./judge";
3
+ import { resolveValue, resolveText, serializeValue, navigatePath } from "./resolve";
3
4
  const DEFAULT_SCENE_TIMEOUT = 10_000;
5
+ /**
6
+ * Extract a named field from an agent response for assertion.
7
+ * - "response" / "value" → the native structured value (deterministic matchers)
8
+ * - "text" → the serialized/judge view (lazy; text matchers)
9
+ * - "metadata"/"refusal" → the corresponding response property
10
+ * - dot-path → navigated into the structured value first
11
+ * (e.g. "plan_items.0.options"), falling back to
12
+ * metadata so existing metadata paths keep resolving.
13
+ */
4
14
  export function extractField(response, field) {
5
15
  switch (field) {
6
16
  case "response":
7
- return response.text;
17
+ case "value":
18
+ return resolveValue(response);
19
+ case "text":
20
+ return resolveText(response);
8
21
  case "metadata":
9
22
  return response.metadata;
10
23
  case "refusal":
11
24
  return response.refusal;
12
- default:
13
- return response.metadata?.[field];
25
+ default: {
26
+ const fromValue = navigatePath(resolveValue(response), field);
27
+ if (fromValue !== undefined)
28
+ return fromValue;
29
+ return navigatePath(response.metadata ?? {}, field);
30
+ }
14
31
  }
15
32
  }
16
33
  /**
@@ -31,6 +48,9 @@ function wilsonSignificance(passes, total) {
31
48
  return Math.max(0, Math.min(1, lower));
32
49
  }
33
50
  async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig) {
51
+ // The empty sentinel uses the `text` branch of the union so it is a valid
52
+ // AgentResponse<T> for ANY T (there is no native value yet — the executor
53
+ // hasn't run). Using `{ value: "" }` would wrongly assume T = string.
34
54
  let response = { text: "" };
35
55
  let duration;
36
56
  try {
@@ -91,7 +111,9 @@ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig)
91
111
  const judgeExecutor = resolveJudgeExecutor(judgeConfig);
92
112
  for (const p of pending) {
93
113
  try {
94
- const result = await callJudge(String(p.value), p.criteria, judgeExecutor);
114
+ // Hand the judge the serialized text view — NOT String(value),
115
+ // which would render a structured value as "[object Object]".
116
+ const result = await callJudge(serializeValue(p.value), p.criteria, judgeExecutor);
95
117
  judgement = result;
96
118
  if (result.verdict === "fail" || result.verdict === "partial") {
97
119
  passed = false;
package/dist/types.d.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  export interface ExecutorOptions {
2
2
  signal?: AbortSignal;
3
3
  }
4
- export type AgentExecutor = (input: string, options?: ExecutorOptions) => Promise<AgentResponse>;
4
+ export type AgentExecutor<T = string> = (input: string, options?: ExecutorOptions) => Promise<AgentResponse<T>>;
5
5
  export type CostSource = "provider" | "table" | "unavailable";
6
6
  export interface CostBreakdown {
7
7
  inputUsd?: number;
@@ -28,8 +28,35 @@ export interface TimelineEvent {
28
28
  runIndex?: number;
29
29
  error?: string;
30
30
  }
31
- export interface AgentResponse {
31
+ /**
32
+ * The result an executor hands back. EXACTLY ONE of `value` / `text` is
33
+ * required (both may be present); the rest are optional.
34
+ *
35
+ * `value` is the agent's NATIVE output and the source of truth for
36
+ * deterministic, structural assertions — a string for a chat agent, an object
37
+ * for a structured agent (a plan, a tool-call payload, parsed JSON). It is
38
+ * never coerced to a string before a matcher asks for text.
39
+ *
40
+ * `text` is a pre-serialized projection for the judge model and the text
41
+ * matchers (`containing`, `matchingPattern`, `refusal`). A string-producing
42
+ * agent can return ONLY `text` (the legacy/common case) — it is then also used
43
+ * as `value`. A structured agent returns `value` and, optionally, an enriched
44
+ * `text` when the judge needs a view the raw value can't give cheaply (e.g.
45
+ * resolving opaque ids to names). When `text` is omitted, agest serializes
46
+ * `value` lazily (string passthrough, else JSON). See `resolve.ts`.
47
+ *
48
+ * The generic defaults to `string`, so the common chat case stays
49
+ * `{ text: "..." }` or `{ value: "..." }` with no type ceremony.
50
+ */
51
+ export type AgentResponse<T = string> = AgentResponseBase<T> & ({
52
+ value: T;
53
+ } | {
32
54
  text: string;
55
+ });
56
+ interface AgentResponseBase<T = string> {
57
+ value?: T;
58
+ /** Pre-serialized view for the judge / text matchers. */
59
+ text?: string;
33
60
  refusal?: boolean;
34
61
  executionError?: string;
35
62
  metadata?: {
@@ -63,22 +90,22 @@ export interface JudgeResult {
63
90
  reasoning: string;
64
91
  criteria: string;
65
92
  }
66
- export interface RunResult {
93
+ export interface RunResult<T = string> {
67
94
  passed: boolean;
68
95
  error?: string;
69
- response: AgentResponse;
96
+ response: AgentResponse<T>;
70
97
  duration: number;
71
98
  judgement?: JudgeResult;
72
99
  }
73
- export interface SceneResult {
100
+ export interface SceneResult<T = string> {
74
101
  prompt: string;
75
- response: AgentResponse;
102
+ response: AgentResponse<T>;
76
103
  duration: number;
77
104
  passed: boolean;
78
105
  error?: string;
79
106
  judgement?: JudgeResult;
80
107
  suite?: string;
81
- runs?: RunResult[];
108
+ runs?: RunResult<T>[];
82
109
  passRate?: number;
83
110
  statisticalSignificance?: number;
84
111
  /** Aggregate tokens across all runs of this scene */
@@ -92,7 +119,7 @@ export interface SceneResult {
92
119
  /** Ordered timeline events from every run of the scene */
93
120
  events?: TimelineEvent[];
94
121
  }
95
- export interface AgentReport {
122
+ export interface AgentReport<T = string> {
96
123
  name?: string;
97
124
  model?: string;
98
125
  systemPromptHash?: string;
@@ -110,5 +137,6 @@ export interface AgentReport {
110
137
  totalInputTokens?: number;
111
138
  totalOutputTokens?: number;
112
139
  totalCostUsd?: number;
113
- results: SceneResult[];
140
+ results: SceneResult<T>[];
114
141
  }
142
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sebastiantuyu/agest",
3
- "version": "0.3.3-next.7",
3
+ "version": "0.3.3-next.8",
4
4
  "description": "A testing library for agents",
5
5
  "repository": {
6
6
  "type": "git",