@sebastiantuyu/agest 0.3.2 → 0.3.3-next.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,43 +1,122 @@
1
1
  #!/usr/bin/env node
2
2
  import { spawn } from "child_process";
3
+ import { fileURLToPath } from "node:url";
4
+ import { realpathSync } from "node:fs";
3
5
  import { main as stats } from "./stats.js";
4
6
  import { main as preview } from "./preview.js";
5
- const command = process.argv[2];
6
- async function run() {
7
- const files = process.argv.slice(3);
7
+ import { DEFAULT_PATTERN, discoverTestFiles } from "./discover.js";
8
+ /**
9
+ * Extract the args that follow the command word from a full `process.argv`.
10
+ * `argv = [execPath, scriptPath, command, ...commandArgs]`, so the command's
11
+ * args always start at index 3. Capturing them here (once, from the original
12
+ * argv) avoids re-slicing a mutated argv downstream — the double-shift that
13
+ * silently dropped a lone `run` target and made discovery scan the whole cwd.
14
+ */
15
+ export function getCommandArgs(argv) {
16
+ return argv.slice(3);
17
+ }
18
+ export function parseRunArgs(args) {
19
+ const targets = [];
20
+ let pattern;
21
+ let full = false;
22
+ for (let i = 0; i < args.length; i++) {
23
+ const a = args[i];
24
+ if (a === "--pattern" || a === "-p") {
25
+ pattern = args[++i];
26
+ if (pattern === undefined) {
27
+ console.error(" Error: --pattern requires a value");
28
+ process.exit(1);
29
+ }
30
+ }
31
+ else if (a.startsWith("--pattern=")) {
32
+ pattern = a.slice("--pattern=".length);
33
+ }
34
+ else if (a === "--full") {
35
+ full = true;
36
+ }
37
+ else {
38
+ targets.push(a);
39
+ }
40
+ }
41
+ return { pattern, targets, full };
42
+ }
43
+ async function run(args) {
44
+ const { pattern, targets, full } = parseRunArgs(args);
45
+ const files = await discoverTestFiles(targets, { pattern });
8
46
  if (files.length === 0) {
9
- console.error(" Usage: agest run <file...>");
47
+ const effective = pattern ?? DEFAULT_PATTERN;
48
+ console.error(` No test files found (pattern: ${effective})`);
10
49
  process.exit(1);
11
50
  }
12
51
  for (const file of files) {
13
52
  const child = spawn("npx", ["tsx", file], {
14
53
  stdio: "inherit",
15
54
  shell: true,
55
+ // The test file renders its own output in a child process; propagate the
56
+ // --full flag through the environment so it knows to emit the waterfall
57
+ // and full report rather than just per-scene results.
58
+ env: full ? { ...process.env, AGEST_FULL: "1" } : process.env,
16
59
  });
17
60
  const code = await new Promise((resolve) => child.on("close", (c) => resolve(c ?? 1)));
18
61
  if (code !== 0)
19
62
  process.exit(code);
20
63
  }
21
64
  }
22
- const commands = {
23
- stats,
24
- preview,
25
- run,
26
- };
27
- if (!command || !commands[command]) {
65
+ function printUsage() {
28
66
  console.log(`
29
67
  Usage: agest <command>
30
68
 
31
69
  Commands:
32
- run Run test file(s) agest run tests/*.test.ts
70
+ run Run test file(s), directories, or glob patterns
71
+ agest run tests/ # walks for ${DEFAULT_PATTERN}
72
+ agest run src/agest --pattern "**/*.test.ts"
73
+ agest run "tests/**/*.agest.ts" path/to/file.agest.ts
74
+ agest run tests/ --full # also print waterfall + full report
33
75
  stats Show aggregated test statistics
34
76
  preview Generate an HTML report preview
35
77
  `);
36
- process.exit(command ? 1 : 0);
37
78
  }
38
- // Forward remaining args so subcommands see them at process.argv[2+]
39
- process.argv = [process.argv[0], process.argv[1], ...process.argv.slice(3)];
40
- commands[command]().catch((err) => {
41
- console.error("Error:", err.message);
42
- process.exit(1);
43
- });
79
+ const KNOWN_COMMANDS = new Set(["run", "stats", "preview"]);
80
+ export async function main(argv) {
81
+ const command = argv[2];
82
+ const commandArgs = getCommandArgs(argv);
83
+ if (!command || !KNOWN_COMMANDS.has(command)) {
84
+ printUsage();
85
+ process.exit(command ? 1 : 0);
86
+ }
87
+ if (command === "run") {
88
+ await run(commandArgs);
89
+ return;
90
+ }
91
+ // stats/preview read their args from `process.argv.slice(2)`, so normalize
92
+ // argv to drop the command word before handing off.
93
+ process.argv = [argv[0], argv[1], ...commandArgs];
94
+ if (command === "stats")
95
+ await stats();
96
+ else
97
+ await preview();
98
+ }
99
+ // Only run as a CLI when invoked directly (bin or `tsx src/cli.ts`), not when
100
+ // imported by a test — that keeps `main` from firing (and calling
101
+ // process.exit) on import. Package managers expose the bin as a symlink
102
+ // (node_modules/.bin/agest), so argv[1] is the symlink path while
103
+ // import.meta.url is the real file; realpath both sides before comparing or
104
+ // the CLI silently no-ops when invoked through the symlink.
105
+ function isInvokedAsCli() {
106
+ const entry = process.argv[1];
107
+ if (!entry)
108
+ return false;
109
+ const self = fileURLToPath(import.meta.url);
110
+ try {
111
+ return realpathSync(entry) === realpathSync(self);
112
+ }
113
+ catch {
114
+ return entry === self;
115
+ }
116
+ }
117
+ if (isInvokedAsCli()) {
118
+ main(process.argv).catch((err) => {
119
+ console.error("Error:", err.message);
120
+ process.exit(1);
121
+ });
122
+ }
package/dist/config.d.ts CHANGED
@@ -15,6 +15,15 @@ export interface AgestConfig {
15
15
  turns?: number;
16
16
  runs?: number;
17
17
  judge?: JudgeConfig;
18
+ /**
19
+ * Per-model pricing override (USD per 1M tokens). Merged on top of the
20
+ * built-in `src/pricing/models.json` table. Provide entries for any model
21
+ * you use that isn't already in the table, or to override a default.
22
+ */
23
+ pricing?: Record<string, {
24
+ input: number;
25
+ output: number;
26
+ }>;
18
27
  }
19
28
  export declare function defineConfig(config: AgestConfig): AgestConfig;
20
29
  export declare function loadConfig(): Promise<AgestConfig>;
package/dist/context.d.ts CHANGED
@@ -1,36 +1,57 @@
1
1
  import type { AgentExecutor, AgentReport, HookFn, SceneDefinition } from "./types";
2
- export declare class SceneBuilder {
2
+ import type { StandardSchemaV1 } from "./schema";
3
+ /**
4
+ * Builds a scene. Generic over `T`, the agent's native value type, so the
5
+ * known fields hand a typed value to the assertion callback:
6
+ * - `"value"` / `"response"` → `T`
7
+ * - `"text"` → `string`
8
+ * - `"refusal"` → `boolean | undefined`
9
+ * - any dot-path / other → `any` (a string field can't be typed)
10
+ * `T` flows in from a schema-typed `agent()` via the scene fn passed to its
11
+ * callback. The free `scene()` import stays `SceneBuilder<string>`.
12
+ */
13
+ export declare class SceneBuilder<T = string> {
3
14
  private _prompt;
4
15
  private _assertions;
5
16
  private _timeout?;
6
17
  private _turns?;
7
18
  private _runs?;
8
19
  private _suite?;
20
+ private _schema?;
9
21
  constructor(_prompt: string);
10
- timeout(ms: number): SceneBuilder;
11
- turns(n: number): SceneBuilder;
12
- runs(n: number): SceneBuilder;
22
+ timeout(ms: number): this;
23
+ turns(n: number): this;
24
+ runs(n: number): this;
13
25
  /** @internal */
14
26
  _setSuite(name: string): void;
15
- expect(field: string, fn: (value: any) => void): SceneBuilder;
27
+ expect(field: "value" | "response", fn: (value: T) => void): this;
28
+ expect(field: "text", fn: (value: string) => void): this;
29
+ expect(field: "refusal", fn: (value: boolean | undefined) => void): this;
30
+ expect(field: string, fn: (value: any) => void): this;
31
+ /**
32
+ * Validate this scene's native value against a Standard Schema before user
33
+ * assertions run. Overrides any schema declared on the agent.
34
+ */
35
+ expectSchema(schema: StandardSchemaV1): this;
16
36
  toDefinition(): SceneDefinition;
17
37
  }
18
- export declare class AgentContext {
38
+ export declare class AgentContext<T = string> {
19
39
  private _executor;
20
40
  private _name?;
41
+ private _schema?;
21
42
  private _scenes;
22
43
  private _currentSuite?;
23
44
  private _beforeAllHooks;
24
45
  private _afterAllHooks;
25
46
  private _beforeEachHooks;
26
47
  private _afterEachHooks;
27
- constructor(_executor: AgentExecutor, _name?: string | undefined);
48
+ constructor(_executor: AgentExecutor<T>, _name?: string | undefined, _schema?: StandardSchemaV1 | undefined);
28
49
  registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
29
50
  setSuite(name: string): void;
30
51
  clearSuite(): void;
31
- registerScene(prompt: string): SceneBuilder;
32
- execute(): Promise<AgentReport>;
52
+ registerScene(prompt: string): SceneBuilder<T>;
53
+ execute(): Promise<AgentReport<T>>;
33
54
  }
34
55
  export declare function hashPromptOnly(prompt: string): string;
35
- export declare function setContext(ctx: AgentContext | null): void;
36
- export declare function getContext(): AgentContext;
56
+ export declare function setContext(ctx: AgentContext<any> | null): void;
57
+ export declare function getContext(): AgentContext<any>;
package/dist/context.js CHANGED
@@ -1,9 +1,22 @@
1
1
  import { createHash } from "crypto";
2
2
  import { executeScene } from "./runner";
3
+ import { resolveText } from "./resolve";
3
4
  import { formatReport, writeReport, writeDiffEntry } from "./reporter";
4
5
  import { logger, c } from "./logger";
5
6
  import { loadConfig } from "./config";
7
+ import { setPricingOverrides } from "./pricing";
8
+ import { renderTerminalWaterfall } from "./waterfall";
6
9
  import { PromisePool } from "@supercharge/promise-pool";
10
+ /**
11
+ * Builds a scene. Generic over `T`, the agent's native value type, so the
12
+ * known fields hand a typed value to the assertion callback:
13
+ * - `"value"` / `"response"` → `T`
14
+ * - `"text"` → `string`
15
+ * - `"refusal"` → `boolean | undefined`
16
+ * - any dot-path / other → `any` (a string field can't be typed)
17
+ * `T` flows in from a schema-typed `agent()` via the scene fn passed to its
18
+ * callback. The free `scene()` import stays `SceneBuilder<string>`.
19
+ */
7
20
  export class SceneBuilder {
8
21
  _prompt;
9
22
  _assertions = [];
@@ -11,6 +24,7 @@ export class SceneBuilder {
11
24
  _turns;
12
25
  _runs;
13
26
  _suite;
27
+ _schema;
14
28
  constructor(_prompt) {
15
29
  this._prompt = _prompt;
16
30
  }
@@ -34,6 +48,14 @@ export class SceneBuilder {
34
48
  this._assertions.push({ field, fn });
35
49
  return this;
36
50
  }
51
+ /**
52
+ * Validate this scene's native value against a Standard Schema before user
53
+ * assertions run. Overrides any schema declared on the agent.
54
+ */
55
+ expectSchema(schema) {
56
+ this._schema = schema;
57
+ return this;
58
+ }
37
59
  toDefinition() {
38
60
  return {
39
61
  prompt: this._prompt,
@@ -42,21 +64,24 @@ export class SceneBuilder {
42
64
  turns: this._turns,
43
65
  runs: this._runs,
44
66
  suite: this._suite,
67
+ schema: this._schema,
45
68
  };
46
69
  }
47
70
  }
48
71
  export class AgentContext {
49
72
  _executor;
50
73
  _name;
74
+ _schema;
51
75
  _scenes = [];
52
76
  _currentSuite;
53
77
  _beforeAllHooks = [];
54
78
  _afterAllHooks = [];
55
79
  _beforeEachHooks = [];
56
80
  _afterEachHooks = [];
57
- constructor(_executor, _name) {
81
+ constructor(_executor, _name, _schema) {
58
82
  this._executor = _executor;
59
83
  this._name = _name;
84
+ this._schema = _schema;
60
85
  }
61
86
  registerHook(type, fn) {
62
87
  this[`_${type}Hooks`].push(fn);
@@ -76,9 +101,20 @@ export class AgentContext {
76
101
  return builder;
77
102
  }
78
103
  async execute() {
104
+ // `--full` flows in via the CLI runner (AGEST_FULL env) or directly on argv
105
+ // when a test file is run standalone (`tsx foo.test.ts --full`). Default is
106
+ // lean output: per-scene results only, no waterfall, no full report dump.
107
+ const full = process.env.AGEST_FULL === "1" || process.argv.includes("--full");
79
108
  const config = await loadConfig();
109
+ setPricingOverrides(config.pricing);
80
110
  const parallelism = Math.max(1, config.parallelism ?? 1);
81
- const definitions = this._scenes.map((s) => s.toDefinition());
111
+ const definitions = this._scenes.map((s) => {
112
+ const def = s.toDefinition();
113
+ // Agent-level schema is the default; a scene-level schema wins.
114
+ if (!def.schema && this._schema)
115
+ def.schema = this._schema;
116
+ return def;
117
+ });
82
118
  const orderedResults = new Array(definitions.length);
83
119
  const total = definitions.length;
84
120
  // Group scenes by suite for organized output
@@ -127,7 +163,19 @@ export class AgentContext {
127
163
  const sigColor = sig >= 0.95 ? c.green : sig >= 0.80 ? c.yellow : c.red;
128
164
  logger.info(`${indent} ${c.dim("significance:")} ${sigColor(`${(sig * 100).toFixed(1)}%`)} ${c.dim(`(pass rate: ${((result.passRate ?? 0) * 100).toFixed(1)}%)`)}`);
129
165
  }
130
- logger.debug(`${indent} response: ${result.response.text?.slice(0, 120)}`);
166
+ if (full && result.events && result.events.length > 0) {
167
+ const costLabel = result.costUsd != null
168
+ ? ` ${c.dim("·")} ${c.green(`$${Number(result.costUsd.toFixed(4))}`)}`
169
+ : "";
170
+ const tokLabel = result.tokens
171
+ ? ` ${c.dim(`(${result.tokens.input}→${result.tokens.output} tok)`)}`
172
+ : "";
173
+ logger.info(`${indent} ${c.dim("waterfall:")}${tokLabel}${costLabel}`);
174
+ for (const line of renderTerminalWaterfall(result.events, { indent: `${indent} ` })) {
175
+ logger.info(line);
176
+ }
177
+ }
178
+ logger.debug(`${indent} response: ${resolveText(result.response).slice(0, 120)}`);
131
179
  };
132
180
  if (hasSuites) {
133
181
  // Execute suite by suite — print header once, then run all scenes in that suite
@@ -170,14 +218,25 @@ export class AgentContext {
170
218
  const successRate = results.length > 0
171
219
  ? Number((results.filter((r) => r.passed).length / results.length).toFixed(2))
172
220
  : 0;
173
- const tokensAvailable = results.some((r) => r.response.metadata?.tokens != null);
221
+ const sceneTokens = results
222
+ .map((r) => r.tokens ?? r.response.metadata?.tokens)
223
+ .filter((t) => t != null);
174
224
  let averageInputTokensPerCase;
175
225
  let averageOutputTokensPerCase;
176
- if (tokensAvailable) {
177
- const withTokens = results.filter((r) => r.response.metadata?.tokens != null);
178
- averageInputTokensPerCase = Math.round(withTokens.reduce((sum, r) => sum + (r.response.metadata.tokens.input ?? 0), 0) / withTokens.length);
179
- averageOutputTokensPerCase = Math.round(withTokens.reduce((sum, r) => sum + (r.response.metadata.tokens.output ?? 0), 0) / withTokens.length);
226
+ let totalInputTokens;
227
+ let totalOutputTokens;
228
+ if (sceneTokens.length > 0) {
229
+ totalInputTokens = sceneTokens.reduce((s, t) => s + (t.input ?? 0), 0);
230
+ totalOutputTokens = sceneTokens.reduce((s, t) => s + (t.output ?? 0), 0);
231
+ averageInputTokensPerCase = Math.round(totalInputTokens / sceneTokens.length);
232
+ averageOutputTokensPerCase = Math.round(totalOutputTokens / sceneTokens.length);
180
233
  }
234
+ const sceneCosts = results
235
+ .map((r) => r.costUsd)
236
+ .filter((c) => typeof c === "number");
237
+ const totalCostUsd = sceneCosts.length > 0
238
+ ? sceneCosts.reduce((s, c) => s + c, 0)
239
+ : undefined;
181
240
  const firstMeta = results.find((r) => r.response.metadata)?.response
182
241
  .metadata;
183
242
  const dimensions = {};
@@ -208,15 +267,27 @@ export class AgentContext {
208
267
  totalCases: results.length,
209
268
  averageInputTokensPerCase,
210
269
  averageOutputTokensPerCase,
270
+ totalInputTokens,
271
+ totalOutputTokens,
272
+ totalCostUsd,
211
273
  results,
212
274
  };
213
275
  if (report.systemPromptHash && firstMeta?.systemPrompt) {
214
276
  await writeDiffEntry(report.systemPromptHash, firstMeta.systemPrompt, report.tools ?? [], report.model);
215
277
  }
216
278
  const formatted = formatReport(report);
217
- logger.info(formatted);
279
+ // Default mode prints a one-line summary; `--full` dumps the whole report.
280
+ if (full) {
281
+ logger.info(formatted);
282
+ }
283
+ else {
284
+ const passed = results.filter((r) => r.passed).length;
285
+ const rateColor = successRate >= 0.95 ? c.green : successRate >= 0.5 ? c.yellow : c.red;
286
+ const costSummary = totalCostUsd != null ? ` ${c.dim("·")} ${c.green(`$${Number(totalCostUsd.toFixed(4))}`)}` : "";
287
+ logger.info(`${rateColor(`${passed}/${results.length} passed`)} ${c.dim(`(${(successRate * 100).toFixed(0)}%)`)} ${c.dim("·")} ${c.dim(`${Math.round(totalDuration)}ms`)}${costSummary}`);
288
+ }
218
289
  const filepath = await writeReport(formatted, report.timestamp, report.name, report.dimensions);
219
- logger.info(`\n${c.dim("Report saved to:")} ${c.cyan(filepath)}`);
290
+ logger.info(`${c.dim("Report saved to:")} ${c.cyan(filepath)}${full ? "" : c.dim(" (run with --full to print it)")}`);
220
291
  return report;
221
292
  }
222
293
  }
@@ -227,6 +298,9 @@ function hashPrompt(prompt, model) {
227
298
  export function hashPromptOnly(prompt) {
228
299
  return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
229
300
  }
301
+ // The active context is a runtime singleton holding an executor of arbitrary
302
+ // value type, so `any` is the honest type for the holder. The generic flows
303
+ // through `agent()` → `AgentContext<T>` → the report at the call site.
230
304
  let currentContext = null;
231
305
  export function setContext(ctx) {
232
306
  currentContext = ctx;
@@ -0,0 +1,16 @@
1
+ export declare const DEFAULT_PATTERN = "**/*.agest.ts";
2
+ export interface DiscoverOptions {
3
+ pattern?: string;
4
+ cwd?: string;
5
+ }
6
+ /**
7
+ * Resolve a mix of file paths, directories, and glob patterns into a
8
+ * deduplicated, sorted list of absolute file paths.
9
+ *
10
+ * Rules per target:
11
+ * - directory: search recursively for `pattern` (default `**\/*.agest.ts`)
12
+ * - glob (contains *, ?, [], {}): expand it
13
+ * - file: use as-is
14
+ * - anything else: try as glob (zero matches is fine)
15
+ */
16
+ export declare function discoverTestFiles(targets: string[], options?: DiscoverOptions): Promise<string[]>;
@@ -0,0 +1,62 @@
1
+ import { promises as fs } from "node:fs";
2
+ import { isAbsolute, resolve } from "node:path";
3
+ export const DEFAULT_PATTERN = "**/*.agest.ts";
4
+ const GLOB_CHARS = /[*?[\]{}]/;
5
+ function hasGlobChars(value) {
6
+ return GLOB_CHARS.test(value);
7
+ }
8
+ async function statSafe(path) {
9
+ try {
10
+ const stat = await fs.stat(path);
11
+ return { isFile: stat.isFile(), isDir: stat.isDirectory() };
12
+ }
13
+ catch {
14
+ return { isFile: false, isDir: false };
15
+ }
16
+ }
17
+ async function expandGlob(pattern, cwd) {
18
+ const out = [];
19
+ // fs.promises.glob is available in Node >= 22 (the package's required engine).
20
+ for await (const match of fs.glob(pattern, { cwd })) {
21
+ out.push(isAbsolute(match) ? match : resolve(cwd, match));
22
+ }
23
+ return out;
24
+ }
25
+ /**
26
+ * Resolve a mix of file paths, directories, and glob patterns into a
27
+ * deduplicated, sorted list of absolute file paths.
28
+ *
29
+ * Rules per target:
30
+ * - directory: search recursively for `pattern` (default `**\/*.agest.ts`)
31
+ * - glob (contains *, ?, [], {}): expand it
32
+ * - file: use as-is
33
+ * - anything else: try as glob (zero matches is fine)
34
+ */
35
+ export async function discoverTestFiles(targets, options = {}) {
36
+ const cwd = options.cwd ?? process.cwd();
37
+ const pattern = options.pattern ?? DEFAULT_PATTERN;
38
+ const work = targets.length === 0 ? ["."] : targets;
39
+ const found = new Set();
40
+ for (const target of work) {
41
+ if (hasGlobChars(target)) {
42
+ for (const f of await expandGlob(target, cwd))
43
+ found.add(f);
44
+ continue;
45
+ }
46
+ const stat = await statSafe(isAbsolute(target) ? target : resolve(cwd, target));
47
+ if (stat.isDir) {
48
+ const trimmed = target.replace(/\/+$/, "");
49
+ const dirPattern = `${trimmed}/${pattern}`;
50
+ for (const f of await expandGlob(dirPattern, cwd))
51
+ found.add(f);
52
+ continue;
53
+ }
54
+ if (stat.isFile) {
55
+ found.add(isAbsolute(target) ? target : resolve(cwd, target));
56
+ continue;
57
+ }
58
+ for (const f of await expandGlob(target, cwd))
59
+ found.add(f);
60
+ }
61
+ return [...found].sort();
62
+ }
package/dist/index.d.ts CHANGED
@@ -1,16 +1,26 @@
1
1
  import type { AgentExecutor, AgentReport, HookFn } from "./types";
2
2
  import { SceneBuilder } from "./context";
3
+ import { type StandardSchemaV1, type InferOutput } from "./schema";
3
4
  export { expect } from "./assertions";
5
+ export type { StandardSchemaV1, InferOutput } from "./schema";
4
6
  export { logger } from "./logger";
5
7
  export { defineConfig } from "./config";
8
+ export { createTrace, summarizeEvents } from "./adapters/tracing";
9
+ export type { Trace } from "./adapters/tracing";
6
10
  export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
7
11
  export type { LogLevel } from "./logger";
8
12
  export type { AgentExpectation, AgentMatchers } from "./assertions";
9
13
  export type { JudgeCriteria } from "./judge";
10
- export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, } from "./types";
14
+ export type { AgentExecutor, ExecutorOptions, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, TimelineEvent, TimelineEventKind, CostBreakdown, CostSource, } from "./types";
11
15
  export interface AgentOptions {
12
16
  name?: string;
13
17
  }
18
+ /**
19
+ * Registers a scene in the active agent. The variant passed to an `agent()`
20
+ * callback is typed `SceneFn<T>`, so `.expect("value", …)` receives the agent's
21
+ * native value type.
22
+ */
23
+ export type SceneFn<T = string> = (prompt: string) => SceneBuilder<T>;
14
24
  export declare function scene(prompt: string): SceneBuilder;
15
25
  export declare function beforeAll(fn: HookFn): void;
16
26
  export declare function afterAll(fn: HookFn): void;
@@ -19,4 +29,12 @@ export declare function afterEach(fn: HookFn): void;
19
29
  export declare function suite(name: string, fn: () => void): void;
20
30
  /** @internal reset auto-run state between tests */
21
31
  export declare function _resetAutoRun(): void;
22
- export declare function agent(executor: AgentExecutor, fn: () => void, options?: AgentOptions): Promise<AgentReport>;
32
+ export declare function agent<T = string>(executor: AgentExecutor<T>, fn: (scene: SceneFn<T>) => void, options?: AgentOptions): Promise<AgentReport<T>>;
33
+ /**
34
+ * Schema-typed agent: the executor's `value` type is inferred from the schema
35
+ * (e.g. `z.infer<typeof Schema>`), and every non-refusal scene is validated
36
+ * against it. The scene fn passed to the callback is typed accordingly, so
37
+ * `.expect("value", …)` receives that value type. A scene's own
38
+ * `.expectSchema()` overrides the agent schema.
39
+ */
40
+ export declare function agent<S extends StandardSchemaV1>(schema: S, executor: AgentExecutor<InferOutput<S>>, fn: (scene: SceneFn<InferOutput<S>>) => void, options?: AgentOptions): Promise<AgentReport<InferOutput<S>>>;
package/dist/index.js CHANGED
@@ -1,7 +1,9 @@
1
1
  import { AgentContext, setContext, getContext } from "./context";
2
+ import { isStandardSchema } from "./schema";
2
3
  export { expect } from "./assertions";
3
4
  export { logger } from "./logger";
4
5
  export { defineConfig } from "./config";
6
+ export { createTrace, summarizeEvents } from "./adapters/tracing";
5
7
  export function scene(prompt) {
6
8
  return getContext().registerScene(prompt);
7
9
  }
@@ -36,11 +38,16 @@ export function _resetAutoRun() {
36
38
  autoRunScheduled = false;
37
39
  executionChain = Promise.resolve();
38
40
  }
39
- export function agent(executor, fn, options) {
40
- const ctx = new AgentContext(executor, options?.name);
41
+ export function agent(...args) {
42
+ const [schema, executor, fn, options] = isStandardSchema(args[0])
43
+ ? args
44
+ : [undefined, ...args];
45
+ const ctx = new AgentContext(executor, options?.name, schema);
41
46
  setContext(ctx);
42
47
  try {
43
- fn();
48
+ // Hand the callback a scene fn bound to the active context. Its static type
49
+ // carries T (via the overloads); at runtime it's the same `scene()`.
50
+ fn(scene);
44
51
  }
45
52
  catch (err) {
46
53
  setContext(null);
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Structural matching primitives for deterministic assertions. Kept in their
3
+ * own module — they are correctness-critical (a wrong result here is a false
4
+ * test pass) and deserve isolated, exhaustive unit tests.
5
+ */
6
+ /** Any non-null, non-array object — including class instances, Map, Date, etc. */
7
+ export declare function isObjectLike(value: unknown): value is Record<string, unknown>;
8
+ /**
9
+ * A "record" object — a plain `{...}` literal (prototype is Object.prototype or
10
+ * null). Class instances, Map, Date, RegExp, etc. are NOT plain: they are
11
+ * compared as opaque leaves rather than recursed into.
12
+ */
13
+ export declare function isPlainObject(value: unknown): value is Record<string, unknown>;
14
+ /**
15
+ * Recursive containment: is `expected` structurally present within `actual`?
16
+ *
17
+ * - `expected` array → `actual` is an array and the expected elements can be
18
+ * matched one-to-one to DISTINCT actual elements (order-independent
19
+ * multiset/sub-multiset membership — duplicates require distinct matches).
20
+ * - `expected` plain object → `actual` is object-like and every key in
21
+ * `expected` exists in `actual` with a recursively-contained value (extra
22
+ * keys in `actual` are allowed — that is the "partial").
23
+ * - anything else (primitive, Date, Map, RegExp, class instance) → strict
24
+ * deep equality via `isDeepStrictEqual` (correct for NaN / Date / ±0).
25
+ *
26
+ * Leaf comparison is EXACT and case-sensitive. Only the shape recurses.
27
+ */
28
+ export declare function structuralContains(actual: unknown, expected: unknown): boolean;
package/dist/match.js ADDED
@@ -0,0 +1,57 @@
1
+ import { isDeepStrictEqual } from "node:util";
2
+ /**
3
+ * Structural matching primitives for deterministic assertions. Kept in their
4
+ * own module — they are correctness-critical (a wrong result here is a false
5
+ * test pass) and deserve isolated, exhaustive unit tests.
6
+ */
7
+ /** Any non-null, non-array object — including class instances, Map, Date, etc. */
8
+ export function isObjectLike(value) {
9
+ return typeof value === "object" && value !== null && !Array.isArray(value);
10
+ }
11
+ /**
12
+ * A "record" object — a plain `{...}` literal (prototype is Object.prototype or
13
+ * null). Class instances, Map, Date, RegExp, etc. are NOT plain: they are
14
+ * compared as opaque leaves rather than recursed into.
15
+ */
16
+ export function isPlainObject(value) {
17
+ if (!isObjectLike(value))
18
+ return false;
19
+ const proto = Object.getPrototypeOf(value);
20
+ return proto === Object.prototype || proto === null;
21
+ }
22
+ /**
23
+ * Recursive containment: is `expected` structurally present within `actual`?
24
+ *
25
+ * - `expected` array → `actual` is an array and the expected elements can be
26
+ * matched one-to-one to DISTINCT actual elements (order-independent
27
+ * multiset/sub-multiset membership — duplicates require distinct matches).
28
+ * - `expected` plain object → `actual` is object-like and every key in
29
+ * `expected` exists in `actual` with a recursively-contained value (extra
30
+ * keys in `actual` are allowed — that is the "partial").
31
+ * - anything else (primitive, Date, Map, RegExp, class instance) → strict
32
+ * deep equality via `isDeepStrictEqual` (correct for NaN / Date / ±0).
33
+ *
34
+ * Leaf comparison is EXACT and case-sensitive. Only the shape recurses.
35
+ */
36
+ export function structuralContains(actual, expected) {
37
+ if (Array.isArray(expected)) {
38
+ if (!Array.isArray(actual))
39
+ return false;
40
+ // Greedy one-to-one matching: each expected element must claim a DISTINCT
41
+ // actual element, so `[1]` does not contain `[1, 1]`.
42
+ const claimed = new Set();
43
+ return expected.every((e) => {
44
+ const idx = actual.findIndex((a, i) => !claimed.has(i) && structuralContains(a, e));
45
+ if (idx === -1)
46
+ return false;
47
+ claimed.add(idx);
48
+ return true;
49
+ });
50
+ }
51
+ if (isPlainObject(expected)) {
52
+ if (!isObjectLike(actual))
53
+ return false;
54
+ return Object.keys(expected).every((key) => key in actual && structuralContains(actual[key], expected[key]));
55
+ }
56
+ return isDeepStrictEqual(actual, expected);
57
+ }