@sebastiantuyu/agest 0.3.2 → 0.3.3-next.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -1
- package/dist/adapters/index.d.ts +2 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +80 -11
- package/dist/adapters/remote.d.ts +1 -1
- package/dist/adapters/remote.js +3 -2
- package/dist/adapters/tracing.d.ts +73 -0
- package/dist/adapters/tracing.js +338 -0
- package/dist/assertions.d.ts +57 -2
- package/dist/assertions.js +119 -33
- package/dist/cli.d.ts +15 -1
- package/dist/cli.js +97 -18
- package/dist/config.d.ts +9 -0
- package/dist/context.d.ts +32 -11
- package/dist/context.js +84 -10
- package/dist/discover.d.ts +16 -0
- package/dist/discover.js +62 -0
- package/dist/index.d.ts +20 -2
- package/dist/index.js +10 -3
- package/dist/match.d.ts +28 -0
- package/dist/match.js +57 -0
- package/dist/preview.js +93 -0
- package/dist/pricing/index.d.ts +32 -0
- package/dist/pricing/index.js +48 -0
- package/dist/pricing/models.json +21 -0
- package/dist/reporter.d.ts +1 -1
- package/dist/reporter.js +77 -4
- package/dist/reports.d.ts +37 -0
- package/dist/reports.js +126 -0
- package/dist/resolve.d.ts +25 -0
- package/dist/resolve.js +62 -0
- package/dist/runner.d.ts +11 -2
- package/dist/runner.js +97 -11
- package/dist/schema.d.ts +63 -0
- package/dist/schema.js +61 -0
- package/dist/types.d.ts +84 -9
- package/dist/waterfall.d.ts +11 -0
- package/dist/waterfall.js +46 -0
- package/package.json +24 -15
package/dist/cli.js
CHANGED
|
@@ -1,43 +1,122 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { spawn } from "child_process";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
import { realpathSync } from "node:fs";
|
|
3
5
|
import { main as stats } from "./stats.js";
|
|
4
6
|
import { main as preview } from "./preview.js";
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
7
|
+
import { DEFAULT_PATTERN, discoverTestFiles } from "./discover.js";
|
|
8
|
+
/**
|
|
9
|
+
* Extract the args that follow the command word from a full `process.argv`.
|
|
10
|
+
* `argv = [execPath, scriptPath, command, ...commandArgs]`, so the command's
|
|
11
|
+
* args always start at index 3. Capturing them here (once, from the original
|
|
12
|
+
* argv) avoids re-slicing a mutated argv downstream — the double-shift that
|
|
13
|
+
* silently dropped a lone `run` target and made discovery scan the whole cwd.
|
|
14
|
+
*/
|
|
15
|
+
export function getCommandArgs(argv) {
|
|
16
|
+
return argv.slice(3);
|
|
17
|
+
}
|
|
18
|
+
export function parseRunArgs(args) {
|
|
19
|
+
const targets = [];
|
|
20
|
+
let pattern;
|
|
21
|
+
let full = false;
|
|
22
|
+
for (let i = 0; i < args.length; i++) {
|
|
23
|
+
const a = args[i];
|
|
24
|
+
if (a === "--pattern" || a === "-p") {
|
|
25
|
+
pattern = args[++i];
|
|
26
|
+
if (pattern === undefined) {
|
|
27
|
+
console.error(" Error: --pattern requires a value");
|
|
28
|
+
process.exit(1);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
else if (a.startsWith("--pattern=")) {
|
|
32
|
+
pattern = a.slice("--pattern=".length);
|
|
33
|
+
}
|
|
34
|
+
else if (a === "--full") {
|
|
35
|
+
full = true;
|
|
36
|
+
}
|
|
37
|
+
else {
|
|
38
|
+
targets.push(a);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return { pattern, targets, full };
|
|
42
|
+
}
|
|
43
|
+
async function run(args) {
|
|
44
|
+
const { pattern, targets, full } = parseRunArgs(args);
|
|
45
|
+
const files = await discoverTestFiles(targets, { pattern });
|
|
8
46
|
if (files.length === 0) {
|
|
9
|
-
|
|
47
|
+
const effective = pattern ?? DEFAULT_PATTERN;
|
|
48
|
+
console.error(` No test files found (pattern: ${effective})`);
|
|
10
49
|
process.exit(1);
|
|
11
50
|
}
|
|
12
51
|
for (const file of files) {
|
|
13
52
|
const child = spawn("npx", ["tsx", file], {
|
|
14
53
|
stdio: "inherit",
|
|
15
54
|
shell: true,
|
|
55
|
+
// The test file renders its own output in a child process; propagate the
|
|
56
|
+
// --full flag through the environment so it knows to emit the waterfall
|
|
57
|
+
// and full report rather than just per-scene results.
|
|
58
|
+
env: full ? { ...process.env, AGEST_FULL: "1" } : process.env,
|
|
16
59
|
});
|
|
17
60
|
const code = await new Promise((resolve) => child.on("close", (c) => resolve(c ?? 1)));
|
|
18
61
|
if (code !== 0)
|
|
19
62
|
process.exit(code);
|
|
20
63
|
}
|
|
21
64
|
}
|
|
22
|
-
|
|
23
|
-
stats,
|
|
24
|
-
preview,
|
|
25
|
-
run,
|
|
26
|
-
};
|
|
27
|
-
if (!command || !commands[command]) {
|
|
65
|
+
function printUsage() {
|
|
28
66
|
console.log(`
|
|
29
67
|
Usage: agest <command>
|
|
30
68
|
|
|
31
69
|
Commands:
|
|
32
|
-
run Run test file(s)
|
|
70
|
+
run Run test file(s), directories, or glob patterns
|
|
71
|
+
agest run tests/ # walks for ${DEFAULT_PATTERN}
|
|
72
|
+
agest run src/agest --pattern "**/*.test.ts"
|
|
73
|
+
agest run "tests/**/*.agest.ts" path/to/file.agest.ts
|
|
74
|
+
agest run tests/ --full # also print waterfall + full report
|
|
33
75
|
stats Show aggregated test statistics
|
|
34
76
|
preview Generate an HTML report preview
|
|
35
77
|
`);
|
|
36
|
-
process.exit(command ? 1 : 0);
|
|
37
78
|
}
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
79
|
+
const KNOWN_COMMANDS = new Set(["run", "stats", "preview"]);
|
|
80
|
+
export async function main(argv) {
|
|
81
|
+
const command = argv[2];
|
|
82
|
+
const commandArgs = getCommandArgs(argv);
|
|
83
|
+
if (!command || !KNOWN_COMMANDS.has(command)) {
|
|
84
|
+
printUsage();
|
|
85
|
+
process.exit(command ? 1 : 0);
|
|
86
|
+
}
|
|
87
|
+
if (command === "run") {
|
|
88
|
+
await run(commandArgs);
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
// stats/preview read their args from `process.argv.slice(2)`, so normalize
|
|
92
|
+
// argv to drop the command word before handing off.
|
|
93
|
+
process.argv = [argv[0], argv[1], ...commandArgs];
|
|
94
|
+
if (command === "stats")
|
|
95
|
+
await stats();
|
|
96
|
+
else
|
|
97
|
+
await preview();
|
|
98
|
+
}
|
|
99
|
+
// Only run as a CLI when invoked directly (bin or `tsx src/cli.ts`), not when
|
|
100
|
+
// imported by a test — that keeps `main` from firing (and calling
|
|
101
|
+
// process.exit) on import. Package managers expose the bin as a symlink
|
|
102
|
+
// (node_modules/.bin/agest), so argv[1] is the symlink path while
|
|
103
|
+
// import.meta.url is the real file; realpath both sides before comparing or
|
|
104
|
+
// the CLI silently no-ops when invoked through the symlink.
|
|
105
|
+
function isInvokedAsCli() {
|
|
106
|
+
const entry = process.argv[1];
|
|
107
|
+
if (!entry)
|
|
108
|
+
return false;
|
|
109
|
+
const self = fileURLToPath(import.meta.url);
|
|
110
|
+
try {
|
|
111
|
+
return realpathSync(entry) === realpathSync(self);
|
|
112
|
+
}
|
|
113
|
+
catch {
|
|
114
|
+
return entry === self;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
if (isInvokedAsCli()) {
|
|
118
|
+
main(process.argv).catch((err) => {
|
|
119
|
+
console.error("Error:", err.message);
|
|
120
|
+
process.exit(1);
|
|
121
|
+
});
|
|
122
|
+
}
|
package/dist/config.d.ts
CHANGED
|
@@ -15,6 +15,15 @@ export interface AgestConfig {
|
|
|
15
15
|
turns?: number;
|
|
16
16
|
runs?: number;
|
|
17
17
|
judge?: JudgeConfig;
|
|
18
|
+
/**
|
|
19
|
+
* Per-model pricing override (USD per 1M tokens). Merged on top of the
|
|
20
|
+
* built-in `src/pricing/models.json` table. Provide entries for any model
|
|
21
|
+
* you use that isn't already in the table, or to override a default.
|
|
22
|
+
*/
|
|
23
|
+
pricing?: Record<string, {
|
|
24
|
+
input: number;
|
|
25
|
+
output: number;
|
|
26
|
+
}>;
|
|
18
27
|
}
|
|
19
28
|
export declare function defineConfig(config: AgestConfig): AgestConfig;
|
|
20
29
|
export declare function loadConfig(): Promise<AgestConfig>;
|
package/dist/context.d.ts
CHANGED
|
@@ -1,36 +1,57 @@
|
|
|
1
1
|
import type { AgentExecutor, AgentReport, HookFn, SceneDefinition } from "./types";
|
|
2
|
-
|
|
2
|
+
import type { StandardSchemaV1 } from "./schema";
|
|
3
|
+
/**
|
|
4
|
+
* Builds a scene. Generic over `T`, the agent's native value type, so the
|
|
5
|
+
* known fields hand a typed value to the assertion callback:
|
|
6
|
+
* - `"value"` / `"response"` → `T`
|
|
7
|
+
* - `"text"` → `string`
|
|
8
|
+
* - `"refusal"` → `boolean | undefined`
|
|
9
|
+
* - any dot-path / other → `any` (a string field can't be typed)
|
|
10
|
+
* `T` flows in from a schema-typed `agent()` via the scene fn passed to its
|
|
11
|
+
* callback. The free `scene()` import stays `SceneBuilder<string>`.
|
|
12
|
+
*/
|
|
13
|
+
export declare class SceneBuilder<T = string> {
|
|
3
14
|
private _prompt;
|
|
4
15
|
private _assertions;
|
|
5
16
|
private _timeout?;
|
|
6
17
|
private _turns?;
|
|
7
18
|
private _runs?;
|
|
8
19
|
private _suite?;
|
|
20
|
+
private _schema?;
|
|
9
21
|
constructor(_prompt: string);
|
|
10
|
-
timeout(ms: number):
|
|
11
|
-
turns(n: number):
|
|
12
|
-
runs(n: number):
|
|
22
|
+
timeout(ms: number): this;
|
|
23
|
+
turns(n: number): this;
|
|
24
|
+
runs(n: number): this;
|
|
13
25
|
/** @internal */
|
|
14
26
|
_setSuite(name: string): void;
|
|
15
|
-
expect(field:
|
|
27
|
+
expect(field: "value" | "response", fn: (value: T) => void): this;
|
|
28
|
+
expect(field: "text", fn: (value: string) => void): this;
|
|
29
|
+
expect(field: "refusal", fn: (value: boolean | undefined) => void): this;
|
|
30
|
+
expect(field: string, fn: (value: any) => void): this;
|
|
31
|
+
/**
|
|
32
|
+
* Validate this scene's native value against a Standard Schema before user
|
|
33
|
+
* assertions run. Overrides any schema declared on the agent.
|
|
34
|
+
*/
|
|
35
|
+
expectSchema(schema: StandardSchemaV1): this;
|
|
16
36
|
toDefinition(): SceneDefinition;
|
|
17
37
|
}
|
|
18
|
-
export declare class AgentContext {
|
|
38
|
+
export declare class AgentContext<T = string> {
|
|
19
39
|
private _executor;
|
|
20
40
|
private _name?;
|
|
41
|
+
private _schema?;
|
|
21
42
|
private _scenes;
|
|
22
43
|
private _currentSuite?;
|
|
23
44
|
private _beforeAllHooks;
|
|
24
45
|
private _afterAllHooks;
|
|
25
46
|
private _beforeEachHooks;
|
|
26
47
|
private _afterEachHooks;
|
|
27
|
-
constructor(_executor: AgentExecutor
|
|
48
|
+
constructor(_executor: AgentExecutor<T>, _name?: string | undefined, _schema?: StandardSchemaV1 | undefined);
|
|
28
49
|
registerHook(type: "beforeAll" | "afterAll" | "beforeEach" | "afterEach", fn: HookFn): void;
|
|
29
50
|
setSuite(name: string): void;
|
|
30
51
|
clearSuite(): void;
|
|
31
|
-
registerScene(prompt: string): SceneBuilder
|
|
32
|
-
execute(): Promise<AgentReport
|
|
52
|
+
registerScene(prompt: string): SceneBuilder<T>;
|
|
53
|
+
execute(): Promise<AgentReport<T>>;
|
|
33
54
|
}
|
|
34
55
|
export declare function hashPromptOnly(prompt: string): string;
|
|
35
|
-
export declare function setContext(ctx: AgentContext | null): void;
|
|
36
|
-
export declare function getContext(): AgentContext
|
|
56
|
+
export declare function setContext(ctx: AgentContext<any> | null): void;
|
|
57
|
+
export declare function getContext(): AgentContext<any>;
|
package/dist/context.js
CHANGED
|
@@ -1,9 +1,22 @@
|
|
|
1
1
|
import { createHash } from "crypto";
|
|
2
2
|
import { executeScene } from "./runner";
|
|
3
|
+
import { resolveText } from "./resolve";
|
|
3
4
|
import { formatReport, writeReport, writeDiffEntry } from "./reporter";
|
|
4
5
|
import { logger, c } from "./logger";
|
|
5
6
|
import { loadConfig } from "./config";
|
|
7
|
+
import { setPricingOverrides } from "./pricing";
|
|
8
|
+
import { renderTerminalWaterfall } from "./waterfall";
|
|
6
9
|
import { PromisePool } from "@supercharge/promise-pool";
|
|
10
|
+
/**
|
|
11
|
+
* Builds a scene. Generic over `T`, the agent's native value type, so the
|
|
12
|
+
* known fields hand a typed value to the assertion callback:
|
|
13
|
+
* - `"value"` / `"response"` → `T`
|
|
14
|
+
* - `"text"` → `string`
|
|
15
|
+
* - `"refusal"` → `boolean | undefined`
|
|
16
|
+
* - any dot-path / other → `any` (a string field can't be typed)
|
|
17
|
+
* `T` flows in from a schema-typed `agent()` via the scene fn passed to its
|
|
18
|
+
* callback. The free `scene()` import stays `SceneBuilder<string>`.
|
|
19
|
+
*/
|
|
7
20
|
export class SceneBuilder {
|
|
8
21
|
_prompt;
|
|
9
22
|
_assertions = [];
|
|
@@ -11,6 +24,7 @@ export class SceneBuilder {
|
|
|
11
24
|
_turns;
|
|
12
25
|
_runs;
|
|
13
26
|
_suite;
|
|
27
|
+
_schema;
|
|
14
28
|
constructor(_prompt) {
|
|
15
29
|
this._prompt = _prompt;
|
|
16
30
|
}
|
|
@@ -34,6 +48,14 @@ export class SceneBuilder {
|
|
|
34
48
|
this._assertions.push({ field, fn });
|
|
35
49
|
return this;
|
|
36
50
|
}
|
|
51
|
+
/**
|
|
52
|
+
* Validate this scene's native value against a Standard Schema before user
|
|
53
|
+
* assertions run. Overrides any schema declared on the agent.
|
|
54
|
+
*/
|
|
55
|
+
expectSchema(schema) {
|
|
56
|
+
this._schema = schema;
|
|
57
|
+
return this;
|
|
58
|
+
}
|
|
37
59
|
toDefinition() {
|
|
38
60
|
return {
|
|
39
61
|
prompt: this._prompt,
|
|
@@ -42,21 +64,24 @@ export class SceneBuilder {
|
|
|
42
64
|
turns: this._turns,
|
|
43
65
|
runs: this._runs,
|
|
44
66
|
suite: this._suite,
|
|
67
|
+
schema: this._schema,
|
|
45
68
|
};
|
|
46
69
|
}
|
|
47
70
|
}
|
|
48
71
|
export class AgentContext {
|
|
49
72
|
_executor;
|
|
50
73
|
_name;
|
|
74
|
+
_schema;
|
|
51
75
|
_scenes = [];
|
|
52
76
|
_currentSuite;
|
|
53
77
|
_beforeAllHooks = [];
|
|
54
78
|
_afterAllHooks = [];
|
|
55
79
|
_beforeEachHooks = [];
|
|
56
80
|
_afterEachHooks = [];
|
|
57
|
-
constructor(_executor, _name) {
|
|
81
|
+
constructor(_executor, _name, _schema) {
|
|
58
82
|
this._executor = _executor;
|
|
59
83
|
this._name = _name;
|
|
84
|
+
this._schema = _schema;
|
|
60
85
|
}
|
|
61
86
|
registerHook(type, fn) {
|
|
62
87
|
this[`_${type}Hooks`].push(fn);
|
|
@@ -76,9 +101,20 @@ export class AgentContext {
|
|
|
76
101
|
return builder;
|
|
77
102
|
}
|
|
78
103
|
async execute() {
|
|
104
|
+
// `--full` flows in via the CLI runner (AGEST_FULL env) or directly on argv
|
|
105
|
+
// when a test file is run standalone (`tsx foo.test.ts --full`). Default is
|
|
106
|
+
// lean output: per-scene results only, no waterfall, no full report dump.
|
|
107
|
+
const full = process.env.AGEST_FULL === "1" || process.argv.includes("--full");
|
|
79
108
|
const config = await loadConfig();
|
|
109
|
+
setPricingOverrides(config.pricing);
|
|
80
110
|
const parallelism = Math.max(1, config.parallelism ?? 1);
|
|
81
|
-
const definitions = this._scenes.map((s) =>
|
|
111
|
+
const definitions = this._scenes.map((s) => {
|
|
112
|
+
const def = s.toDefinition();
|
|
113
|
+
// Agent-level schema is the default; a scene-level schema wins.
|
|
114
|
+
if (!def.schema && this._schema)
|
|
115
|
+
def.schema = this._schema;
|
|
116
|
+
return def;
|
|
117
|
+
});
|
|
82
118
|
const orderedResults = new Array(definitions.length);
|
|
83
119
|
const total = definitions.length;
|
|
84
120
|
// Group scenes by suite for organized output
|
|
@@ -127,7 +163,19 @@ export class AgentContext {
|
|
|
127
163
|
const sigColor = sig >= 0.95 ? c.green : sig >= 0.80 ? c.yellow : c.red;
|
|
128
164
|
logger.info(`${indent} ${c.dim("significance:")} ${sigColor(`${(sig * 100).toFixed(1)}%`)} ${c.dim(`(pass rate: ${((result.passRate ?? 0) * 100).toFixed(1)}%)`)}`);
|
|
129
165
|
}
|
|
130
|
-
|
|
166
|
+
if (full && result.events && result.events.length > 0) {
|
|
167
|
+
const costLabel = result.costUsd != null
|
|
168
|
+
? ` ${c.dim("·")} ${c.green(`$${Number(result.costUsd.toFixed(4))}`)}`
|
|
169
|
+
: "";
|
|
170
|
+
const tokLabel = result.tokens
|
|
171
|
+
? ` ${c.dim(`(${result.tokens.input}→${result.tokens.output} tok)`)}`
|
|
172
|
+
: "";
|
|
173
|
+
logger.info(`${indent} ${c.dim("waterfall:")}${tokLabel}${costLabel}`);
|
|
174
|
+
for (const line of renderTerminalWaterfall(result.events, { indent: `${indent} ` })) {
|
|
175
|
+
logger.info(line);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
logger.debug(`${indent} response: ${resolveText(result.response).slice(0, 120)}`);
|
|
131
179
|
};
|
|
132
180
|
if (hasSuites) {
|
|
133
181
|
// Execute suite by suite — print header once, then run all scenes in that suite
|
|
@@ -170,14 +218,25 @@ export class AgentContext {
|
|
|
170
218
|
const successRate = results.length > 0
|
|
171
219
|
? Number((results.filter((r) => r.passed).length / results.length).toFixed(2))
|
|
172
220
|
: 0;
|
|
173
|
-
const
|
|
221
|
+
const sceneTokens = results
|
|
222
|
+
.map((r) => r.tokens ?? r.response.metadata?.tokens)
|
|
223
|
+
.filter((t) => t != null);
|
|
174
224
|
let averageInputTokensPerCase;
|
|
175
225
|
let averageOutputTokensPerCase;
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
226
|
+
let totalInputTokens;
|
|
227
|
+
let totalOutputTokens;
|
|
228
|
+
if (sceneTokens.length > 0) {
|
|
229
|
+
totalInputTokens = sceneTokens.reduce((s, t) => s + (t.input ?? 0), 0);
|
|
230
|
+
totalOutputTokens = sceneTokens.reduce((s, t) => s + (t.output ?? 0), 0);
|
|
231
|
+
averageInputTokensPerCase = Math.round(totalInputTokens / sceneTokens.length);
|
|
232
|
+
averageOutputTokensPerCase = Math.round(totalOutputTokens / sceneTokens.length);
|
|
180
233
|
}
|
|
234
|
+
const sceneCosts = results
|
|
235
|
+
.map((r) => r.costUsd)
|
|
236
|
+
.filter((c) => typeof c === "number");
|
|
237
|
+
const totalCostUsd = sceneCosts.length > 0
|
|
238
|
+
? sceneCosts.reduce((s, c) => s + c, 0)
|
|
239
|
+
: undefined;
|
|
181
240
|
const firstMeta = results.find((r) => r.response.metadata)?.response
|
|
182
241
|
.metadata;
|
|
183
242
|
const dimensions = {};
|
|
@@ -208,15 +267,27 @@ export class AgentContext {
|
|
|
208
267
|
totalCases: results.length,
|
|
209
268
|
averageInputTokensPerCase,
|
|
210
269
|
averageOutputTokensPerCase,
|
|
270
|
+
totalInputTokens,
|
|
271
|
+
totalOutputTokens,
|
|
272
|
+
totalCostUsd,
|
|
211
273
|
results,
|
|
212
274
|
};
|
|
213
275
|
if (report.systemPromptHash && firstMeta?.systemPrompt) {
|
|
214
276
|
await writeDiffEntry(report.systemPromptHash, firstMeta.systemPrompt, report.tools ?? [], report.model);
|
|
215
277
|
}
|
|
216
278
|
const formatted = formatReport(report);
|
|
217
|
-
|
|
279
|
+
// Default mode prints a one-line summary; `--full` dumps the whole report.
|
|
280
|
+
if (full) {
|
|
281
|
+
logger.info(formatted);
|
|
282
|
+
}
|
|
283
|
+
else {
|
|
284
|
+
const passed = results.filter((r) => r.passed).length;
|
|
285
|
+
const rateColor = successRate >= 0.95 ? c.green : successRate >= 0.5 ? c.yellow : c.red;
|
|
286
|
+
const costSummary = totalCostUsd != null ? ` ${c.dim("·")} ${c.green(`$${Number(totalCostUsd.toFixed(4))}`)}` : "";
|
|
287
|
+
logger.info(`${rateColor(`${passed}/${results.length} passed`)} ${c.dim(`(${(successRate * 100).toFixed(0)}%)`)} ${c.dim("·")} ${c.dim(`${Math.round(totalDuration)}ms`)}${costSummary}`);
|
|
288
|
+
}
|
|
218
289
|
const filepath = await writeReport(formatted, report.timestamp, report.name, report.dimensions);
|
|
219
|
-
logger.info(
|
|
290
|
+
logger.info(`${c.dim("Report saved to:")} ${c.cyan(filepath)}${full ? "" : c.dim(" (run with --full to print it)")}`);
|
|
220
291
|
return report;
|
|
221
292
|
}
|
|
222
293
|
}
|
|
@@ -227,6 +298,9 @@ function hashPrompt(prompt, model) {
|
|
|
227
298
|
export function hashPromptOnly(prompt) {
|
|
228
299
|
return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
|
|
229
300
|
}
|
|
301
|
+
// The active context is a runtime singleton holding an executor of arbitrary
|
|
302
|
+
// value type, so `any` is the honest type for the holder. The generic flows
|
|
303
|
+
// through `agent()` → `AgentContext<T>` → the report at the call site.
|
|
230
304
|
let currentContext = null;
|
|
231
305
|
export function setContext(ctx) {
|
|
232
306
|
currentContext = ctx;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export declare const DEFAULT_PATTERN = "**/*.agest.ts";
|
|
2
|
+
export interface DiscoverOptions {
|
|
3
|
+
pattern?: string;
|
|
4
|
+
cwd?: string;
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Resolve a mix of file paths, directories, and glob patterns into a
|
|
8
|
+
* deduplicated, sorted list of absolute file paths.
|
|
9
|
+
*
|
|
10
|
+
* Rules per target:
|
|
11
|
+
* - directory: search recursively for `pattern` (default `**\/*.agest.ts`)
|
|
12
|
+
* - glob (contains *, ?, [], {}): expand it
|
|
13
|
+
* - file: use as-is
|
|
14
|
+
* - anything else: try as glob (zero matches is fine)
|
|
15
|
+
*/
|
|
16
|
+
export declare function discoverTestFiles(targets: string[], options?: DiscoverOptions): Promise<string[]>;
|
package/dist/discover.js
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { promises as fs } from "node:fs";
|
|
2
|
+
import { isAbsolute, resolve } from "node:path";
|
|
3
|
+
export const DEFAULT_PATTERN = "**/*.agest.ts";
|
|
4
|
+
const GLOB_CHARS = /[*?[\]{}]/;
|
|
5
|
+
function hasGlobChars(value) {
|
|
6
|
+
return GLOB_CHARS.test(value);
|
|
7
|
+
}
|
|
8
|
+
async function statSafe(path) {
|
|
9
|
+
try {
|
|
10
|
+
const stat = await fs.stat(path);
|
|
11
|
+
return { isFile: stat.isFile(), isDir: stat.isDirectory() };
|
|
12
|
+
}
|
|
13
|
+
catch {
|
|
14
|
+
return { isFile: false, isDir: false };
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
async function expandGlob(pattern, cwd) {
|
|
18
|
+
const out = [];
|
|
19
|
+
// fs.promises.glob is available in Node >= 22 (the package's required engine).
|
|
20
|
+
for await (const match of fs.glob(pattern, { cwd })) {
|
|
21
|
+
out.push(isAbsolute(match) ? match : resolve(cwd, match));
|
|
22
|
+
}
|
|
23
|
+
return out;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Resolve a mix of file paths, directories, and glob patterns into a
|
|
27
|
+
* deduplicated, sorted list of absolute file paths.
|
|
28
|
+
*
|
|
29
|
+
* Rules per target:
|
|
30
|
+
* - directory: search recursively for `pattern` (default `**\/*.agest.ts`)
|
|
31
|
+
* - glob (contains *, ?, [], {}): expand it
|
|
32
|
+
* - file: use as-is
|
|
33
|
+
* - anything else: try as glob (zero matches is fine)
|
|
34
|
+
*/
|
|
35
|
+
export async function discoverTestFiles(targets, options = {}) {
|
|
36
|
+
const cwd = options.cwd ?? process.cwd();
|
|
37
|
+
const pattern = options.pattern ?? DEFAULT_PATTERN;
|
|
38
|
+
const work = targets.length === 0 ? ["."] : targets;
|
|
39
|
+
const found = new Set();
|
|
40
|
+
for (const target of work) {
|
|
41
|
+
if (hasGlobChars(target)) {
|
|
42
|
+
for (const f of await expandGlob(target, cwd))
|
|
43
|
+
found.add(f);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
const stat = await statSafe(isAbsolute(target) ? target : resolve(cwd, target));
|
|
47
|
+
if (stat.isDir) {
|
|
48
|
+
const trimmed = target.replace(/\/+$/, "");
|
|
49
|
+
const dirPattern = `${trimmed}/${pattern}`;
|
|
50
|
+
for (const f of await expandGlob(dirPattern, cwd))
|
|
51
|
+
found.add(f);
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
if (stat.isFile) {
|
|
55
|
+
found.add(isAbsolute(target) ? target : resolve(cwd, target));
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
for (const f of await expandGlob(target, cwd))
|
|
59
|
+
found.add(f);
|
|
60
|
+
}
|
|
61
|
+
return [...found].sort();
|
|
62
|
+
}
|
package/dist/index.d.ts
CHANGED
|
@@ -1,16 +1,26 @@
|
|
|
1
1
|
import type { AgentExecutor, AgentReport, HookFn } from "./types";
|
|
2
2
|
import { SceneBuilder } from "./context";
|
|
3
|
+
import { type StandardSchemaV1, type InferOutput } from "./schema";
|
|
3
4
|
export { expect } from "./assertions";
|
|
5
|
+
export type { StandardSchemaV1, InferOutput } from "./schema";
|
|
4
6
|
export { logger } from "./logger";
|
|
5
7
|
export { defineConfig } from "./config";
|
|
8
|
+
export { createTrace, summarizeEvents } from "./adapters/tracing";
|
|
9
|
+
export type { Trace } from "./adapters/tracing";
|
|
6
10
|
export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
|
|
7
11
|
export type { LogLevel } from "./logger";
|
|
8
12
|
export type { AgentExpectation, AgentMatchers } from "./assertions";
|
|
9
13
|
export type { JudgeCriteria } from "./judge";
|
|
10
|
-
export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, } from "./types";
|
|
14
|
+
export type { AgentExecutor, ExecutorOptions, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, TimelineEvent, TimelineEventKind, CostBreakdown, CostSource, } from "./types";
|
|
11
15
|
export interface AgentOptions {
|
|
12
16
|
name?: string;
|
|
13
17
|
}
|
|
18
|
+
/**
|
|
19
|
+
* Registers a scene in the active agent. The variant passed to an `agent()`
|
|
20
|
+
* callback is typed `SceneFn<T>`, so `.expect("value", …)` receives the agent's
|
|
21
|
+
* native value type.
|
|
22
|
+
*/
|
|
23
|
+
export type SceneFn<T = string> = (prompt: string) => SceneBuilder<T>;
|
|
14
24
|
export declare function scene(prompt: string): SceneBuilder;
|
|
15
25
|
export declare function beforeAll(fn: HookFn): void;
|
|
16
26
|
export declare function afterAll(fn: HookFn): void;
|
|
@@ -19,4 +29,12 @@ export declare function afterEach(fn: HookFn): void;
|
|
|
19
29
|
export declare function suite(name: string, fn: () => void): void;
|
|
20
30
|
/** @internal reset auto-run state between tests */
|
|
21
31
|
export declare function _resetAutoRun(): void;
|
|
22
|
-
export declare function agent(executor: AgentExecutor
|
|
32
|
+
export declare function agent<T = string>(executor: AgentExecutor<T>, fn: (scene: SceneFn<T>) => void, options?: AgentOptions): Promise<AgentReport<T>>;
|
|
33
|
+
/**
|
|
34
|
+
* Schema-typed agent: the executor's `value` type is inferred from the schema
|
|
35
|
+
* (e.g. `z.infer<typeof Schema>`), and every non-refusal scene is validated
|
|
36
|
+
* against it. The scene fn passed to the callback is typed accordingly, so
|
|
37
|
+
* `.expect("value", …)` receives that value type. A scene's own
|
|
38
|
+
* `.expectSchema()` overrides the agent schema.
|
|
39
|
+
*/
|
|
40
|
+
export declare function agent<S extends StandardSchemaV1>(schema: S, executor: AgentExecutor<InferOutput<S>>, fn: (scene: SceneFn<InferOutput<S>>) => void, options?: AgentOptions): Promise<AgentReport<InferOutput<S>>>;
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import { AgentContext, setContext, getContext } from "./context";
|
|
2
|
+
import { isStandardSchema } from "./schema";
|
|
2
3
|
export { expect } from "./assertions";
|
|
3
4
|
export { logger } from "./logger";
|
|
4
5
|
export { defineConfig } from "./config";
|
|
6
|
+
export { createTrace, summarizeEvents } from "./adapters/tracing";
|
|
5
7
|
export function scene(prompt) {
|
|
6
8
|
return getContext().registerScene(prompt);
|
|
7
9
|
}
|
|
@@ -36,11 +38,16 @@ export function _resetAutoRun() {
|
|
|
36
38
|
autoRunScheduled = false;
|
|
37
39
|
executionChain = Promise.resolve();
|
|
38
40
|
}
|
|
39
|
-
export function agent(
|
|
40
|
-
const
|
|
41
|
+
export function agent(...args) {
|
|
42
|
+
const [schema, executor, fn, options] = isStandardSchema(args[0])
|
|
43
|
+
? args
|
|
44
|
+
: [undefined, ...args];
|
|
45
|
+
const ctx = new AgentContext(executor, options?.name, schema);
|
|
41
46
|
setContext(ctx);
|
|
42
47
|
try {
|
|
43
|
-
fn
|
|
48
|
+
// Hand the callback a scene fn bound to the active context. Its static type
|
|
49
|
+
// carries T (via the overloads); at runtime it's the same `scene()`.
|
|
50
|
+
fn(scene);
|
|
44
51
|
}
|
|
45
52
|
catch (err) {
|
|
46
53
|
setContext(null);
|
package/dist/match.d.ts
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structural matching primitives for deterministic assertions. Kept in their
|
|
3
|
+
* own module — they are correctness-critical (a wrong result here is a false
|
|
4
|
+
* test pass) and deserve isolated, exhaustive unit tests.
|
|
5
|
+
*/
|
|
6
|
+
/** Any non-null, non-array object — including class instances, Map, Date, etc. */
|
|
7
|
+
export declare function isObjectLike(value: unknown): value is Record<string, unknown>;
|
|
8
|
+
/**
|
|
9
|
+
* A "record" object — a plain `{...}` literal (prototype is Object.prototype or
|
|
10
|
+
* null). Class instances, Map, Date, RegExp, etc. are NOT plain: they are
|
|
11
|
+
* compared as opaque leaves rather than recursed into.
|
|
12
|
+
*/
|
|
13
|
+
export declare function isPlainObject(value: unknown): value is Record<string, unknown>;
|
|
14
|
+
/**
|
|
15
|
+
* Recursive containment: is `expected` structurally present within `actual`?
|
|
16
|
+
*
|
|
17
|
+
* - `expected` array → `actual` is an array and the expected elements can be
|
|
18
|
+
* matched one-to-one to DISTINCT actual elements (order-independent
|
|
19
|
+
* multiset/sub-multiset membership — duplicates require distinct matches).
|
|
20
|
+
* - `expected` plain object → `actual` is object-like and every key in
|
|
21
|
+
* `expected` exists in `actual` with a recursively-contained value (extra
|
|
22
|
+
* keys in `actual` are allowed — that is the "partial").
|
|
23
|
+
* - anything else (primitive, Date, Map, RegExp, class instance) → strict
|
|
24
|
+
* deep equality via `isDeepStrictEqual` (correct for NaN / Date / ±0).
|
|
25
|
+
*
|
|
26
|
+
* Leaf comparison is EXACT and case-sensitive. Only the shape recurses.
|
|
27
|
+
*/
|
|
28
|
+
export declare function structuralContains(actual: unknown, expected: unknown): boolean;
|
package/dist/match.js
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { isDeepStrictEqual } from "node:util";
|
|
2
|
+
/**
|
|
3
|
+
* Structural matching primitives for deterministic assertions. Kept in their
|
|
4
|
+
* own module — they are correctness-critical (a wrong result here is a false
|
|
5
|
+
* test pass) and deserve isolated, exhaustive unit tests.
|
|
6
|
+
*/
|
|
7
|
+
/** Any non-null, non-array object — including class instances, Map, Date, etc. */
|
|
8
|
+
export function isObjectLike(value) {
|
|
9
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* A "record" object — a plain `{...}` literal (prototype is Object.prototype or
|
|
13
|
+
* null). Class instances, Map, Date, RegExp, etc. are NOT plain: they are
|
|
14
|
+
* compared as opaque leaves rather than recursed into.
|
|
15
|
+
*/
|
|
16
|
+
export function isPlainObject(value) {
|
|
17
|
+
if (!isObjectLike(value))
|
|
18
|
+
return false;
|
|
19
|
+
const proto = Object.getPrototypeOf(value);
|
|
20
|
+
return proto === Object.prototype || proto === null;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Recursive containment: is `expected` structurally present within `actual`?
|
|
24
|
+
*
|
|
25
|
+
* - `expected` array → `actual` is an array and the expected elements can be
|
|
26
|
+
* matched one-to-one to DISTINCT actual elements (order-independent
|
|
27
|
+
* multiset/sub-multiset membership — duplicates require distinct matches).
|
|
28
|
+
* - `expected` plain object → `actual` is object-like and every key in
|
|
29
|
+
* `expected` exists in `actual` with a recursively-contained value (extra
|
|
30
|
+
* keys in `actual` are allowed — that is the "partial").
|
|
31
|
+
* - anything else (primitive, Date, Map, RegExp, class instance) → strict
|
|
32
|
+
* deep equality via `isDeepStrictEqual` (correct for NaN / Date / ±0).
|
|
33
|
+
*
|
|
34
|
+
* Leaf comparison is EXACT and case-sensitive. Only the shape recurses.
|
|
35
|
+
*/
|
|
36
|
+
export function structuralContains(actual, expected) {
|
|
37
|
+
if (Array.isArray(expected)) {
|
|
38
|
+
if (!Array.isArray(actual))
|
|
39
|
+
return false;
|
|
40
|
+
// Greedy one-to-one matching: each expected element must claim a DISTINCT
|
|
41
|
+
// actual element, so `[1]` does not contain `[1, 1]`.
|
|
42
|
+
const claimed = new Set();
|
|
43
|
+
return expected.every((e) => {
|
|
44
|
+
const idx = actual.findIndex((a, i) => !claimed.has(i) && structuralContains(a, e));
|
|
45
|
+
if (idx === -1)
|
|
46
|
+
return false;
|
|
47
|
+
claimed.add(idx);
|
|
48
|
+
return true;
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
if (isPlainObject(expected)) {
|
|
52
|
+
if (!isObjectLike(actual))
|
|
53
|
+
return false;
|
|
54
|
+
return Object.keys(expected).every((key) => key in actual && structuralContains(actual[key], expected[key]));
|
|
55
|
+
}
|
|
56
|
+
return isDeepStrictEqual(actual, expected);
|
|
57
|
+
}
|