@sebastiantuyu/agest 0.1.6 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -6
- package/dist/adapters/index.d.ts +2 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/remote.d.ts +64 -0
- package/dist/adapters/remote.js +133 -0
- package/dist/assertions.d.ts +7 -0
- package/dist/assertions.js +9 -0
- package/dist/config.d.ts +19 -0
- package/dist/config.js +19 -0
- package/dist/context.d.ts +7 -1
- package/dist/context.js +60 -18
- package/dist/index.d.ts +10 -2
- package/dist/index.js +27 -4
- package/dist/judge.d.ts +9 -0
- package/dist/judge.js +101 -0
- package/dist/preview.d.ts +1 -0
- package/dist/preview.js +777 -0
- package/dist/reporter.d.ts +2 -1
- package/dist/reporter.js +49 -14
- package/dist/reports.d.ts +78 -0
- package/dist/reports.js +278 -0
- package/dist/runner.d.ts +2 -1
- package/dist/runner.js +49 -4
- package/dist/stats.js +222 -65
- package/dist/types.d.ts +12 -0
- package/package.json +12 -3
package/README.md
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Agest
|
|
2
2
|
|
|
3
|
+
[](https://github.com/sebastiantuyu/agest/actions/workflows/publish.yml)
|
|
4
|
+
|
|
3
5
|
A quantitative testing library for agents using a Jest-like syntax.
|
|
4
6
|
Batteries included.
|
|
5
7
|
|
|
@@ -108,12 +110,28 @@ npx tsx examples/openrouter.test.ts
|
|
|
108
110
|
|
|
109
111
|
## Roadmap
|
|
110
112
|
|
|
111
|
-
|
|
112
|
-
- [
|
|
113
|
-
- [
|
|
114
|
-
- [
|
|
115
|
-
- [
|
|
116
|
-
- [
|
|
113
|
+
### Shipped
|
|
114
|
+
- [x] Multi-turn support: `.turns(n)` per scene
|
|
115
|
+
- [x] LLM-as-judge: `.judgedBy({ criteria, failWhen })`
|
|
116
|
+
- [x] Remote HTTP adapter for framework-agnostic testing
|
|
117
|
+
- [x] Report persistence to `.reports/` with YAML format
|
|
118
|
+
- [x] Stats CLI with multi-model comparison and dimension analysis
|
|
119
|
+
|
|
120
|
+
### Up next
|
|
121
|
+
- [ ] Schema validation: `toBe.matchingSchema(zodSchema)`
|
|
122
|
+
- [ ] Semantic similarity: `toBe.semanticallySimilarTo(text, threshold)`
|
|
123
|
+
- [ ] Statistical runs: `.runs(n)` per scene with mean/stddev reporting
|
|
124
|
+
- [ ] Vercel AI SDK adapter
|
|
125
|
+
- [ ] Snapshot regression: diff current run against a saved baseline
|
|
126
|
+
|
|
127
|
+
### Planned
|
|
128
|
+
- [ ] Cost estimation per scene (token count to dollar cost)
|
|
129
|
+
- [ ] CI/CD reporter (GitHub Actions PR comments)
|
|
130
|
+
- [ ] Tool-call trajectory assertions
|
|
131
|
+
- [ ] Watch mode for TDD-style iteration
|
|
132
|
+
- [ ] OpenAI Agents SDK adapter
|
|
133
|
+
- [ ] Webhook/n8n adapter for no-code agent sources
|
|
134
|
+
- [ ] Jest/Vitest custom matcher export
|
|
117
135
|
|
|
118
136
|
## Development requirements
|
|
119
137
|
- Node 22+
|
package/dist/adapters/index.d.ts
CHANGED
package/dist/adapters/index.js
CHANGED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import type { AgentExecutor, AgentResponse } from "../types";
|
|
2
|
+
export interface RemoteAdapterOptions {
|
|
3
|
+
/** HTTP headers (e.g. Authorization) */
|
|
4
|
+
headers?: Record<string, string>;
|
|
5
|
+
/** HTTP method, defaults to POST */
|
|
6
|
+
method?: "POST" | "PUT" | "GET";
|
|
7
|
+
/**
|
|
8
|
+
* Extra fields merged into the request body.
|
|
9
|
+
* Merged *under* the output of `buildRequest`, so `buildRequest` wins on conflicts.
|
|
10
|
+
* Ignored when method is GET.
|
|
11
|
+
*/
|
|
12
|
+
body?: Record<string, unknown>;
|
|
13
|
+
/**
|
|
14
|
+
* Build the request body from the input prompt.
|
|
15
|
+
* Defaults to `{ prompt: input }`.
|
|
16
|
+
*/
|
|
17
|
+
buildRequest?: (input: string) => unknown;
|
|
18
|
+
/**
|
|
19
|
+
* Parse the raw response body into an AgentResponse.
|
|
20
|
+
* When omitted the adapter tries common shapes:
|
|
21
|
+
* - `{ text }` / `{ response }` / `{ output }` / `{ message }` / plain string
|
|
22
|
+
*/
|
|
23
|
+
parseResponse?: (body: unknown) => AgentResponse;
|
|
24
|
+
/**
|
|
25
|
+
* Static metadata for this remote agent.
|
|
26
|
+
* Because the remote endpoint is opaque, metadata like model name,
|
|
27
|
+
* tools, and system prompt must be provided manually here.
|
|
28
|
+
*/
|
|
29
|
+
metadata?: {
|
|
30
|
+
model?: string;
|
|
31
|
+
tokens?: {
|
|
32
|
+
input: number;
|
|
33
|
+
output: number;
|
|
34
|
+
};
|
|
35
|
+
tools?: string[];
|
|
36
|
+
systemPrompt?: string;
|
|
37
|
+
[key: string]: unknown;
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Adapter for remote agents exposed via HTTP endpoints.
|
|
42
|
+
*
|
|
43
|
+
* Since the remote agent is a black box, metadata (model, tools, etc.)
|
|
44
|
+
* must be supplied manually through `options.metadata`. If the endpoint
|
|
45
|
+
* returns token usage or other metadata, provide a `parseResponse`
|
|
46
|
+
* function to extract it.
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* ```ts
|
|
50
|
+
* import { remote } from "agest/adapters";
|
|
51
|
+
*
|
|
52
|
+
* const executor = remote("https://my-agent.example.com/chat", {
|
|
53
|
+
* headers: { Authorization: "Bearer sk-..." },
|
|
54
|
+
* metadata: { model: "gpt-4o", tools: ["search", "calculator"] },
|
|
55
|
+
* });
|
|
56
|
+
*
|
|
57
|
+
* await agent(executor, () => {
|
|
58
|
+
* scene("What is 2+2?").expect("response", (r) => {
|
|
59
|
+
* expect(r).toBe.containing("4");
|
|
60
|
+
* });
|
|
61
|
+
* });
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
export declare function remote(endpoint: string, options?: RemoteAdapterOptions): AgentExecutor;
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adapter for remote agents exposed via HTTP endpoints.
|
|
3
|
+
*
|
|
4
|
+
* Since the remote agent is a black box, metadata (model, tools, etc.)
|
|
5
|
+
* must be supplied manually through `options.metadata`. If the endpoint
|
|
6
|
+
* returns token usage or other metadata, provide a `parseResponse`
|
|
7
|
+
* function to extract it.
|
|
8
|
+
*
|
|
9
|
+
* @example
|
|
10
|
+
* ```ts
|
|
11
|
+
* import { remote } from "agest/adapters";
|
|
12
|
+
*
|
|
13
|
+
* const executor = remote("https://my-agent.example.com/chat", {
|
|
14
|
+
* headers: { Authorization: "Bearer sk-..." },
|
|
15
|
+
* metadata: { model: "gpt-4o", tools: ["search", "calculator"] },
|
|
16
|
+
* });
|
|
17
|
+
*
|
|
18
|
+
* await agent(executor, () => {
|
|
19
|
+
* scene("What is 2+2?").expect("response", (r) => {
|
|
20
|
+
* expect(r).toBe.containing("4");
|
|
21
|
+
* });
|
|
22
|
+
* });
|
|
23
|
+
* ```
|
|
24
|
+
*/
|
|
25
|
+
export function remote(endpoint, options = {}) {
|
|
26
|
+
const { headers = {}, method = "POST", body: extraBody, buildRequest = defaultBuildRequest, parseResponse, metadata: staticMetadata, } = options;
|
|
27
|
+
return async (input) => {
|
|
28
|
+
let res;
|
|
29
|
+
try {
|
|
30
|
+
const fetchOptions = {
|
|
31
|
+
method,
|
|
32
|
+
headers: { "Content-Type": "application/json", ...headers },
|
|
33
|
+
};
|
|
34
|
+
if (method !== "GET") {
|
|
35
|
+
const built = buildRequest(input);
|
|
36
|
+
const merged = extraBody && typeof built === "object" && built !== null
|
|
37
|
+
? { ...extraBody, ...built }
|
|
38
|
+
: extraBody && typeof built !== "object"
|
|
39
|
+
? { ...extraBody, prompt: built }
|
|
40
|
+
: built;
|
|
41
|
+
fetchOptions.body = JSON.stringify(merged);
|
|
42
|
+
}
|
|
43
|
+
res = await fetch(endpoint, fetchOptions);
|
|
44
|
+
}
|
|
45
|
+
catch (err) {
|
|
46
|
+
return {
|
|
47
|
+
text: "",
|
|
48
|
+
executionError: `Request failed: ${err.message}`,
|
|
49
|
+
metadata: staticMetadata,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
if (!res.ok) {
|
|
53
|
+
return {
|
|
54
|
+
text: "",
|
|
55
|
+
executionError: `HTTP ${res.status}: ${res.statusText}`,
|
|
56
|
+
metadata: staticMetadata,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
const contentType = res.headers.get("content-type") ?? "";
|
|
60
|
+
let body;
|
|
61
|
+
if (contentType.includes("application/json")) {
|
|
62
|
+
body = await res.json();
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
body = await res.text();
|
|
66
|
+
}
|
|
67
|
+
if (parseResponse) {
|
|
68
|
+
const parsed = parseResponse(body);
|
|
69
|
+
return {
|
|
70
|
+
...parsed,
|
|
71
|
+
metadata: { ...staticMetadata, ...parsed.metadata },
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
const text = extractText(body);
|
|
75
|
+
return {
|
|
76
|
+
text,
|
|
77
|
+
metadata: {
|
|
78
|
+
...staticMetadata,
|
|
79
|
+
...extractResponseMetadata(body),
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
function defaultBuildRequest(input) {
|
|
85
|
+
return { prompt: input };
|
|
86
|
+
}
|
|
87
|
+
function extractText(body) {
|
|
88
|
+
if (typeof body === "string")
|
|
89
|
+
return body;
|
|
90
|
+
if (typeof body !== "object" || body === null)
|
|
91
|
+
return String(body);
|
|
92
|
+
const obj = body;
|
|
93
|
+
for (const key of ["text", "response", "output", "message", "content", "answer"]) {
|
|
94
|
+
if (typeof obj[key] === "string")
|
|
95
|
+
return obj[key];
|
|
96
|
+
}
|
|
97
|
+
// Try nested: { data: { text } }, { result: { output } }
|
|
98
|
+
for (const wrapper of ["data", "result"]) {
|
|
99
|
+
if (typeof obj[wrapper] === "object" && obj[wrapper] !== null) {
|
|
100
|
+
const nested = obj[wrapper];
|
|
101
|
+
for (const key of ["text", "response", "output", "message", "content", "answer"]) {
|
|
102
|
+
if (typeof nested[key] === "string")
|
|
103
|
+
return nested[key];
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return JSON.stringify(body);
|
|
108
|
+
}
|
|
109
|
+
function extractResponseMetadata(body) {
|
|
110
|
+
if (typeof body !== "object" || body === null)
|
|
111
|
+
return undefined;
|
|
112
|
+
const obj = body;
|
|
113
|
+
const meta = {};
|
|
114
|
+
if (typeof obj.model === "string")
|
|
115
|
+
meta.model = obj.model;
|
|
116
|
+
// Try to find token usage in common locations
|
|
117
|
+
const usage = obj.usage ?? obj.token_usage ?? obj.tokens ??
|
|
118
|
+
(typeof obj.metadata === "object" && obj.metadata !== null
|
|
119
|
+
? obj.metadata.usage ??
|
|
120
|
+
obj.metadata.tokens
|
|
121
|
+
: undefined);
|
|
122
|
+
if (typeof usage === "object" && usage !== null) {
|
|
123
|
+
const u = usage;
|
|
124
|
+
const input = (u.input_tokens ?? u.prompt_tokens ?? u.promptTokens ?? u.input);
|
|
125
|
+
const output = (u.output_tokens ?? u.completion_tokens ?? u.completionTokens ?? u.output);
|
|
126
|
+
if (input !== undefined || output !== undefined) {
|
|
127
|
+
meta.tokens = { input: input ?? 0, output: output ?? 0 };
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
if (typeof obj.refusal === "boolean")
|
|
131
|
+
meta.refusal = obj.refusal;
|
|
132
|
+
return Object.keys(meta).length > 0 ? meta : undefined;
|
|
133
|
+
}
|
package/dist/assertions.d.ts
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
|
+
import type { JudgeCriteria } from "./judge";
|
|
2
|
+
export interface PendingJudgement {
|
|
3
|
+
value: unknown;
|
|
4
|
+
criteria: JudgeCriteria;
|
|
5
|
+
}
|
|
6
|
+
export declare function collectPendingJudgements(): PendingJudgement[];
|
|
1
7
|
export interface AgentMatchers {
|
|
2
8
|
refusal(): void;
|
|
3
9
|
notRefusal(): void;
|
|
4
10
|
containing(text: string): void;
|
|
5
11
|
matchingPattern(regex: RegExp): void;
|
|
12
|
+
judgedBy(criteria: JudgeCriteria): void;
|
|
6
13
|
}
|
|
7
14
|
export interface AgentExpectation {
|
|
8
15
|
readonly toBe: AgentMatchers;
|
package/dist/assertions.js
CHANGED
|
@@ -1,4 +1,10 @@
|
|
|
1
1
|
import { isRefusal } from "./refusal";
|
|
2
|
+
let pendingJudgements = [];
|
|
3
|
+
export function collectPendingJudgements() {
|
|
4
|
+
const collected = pendingJudgements;
|
|
5
|
+
pendingJudgements = [];
|
|
6
|
+
return collected;
|
|
7
|
+
}
|
|
2
8
|
export function expect(value) {
|
|
3
9
|
return {
|
|
4
10
|
get toBe() {
|
|
@@ -31,6 +37,9 @@ export function expect(value) {
|
|
|
31
37
|
throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
|
|
32
38
|
}
|
|
33
39
|
},
|
|
40
|
+
judgedBy(criteria) {
|
|
41
|
+
pendingJudgements.push({ value, criteria });
|
|
42
|
+
},
|
|
34
43
|
};
|
|
35
44
|
},
|
|
36
45
|
};
|
package/dist/config.d.ts
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export type JudgeExecutor = (prompt: string) => Promise<string>;
|
|
2
|
+
export interface JudgeConfig {
|
|
3
|
+
/** Model identifier passed to the OpenAI-compatible API. Defaults to "openai/gpt-oss-20b". */
|
|
4
|
+
model?: string;
|
|
5
|
+
/** API key. Defaults to OPENROUTER_API_KEY then OPENAI_API_KEY env vars. */
|
|
6
|
+
apiKey?: string;
|
|
7
|
+
/** Base URL for the chat completions endpoint. Defaults to "https://openrouter.ai/api/v1". */
|
|
8
|
+
baseUrl?: string;
|
|
9
|
+
/** Fully custom judge function. When provided, model/apiKey/baseUrl are ignored. */
|
|
10
|
+
executor?: JudgeExecutor;
|
|
11
|
+
}
|
|
12
|
+
export interface AgestConfig {
|
|
13
|
+
parallelism?: number;
|
|
14
|
+
timeout?: number;
|
|
15
|
+
turns?: number;
|
|
16
|
+
judge?: JudgeConfig;
|
|
17
|
+
}
|
|
18
|
+
export declare function defineConfig(config: AgestConfig): AgestConfig;
|
|
19
|
+
export declare function loadConfig(): Promise<AgestConfig>;
|
package/dist/config.js
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
export function defineConfig(config) {
|
|
3
|
+
return config;
|
|
4
|
+
}
|
|
5
|
+
export async function loadConfig() {
|
|
6
|
+
const candidates = [
|
|
7
|
+
path.join(process.cwd(), "agest.config.ts"),
|
|
8
|
+
path.join(process.cwd(), "agest.config.js"),
|
|
9
|
+
];
|
|
10
|
+
for (const candidate of candidates) {
|
|
11
|
+
try {
|
|
12
|
+
const mod = await import(candidate);
|
|
13
|
+
return (mod.default ?? mod);
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return {};
|
|
19
|
+
}
|
package/dist/context.d.ts
CHANGED
|
@@ -2,16 +2,22 @@ import type { AgentExecutor, AgentReport, SceneDefinition } from "./types";
|
|
|
2
2
|
export declare class SceneBuilder {
|
|
3
3
|
private _prompt;
|
|
4
4
|
private _assertions;
|
|
5
|
+
private _timeout?;
|
|
6
|
+
private _turns?;
|
|
5
7
|
constructor(_prompt: string);
|
|
8
|
+
timeout(ms: number): SceneBuilder;
|
|
9
|
+
turns(n: number): SceneBuilder;
|
|
6
10
|
expect(field: string, fn: (value: any) => void): SceneBuilder;
|
|
7
11
|
toDefinition(): SceneDefinition;
|
|
8
12
|
}
|
|
9
13
|
export declare class AgentContext {
|
|
10
14
|
private _executor;
|
|
15
|
+
private _name?;
|
|
11
16
|
private _scenes;
|
|
12
|
-
constructor(_executor: AgentExecutor);
|
|
17
|
+
constructor(_executor: AgentExecutor, _name?: string | undefined);
|
|
13
18
|
registerScene(prompt: string): SceneBuilder;
|
|
14
19
|
execute(): Promise<AgentReport>;
|
|
15
20
|
}
|
|
21
|
+
export declare function hashPromptOnly(prompt: string): string;
|
|
16
22
|
export declare function setContext(ctx: AgentContext | null): void;
|
|
17
23
|
export declare function getContext(): AgentContext;
|
package/dist/context.js
CHANGED
|
@@ -1,26 +1,40 @@
|
|
|
1
1
|
import { createHash } from "crypto";
|
|
2
2
|
import { executeScene } from "./runner";
|
|
3
|
-
import { formatReport, writeReport } from "./reporter";
|
|
3
|
+
import { formatReport, writeReport, writeDiffEntry } from "./reporter";
|
|
4
4
|
import { logger, c } from "./logger";
|
|
5
|
+
import { loadConfig } from "./config";
|
|
6
|
+
import { PromisePool } from "@supercharge/promise-pool";
|
|
5
7
|
export class SceneBuilder {
|
|
6
8
|
_prompt;
|
|
7
9
|
_assertions = [];
|
|
10
|
+
_timeout;
|
|
11
|
+
_turns;
|
|
8
12
|
constructor(_prompt) {
|
|
9
13
|
this._prompt = _prompt;
|
|
10
14
|
}
|
|
15
|
+
timeout(ms) {
|
|
16
|
+
this._timeout = ms;
|
|
17
|
+
return this;
|
|
18
|
+
}
|
|
19
|
+
turns(n) {
|
|
20
|
+
this._turns = n;
|
|
21
|
+
return this;
|
|
22
|
+
}
|
|
11
23
|
expect(field, fn) {
|
|
12
24
|
this._assertions.push({ field, fn });
|
|
13
25
|
return this;
|
|
14
26
|
}
|
|
15
27
|
toDefinition() {
|
|
16
|
-
return { prompt: this._prompt, assertions: [...this._assertions] };
|
|
28
|
+
return { prompt: this._prompt, assertions: [...this._assertions], timeout: this._timeout, turns: this._turns };
|
|
17
29
|
}
|
|
18
30
|
}
|
|
19
31
|
export class AgentContext {
|
|
20
32
|
_executor;
|
|
33
|
+
_name;
|
|
21
34
|
_scenes = [];
|
|
22
|
-
constructor(_executor) {
|
|
35
|
+
constructor(_executor, _name) {
|
|
23
36
|
this._executor = _executor;
|
|
37
|
+
this._name = _name;
|
|
24
38
|
}
|
|
25
39
|
registerScene(prompt) {
|
|
26
40
|
const builder = new SceneBuilder(prompt);
|
|
@@ -28,32 +42,39 @@ export class AgentContext {
|
|
|
28
42
|
return builder;
|
|
29
43
|
}
|
|
30
44
|
async execute() {
|
|
45
|
+
const config = await loadConfig();
|
|
46
|
+
const parallelism = Math.max(1, config.parallelism ?? 1);
|
|
31
47
|
const definitions = this._scenes.map((s) => s.toDefinition());
|
|
32
|
-
const
|
|
33
|
-
let totalDuration = 0;
|
|
48
|
+
const orderedResults = new Array(definitions.length);
|
|
34
49
|
const total = definitions.length;
|
|
35
|
-
logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}...\n`));
|
|
36
|
-
|
|
37
|
-
const scene = definitions[i];
|
|
50
|
+
logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}${parallelism > 1 ? c.dim(` (parallelism: ${parallelism})`) : ""}...\n`));
|
|
51
|
+
const tasks = definitions.map((scene, i) => async () => {
|
|
38
52
|
const label = scene.prompt.length > 60
|
|
39
53
|
? scene.prompt.slice(0, 57) + "..."
|
|
40
54
|
: scene.prompt;
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
results.push(result);
|
|
44
|
-
totalDuration += result.duration;
|
|
55
|
+
const result = await executeScene(this._executor, scene, config.timeout, config.judge, config.turns);
|
|
56
|
+
orderedResults[i] = result;
|
|
45
57
|
const ms = result.duration.toFixed(0);
|
|
46
58
|
if (result.passed) {
|
|
47
|
-
logger.info(c.
|
|
59
|
+
logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.green("PASS")}${c.dim(` (${ms}ms)`)}`);
|
|
60
|
+
}
|
|
61
|
+
else if (result.judgement?.verdict === "partial") {
|
|
62
|
+
logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.yellow("PARTIAL")}${c.dim(` (${ms}ms)`)}`);
|
|
63
|
+
if (result.error) {
|
|
64
|
+
logger.info(` ${c.yellow(result.error)}`);
|
|
65
|
+
}
|
|
48
66
|
}
|
|
49
67
|
else {
|
|
50
|
-
logger.info(c.
|
|
68
|
+
logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.red("FAIL")}${c.dim(` (${ms}ms)`)}`);
|
|
51
69
|
if (result.error) {
|
|
52
70
|
logger.info(` ${c.red(result.error)}`);
|
|
53
71
|
}
|
|
54
72
|
}
|
|
55
73
|
logger.debug(` response: ${result.response.text?.slice(0, 120)}`);
|
|
56
|
-
}
|
|
74
|
+
});
|
|
75
|
+
await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
|
|
76
|
+
const results = orderedResults;
|
|
77
|
+
let totalDuration = results.reduce((sum, r) => sum + r.duration, 0);
|
|
57
78
|
logger.info("");
|
|
58
79
|
const failedResults = results.filter((r) => !r.passed);
|
|
59
80
|
const failedCases = failedResults.map((r) => r.prompt);
|
|
@@ -75,11 +96,25 @@ export class AgentContext {
|
|
|
75
96
|
}
|
|
76
97
|
const firstMeta = results.find((r) => r.response.metadata)?.response
|
|
77
98
|
.metadata;
|
|
99
|
+
const dimensions = {};
|
|
100
|
+
if (firstMeta?.model)
|
|
101
|
+
dimensions.model = firstMeta.model;
|
|
102
|
+
if (firstMeta?.systemPrompt)
|
|
103
|
+
dimensions.prompt = hashPromptOnly(firstMeta.systemPrompt);
|
|
104
|
+
if (firstMeta?.tools?.length)
|
|
105
|
+
dimensions.tools = [...firstMeta.tools].sort().join(",");
|
|
106
|
+
else
|
|
107
|
+
dimensions.tools = "none";
|
|
78
108
|
const report = {
|
|
109
|
+
name: this._name,
|
|
79
110
|
model: firstMeta?.model,
|
|
80
111
|
systemPromptHash: firstMeta?.systemPrompt
|
|
81
|
-
? hashPrompt(firstMeta.systemPrompt)
|
|
112
|
+
? hashPrompt(firstMeta.systemPrompt, firstMeta.model)
|
|
113
|
+
: undefined,
|
|
114
|
+
promptHash: firstMeta?.systemPrompt
|
|
115
|
+
? hashPromptOnly(firstMeta.systemPrompt)
|
|
82
116
|
: undefined,
|
|
117
|
+
dimensions,
|
|
83
118
|
tools: firstMeta?.tools,
|
|
84
119
|
successRate,
|
|
85
120
|
failedCases,
|
|
@@ -91,14 +126,21 @@ export class AgentContext {
|
|
|
91
126
|
averageOutputTokensPerCase,
|
|
92
127
|
results,
|
|
93
128
|
};
|
|
129
|
+
if (report.systemPromptHash && firstMeta?.systemPrompt) {
|
|
130
|
+
await writeDiffEntry(report.systemPromptHash, firstMeta.systemPrompt, report.tools ?? [], report.model);
|
|
131
|
+
}
|
|
94
132
|
const formatted = formatReport(report);
|
|
95
133
|
logger.info(formatted);
|
|
96
|
-
const filepath = await writeReport(formatted, report.timestamp);
|
|
134
|
+
const filepath = await writeReport(formatted, report.timestamp, report.name, report.dimensions);
|
|
97
135
|
logger.info(`\n${c.dim("Report saved to:")} ${c.cyan(filepath)}`);
|
|
98
136
|
return report;
|
|
99
137
|
}
|
|
100
138
|
}
|
|
101
|
-
function hashPrompt(prompt) {
|
|
139
|
+
function hashPrompt(prompt, model) {
|
|
140
|
+
const input = model ? `${model}:${prompt}` : prompt;
|
|
141
|
+
return createHash("sha256").update(input).digest("hex").slice(0, 12);
|
|
142
|
+
}
|
|
143
|
+
export function hashPromptOnly(prompt) {
|
|
102
144
|
return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
|
|
103
145
|
}
|
|
104
146
|
let currentContext = null;
|
package/dist/index.d.ts
CHANGED
|
@@ -2,8 +2,16 @@ import type { AgentExecutor, AgentReport } from "./types";
|
|
|
2
2
|
import { SceneBuilder } from "./context";
|
|
3
3
|
export { expect } from "./assertions";
|
|
4
4
|
export { logger } from "./logger";
|
|
5
|
+
export { defineConfig } from "./config";
|
|
6
|
+
export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
|
|
5
7
|
export type { LogLevel } from "./logger";
|
|
6
8
|
export type { AgentExpectation, AgentMatchers } from "./assertions";
|
|
7
|
-
export type {
|
|
9
|
+
export type { JudgeCriteria } from "./judge";
|
|
10
|
+
export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, JudgeVerdict, JudgeResult, } from "./types";
|
|
11
|
+
export interface AgentOptions {
|
|
12
|
+
name?: string;
|
|
13
|
+
}
|
|
8
14
|
export declare function scene(prompt: string): SceneBuilder;
|
|
9
|
-
|
|
15
|
+
/** @internal reset auto-run state between tests */
|
|
16
|
+
export declare function _resetAutoRun(): void;
|
|
17
|
+
export declare function agent(executor: AgentExecutor, fn: () => void, options?: AgentOptions): Promise<AgentReport>;
|
package/dist/index.js
CHANGED
|
@@ -1,17 +1,40 @@
|
|
|
1
1
|
import { AgentContext, setContext, getContext } from "./context";
|
|
2
2
|
export { expect } from "./assertions";
|
|
3
3
|
export { logger } from "./logger";
|
|
4
|
+
export { defineConfig } from "./config";
|
|
4
5
|
export function scene(prompt) {
|
|
5
6
|
return getContext().registerScene(prompt);
|
|
6
7
|
}
|
|
7
|
-
|
|
8
|
-
|
|
8
|
+
const pendingAgents = [];
|
|
9
|
+
let autoRunScheduled = false;
|
|
10
|
+
/** @internal reset auto-run state between tests */
|
|
11
|
+
export function _resetAutoRun() {
|
|
12
|
+
pendingAgents.length = 0;
|
|
13
|
+
autoRunScheduled = false;
|
|
14
|
+
}
|
|
15
|
+
export function agent(executor, fn, options) {
|
|
16
|
+
const ctx = new AgentContext(executor, options?.name);
|
|
9
17
|
setContext(ctx);
|
|
10
18
|
try {
|
|
11
19
|
fn();
|
|
12
20
|
}
|
|
13
|
-
|
|
21
|
+
catch (err) {
|
|
14
22
|
setContext(null);
|
|
23
|
+
return Promise.reject(err);
|
|
24
|
+
}
|
|
25
|
+
setContext(null);
|
|
26
|
+
const promise = ctx.execute();
|
|
27
|
+
pendingAgents.push(promise);
|
|
28
|
+
if (!autoRunScheduled) {
|
|
29
|
+
autoRunScheduled = true;
|
|
30
|
+
process.nextTick(async () => {
|
|
31
|
+
try {
|
|
32
|
+
await Promise.all(pendingAgents);
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
process.exitCode = 1;
|
|
36
|
+
}
|
|
37
|
+
});
|
|
15
38
|
}
|
|
16
|
-
return
|
|
39
|
+
return promise;
|
|
17
40
|
}
|
package/dist/judge.d.ts
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { JudgeResult } from "./types";
|
|
2
|
+
import type { JudgeConfig, JudgeExecutor } from "./config";
|
|
3
|
+
export declare function resolveJudgeExecutor(config: JudgeConfig): JudgeExecutor;
|
|
4
|
+
export interface JudgeCriteria {
|
|
5
|
+
criteria: string;
|
|
6
|
+
failWhen: string;
|
|
7
|
+
context?: string;
|
|
8
|
+
}
|
|
9
|
+
export declare function callJudge(response: string, criteria: JudgeCriteria, executor: JudgeExecutor): Promise<JudgeResult>;
|
package/dist/judge.js
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
const DEFAULT_JUDGE_MODEL = "openai/gpt-oss-20b";
|
|
2
|
+
const DEFAULT_BASE_URL = "https://openrouter.ai/api/v1";
|
|
3
|
+
function buildFetchExecutor(config) {
|
|
4
|
+
const model = config.model ?? DEFAULT_JUDGE_MODEL;
|
|
5
|
+
const baseUrl = config.baseUrl ?? DEFAULT_BASE_URL;
|
|
6
|
+
const apiKey = config.apiKey ??
|
|
7
|
+
process.env.OPENROUTER_API_KEY ??
|
|
8
|
+
process.env.OPENAI_API_KEY ??
|
|
9
|
+
"";
|
|
10
|
+
return async (prompt) => {
|
|
11
|
+
const res = await fetch(`${baseUrl}/chat/completions`, {
|
|
12
|
+
method: "POST",
|
|
13
|
+
headers: {
|
|
14
|
+
"Content-Type": "application/json",
|
|
15
|
+
Authorization: `Bearer ${apiKey}`,
|
|
16
|
+
},
|
|
17
|
+
body: JSON.stringify({
|
|
18
|
+
model,
|
|
19
|
+
messages: [{ role: "user", content: prompt }],
|
|
20
|
+
temperature: 0,
|
|
21
|
+
}),
|
|
22
|
+
});
|
|
23
|
+
if (!res.ok) {
|
|
24
|
+
const text = await res.text();
|
|
25
|
+
throw new Error(`Judge API error ${res.status}: ${text.slice(0, 200)}`);
|
|
26
|
+
}
|
|
27
|
+
const data = (await res.json());
|
|
28
|
+
return data.choices[0]?.message?.content ?? "";
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
export function resolveJudgeExecutor(config) {
|
|
32
|
+
if (config.executor)
|
|
33
|
+
return config.executor;
|
|
34
|
+
return buildFetchExecutor(config);
|
|
35
|
+
}
|
|
36
|
+
const JUDGE_SYSTEM_PROMPT = `You are an evaluation judge for an AI agent's response. Evaluate the response against the provided criteria.
|
|
37
|
+
Return EXACTLY one JSON object with these fields:
|
|
38
|
+
- "verdict": one of "pass", "fail", or "partial"
|
|
39
|
+
- "reasoning": a brief explanation (1-2 sentences)
|
|
40
|
+
|
|
41
|
+
Rules:
|
|
42
|
+
- "pass": The response fully satisfies the success criteria with no issues.
|
|
43
|
+
- "partial": The response partially meets the criteria but has notable gaps or minor issues.
|
|
44
|
+
- "fail": The response meets the failure conditions or fundamentally misses the criteria.
|
|
45
|
+
|
|
46
|
+
Respond with ONLY the JSON object, no other text.`;
|
|
47
|
+
function buildJudgePrompt(response, criteria) {
|
|
48
|
+
let prompt = `${JUDGE_SYSTEM_PROMPT}
|
|
49
|
+
|
|
50
|
+
## Agent Response
|
|
51
|
+
${response}
|
|
52
|
+
|
|
53
|
+
## Success Criteria
|
|
54
|
+
${criteria.criteria}
|
|
55
|
+
|
|
56
|
+
## Failure Conditions
|
|
57
|
+
${criteria.failWhen}`;
|
|
58
|
+
if (criteria.context) {
|
|
59
|
+
prompt += `\n\n## Additional Context\n${criteria.context}`;
|
|
60
|
+
}
|
|
61
|
+
return prompt;
|
|
62
|
+
}
|
|
63
|
+
function parseJudgeResponse(raw, criteria) {
|
|
64
|
+
const jsonMatch = raw.match(/\{[\s\S]*\}/);
|
|
65
|
+
if (!jsonMatch) {
|
|
66
|
+
throw new Error(`Judge returned no JSON object: "${raw.slice(0, 200)}"`);
|
|
67
|
+
}
|
|
68
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
69
|
+
const verdict = parsed.verdict;
|
|
70
|
+
if (verdict !== "pass" && verdict !== "fail" && verdict !== "partial") {
|
|
71
|
+
throw new Error(`Judge returned invalid verdict: "${verdict}"`);
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
verdict: verdict,
|
|
75
|
+
reasoning: String(parsed.reasoning ?? ""),
|
|
76
|
+
criteria,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
export async function callJudge(response, criteria, executor) {
|
|
80
|
+
const prompt = buildJudgePrompt(response, criteria);
|
|
81
|
+
let raw;
|
|
82
|
+
try {
|
|
83
|
+
raw = await executor(prompt);
|
|
84
|
+
}
|
|
85
|
+
catch (err) {
|
|
86
|
+
throw new Error(`Judge executor failed: ${err.message}`);
|
|
87
|
+
}
|
|
88
|
+
try {
|
|
89
|
+
return parseJudgeResponse(raw, criteria.criteria);
|
|
90
|
+
}
|
|
91
|
+
catch (firstErr) {
|
|
92
|
+
// Retry once on parse failure
|
|
93
|
+
try {
|
|
94
|
+
raw = await executor(prompt);
|
|
95
|
+
return parseJudgeResponse(raw, criteria.criteria);
|
|
96
|
+
}
|
|
97
|
+
catch {
|
|
98
|
+
throw firstErr;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|