@sebastiantuyu/agest 0.3.2 → 0.3.3-next.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -1
- package/dist/adapters/index.d.ts +2 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +80 -11
- package/dist/adapters/remote.d.ts +1 -1
- package/dist/adapters/remote.js +3 -2
- package/dist/adapters/tracing.d.ts +73 -0
- package/dist/adapters/tracing.js +338 -0
- package/dist/assertions.d.ts +57 -2
- package/dist/assertions.js +119 -33
- package/dist/cli.d.ts +15 -1
- package/dist/cli.js +97 -18
- package/dist/config.d.ts +9 -0
- package/dist/context.d.ts +32 -11
- package/dist/context.js +84 -10
- package/dist/discover.d.ts +16 -0
- package/dist/discover.js +62 -0
- package/dist/index.d.ts +20 -2
- package/dist/index.js +10 -3
- package/dist/match.d.ts +28 -0
- package/dist/match.js +57 -0
- package/dist/preview.js +93 -0
- package/dist/pricing/index.d.ts +32 -0
- package/dist/pricing/index.js +48 -0
- package/dist/pricing/models.json +21 -0
- package/dist/reporter.d.ts +1 -1
- package/dist/reporter.js +77 -4
- package/dist/reports.d.ts +37 -0
- package/dist/reports.js +126 -0
- package/dist/resolve.d.ts +25 -0
- package/dist/resolve.js +62 -0
- package/dist/runner.d.ts +11 -2
- package/dist/runner.js +97 -11
- package/dist/schema.d.ts +63 -0
- package/dist/schema.js +61 -0
- package/dist/types.d.ts +84 -9
- package/dist/waterfall.d.ts +11 -0
- package/dist/waterfall.js +46 -0
- package/package.json +24 -15
package/dist/runner.js
CHANGED
|
@@ -1,16 +1,34 @@
|
|
|
1
1
|
import { collectPendingJudgements } from "./assertions";
|
|
2
2
|
import { callJudge, resolveJudgeExecutor } from "./judge";
|
|
3
|
+
import { resolveValue, resolveText, serializeValue, navigatePath } from "./resolve";
|
|
4
|
+
import { validateAgainstSchema } from "./schema";
|
|
3
5
|
const DEFAULT_SCENE_TIMEOUT = 10_000;
|
|
6
|
+
/**
|
|
7
|
+
* Extract a named field from an agent response for assertion.
|
|
8
|
+
* - "response" / "value" → the native structured value (deterministic matchers)
|
|
9
|
+
* - "text" → the serialized/judge view (lazy; text matchers)
|
|
10
|
+
* - "metadata"/"refusal" → the corresponding response property
|
|
11
|
+
* - dot-path → navigated into the structured value first
|
|
12
|
+
* (e.g. "plan_items.0.options"), falling back to
|
|
13
|
+
* metadata so existing metadata paths keep resolving.
|
|
14
|
+
*/
|
|
4
15
|
export function extractField(response, field) {
|
|
5
16
|
switch (field) {
|
|
6
17
|
case "response":
|
|
7
|
-
|
|
18
|
+
case "value":
|
|
19
|
+
return resolveValue(response);
|
|
20
|
+
case "text":
|
|
21
|
+
return resolveText(response);
|
|
8
22
|
case "metadata":
|
|
9
23
|
return response.metadata;
|
|
10
24
|
case "refusal":
|
|
11
25
|
return response.refusal;
|
|
12
|
-
default:
|
|
13
|
-
|
|
26
|
+
default: {
|
|
27
|
+
const fromValue = navigatePath(resolveValue(response), field);
|
|
28
|
+
if (fromValue !== undefined)
|
|
29
|
+
return fromValue;
|
|
30
|
+
return navigatePath(response.metadata ?? {}, field);
|
|
31
|
+
}
|
|
14
32
|
}
|
|
15
33
|
}
|
|
16
34
|
/**
|
|
@@ -31,19 +49,29 @@ function wilsonSignificance(passes, total) {
|
|
|
31
49
|
return Math.max(0, Math.min(1, lower));
|
|
32
50
|
}
|
|
33
51
|
async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig) {
|
|
52
|
+
// The empty sentinel uses the `text` branch of the union so it is a valid
|
|
53
|
+
// AgentResponse<T> for ANY T (there is no native value yet — the executor
|
|
54
|
+
// hasn't run). Using `{ value: "" }` would wrongly assume T = string.
|
|
34
55
|
let response = { text: "" };
|
|
35
56
|
let duration;
|
|
36
57
|
try {
|
|
37
58
|
const start = performance.now();
|
|
38
59
|
const input = scene.prompt;
|
|
39
60
|
for (let t = 0; t < turns; t++) {
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
61
|
+
const controller = new AbortController();
|
|
62
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
63
|
+
try {
|
|
64
|
+
response = await executor(input, { signal: controller.signal });
|
|
65
|
+
}
|
|
66
|
+
catch (err) {
|
|
67
|
+
if (err.name === "AbortError" || controller.signal.aborted) {
|
|
68
|
+
throw new Error(`Scene timed out after ${timeoutMs}ms`);
|
|
69
|
+
}
|
|
70
|
+
throw err;
|
|
71
|
+
}
|
|
72
|
+
finally {
|
|
73
|
+
clearTimeout(timer);
|
|
74
|
+
}
|
|
47
75
|
if (response.executionError)
|
|
48
76
|
break;
|
|
49
77
|
}
|
|
@@ -63,7 +91,21 @@ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig)
|
|
|
63
91
|
let passed = true;
|
|
64
92
|
let error;
|
|
65
93
|
let judgement;
|
|
94
|
+
// Schema validation runs first — a structural failure is the headline. Skip
|
|
95
|
+
// refusals (which legitimately won't match the output shape) and empty values.
|
|
96
|
+
if (scene.schema && !response.refusal) {
|
|
97
|
+
const value = resolveValue(response);
|
|
98
|
+
if (value !== undefined) {
|
|
99
|
+
const outcome = await validateAgainstSchema(scene.schema, value);
|
|
100
|
+
if (!outcome.ok) {
|
|
101
|
+
passed = false;
|
|
102
|
+
error = `Schema validation failed — ${outcome.message}`;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
66
106
|
for (const assertion of scene.assertions) {
|
|
107
|
+
if (!passed)
|
|
108
|
+
break;
|
|
67
109
|
try {
|
|
68
110
|
const value = extractField(response, assertion.field);
|
|
69
111
|
assertion.fn(value);
|
|
@@ -84,7 +126,9 @@ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig)
|
|
|
84
126
|
const judgeExecutor = resolveJudgeExecutor(judgeConfig);
|
|
85
127
|
for (const p of pending) {
|
|
86
128
|
try {
|
|
87
|
-
|
|
129
|
+
// Hand the judge the serialized text view — NOT String(value),
|
|
130
|
+
// which would render a structured value as "[object Object]".
|
|
131
|
+
const result = await callJudge(serializeValue(p.value), p.criteria, judgeExecutor);
|
|
88
132
|
judgement = result;
|
|
89
133
|
if (result.verdict === "fail" || result.verdict === "partial") {
|
|
90
134
|
passed = false;
|
|
@@ -109,6 +153,9 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
109
153
|
// Single run — original fast path
|
|
110
154
|
if (numRuns <= 1) {
|
|
111
155
|
const run = await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig);
|
|
156
|
+
const tokens = run.response.metadata?.tokens;
|
|
157
|
+
const cost = run.response.metadata?.cost;
|
|
158
|
+
const events = run.response.metadata?.events;
|
|
112
159
|
return {
|
|
113
160
|
prompt: scene.prompt,
|
|
114
161
|
response: run.response,
|
|
@@ -117,6 +164,10 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
117
164
|
error: run.error,
|
|
118
165
|
judgement: run.judgement,
|
|
119
166
|
suite: scene.suite,
|
|
167
|
+
tokens: tokens ? { input: tokens.input, output: tokens.output } : undefined,
|
|
168
|
+
costUsd: cost?.totalUsd,
|
|
169
|
+
costSource: cost?.source,
|
|
170
|
+
events: events && events.length ? events : undefined,
|
|
120
171
|
};
|
|
121
172
|
}
|
|
122
173
|
// Multiple runs — execute N times and aggregate
|
|
@@ -136,6 +187,37 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
136
187
|
const error = overallPassed
|
|
137
188
|
? undefined
|
|
138
189
|
: failedRuns[0]?.error ?? "Majority of runs failed";
|
|
190
|
+
// Aggregate tokens, cost, events across runs
|
|
191
|
+
let inputTokens = 0;
|
|
192
|
+
let outputTokens = 0;
|
|
193
|
+
let hasTokens = false;
|
|
194
|
+
let costTotal = 0;
|
|
195
|
+
let hasCost = false;
|
|
196
|
+
let costSource;
|
|
197
|
+
const allEvents = [];
|
|
198
|
+
runs.forEach((r, runIndex) => {
|
|
199
|
+
const meta = r.response.metadata;
|
|
200
|
+
if (meta?.tokens) {
|
|
201
|
+
hasTokens = true;
|
|
202
|
+
inputTokens += meta.tokens.input;
|
|
203
|
+
outputTokens += meta.tokens.output;
|
|
204
|
+
}
|
|
205
|
+
if (meta?.cost?.totalUsd != null) {
|
|
206
|
+
hasCost = true;
|
|
207
|
+
costTotal += meta.cost.totalUsd;
|
|
208
|
+
// Promote weakest source: provider > table > unavailable
|
|
209
|
+
if (costSource !== "table")
|
|
210
|
+
costSource = meta.cost.source;
|
|
211
|
+
if (meta.cost.source === "table" && costSource !== "table") {
|
|
212
|
+
costSource = "table";
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
if (meta?.events?.length) {
|
|
216
|
+
for (const e of meta.events) {
|
|
217
|
+
allEvents.push({ ...e, runIndex });
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
});
|
|
139
221
|
return {
|
|
140
222
|
prompt: scene.prompt,
|
|
141
223
|
response: lastRun.response,
|
|
@@ -147,5 +229,9 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
147
229
|
runs,
|
|
148
230
|
passRate,
|
|
149
231
|
statisticalSignificance,
|
|
232
|
+
tokens: hasTokens ? { input: inputTokens, output: outputTokens } : undefined,
|
|
233
|
+
costUsd: hasCost ? costTotal : undefined,
|
|
234
|
+
costSource,
|
|
235
|
+
events: allEvents.length ? allEvents : undefined,
|
|
150
236
|
};
|
|
151
237
|
}
|
package/dist/schema.d.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema validation built on the Standard Schema v1 spec
|
|
3
|
+
* (https://standardschema.dev). Agest never imports a schema library — it talks
|
|
4
|
+
* to whatever the consumer brings (zod 4, valibot, arktype, …) through the
|
|
5
|
+
* `~standard` contract every compliant library exposes. zod is the documented,
|
|
6
|
+
* blessed choice but is not a runtime or peer dependency.
|
|
7
|
+
*/
|
|
8
|
+
/** The minimal Standard Schema v1 interface, vendored from the spec. */
|
|
9
|
+
export interface StandardSchemaV1<Input = unknown, Output = Input> {
|
|
10
|
+
readonly "~standard": StandardSchemaV1.Props<Input, Output>;
|
|
11
|
+
}
|
|
12
|
+
export declare namespace StandardSchemaV1 {
|
|
13
|
+
interface Props<Input = unknown, Output = Input> {
|
|
14
|
+
readonly version: 1;
|
|
15
|
+
readonly vendor: string;
|
|
16
|
+
readonly validate: (value: unknown) => Result<Output> | Promise<Result<Output>>;
|
|
17
|
+
readonly types?: Types<Input, Output>;
|
|
18
|
+
}
|
|
19
|
+
type Result<Output> = SuccessResult<Output> | FailureResult;
|
|
20
|
+
interface SuccessResult<Output> {
|
|
21
|
+
readonly value: Output;
|
|
22
|
+
readonly issues?: undefined;
|
|
23
|
+
}
|
|
24
|
+
interface FailureResult {
|
|
25
|
+
readonly issues: ReadonlyArray<Issue>;
|
|
26
|
+
}
|
|
27
|
+
interface Issue {
|
|
28
|
+
readonly message: string;
|
|
29
|
+
readonly path?: ReadonlyArray<PropertyKey | PathSegment>;
|
|
30
|
+
}
|
|
31
|
+
interface PathSegment {
|
|
32
|
+
readonly key: PropertyKey;
|
|
33
|
+
}
|
|
34
|
+
interface Types<Input = unknown, Output = Input> {
|
|
35
|
+
readonly input: Input;
|
|
36
|
+
readonly output: Output;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
/** The inferred output type of a Standard Schema (e.g. `z.infer<typeof S>`). */
|
|
40
|
+
export type InferOutput<S extends StandardSchemaV1> = NonNullable<S["~standard"]["types"]>["output"];
|
|
41
|
+
/** Structural duck-type check so any Standard-Schema library is accepted. */
|
|
42
|
+
export declare function isStandardSchema(value: unknown): value is StandardSchemaV1;
|
|
43
|
+
/** Render Standard Schema failure issues into a readable multi-line message. */
|
|
44
|
+
export declare function formatIssues(issues: ReadonlyArray<StandardSchemaV1.Issue>): string;
|
|
45
|
+
export type ValidationOutcome = {
|
|
46
|
+
ok: true;
|
|
47
|
+
} | {
|
|
48
|
+
ok: false;
|
|
49
|
+
message: string;
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* Validate a value against a schema, awaiting the result. Supports both
|
|
53
|
+
* synchronous and asynchronous (`refine`-style) schemas — used by the runner,
|
|
54
|
+
* which is already async.
|
|
55
|
+
*/
|
|
56
|
+
export declare function validateAgainstSchema(schema: StandardSchemaV1, value: unknown): Promise<ValidationOutcome>;
|
|
57
|
+
/**
|
|
58
|
+
* Synchronous validation for the `matchingSchema` matcher (matchers run inside
|
|
59
|
+
* a sync assertion callback). Throws a directive error if the schema needs to
|
|
60
|
+
* resolve asynchronously — declare such schemas at the agent/scene level, where
|
|
61
|
+
* validation is awaited.
|
|
62
|
+
*/
|
|
63
|
+
export declare function validateSync(schema: StandardSchemaV1, value: unknown): ValidationOutcome;
|
package/dist/schema.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema validation built on the Standard Schema v1 spec
|
|
3
|
+
* (https://standardschema.dev). Agest never imports a schema library — it talks
|
|
4
|
+
* to whatever the consumer brings (zod 4, valibot, arktype, …) through the
|
|
5
|
+
* `~standard` contract every compliant library exposes. zod is the documented,
|
|
6
|
+
* blessed choice but is not a runtime or peer dependency.
|
|
7
|
+
*/
|
|
8
|
+
/** Structural duck-type check so any Standard-Schema library is accepted. */
|
|
9
|
+
export function isStandardSchema(value) {
|
|
10
|
+
return (typeof value === "object" &&
|
|
11
|
+
value !== null &&
|
|
12
|
+
"~standard" in value &&
|
|
13
|
+
typeof value["~standard"]?.validate === "function");
|
|
14
|
+
}
|
|
15
|
+
function isThenable(value) {
|
|
16
|
+
return (typeof value === "object" &&
|
|
17
|
+
value !== null &&
|
|
18
|
+
typeof value.then === "function");
|
|
19
|
+
}
|
|
20
|
+
/** Normalise one issue path segment (`PropertyKey | { key }`) to a string. */
|
|
21
|
+
function renderSegment(seg) {
|
|
22
|
+
return typeof seg === "object" ? String(seg.key) : String(seg);
|
|
23
|
+
}
|
|
24
|
+
/** Render Standard Schema failure issues into a readable multi-line message. */
|
|
25
|
+
export function formatIssues(issues) {
|
|
26
|
+
const lines = issues.map((issue) => {
|
|
27
|
+
const path = issue.path?.map(renderSegment).join(".");
|
|
28
|
+
return path ? ` • ${path}: ${issue.message}` : ` • ${issue.message}`;
|
|
29
|
+
});
|
|
30
|
+
const count = issues.length;
|
|
31
|
+
return `${count} issue${count !== 1 ? "s" : ""}:\n${lines.join("\n")}`;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Validate a value against a schema, awaiting the result. Supports both
|
|
35
|
+
* synchronous and asynchronous (`refine`-style) schemas — used by the runner,
|
|
36
|
+
* which is already async.
|
|
37
|
+
*/
|
|
38
|
+
export async function validateAgainstSchema(schema, value) {
|
|
39
|
+
const result = await schema["~standard"].validate(value);
|
|
40
|
+
if (result.issues) {
|
|
41
|
+
return { ok: false, message: formatIssues(result.issues) };
|
|
42
|
+
}
|
|
43
|
+
return { ok: true };
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Synchronous validation for the `matchingSchema` matcher (matchers run inside
|
|
47
|
+
* a sync assertion callback). Throws a directive error if the schema needs to
|
|
48
|
+
* resolve asynchronously — declare such schemas at the agent/scene level, where
|
|
49
|
+
* validation is awaited.
|
|
50
|
+
*/
|
|
51
|
+
export function validateSync(schema, value) {
|
|
52
|
+
const result = schema["~standard"].validate(value);
|
|
53
|
+
if (isThenable(result)) {
|
|
54
|
+
throw new Error("matchingSchema() cannot validate an async schema. Declare the schema at " +
|
|
55
|
+
"the agent() or scene().expectSchema() level, where validation is awaited.");
|
|
56
|
+
}
|
|
57
|
+
if (result.issues) {
|
|
58
|
+
return { ok: false, message: formatIssues(result.issues) };
|
|
59
|
+
}
|
|
60
|
+
return { ok: true };
|
|
61
|
+
}
|
package/dist/types.d.ts
CHANGED
|
@@ -1,6 +1,63 @@
|
|
|
1
|
-
|
|
2
|
-
export interface
|
|
1
|
+
import type { StandardSchemaV1 } from "./schema";
|
|
2
|
+
export interface ExecutorOptions {
|
|
3
|
+
signal?: AbortSignal;
|
|
4
|
+
}
|
|
5
|
+
export type AgentExecutor<T = string> = (input: string, options?: ExecutorOptions) => Promise<AgentResponse<T>>;
|
|
6
|
+
export type CostSource = "provider" | "table" | "unavailable";
|
|
7
|
+
export interface CostBreakdown {
|
|
8
|
+
inputUsd?: number;
|
|
9
|
+
outputUsd?: number;
|
|
10
|
+
totalUsd?: number;
|
|
11
|
+
source: CostSource;
|
|
12
|
+
}
|
|
13
|
+
export type TimelineEventKind = "model" | "tool";
|
|
14
|
+
export interface TimelineEvent {
|
|
15
|
+
kind: TimelineEventKind;
|
|
16
|
+
name: string;
|
|
17
|
+
/** ms relative to the scene start */
|
|
18
|
+
startMs: number;
|
|
19
|
+
endMs: number;
|
|
20
|
+
durationMs: number;
|
|
21
|
+
tokens?: {
|
|
22
|
+
input: number;
|
|
23
|
+
output: number;
|
|
24
|
+
};
|
|
25
|
+
/** Prompt-cache-hit input tokens (subset of tokens.input), when reported by the provider */
|
|
26
|
+
cachedInputTokens?: number;
|
|
27
|
+
cost?: CostBreakdown;
|
|
28
|
+
/** Index of the run this event belongs to (only set when aggregating across multi-run scenes) */
|
|
29
|
+
runIndex?: number;
|
|
30
|
+
error?: string;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* The result an executor hands back. EXACTLY ONE of `value` / `text` is
|
|
34
|
+
* required (both may be present); the rest are optional.
|
|
35
|
+
*
|
|
36
|
+
* `value` is the agent's NATIVE output and the source of truth for
|
|
37
|
+
* deterministic, structural assertions — a string for a chat agent, an object
|
|
38
|
+
* for a structured agent (a plan, a tool-call payload, parsed JSON). It is
|
|
39
|
+
* never coerced to a string before a matcher asks for text.
|
|
40
|
+
*
|
|
41
|
+
* `text` is a pre-serialized projection for the judge model and the text
|
|
42
|
+
* matchers (`containing`, `matchingPattern`, `refusal`). A string-producing
|
|
43
|
+
* agent can return ONLY `text` (the legacy/common case) — it is then also used
|
|
44
|
+
* as `value`. A structured agent returns `value` and, optionally, an enriched
|
|
45
|
+
* `text` when the judge needs a view the raw value can't give cheaply (e.g.
|
|
46
|
+
* resolving opaque ids to names). When `text` is omitted, agest serializes
|
|
47
|
+
* `value` lazily (string passthrough, else JSON). See `resolve.ts`.
|
|
48
|
+
*
|
|
49
|
+
* The generic defaults to `string`, so the common chat case stays
|
|
50
|
+
* `{ text: "..." }` or `{ value: "..." }` with no type ceremony.
|
|
51
|
+
*/
|
|
52
|
+
export type AgentResponse<T = string> = AgentResponseBase<T> & ({
|
|
53
|
+
value: T;
|
|
54
|
+
} | {
|
|
3
55
|
text: string;
|
|
56
|
+
});
|
|
57
|
+
interface AgentResponseBase<T = string> {
|
|
58
|
+
value?: T;
|
|
59
|
+
/** Pre-serialized view for the judge / text matchers. */
|
|
60
|
+
text?: string;
|
|
4
61
|
refusal?: boolean;
|
|
5
62
|
executionError?: string;
|
|
6
63
|
metadata?: {
|
|
@@ -11,6 +68,8 @@ export interface AgentResponse {
|
|
|
11
68
|
};
|
|
12
69
|
tools?: string[];
|
|
13
70
|
systemPrompt?: string;
|
|
71
|
+
events?: TimelineEvent[];
|
|
72
|
+
cost?: CostBreakdown;
|
|
14
73
|
[key: string]: unknown;
|
|
15
74
|
};
|
|
16
75
|
}
|
|
@@ -25,6 +84,8 @@ export interface SceneDefinition {
|
|
|
25
84
|
turns?: number;
|
|
26
85
|
runs?: number;
|
|
27
86
|
suite?: string;
|
|
87
|
+
/** Standard Schema validated against the native value before user assertions. */
|
|
88
|
+
schema?: StandardSchemaV1;
|
|
28
89
|
}
|
|
29
90
|
export type JudgeVerdict = "pass" | "fail" | "partial";
|
|
30
91
|
export interface JudgeResult {
|
|
@@ -32,26 +93,36 @@ export interface JudgeResult {
|
|
|
32
93
|
reasoning: string;
|
|
33
94
|
criteria: string;
|
|
34
95
|
}
|
|
35
|
-
export interface RunResult {
|
|
96
|
+
export interface RunResult<T = string> {
|
|
36
97
|
passed: boolean;
|
|
37
98
|
error?: string;
|
|
38
|
-
response: AgentResponse
|
|
99
|
+
response: AgentResponse<T>;
|
|
39
100
|
duration: number;
|
|
40
101
|
judgement?: JudgeResult;
|
|
41
102
|
}
|
|
42
|
-
export interface SceneResult {
|
|
103
|
+
export interface SceneResult<T = string> {
|
|
43
104
|
prompt: string;
|
|
44
|
-
response: AgentResponse
|
|
105
|
+
response: AgentResponse<T>;
|
|
45
106
|
duration: number;
|
|
46
107
|
passed: boolean;
|
|
47
108
|
error?: string;
|
|
48
109
|
judgement?: JudgeResult;
|
|
49
110
|
suite?: string;
|
|
50
|
-
runs?: RunResult[];
|
|
111
|
+
runs?: RunResult<T>[];
|
|
51
112
|
passRate?: number;
|
|
52
113
|
statisticalSignificance?: number;
|
|
114
|
+
/** Aggregate tokens across all runs of this scene */
|
|
115
|
+
tokens?: {
|
|
116
|
+
input: number;
|
|
117
|
+
output: number;
|
|
118
|
+
};
|
|
119
|
+
/** Aggregate USD cost across all runs of this scene */
|
|
120
|
+
costUsd?: number;
|
|
121
|
+
costSource?: CostSource;
|
|
122
|
+
/** Ordered timeline events from every run of the scene */
|
|
123
|
+
events?: TimelineEvent[];
|
|
53
124
|
}
|
|
54
|
-
export interface AgentReport {
|
|
125
|
+
export interface AgentReport<T = string> {
|
|
55
126
|
name?: string;
|
|
56
127
|
model?: string;
|
|
57
128
|
systemPromptHash?: string;
|
|
@@ -66,5 +137,9 @@ export interface AgentReport {
|
|
|
66
137
|
totalCases: number;
|
|
67
138
|
averageInputTokensPerCase?: number;
|
|
68
139
|
averageOutputTokensPerCase?: number;
|
|
69
|
-
|
|
140
|
+
totalInputTokens?: number;
|
|
141
|
+
totalOutputTokens?: number;
|
|
142
|
+
totalCostUsd?: number;
|
|
143
|
+
results: SceneResult<T>[];
|
|
70
144
|
}
|
|
145
|
+
export {};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { TimelineEvent } from "./types";
|
|
2
|
+
/**
|
|
3
|
+
* Render a Chrome-DevTools-style waterfall of timeline events as colored
|
|
4
|
+
* terminal lines. Bars are positioned by `startMs` and sized by `durationMs`
|
|
5
|
+
* relative to the full span of the scene. Returns one string per event row
|
|
6
|
+
* (already indented), or `[]` when there's nothing to draw.
|
|
7
|
+
*/
|
|
8
|
+
export declare function renderTerminalWaterfall(events: TimelineEvent[], opts?: {
|
|
9
|
+
width?: number;
|
|
10
|
+
indent?: string;
|
|
11
|
+
}): string[];
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { c } from "./logger";
|
|
2
|
+
const BLOCK = "█";
|
|
3
|
+
const THIN = "▏";
|
|
4
|
+
function truncate(s, n) {
|
|
5
|
+
return s.length > n ? s.slice(0, n - 1) + "…" : s;
|
|
6
|
+
}
|
|
7
|
+
function fmtUsd(n) {
|
|
8
|
+
if (n === 0)
|
|
9
|
+
return "$0";
|
|
10
|
+
return "$" + Number(n.toFixed(4)).toString();
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Render a Chrome-DevTools-style waterfall of timeline events as colored
|
|
14
|
+
* terminal lines. Bars are positioned by `startMs` and sized by `durationMs`
|
|
15
|
+
* relative to the full span of the scene. Returns one string per event row
|
|
16
|
+
* (already indented), or `[]` when there's nothing to draw.
|
|
17
|
+
*/
|
|
18
|
+
export function renderTerminalWaterfall(events, opts = {}) {
|
|
19
|
+
if (!events || events.length === 0)
|
|
20
|
+
return [];
|
|
21
|
+
const width = opts.width ?? 28;
|
|
22
|
+
const indent = opts.indent ?? "";
|
|
23
|
+
const t0 = Math.min(...events.map((e) => e.startMs));
|
|
24
|
+
const tEnd = Math.max(...events.map((e) => e.endMs));
|
|
25
|
+
const span = Math.max(1, tEnd - t0);
|
|
26
|
+
const nameWidth = 16;
|
|
27
|
+
return events.map((e) => {
|
|
28
|
+
const lead = Math.min(width - 1, Math.round(((e.startMs - t0) / span) * width));
|
|
29
|
+
const barLen = Math.max(1, Math.round((e.durationMs / span) * width));
|
|
30
|
+
const fill = e.durationMs === 0 ? THIN : BLOCK.repeat(Math.min(barLen, width - lead));
|
|
31
|
+
const cells = Array(width).fill(" ");
|
|
32
|
+
for (let i = 0; i < fill.length && lead + i < width; i++) {
|
|
33
|
+
cells[lead + i] = fill[i];
|
|
34
|
+
}
|
|
35
|
+
let bar = cells.join("");
|
|
36
|
+
const color = e.error ? c.red : e.kind === "model" ? c.cyan : c.yellow;
|
|
37
|
+
bar = color(bar);
|
|
38
|
+
const kindLabel = (e.kind === "model" ? "model" : "tool ").padEnd(5);
|
|
39
|
+
const nameLabel = truncate(e.name, nameWidth).padEnd(nameWidth);
|
|
40
|
+
const dur = `${Math.round(e.durationMs)}ms`.padStart(7);
|
|
41
|
+
const cost = e.cost?.totalUsd != null ? ` ${fmtUsd(e.cost.totalUsd)}` : "";
|
|
42
|
+
const cached = e.cachedInputTokens ? ` ${c.dim(`(${e.cachedInputTokens} cached)`)}` : "";
|
|
43
|
+
const err = e.error ? ` ${c.red("✗ " + truncate(e.error, 40))}` : "";
|
|
44
|
+
return `${indent}${c.dim(kindLabel)} ${nameLabel} ${bar} ${c.dim(dur)}${c.dim(cost)}${cached}${err}`;
|
|
45
|
+
});
|
|
46
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sebastiantuyu/agest",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.3-next.10",
|
|
4
4
|
"description": "A testing library for agents",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
}
|
|
27
27
|
},
|
|
28
28
|
"scripts": {
|
|
29
|
-
"build": "tsc -p tsconfig.build.json",
|
|
29
|
+
"build": "tsc -p tsconfig.build.json && mkdir -p dist/pricing && cp src/pricing/models.json dist/pricing/models.json",
|
|
30
30
|
"test": "vitest run",
|
|
31
31
|
"test:watch": "vitest",
|
|
32
32
|
"test:coverage": "vitest run --coverage",
|
|
@@ -37,25 +37,34 @@
|
|
|
37
37
|
"site:preview": "npx serve site -p 3000",
|
|
38
38
|
"release:patch": "npm version patch && git push && git push --tags",
|
|
39
39
|
"release:minor": "npm version minor && git push && git push --tags",
|
|
40
|
-
"release:major": "npm version major && git push && git push --tags"
|
|
40
|
+
"release:major": "npm version major && git push && git push --tags",
|
|
41
|
+
"release:next": "npm version prerelease --preid=next && git push && git push --tags"
|
|
41
42
|
},
|
|
42
43
|
"engines": {
|
|
43
44
|
"node": ">=22.0.0"
|
|
44
45
|
},
|
|
45
46
|
"devDependencies": {
|
|
46
|
-
"@langchain/core": "
|
|
47
|
-
"@langchain/langgraph": "
|
|
48
|
-
"@langchain/openai": "
|
|
49
|
-
"@types/node": "
|
|
50
|
-
"@vitest/coverage-v8": "
|
|
51
|
-
"dotenv": "
|
|
52
|
-
"langchain": "
|
|
53
|
-
"tsx": "
|
|
54
|
-
"typescript": "
|
|
55
|
-
"vitest": "
|
|
56
|
-
"zod": "
|
|
47
|
+
"@langchain/core": "1.1.39",
|
|
48
|
+
"@langchain/langgraph": "1.2.8",
|
|
49
|
+
"@langchain/openai": "1.4.4",
|
|
50
|
+
"@types/node": "22.19.17",
|
|
51
|
+
"@vitest/coverage-v8": "3.2.4",
|
|
52
|
+
"dotenv": "17.4.1",
|
|
53
|
+
"langchain": "1.3.1",
|
|
54
|
+
"tsx": "4.21.0",
|
|
55
|
+
"typescript": "5.9.3",
|
|
56
|
+
"vitest": "3.2.4",
|
|
57
|
+
"zod": "4.3.6"
|
|
57
58
|
},
|
|
58
59
|
"dependencies": {
|
|
59
|
-
"@supercharge/promise-pool": "
|
|
60
|
+
"@supercharge/promise-pool": "3.3.0"
|
|
61
|
+
},
|
|
62
|
+
"peerDependencies": {
|
|
63
|
+
"@langchain/core": ">=0.3.0 <2.0.0"
|
|
64
|
+
},
|
|
65
|
+
"peerDependenciesMeta": {
|
|
66
|
+
"@langchain/core": {
|
|
67
|
+
"optional": true
|
|
68
|
+
}
|
|
60
69
|
}
|
|
61
70
|
}
|