@sebastiantuyu/agest 0.3.2 → 0.3.3-next.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -1
- package/dist/adapters/index.d.ts +2 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +80 -11
- package/dist/adapters/remote.d.ts +1 -1
- package/dist/adapters/remote.js +3 -2
- package/dist/adapters/tracing.d.ts +73 -0
- package/dist/adapters/tracing.js +338 -0
- package/dist/assertions.d.ts +57 -2
- package/dist/assertions.js +119 -33
- package/dist/cli.d.ts +15 -1
- package/dist/cli.js +97 -18
- package/dist/config.d.ts +9 -0
- package/dist/context.d.ts +32 -11
- package/dist/context.js +84 -10
- package/dist/discover.d.ts +16 -0
- package/dist/discover.js +62 -0
- package/dist/index.d.ts +20 -2
- package/dist/index.js +10 -3
- package/dist/match.d.ts +28 -0
- package/dist/match.js +57 -0
- package/dist/preview.js +93 -0
- package/dist/pricing/index.d.ts +32 -0
- package/dist/pricing/index.js +48 -0
- package/dist/pricing/models.json +21 -0
- package/dist/reporter.d.ts +1 -1
- package/dist/reporter.js +77 -4
- package/dist/reports.d.ts +37 -0
- package/dist/reports.js +126 -0
- package/dist/resolve.d.ts +25 -0
- package/dist/resolve.js +62 -0
- package/dist/runner.d.ts +11 -2
- package/dist/runner.js +97 -11
- package/dist/schema.d.ts +63 -0
- package/dist/schema.js +61 -0
- package/dist/types.d.ts +84 -9
- package/dist/waterfall.d.ts +11 -0
- package/dist/waterfall.js +46 -0
- package/package.json +24 -15
package/dist/preview.js
CHANGED
|
@@ -189,6 +189,91 @@ function renderFailedCases(cases) {
|
|
|
189
189
|
</ul>
|
|
190
190
|
</details>`;
|
|
191
191
|
}
|
|
192
|
+
const WF_MODEL = "#38bdf8";
|
|
193
|
+
const WF_TOOL = "#facc15";
|
|
194
|
+
const WF_ERROR = "#f87171";
|
|
195
|
+
function fmtUsdHtml(n) {
|
|
196
|
+
if (n === 0)
|
|
197
|
+
return "$0";
|
|
198
|
+
return "$" + Number(n.toFixed(4)).toString();
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Chrome-DevTools-style waterfall for a report's per-scene timelines. Bars are
|
|
202
|
+
* absolutely positioned within a track by start_ms / duration_ms. Returns "" if
|
|
203
|
+
* the report carries no timeline data (older reports / non-traced executors).
|
|
204
|
+
*/
|
|
205
|
+
function renderWaterfallHtml(report) {
|
|
206
|
+
const scenes = (report.scenes ?? []).filter((s) => s.timeline && s.timeline.length > 0);
|
|
207
|
+
if (scenes.length === 0)
|
|
208
|
+
return "";
|
|
209
|
+
const sceneBlocks = scenes
|
|
210
|
+
.map((scene) => {
|
|
211
|
+
const events = scene.timeline;
|
|
212
|
+
const t0 = Math.min(...events.map((e) => e.startMs));
|
|
213
|
+
const tEnd = Math.max(...events.map((e) => e.startMs + e.durationMs));
|
|
214
|
+
const span = Math.max(1, tEnd - t0);
|
|
215
|
+
const rows = events
|
|
216
|
+
.map((e) => {
|
|
217
|
+
const left = ((e.startMs - t0) / span) * 100;
|
|
218
|
+
const width = Math.max(0.6, (e.durationMs / span) * 100);
|
|
219
|
+
const bg = e.error ? WF_ERROR : e.kind === "model" ? WF_MODEL : WF_TOOL;
|
|
220
|
+
const icon = e.kind === "model" ? "◆" : "▸";
|
|
221
|
+
const tip = [
|
|
222
|
+
`${e.kind}: ${e.name}`,
|
|
223
|
+
`start ${Math.round(e.startMs)}ms · ${Math.round(e.durationMs)}ms`,
|
|
224
|
+
e.tokens ? `${e.tokens.input}→${e.tokens.output} tok` : "",
|
|
225
|
+
e.cachedInputTokens ? `${e.cachedInputTokens} cached` : "",
|
|
226
|
+
e.costUsd != null ? fmtUsdHtml(e.costUsd) : "",
|
|
227
|
+
e.error ? `error: ${e.error}` : "",
|
|
228
|
+
]
|
|
229
|
+
.filter(Boolean)
|
|
230
|
+
.join(" · ");
|
|
231
|
+
const cost = e.costUsd != null ? fmtUsdHtml(e.costUsd) : "";
|
|
232
|
+
return `
|
|
233
|
+
<div class="flex items-center gap-2 text-[11px] leading-5">
|
|
234
|
+
<span class="w-44 shrink-0 truncate ${e.error ? "text-red-400" : "text-zinc-400"}" title="${escHtml(e.name)}">
|
|
235
|
+
<span style="color:${bg}">${icon}</span> ${escHtml(e.name)}
|
|
236
|
+
</span>
|
|
237
|
+
<div class="relative flex-1 h-3 bg-zinc-800/40 rounded">
|
|
238
|
+
<div class="absolute top-0 h-3 rounded" style="left:${left.toFixed(2)}%;width:${width.toFixed(2)}%;background:${bg}" title="${escHtml(tip)}"></div>
|
|
239
|
+
</div>
|
|
240
|
+
<span class="w-16 shrink-0 text-right text-zinc-500">${Math.round(e.durationMs)}ms</span>
|
|
241
|
+
<span class="w-16 shrink-0 text-right text-zinc-500">${cost}</span>
|
|
242
|
+
</div>`;
|
|
243
|
+
})
|
|
244
|
+
.join("\n");
|
|
245
|
+
const meta = [
|
|
246
|
+
scene.tokens ? `${scene.tokens.input}→${scene.tokens.output} tok` : "",
|
|
247
|
+
scene.costUsd != null ? fmtUsdHtml(scene.costUsd) : "",
|
|
248
|
+
scene.costSource ? scene.costSource : "",
|
|
249
|
+
scene.durationMs != null ? `${Math.round(scene.durationMs)}ms` : "",
|
|
250
|
+
]
|
|
251
|
+
.filter(Boolean)
|
|
252
|
+
.join(" · ");
|
|
253
|
+
return `
|
|
254
|
+
<div>
|
|
255
|
+
<div class="flex items-center justify-between mb-1.5">
|
|
256
|
+
<span class="text-xs text-zinc-300 truncate" title="${escHtml(scene.prompt)}">${escHtml(scene.prompt)}</span>
|
|
257
|
+
<span class="text-[11px] text-zinc-500 shrink-0 ml-3">${escHtml(meta)}</span>
|
|
258
|
+
</div>
|
|
259
|
+
<div class="space-y-1">${rows}</div>
|
|
260
|
+
</div>`;
|
|
261
|
+
})
|
|
262
|
+
.join("\n");
|
|
263
|
+
return `
|
|
264
|
+
<details class="mt-2" open>
|
|
265
|
+
<summary class="text-xs text-sky-400 cursor-pointer hover:text-sky-300 select-none">
|
|
266
|
+
waterfall · ${scenes.length} scene${scenes.length !== 1 ? "s" : ""}
|
|
267
|
+
</summary>
|
|
268
|
+
<div class="mt-3 mb-2 pl-3 border-l border-zinc-800 space-y-5">
|
|
269
|
+
<div class="flex gap-4 text-[10px] text-zinc-500">
|
|
270
|
+
<span><span style="color:${WF_MODEL}">◆</span> model</span>
|
|
271
|
+
<span><span style="color:${WF_TOOL}">▸</span> tool</span>
|
|
272
|
+
</div>
|
|
273
|
+
${sceneBlocks}
|
|
274
|
+
</div>
|
|
275
|
+
</details>`;
|
|
276
|
+
}
|
|
192
277
|
function renderRunRow(entry, idx) {
|
|
193
278
|
const { report, delta, diffLines } = entry;
|
|
194
279
|
const pct = report.successRate * 100;
|
|
@@ -233,6 +318,7 @@ function renderRunRow(entry, idx) {
|
|
|
233
318
|
</div>
|
|
234
319
|
<div class="ml-10 mt-0.5 flex gap-3 flex-wrap">${dimTags}</div>
|
|
235
320
|
${diffHtml}
|
|
321
|
+
<div class="ml-10">${renderWaterfallHtml(report)}</div>
|
|
236
322
|
</div>`;
|
|
237
323
|
}
|
|
238
324
|
// ---------------------------------------------------------------------------
|
|
@@ -990,6 +1076,12 @@ function renderSingleRun(report) {
|
|
|
990
1076
|
<p class="text-zinc-300">${Math.round(report.averageOutputTokensPerCase)}</p>
|
|
991
1077
|
</div>`
|
|
992
1078
|
: ""}
|
|
1079
|
+
${report.totalCostUsd != null
|
|
1080
|
+
? `<div>
|
|
1081
|
+
<span class="text-zinc-500">Total Cost</span>
|
|
1082
|
+
<p class="text-zinc-300">${fmtUsdHtml(report.totalCostUsd)}${report.totalInputTokens != null ? ` <span class="text-zinc-600">· ${report.totalInputTokens}→${report.totalOutputTokens} tok</span>` : ""}</p>
|
|
1083
|
+
</div>`
|
|
1084
|
+
: ""}
|
|
993
1085
|
${report.tools && report.tools.length > 0
|
|
994
1086
|
? `<div>
|
|
995
1087
|
<span class="text-zinc-500">Tools</span>
|
|
@@ -997,6 +1089,7 @@ function renderSingleRun(report) {
|
|
|
997
1089
|
</div>`
|
|
998
1090
|
: ""}
|
|
999
1091
|
</div>
|
|
1092
|
+
${renderWaterfallHtml(report)}
|
|
1000
1093
|
${failedSection}
|
|
1001
1094
|
</div>`;
|
|
1002
1095
|
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
export interface ModelPrice {
|
|
2
|
+
/** USD per 1M input tokens */
|
|
3
|
+
input: number;
|
|
4
|
+
/** USD per 1M output tokens */
|
|
5
|
+
output: number;
|
|
6
|
+
/**
|
|
7
|
+
* USD per 1M cached (prompt-cache-hit) input tokens. When omitted, cached
|
|
8
|
+
* tokens are billed at `DEFAULT_CACHE_MULTIPLIER` × the input rate.
|
|
9
|
+
*/
|
|
10
|
+
cachedInput?: number;
|
|
11
|
+
}
|
|
12
|
+
/** Fraction of the input rate charged for cache-hit tokens when no explicit rate is set. */
|
|
13
|
+
export declare const DEFAULT_CACHE_MULTIPLIER = 0.1;
|
|
14
|
+
export type CostSource = "provider" | "table" | "unavailable";
|
|
15
|
+
export interface CostBreakdown {
|
|
16
|
+
inputUsd?: number;
|
|
17
|
+
outputUsd?: number;
|
|
18
|
+
totalUsd?: number;
|
|
19
|
+
source: CostSource;
|
|
20
|
+
}
|
|
21
|
+
export declare function setPricingOverrides(table?: Record<string, ModelPrice>): void;
|
|
22
|
+
export declare function lookupPrice(model: string | undefined): ModelPrice | undefined;
|
|
23
|
+
export interface ComputeCostInput {
|
|
24
|
+
model?: string;
|
|
25
|
+
inputTokens?: number;
|
|
26
|
+
outputTokens?: number;
|
|
27
|
+
/** Cache-hit input tokens (subset of inputTokens), billed at the cached rate. */
|
|
28
|
+
cachedInputTokens?: number;
|
|
29
|
+
/** USD cost the provider already reported (takes precedence) */
|
|
30
|
+
providerCost?: number;
|
|
31
|
+
}
|
|
32
|
+
export declare function computeCost(input: ComputeCostInput): CostBreakdown;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { readFileSync } from "fs";
|
|
2
|
+
import { fileURLToPath } from "url";
|
|
3
|
+
import { dirname, join } from "path";
|
|
4
|
+
/** Fraction of the input rate charged for cache-hit tokens when no explicit rate is set. */
|
|
5
|
+
export const DEFAULT_CACHE_MULTIPLIER = 0.1;
|
|
6
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
7
|
+
const builtIn = JSON.parse(readFileSync(join(here, "models.json"), "utf-8"));
|
|
8
|
+
let overrides = {};
|
|
9
|
+
export function setPricingOverrides(table) {
|
|
10
|
+
overrides = table ?? {};
|
|
11
|
+
}
|
|
12
|
+
export function lookupPrice(model) {
|
|
13
|
+
if (!model)
|
|
14
|
+
return undefined;
|
|
15
|
+
if (overrides[model])
|
|
16
|
+
return overrides[model];
|
|
17
|
+
if (builtIn[model])
|
|
18
|
+
return builtIn[model];
|
|
19
|
+
// Loose suffix/prefix match — pick the longest matching key
|
|
20
|
+
const lowered = model.toLowerCase();
|
|
21
|
+
const keys = Object.keys({ ...builtIn, ...overrides })
|
|
22
|
+
.filter((k) => lowered.includes(k.toLowerCase()) || k.toLowerCase().includes(lowered))
|
|
23
|
+
.sort((a, b) => b.length - a.length);
|
|
24
|
+
if (keys.length > 0) {
|
|
25
|
+
return overrides[keys[0]] ?? builtIn[keys[0]];
|
|
26
|
+
}
|
|
27
|
+
return undefined;
|
|
28
|
+
}
|
|
29
|
+
export function computeCost(input) {
|
|
30
|
+
if (input.providerCost != null && Number.isFinite(input.providerCost)) {
|
|
31
|
+
return { totalUsd: input.providerCost, source: "provider" };
|
|
32
|
+
}
|
|
33
|
+
const price = lookupPrice(input.model);
|
|
34
|
+
if (!price)
|
|
35
|
+
return { source: "unavailable" };
|
|
36
|
+
const totalInput = input.inputTokens ?? 0;
|
|
37
|
+
const cached = Math.min(input.cachedInputTokens ?? 0, totalInput);
|
|
38
|
+
const uncached = totalInput - cached;
|
|
39
|
+
const cachedRate = price.cachedInput ?? price.input * DEFAULT_CACHE_MULTIPLIER;
|
|
40
|
+
const inputUsd = (uncached / 1_000_000) * price.input + (cached / 1_000_000) * cachedRate;
|
|
41
|
+
const outputUsd = ((input.outputTokens ?? 0) / 1_000_000) * price.output;
|
|
42
|
+
return {
|
|
43
|
+
inputUsd,
|
|
44
|
+
outputUsd,
|
|
45
|
+
totalUsd: inputUsd + outputUsd,
|
|
46
|
+
source: "table",
|
|
47
|
+
};
|
|
48
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"claude-opus-4-7": { "input": 15, "output": 75 },
|
|
3
|
+
"claude-opus-4-6": { "input": 15, "output": 75 },
|
|
4
|
+
"claude-opus-4-5": { "input": 15, "output": 75 },
|
|
5
|
+
"claude-sonnet-4-6": { "input": 3, "output": 15 },
|
|
6
|
+
"claude-sonnet-4-5": { "input": 3, "output": 15 },
|
|
7
|
+
"claude-haiku-4-5": { "input": 1, "output": 5 },
|
|
8
|
+
"claude-3-5-sonnet-20241022": { "input": 3, "output": 15 },
|
|
9
|
+
"claude-3-5-haiku-20241022": { "input": 0.8, "output": 4 },
|
|
10
|
+
"claude-3-opus-20240229": { "input": 15, "output": 75 },
|
|
11
|
+
"gpt-4o": { "input": 2.5, "output": 10 },
|
|
12
|
+
"gpt-4o-mini": { "input": 0.15, "output": 0.6 },
|
|
13
|
+
"gpt-4.1": { "input": 2, "output": 8 },
|
|
14
|
+
"gpt-4.1-mini": { "input": 0.4, "output": 1.6 },
|
|
15
|
+
"gpt-4.1-nano": { "input": 0.1, "output": 0.4 },
|
|
16
|
+
"gpt-5": { "input": 1.25, "output": 10 },
|
|
17
|
+
"gpt-5-mini": { "input": 0.25, "output": 2 },
|
|
18
|
+
"o1": { "input": 15, "output": 60 },
|
|
19
|
+
"o1-mini": { "input": 1.1, "output": 4.4 },
|
|
20
|
+
"o3-mini": { "input": 1.1, "output": 4.4 }
|
|
21
|
+
}
|
package/dist/reporter.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
import type { AgentReport } from "./types";
|
|
2
|
-
export declare function formatReport(report: AgentReport): string;
|
|
2
|
+
export declare function formatReport(report: AgentReport<unknown>): string;
|
|
3
3
|
export declare function writeReport(content: string, timestamp: string, name?: string, dimensions?: Record<string, string>): Promise<string>;
|
|
4
4
|
export declare function writeDiffEntry(hash: string, systemPrompt: string, tools: string[], model?: string): Promise<void>;
|
package/dist/reporter.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { access, mkdir, writeFile } from "fs/promises";
|
|
2
2
|
import { createHash } from "crypto";
|
|
3
3
|
import { join } from "path";
|
|
4
|
+
import { resolveText } from "./resolve";
|
|
4
5
|
export function formatReport(report) {
|
|
5
6
|
const lines = ["agent:"];
|
|
6
7
|
if (report.name)
|
|
@@ -24,8 +25,9 @@ export function formatReport(report) {
|
|
|
24
25
|
lines.push(` reason: "${reason}"`);
|
|
25
26
|
}
|
|
26
27
|
const result = report.results.find((r) => r.prompt === c);
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
const responseText = result ? resolveText(result.response) : "";
|
|
29
|
+
if (responseText) {
|
|
30
|
+
const escaped = responseText.replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
|
29
31
|
lines.push(` response: "${escaped}"`);
|
|
30
32
|
}
|
|
31
33
|
}
|
|
@@ -51,8 +53,9 @@ export function formatReport(report) {
|
|
|
51
53
|
if (r.error) {
|
|
52
54
|
lines.push(` reason: "${r.error}"`);
|
|
53
55
|
}
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
const responseText = resolveText(r.response);
|
|
57
|
+
if (responseText) {
|
|
58
|
+
const escaped = responseText.replace(/"/g, '\\"').replace(/\n/g, '\\n');
|
|
56
59
|
lines.push(` response: "${escaped}"`);
|
|
57
60
|
}
|
|
58
61
|
}
|
|
@@ -78,8 +81,78 @@ export function formatReport(report) {
|
|
|
78
81
|
if (report.averageOutputTokensPerCase != null) {
|
|
79
82
|
lines.push(` average_output_tokens_per_case: ${report.averageOutputTokensPerCase}`);
|
|
80
83
|
}
|
|
84
|
+
if (report.totalInputTokens != null) {
|
|
85
|
+
lines.push(` total_input_tokens: ${report.totalInputTokens}`);
|
|
86
|
+
}
|
|
87
|
+
if (report.totalOutputTokens != null) {
|
|
88
|
+
lines.push(` total_output_tokens: ${report.totalOutputTokens}`);
|
|
89
|
+
}
|
|
90
|
+
if (report.totalCostUsd != null) {
|
|
91
|
+
lines.push(` total_cost_usd: ${formatUsd(report.totalCostUsd)}`);
|
|
92
|
+
}
|
|
93
|
+
const observedScenes = report.results.filter((r) => r.tokens || r.costUsd != null || (r.events && r.events.length));
|
|
94
|
+
if (observedScenes.length > 0) {
|
|
95
|
+
lines.push(` scenes:`);
|
|
96
|
+
for (const r of observedScenes) {
|
|
97
|
+
lines.push(...renderSceneObservability(r));
|
|
98
|
+
}
|
|
99
|
+
}
|
|
81
100
|
return lines.join("\n");
|
|
82
101
|
}
|
|
102
|
+
function renderSceneObservability(r) {
|
|
103
|
+
const out = [];
|
|
104
|
+
const promptLabel = r.prompt.length > 80 ? r.prompt.slice(0, 77) + "..." : r.prompt;
|
|
105
|
+
out.push(` - prompt: "${escapeYaml(promptLabel)}"`);
|
|
106
|
+
out.push(` duration_ms: ${Math.round(r.duration)}`);
|
|
107
|
+
if (r.tokens) {
|
|
108
|
+
out.push(` tokens: { input: ${r.tokens.input}, output: ${r.tokens.output} }`);
|
|
109
|
+
}
|
|
110
|
+
if (r.costUsd != null) {
|
|
111
|
+
const source = r.costSource ?? "table";
|
|
112
|
+
out.push(` cost_usd: ${formatUsd(r.costUsd)}`);
|
|
113
|
+
out.push(` cost_source: ${source}`);
|
|
114
|
+
}
|
|
115
|
+
if (r.events && r.events.length) {
|
|
116
|
+
out.push(` timeline:`);
|
|
117
|
+
for (const e of r.events) {
|
|
118
|
+
out.push(...renderTimelineEvent(e));
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return out;
|
|
122
|
+
}
|
|
123
|
+
function renderTimelineEvent(e) {
|
|
124
|
+
const out = [];
|
|
125
|
+
out.push(` - kind: ${e.kind}`);
|
|
126
|
+
out.push(` name: "${escapeYaml(e.name)}"`);
|
|
127
|
+
out.push(` start_ms: ${Math.round(e.startMs)}`);
|
|
128
|
+
out.push(` duration_ms: ${Math.round(e.durationMs)}`);
|
|
129
|
+
if (e.tokens) {
|
|
130
|
+
out.push(` tokens: { input: ${e.tokens.input}, output: ${e.tokens.output} }`);
|
|
131
|
+
}
|
|
132
|
+
if (e.cachedInputTokens != null && e.cachedInputTokens > 0) {
|
|
133
|
+
out.push(` cached_input_tokens: ${e.cachedInputTokens}`);
|
|
134
|
+
}
|
|
135
|
+
if (e.cost?.totalUsd != null) {
|
|
136
|
+
out.push(` cost_usd: ${formatUsd(e.cost.totalUsd)}`);
|
|
137
|
+
out.push(` cost_source: ${e.cost.source}`);
|
|
138
|
+
}
|
|
139
|
+
if (e.runIndex != null) {
|
|
140
|
+
out.push(` run_index: ${e.runIndex}`);
|
|
141
|
+
}
|
|
142
|
+
if (e.error) {
|
|
143
|
+
out.push(` error: "${escapeYaml(e.error)}"`);
|
|
144
|
+
}
|
|
145
|
+
return out;
|
|
146
|
+
}
|
|
147
|
+
function formatUsd(n) {
|
|
148
|
+
if (n === 0)
|
|
149
|
+
return "0";
|
|
150
|
+
// Up to 6 decimal places, but trim trailing zeros for compactness
|
|
151
|
+
return Number(n.toFixed(6)).toString();
|
|
152
|
+
}
|
|
153
|
+
function escapeYaml(s) {
|
|
154
|
+
return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, "\\n");
|
|
155
|
+
}
|
|
83
156
|
export async function writeReport(content, timestamp, name, dimensions) {
|
|
84
157
|
const reportsDir = join(process.cwd(), ".reports");
|
|
85
158
|
await mkdir(reportsDir, { recursive: true });
|
package/dist/reports.d.ts
CHANGED
|
@@ -9,6 +9,32 @@ export interface ParsedSuiteResult {
|
|
|
9
9
|
response?: string;
|
|
10
10
|
}>;
|
|
11
11
|
}
|
|
12
|
+
export interface ParsedTimelineEvent {
|
|
13
|
+
kind: "model" | "tool";
|
|
14
|
+
name: string;
|
|
15
|
+
startMs: number;
|
|
16
|
+
durationMs: number;
|
|
17
|
+
tokens?: {
|
|
18
|
+
input: number;
|
|
19
|
+
output: number;
|
|
20
|
+
};
|
|
21
|
+
cachedInputTokens?: number;
|
|
22
|
+
costUsd?: number;
|
|
23
|
+
costSource?: string;
|
|
24
|
+
runIndex?: number;
|
|
25
|
+
error?: string;
|
|
26
|
+
}
|
|
27
|
+
export interface ParsedScene {
|
|
28
|
+
prompt: string;
|
|
29
|
+
durationMs?: number;
|
|
30
|
+
tokens?: {
|
|
31
|
+
input: number;
|
|
32
|
+
output: number;
|
|
33
|
+
};
|
|
34
|
+
costUsd?: number;
|
|
35
|
+
costSource?: string;
|
|
36
|
+
timeline?: ParsedTimelineEvent[];
|
|
37
|
+
}
|
|
12
38
|
export interface ParsedReport {
|
|
13
39
|
name?: string;
|
|
14
40
|
systemPromptHash?: string;
|
|
@@ -28,6 +54,10 @@ export interface ParsedReport {
|
|
|
28
54
|
timestamp: string;
|
|
29
55
|
averageInputTokensPerCase?: number;
|
|
30
56
|
averageOutputTokensPerCase?: number;
|
|
57
|
+
totalInputTokens?: number;
|
|
58
|
+
totalOutputTokens?: number;
|
|
59
|
+
totalCostUsd?: number;
|
|
60
|
+
scenes?: ParsedScene[];
|
|
31
61
|
suites?: ParsedSuiteResult[];
|
|
32
62
|
source: string;
|
|
33
63
|
}
|
|
@@ -44,6 +74,13 @@ export declare function parseFailedCases(content: string): Array<{
|
|
|
44
74
|
}>;
|
|
45
75
|
export declare function parseDimensions(content: string): Record<string, string> | undefined;
|
|
46
76
|
export declare function parseSuites(content: string): ParsedSuiteResult[] | undefined;
|
|
77
|
+
/**
|
|
78
|
+
* Parse the `scenes:` block (per-scene tokens/cost + timeline waterfall) from a
|
|
79
|
+
* report. The emitted format is fixed (see reporter.ts `renderSceneObservability`),
|
|
80
|
+
* so this hand-parses by indentation: scenes start at 8 spaces, scene fields at
|
|
81
|
+
* 10, timeline events at 14, event fields at 16.
|
|
82
|
+
*/
|
|
83
|
+
export declare function parseScenes(content: string): ParsedScene[] | undefined;
|
|
47
84
|
export declare function parseReport(content: string, source: string): ParsedReport;
|
|
48
85
|
export declare function findReports(dir: string, depth?: number): Promise<string[]>;
|
|
49
86
|
export declare function loadDiffEntry(hash: string): Promise<DiffEntry | null>;
|
package/dist/reports.js
CHANGED
|
@@ -124,6 +124,124 @@ export function parseSuites(content) {
|
|
|
124
124
|
suites.push(current);
|
|
125
125
|
return suites.length > 0 ? suites : undefined;
|
|
126
126
|
}
|
|
127
|
+
function parseTokens(raw) {
|
|
128
|
+
const m = raw.match(/input:\s*(\d+),\s*output:\s*(\d+)/);
|
|
129
|
+
if (!m)
|
|
130
|
+
return undefined;
|
|
131
|
+
return { input: parseInt(m[1], 10), output: parseInt(m[2], 10) };
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Parse the `scenes:` block (per-scene tokens/cost + timeline waterfall) from a
|
|
135
|
+
* report. The emitted format is fixed (see reporter.ts `renderSceneObservability`),
|
|
136
|
+
* so this hand-parses by indentation: scenes start at 8 spaces, scene fields at
|
|
137
|
+
* 10, timeline events at 14, event fields at 16.
|
|
138
|
+
*/
|
|
139
|
+
export function parseScenes(content) {
|
|
140
|
+
const lines = content.split("\n");
|
|
141
|
+
const startIdx = lines.findIndex((l) => l === " scenes:");
|
|
142
|
+
if (startIdx === -1)
|
|
143
|
+
return undefined;
|
|
144
|
+
const scenes = [];
|
|
145
|
+
let scene;
|
|
146
|
+
let inTimeline = false;
|
|
147
|
+
let event;
|
|
148
|
+
const pushEvent = () => {
|
|
149
|
+
if (event && scene) {
|
|
150
|
+
(scene.timeline ??= []).push(event);
|
|
151
|
+
event = undefined;
|
|
152
|
+
}
|
|
153
|
+
};
|
|
154
|
+
const pushScene = () => {
|
|
155
|
+
pushEvent();
|
|
156
|
+
if (scene)
|
|
157
|
+
scenes.push(scene);
|
|
158
|
+
scene = undefined;
|
|
159
|
+
inTimeline = false;
|
|
160
|
+
};
|
|
161
|
+
for (let i = startIdx + 1; i < lines.length; i++) {
|
|
162
|
+
const line = lines[i];
|
|
163
|
+
if (line.trim() === "")
|
|
164
|
+
continue;
|
|
165
|
+
const indent = line.length - line.trimStart().length;
|
|
166
|
+
// A new top-level agent field (<= 4 spaces, not part of scenes) ends the block.
|
|
167
|
+
if (indent <= 4)
|
|
168
|
+
break;
|
|
169
|
+
const sceneStart = line.match(/^ - prompt: "(.*)"$/);
|
|
170
|
+
if (sceneStart) {
|
|
171
|
+
pushScene();
|
|
172
|
+
scene = { prompt: sceneStart[1].replace(/\\"/g, '"').replace(/\\n/g, "\n") };
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
if (!scene)
|
|
176
|
+
continue;
|
|
177
|
+
const eventStart = line.match(/^ - kind: (model|tool)$/);
|
|
178
|
+
if (eventStart) {
|
|
179
|
+
pushEvent();
|
|
180
|
+
event = { kind: eventStart[1], name: "", startMs: 0, durationMs: 0 };
|
|
181
|
+
inTimeline = true;
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
if (line.match(/^ timeline:$/)) {
|
|
185
|
+
inTimeline = true;
|
|
186
|
+
continue;
|
|
187
|
+
}
|
|
188
|
+
const trimmed = line.trim();
|
|
189
|
+
const target = inTimeline && event ? "event" : "scene";
|
|
190
|
+
const kv = trimmed.match(/^([a-z_]+):\s*(.*)$/);
|
|
191
|
+
if (!kv)
|
|
192
|
+
continue;
|
|
193
|
+
const [, key, value] = kv;
|
|
194
|
+
if (target === "event" && event) {
|
|
195
|
+
switch (key) {
|
|
196
|
+
case "name":
|
|
197
|
+
event.name = value.replace(/^"|"$/g, "").replace(/\\"/g, '"');
|
|
198
|
+
break;
|
|
199
|
+
case "start_ms":
|
|
200
|
+
event.startMs = parseFloat(value);
|
|
201
|
+
break;
|
|
202
|
+
case "duration_ms":
|
|
203
|
+
event.durationMs = parseFloat(value);
|
|
204
|
+
break;
|
|
205
|
+
case "tokens":
|
|
206
|
+
event.tokens = parseTokens(value);
|
|
207
|
+
break;
|
|
208
|
+
case "cached_input_tokens":
|
|
209
|
+
event.cachedInputTokens = parseInt(value, 10);
|
|
210
|
+
break;
|
|
211
|
+
case "cost_usd":
|
|
212
|
+
event.costUsd = parseFloat(value);
|
|
213
|
+
break;
|
|
214
|
+
case "cost_source":
|
|
215
|
+
event.costSource = value;
|
|
216
|
+
break;
|
|
217
|
+
case "run_index":
|
|
218
|
+
event.runIndex = parseInt(value, 10);
|
|
219
|
+
break;
|
|
220
|
+
case "error":
|
|
221
|
+
event.error = value.replace(/^"|"$/g, "").replace(/\\"/g, '"');
|
|
222
|
+
break;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
else if (scene) {
|
|
226
|
+
switch (key) {
|
|
227
|
+
case "duration_ms":
|
|
228
|
+
scene.durationMs = parseFloat(value);
|
|
229
|
+
break;
|
|
230
|
+
case "tokens":
|
|
231
|
+
scene.tokens = parseTokens(value);
|
|
232
|
+
break;
|
|
233
|
+
case "cost_usd":
|
|
234
|
+
scene.costUsd = parseFloat(value);
|
|
235
|
+
break;
|
|
236
|
+
case "cost_source":
|
|
237
|
+
scene.costSource = value;
|
|
238
|
+
break;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
pushScene();
|
|
243
|
+
return scenes.length > 0 ? scenes : undefined;
|
|
244
|
+
}
|
|
127
245
|
export function parseReport(content, source) {
|
|
128
246
|
const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
|
|
129
247
|
const avgIn = extractField(content, "average_input_tokens_per_case");
|
|
@@ -158,9 +276,17 @@ export function parseReport(content, source) {
|
|
|
158
276
|
timestamp: extractField(content, "timestamp") ?? "",
|
|
159
277
|
averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
|
|
160
278
|
averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
|
|
279
|
+
totalInputTokens: optNum("total_input_tokens"),
|
|
280
|
+
totalOutputTokens: optNum("total_output_tokens"),
|
|
281
|
+
totalCostUsd: optNum("total_cost_usd"),
|
|
282
|
+
scenes: parseScenes(content),
|
|
161
283
|
suites: parseSuites(content),
|
|
162
284
|
source,
|
|
163
285
|
};
|
|
286
|
+
function optNum(key) {
|
|
287
|
+
const v = extractField(content, key);
|
|
288
|
+
return v != null ? parseFloat(v) : undefined;
|
|
289
|
+
}
|
|
164
290
|
}
|
|
165
291
|
export async function findReports(dir, depth = 0) {
|
|
166
292
|
if (depth > 6)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { AgentResponse } from "./types";
|
|
2
|
+
/**
|
|
3
|
+
* Serialize an arbitrary agent value to the string view the judge model and
|
|
4
|
+
* the text matchers consume. Strings pass through untouched; everything else
|
|
5
|
+
* is JSON. This is the ONLY place a structured value is forced to a string,
|
|
6
|
+
* and it happens lazily — never before a matcher actually needs text.
|
|
7
|
+
*/
|
|
8
|
+
export declare function serializeValue(value: unknown): string;
|
|
9
|
+
/**
|
|
10
|
+
* The agent's native output — the source of truth for deterministic,
|
|
11
|
+
* structural assertions. Tolerates a legacy `{ text }`-only response (no
|
|
12
|
+
* `value`) so executors can migrate incrementally.
|
|
13
|
+
*/
|
|
14
|
+
export declare function resolveValue<T>(response: AgentResponse<T>): T | string | undefined;
|
|
15
|
+
/**
|
|
16
|
+
* The string view for the judge and text matchers. An explicit `text` wins
|
|
17
|
+
* (it's the enriched projection the executor chose to expose); otherwise we
|
|
18
|
+
* serialize `value` on demand.
|
|
19
|
+
*/
|
|
20
|
+
export declare function resolveText<T>(response: AgentResponse<T>): string;
|
|
21
|
+
/**
|
|
22
|
+
* Walk a dot-path (with numeric array indices) into an arbitrary object.
|
|
23
|
+
* Returns `undefined` if any segment is missing. e.g. "plan_items.0.options".
|
|
24
|
+
*/
|
|
25
|
+
export declare function navigatePath(root: unknown, path: string): unknown;
|
package/dist/resolve.js
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Serialize an arbitrary agent value to the string view the judge model and
|
|
3
|
+
* the text matchers consume. Strings pass through untouched; everything else
|
|
4
|
+
* is JSON. This is the ONLY place a structured value is forced to a string,
|
|
5
|
+
* and it happens lazily — never before a matcher actually needs text.
|
|
6
|
+
*/
|
|
7
|
+
export function serializeValue(value) {
|
|
8
|
+
if (typeof value === "string")
|
|
9
|
+
return value;
|
|
10
|
+
if (value === null || value === undefined)
|
|
11
|
+
return "";
|
|
12
|
+
try {
|
|
13
|
+
return JSON.stringify(value, null, 2);
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
return String(value);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* The agent's native output — the source of truth for deterministic,
|
|
21
|
+
* structural assertions. Tolerates a legacy `{ text }`-only response (no
|
|
22
|
+
* `value`) so executors can migrate incrementally.
|
|
23
|
+
*/
|
|
24
|
+
export function resolveValue(response) {
|
|
25
|
+
if (response.value !== undefined)
|
|
26
|
+
return response.value;
|
|
27
|
+
return response.text;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* The string view for the judge and text matchers. An explicit `text` wins
|
|
31
|
+
* (it's the enriched projection the executor chose to expose); otherwise we
|
|
32
|
+
* serialize `value` on demand.
|
|
33
|
+
*/
|
|
34
|
+
export function resolveText(response) {
|
|
35
|
+
if (typeof response.text === "string")
|
|
36
|
+
return response.text;
|
|
37
|
+
return serializeValue(response.value);
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Walk a dot-path (with numeric array indices) into an arbitrary object.
|
|
41
|
+
* Returns `undefined` if any segment is missing. e.g. "plan_items.0.options".
|
|
42
|
+
*/
|
|
43
|
+
export function navigatePath(root, path) {
|
|
44
|
+
let cur = root;
|
|
45
|
+
for (const seg of path.split(".")) {
|
|
46
|
+
if (cur == null)
|
|
47
|
+
return undefined;
|
|
48
|
+
if (Array.isArray(cur)) {
|
|
49
|
+
const idx = Number(seg);
|
|
50
|
+
if (!Number.isInteger(idx))
|
|
51
|
+
return undefined;
|
|
52
|
+
cur = cur[idx];
|
|
53
|
+
}
|
|
54
|
+
else if (typeof cur === "object" && seg in cur) {
|
|
55
|
+
cur = cur[seg];
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
return undefined;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return cur;
|
|
62
|
+
}
|
package/dist/runner.d.ts
CHANGED
|
@@ -1,4 +1,13 @@
|
|
|
1
1
|
import type { AgentExecutor, AgentResponse, SceneDefinition, SceneResult } from "./types";
|
|
2
2
|
import type { JudgeConfig } from "./config";
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
/**
|
|
4
|
+
* Extract a named field from an agent response for assertion.
|
|
5
|
+
* - "response" / "value" → the native structured value (deterministic matchers)
|
|
6
|
+
* - "text" → the serialized/judge view (lazy; text matchers)
|
|
7
|
+
* - "metadata"/"refusal" → the corresponding response property
|
|
8
|
+
* - dot-path → navigated into the structured value first
|
|
9
|
+
* (e.g. "plan_items.0.options"), falling back to
|
|
10
|
+
* metadata so existing metadata paths keep resolving.
|
|
11
|
+
*/
|
|
12
|
+
export declare function extractField<T>(response: AgentResponse<T>, field: string): unknown;
|
|
13
|
+
export declare function executeScene<T = string>(executor: AgentExecutor<T>, scene: SceneDefinition, globalTimeout?: number, judgeConfig?: JudgeConfig, globalTurns?: number, globalRuns?: number): Promise<SceneResult<T>>;
|