@sebastiantuyu/agest 0.3.2 → 0.3.3-next.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/preview.js CHANGED
@@ -189,6 +189,91 @@ function renderFailedCases(cases) {
189
189
  </ul>
190
190
  </details>`;
191
191
  }
192
+ const WF_MODEL = "#38bdf8";
193
+ const WF_TOOL = "#facc15";
194
+ const WF_ERROR = "#f87171";
195
+ function fmtUsdHtml(n) {
196
+ if (n === 0)
197
+ return "$0";
198
+ return "$" + Number(n.toFixed(4)).toString();
199
+ }
200
+ /**
201
+ * Chrome-DevTools-style waterfall for a report's per-scene timelines. Bars are
202
+ * absolutely positioned within a track by start_ms / duration_ms. Returns "" if
203
+ * the report carries no timeline data (older reports / non-traced executors).
204
+ */
205
+ function renderWaterfallHtml(report) {
206
+ const scenes = (report.scenes ?? []).filter((s) => s.timeline && s.timeline.length > 0);
207
+ if (scenes.length === 0)
208
+ return "";
209
+ const sceneBlocks = scenes
210
+ .map((scene) => {
211
+ const events = scene.timeline;
212
+ const t0 = Math.min(...events.map((e) => e.startMs));
213
+ const tEnd = Math.max(...events.map((e) => e.startMs + e.durationMs));
214
+ const span = Math.max(1, tEnd - t0);
215
+ const rows = events
216
+ .map((e) => {
217
+ const left = ((e.startMs - t0) / span) * 100;
218
+ const width = Math.max(0.6, (e.durationMs / span) * 100);
219
+ const bg = e.error ? WF_ERROR : e.kind === "model" ? WF_MODEL : WF_TOOL;
220
+ const icon = e.kind === "model" ? "◆" : "▸";
221
+ const tip = [
222
+ `${e.kind}: ${e.name}`,
223
+ `start ${Math.round(e.startMs)}ms · ${Math.round(e.durationMs)}ms`,
224
+ e.tokens ? `${e.tokens.input}→${e.tokens.output} tok` : "",
225
+ e.cachedInputTokens ? `${e.cachedInputTokens} cached` : "",
226
+ e.costUsd != null ? fmtUsdHtml(e.costUsd) : "",
227
+ e.error ? `error: ${e.error}` : "",
228
+ ]
229
+ .filter(Boolean)
230
+ .join(" · ");
231
+ const cost = e.costUsd != null ? fmtUsdHtml(e.costUsd) : "";
232
+ return `
233
+ <div class="flex items-center gap-2 text-[11px] leading-5">
234
+ <span class="w-44 shrink-0 truncate ${e.error ? "text-red-400" : "text-zinc-400"}" title="${escHtml(e.name)}">
235
+ <span style="color:${bg}">${icon}</span> ${escHtml(e.name)}
236
+ </span>
237
+ <div class="relative flex-1 h-3 bg-zinc-800/40 rounded">
238
+ <div class="absolute top-0 h-3 rounded" style="left:${left.toFixed(2)}%;width:${width.toFixed(2)}%;background:${bg}" title="${escHtml(tip)}"></div>
239
+ </div>
240
+ <span class="w-16 shrink-0 text-right text-zinc-500">${Math.round(e.durationMs)}ms</span>
241
+ <span class="w-16 shrink-0 text-right text-zinc-500">${cost}</span>
242
+ </div>`;
243
+ })
244
+ .join("\n");
245
+ const meta = [
246
+ scene.tokens ? `${scene.tokens.input}→${scene.tokens.output} tok` : "",
247
+ scene.costUsd != null ? fmtUsdHtml(scene.costUsd) : "",
248
+ scene.costSource ? scene.costSource : "",
249
+ scene.durationMs != null ? `${Math.round(scene.durationMs)}ms` : "",
250
+ ]
251
+ .filter(Boolean)
252
+ .join(" · ");
253
+ return `
254
+ <div>
255
+ <div class="flex items-center justify-between mb-1.5">
256
+ <span class="text-xs text-zinc-300 truncate" title="${escHtml(scene.prompt)}">${escHtml(scene.prompt)}</span>
257
+ <span class="text-[11px] text-zinc-500 shrink-0 ml-3">${escHtml(meta)}</span>
258
+ </div>
259
+ <div class="space-y-1">${rows}</div>
260
+ </div>`;
261
+ })
262
+ .join("\n");
263
+ return `
264
+ <details class="mt-2" open>
265
+ <summary class="text-xs text-sky-400 cursor-pointer hover:text-sky-300 select-none">
266
+ waterfall &middot; ${scenes.length} scene${scenes.length !== 1 ? "s" : ""}
267
+ </summary>
268
+ <div class="mt-3 mb-2 pl-3 border-l border-zinc-800 space-y-5">
269
+ <div class="flex gap-4 text-[10px] text-zinc-500">
270
+ <span><span style="color:${WF_MODEL}">◆</span> model</span>
271
+ <span><span style="color:${WF_TOOL}">▸</span> tool</span>
272
+ </div>
273
+ ${sceneBlocks}
274
+ </div>
275
+ </details>`;
276
+ }
192
277
  function renderRunRow(entry, idx) {
193
278
  const { report, delta, diffLines } = entry;
194
279
  const pct = report.successRate * 100;
@@ -233,6 +318,7 @@ function renderRunRow(entry, idx) {
233
318
  </div>
234
319
  <div class="ml-10 mt-0.5 flex gap-3 flex-wrap">${dimTags}</div>
235
320
  ${diffHtml}
321
+ <div class="ml-10">${renderWaterfallHtml(report)}</div>
236
322
  </div>`;
237
323
  }
238
324
  // ---------------------------------------------------------------------------
@@ -990,6 +1076,12 @@ function renderSingleRun(report) {
990
1076
  <p class="text-zinc-300">${Math.round(report.averageOutputTokensPerCase)}</p>
991
1077
  </div>`
992
1078
  : ""}
1079
+ ${report.totalCostUsd != null
1080
+ ? `<div>
1081
+ <span class="text-zinc-500">Total Cost</span>
1082
+ <p class="text-zinc-300">${fmtUsdHtml(report.totalCostUsd)}${report.totalInputTokens != null ? ` <span class="text-zinc-600">· ${report.totalInputTokens}→${report.totalOutputTokens} tok</span>` : ""}</p>
1083
+ </div>`
1084
+ : ""}
993
1085
  ${report.tools && report.tools.length > 0
994
1086
  ? `<div>
995
1087
  <span class="text-zinc-500">Tools</span>
@@ -997,6 +1089,7 @@ function renderSingleRun(report) {
997
1089
  </div>`
998
1090
  : ""}
999
1091
  </div>
1092
+ ${renderWaterfallHtml(report)}
1000
1093
  ${failedSection}
1001
1094
  </div>`;
1002
1095
  }
@@ -0,0 +1,32 @@
1
+ export interface ModelPrice {
2
+ /** USD per 1M input tokens */
3
+ input: number;
4
+ /** USD per 1M output tokens */
5
+ output: number;
6
+ /**
7
+ * USD per 1M cached (prompt-cache-hit) input tokens. When omitted, cached
8
+ * tokens are billed at `DEFAULT_CACHE_MULTIPLIER` × the input rate.
9
+ */
10
+ cachedInput?: number;
11
+ }
12
+ /** Fraction of the input rate charged for cache-hit tokens when no explicit rate is set. */
13
+ export declare const DEFAULT_CACHE_MULTIPLIER = 0.1;
14
+ export type CostSource = "provider" | "table" | "unavailable";
15
+ export interface CostBreakdown {
16
+ inputUsd?: number;
17
+ outputUsd?: number;
18
+ totalUsd?: number;
19
+ source: CostSource;
20
+ }
21
+ export declare function setPricingOverrides(table?: Record<string, ModelPrice>): void;
22
+ export declare function lookupPrice(model: string | undefined): ModelPrice | undefined;
23
+ export interface ComputeCostInput {
24
+ model?: string;
25
+ inputTokens?: number;
26
+ outputTokens?: number;
27
+ /** Cache-hit input tokens (subset of inputTokens), billed at the cached rate. */
28
+ cachedInputTokens?: number;
29
+ /** USD cost the provider already reported (takes precedence) */
30
+ providerCost?: number;
31
+ }
32
+ export declare function computeCost(input: ComputeCostInput): CostBreakdown;
@@ -0,0 +1,48 @@
1
+ import { readFileSync } from "fs";
2
+ import { fileURLToPath } from "url";
3
+ import { dirname, join } from "path";
4
+ /** Fraction of the input rate charged for cache-hit tokens when no explicit rate is set. */
5
+ export const DEFAULT_CACHE_MULTIPLIER = 0.1;
6
+ const here = dirname(fileURLToPath(import.meta.url));
7
+ const builtIn = JSON.parse(readFileSync(join(here, "models.json"), "utf-8"));
8
+ let overrides = {};
9
+ export function setPricingOverrides(table) {
10
+ overrides = table ?? {};
11
+ }
12
+ export function lookupPrice(model) {
13
+ if (!model)
14
+ return undefined;
15
+ if (overrides[model])
16
+ return overrides[model];
17
+ if (builtIn[model])
18
+ return builtIn[model];
19
+ // Loose suffix/prefix match — pick the longest matching key
20
+ const lowered = model.toLowerCase();
21
+ const keys = Object.keys({ ...builtIn, ...overrides })
22
+ .filter((k) => lowered.includes(k.toLowerCase()) || k.toLowerCase().includes(lowered))
23
+ .sort((a, b) => b.length - a.length);
24
+ if (keys.length > 0) {
25
+ return overrides[keys[0]] ?? builtIn[keys[0]];
26
+ }
27
+ return undefined;
28
+ }
29
+ export function computeCost(input) {
30
+ if (input.providerCost != null && Number.isFinite(input.providerCost)) {
31
+ return { totalUsd: input.providerCost, source: "provider" };
32
+ }
33
+ const price = lookupPrice(input.model);
34
+ if (!price)
35
+ return { source: "unavailable" };
36
+ const totalInput = input.inputTokens ?? 0;
37
+ const cached = Math.min(input.cachedInputTokens ?? 0, totalInput);
38
+ const uncached = totalInput - cached;
39
+ const cachedRate = price.cachedInput ?? price.input * DEFAULT_CACHE_MULTIPLIER;
40
+ const inputUsd = (uncached / 1_000_000) * price.input + (cached / 1_000_000) * cachedRate;
41
+ const outputUsd = ((input.outputTokens ?? 0) / 1_000_000) * price.output;
42
+ return {
43
+ inputUsd,
44
+ outputUsd,
45
+ totalUsd: inputUsd + outputUsd,
46
+ source: "table",
47
+ };
48
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "claude-opus-4-7": { "input": 15, "output": 75 },
3
+ "claude-opus-4-6": { "input": 15, "output": 75 },
4
+ "claude-opus-4-5": { "input": 15, "output": 75 },
5
+ "claude-sonnet-4-6": { "input": 3, "output": 15 },
6
+ "claude-sonnet-4-5": { "input": 3, "output": 15 },
7
+ "claude-haiku-4-5": { "input": 1, "output": 5 },
8
+ "claude-3-5-sonnet-20241022": { "input": 3, "output": 15 },
9
+ "claude-3-5-haiku-20241022": { "input": 0.8, "output": 4 },
10
+ "claude-3-opus-20240229": { "input": 15, "output": 75 },
11
+ "gpt-4o": { "input": 2.5, "output": 10 },
12
+ "gpt-4o-mini": { "input": 0.15, "output": 0.6 },
13
+ "gpt-4.1": { "input": 2, "output": 8 },
14
+ "gpt-4.1-mini": { "input": 0.4, "output": 1.6 },
15
+ "gpt-4.1-nano": { "input": 0.1, "output": 0.4 },
16
+ "gpt-5": { "input": 1.25, "output": 10 },
17
+ "gpt-5-mini": { "input": 0.25, "output": 2 },
18
+ "o1": { "input": 15, "output": 60 },
19
+ "o1-mini": { "input": 1.1, "output": 4.4 },
20
+ "o3-mini": { "input": 1.1, "output": 4.4 }
21
+ }
@@ -1,4 +1,4 @@
1
1
  import type { AgentReport } from "./types";
2
- export declare function formatReport(report: AgentReport): string;
2
+ export declare function formatReport(report: AgentReport<unknown>): string;
3
3
  export declare function writeReport(content: string, timestamp: string, name?: string, dimensions?: Record<string, string>): Promise<string>;
4
4
  export declare function writeDiffEntry(hash: string, systemPrompt: string, tools: string[], model?: string): Promise<void>;
package/dist/reporter.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import { access, mkdir, writeFile } from "fs/promises";
2
2
  import { createHash } from "crypto";
3
3
  import { join } from "path";
4
+ import { resolveText } from "./resolve";
4
5
  export function formatReport(report) {
5
6
  const lines = ["agent:"];
6
7
  if (report.name)
@@ -24,8 +25,9 @@ export function formatReport(report) {
24
25
  lines.push(` reason: "${reason}"`);
25
26
  }
26
27
  const result = report.results.find((r) => r.prompt === c);
27
- if (result?.response.text) {
28
- const escaped = result.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
28
+ const responseText = result ? resolveText(result.response) : "";
29
+ if (responseText) {
30
+ const escaped = responseText.replace(/"/g, '\\"').replace(/\n/g, '\\n');
29
31
  lines.push(` response: "${escaped}"`);
30
32
  }
31
33
  }
@@ -51,8 +53,9 @@ export function formatReport(report) {
51
53
  if (r.error) {
52
54
  lines.push(` reason: "${r.error}"`);
53
55
  }
54
- if (r.response.text) {
55
- const escaped = r.response.text.replace(/"/g, '\\"').replace(/\n/g, '\\n');
56
+ const responseText = resolveText(r.response);
57
+ if (responseText) {
58
+ const escaped = responseText.replace(/"/g, '\\"').replace(/\n/g, '\\n');
56
59
  lines.push(` response: "${escaped}"`);
57
60
  }
58
61
  }
@@ -78,8 +81,78 @@ export function formatReport(report) {
78
81
  if (report.averageOutputTokensPerCase != null) {
79
82
  lines.push(` average_output_tokens_per_case: ${report.averageOutputTokensPerCase}`);
80
83
  }
84
+ if (report.totalInputTokens != null) {
85
+ lines.push(` total_input_tokens: ${report.totalInputTokens}`);
86
+ }
87
+ if (report.totalOutputTokens != null) {
88
+ lines.push(` total_output_tokens: ${report.totalOutputTokens}`);
89
+ }
90
+ if (report.totalCostUsd != null) {
91
+ lines.push(` total_cost_usd: ${formatUsd(report.totalCostUsd)}`);
92
+ }
93
+ const observedScenes = report.results.filter((r) => r.tokens || r.costUsd != null || (r.events && r.events.length));
94
+ if (observedScenes.length > 0) {
95
+ lines.push(` scenes:`);
96
+ for (const r of observedScenes) {
97
+ lines.push(...renderSceneObservability(r));
98
+ }
99
+ }
81
100
  return lines.join("\n");
82
101
  }
102
+ function renderSceneObservability(r) {
103
+ const out = [];
104
+ const promptLabel = r.prompt.length > 80 ? r.prompt.slice(0, 77) + "..." : r.prompt;
105
+ out.push(` - prompt: "${escapeYaml(promptLabel)}"`);
106
+ out.push(` duration_ms: ${Math.round(r.duration)}`);
107
+ if (r.tokens) {
108
+ out.push(` tokens: { input: ${r.tokens.input}, output: ${r.tokens.output} }`);
109
+ }
110
+ if (r.costUsd != null) {
111
+ const source = r.costSource ?? "table";
112
+ out.push(` cost_usd: ${formatUsd(r.costUsd)}`);
113
+ out.push(` cost_source: ${source}`);
114
+ }
115
+ if (r.events && r.events.length) {
116
+ out.push(` timeline:`);
117
+ for (const e of r.events) {
118
+ out.push(...renderTimelineEvent(e));
119
+ }
120
+ }
121
+ return out;
122
+ }
123
+ function renderTimelineEvent(e) {
124
+ const out = [];
125
+ out.push(` - kind: ${e.kind}`);
126
+ out.push(` name: "${escapeYaml(e.name)}"`);
127
+ out.push(` start_ms: ${Math.round(e.startMs)}`);
128
+ out.push(` duration_ms: ${Math.round(e.durationMs)}`);
129
+ if (e.tokens) {
130
+ out.push(` tokens: { input: ${e.tokens.input}, output: ${e.tokens.output} }`);
131
+ }
132
+ if (e.cachedInputTokens != null && e.cachedInputTokens > 0) {
133
+ out.push(` cached_input_tokens: ${e.cachedInputTokens}`);
134
+ }
135
+ if (e.cost?.totalUsd != null) {
136
+ out.push(` cost_usd: ${formatUsd(e.cost.totalUsd)}`);
137
+ out.push(` cost_source: ${e.cost.source}`);
138
+ }
139
+ if (e.runIndex != null) {
140
+ out.push(` run_index: ${e.runIndex}`);
141
+ }
142
+ if (e.error) {
143
+ out.push(` error: "${escapeYaml(e.error)}"`);
144
+ }
145
+ return out;
146
+ }
147
+ function formatUsd(n) {
148
+ if (n === 0)
149
+ return "0";
150
+ // Up to 6 decimal places, but trim trailing zeros for compactness
151
+ return Number(n.toFixed(6)).toString();
152
+ }
153
+ function escapeYaml(s) {
154
+ return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, "\\n");
155
+ }
83
156
  export async function writeReport(content, timestamp, name, dimensions) {
84
157
  const reportsDir = join(process.cwd(), ".reports");
85
158
  await mkdir(reportsDir, { recursive: true });
package/dist/reports.d.ts CHANGED
@@ -9,6 +9,32 @@ export interface ParsedSuiteResult {
9
9
  response?: string;
10
10
  }>;
11
11
  }
12
+ export interface ParsedTimelineEvent {
13
+ kind: "model" | "tool";
14
+ name: string;
15
+ startMs: number;
16
+ durationMs: number;
17
+ tokens?: {
18
+ input: number;
19
+ output: number;
20
+ };
21
+ cachedInputTokens?: number;
22
+ costUsd?: number;
23
+ costSource?: string;
24
+ runIndex?: number;
25
+ error?: string;
26
+ }
27
+ export interface ParsedScene {
28
+ prompt: string;
29
+ durationMs?: number;
30
+ tokens?: {
31
+ input: number;
32
+ output: number;
33
+ };
34
+ costUsd?: number;
35
+ costSource?: string;
36
+ timeline?: ParsedTimelineEvent[];
37
+ }
12
38
  export interface ParsedReport {
13
39
  name?: string;
14
40
  systemPromptHash?: string;
@@ -28,6 +54,10 @@ export interface ParsedReport {
28
54
  timestamp: string;
29
55
  averageInputTokensPerCase?: number;
30
56
  averageOutputTokensPerCase?: number;
57
+ totalInputTokens?: number;
58
+ totalOutputTokens?: number;
59
+ totalCostUsd?: number;
60
+ scenes?: ParsedScene[];
31
61
  suites?: ParsedSuiteResult[];
32
62
  source: string;
33
63
  }
@@ -44,6 +74,13 @@ export declare function parseFailedCases(content: string): Array<{
44
74
  }>;
45
75
  export declare function parseDimensions(content: string): Record<string, string> | undefined;
46
76
  export declare function parseSuites(content: string): ParsedSuiteResult[] | undefined;
77
+ /**
78
+ * Parse the `scenes:` block (per-scene tokens/cost + timeline waterfall) from a
79
+ * report. The emitted format is fixed (see reporter.ts `renderSceneObservability`),
80
+ * so this hand-parses by indentation: scenes start at 8 spaces, scene fields at
81
+ * 10, timeline events at 14, event fields at 16.
82
+ */
83
+ export declare function parseScenes(content: string): ParsedScene[] | undefined;
47
84
  export declare function parseReport(content: string, source: string): ParsedReport;
48
85
  export declare function findReports(dir: string, depth?: number): Promise<string[]>;
49
86
  export declare function loadDiffEntry(hash: string): Promise<DiffEntry | null>;
package/dist/reports.js CHANGED
@@ -124,6 +124,124 @@ export function parseSuites(content) {
124
124
  suites.push(current);
125
125
  return suites.length > 0 ? suites : undefined;
126
126
  }
127
+ function parseTokens(raw) {
128
+ const m = raw.match(/input:\s*(\d+),\s*output:\s*(\d+)/);
129
+ if (!m)
130
+ return undefined;
131
+ return { input: parseInt(m[1], 10), output: parseInt(m[2], 10) };
132
+ }
133
+ /**
134
+ * Parse the `scenes:` block (per-scene tokens/cost + timeline waterfall) from a
135
+ * report. The emitted format is fixed (see reporter.ts `renderSceneObservability`),
136
+ * so this hand-parses by indentation: scenes start at 8 spaces, scene fields at
137
+ * 10, timeline events at 14, event fields at 16.
138
+ */
139
+ export function parseScenes(content) {
140
+ const lines = content.split("\n");
141
+ const startIdx = lines.findIndex((l) => l === " scenes:");
142
+ if (startIdx === -1)
143
+ return undefined;
144
+ const scenes = [];
145
+ let scene;
146
+ let inTimeline = false;
147
+ let event;
148
+ const pushEvent = () => {
149
+ if (event && scene) {
150
+ (scene.timeline ??= []).push(event);
151
+ event = undefined;
152
+ }
153
+ };
154
+ const pushScene = () => {
155
+ pushEvent();
156
+ if (scene)
157
+ scenes.push(scene);
158
+ scene = undefined;
159
+ inTimeline = false;
160
+ };
161
+ for (let i = startIdx + 1; i < lines.length; i++) {
162
+ const line = lines[i];
163
+ if (line.trim() === "")
164
+ continue;
165
+ const indent = line.length - line.trimStart().length;
166
+ // A new top-level agent field (<= 4 spaces, not part of scenes) ends the block.
167
+ if (indent <= 4)
168
+ break;
169
+ const sceneStart = line.match(/^ - prompt: "(.*)"$/);
170
+ if (sceneStart) {
171
+ pushScene();
172
+ scene = { prompt: sceneStart[1].replace(/\\"/g, '"').replace(/\\n/g, "\n") };
173
+ continue;
174
+ }
175
+ if (!scene)
176
+ continue;
177
+ const eventStart = line.match(/^ - kind: (model|tool)$/);
178
+ if (eventStart) {
179
+ pushEvent();
180
+ event = { kind: eventStart[1], name: "", startMs: 0, durationMs: 0 };
181
+ inTimeline = true;
182
+ continue;
183
+ }
184
+ if (line.match(/^ timeline:$/)) {
185
+ inTimeline = true;
186
+ continue;
187
+ }
188
+ const trimmed = line.trim();
189
+ const target = inTimeline && event ? "event" : "scene";
190
+ const kv = trimmed.match(/^([a-z_]+):\s*(.*)$/);
191
+ if (!kv)
192
+ continue;
193
+ const [, key, value] = kv;
194
+ if (target === "event" && event) {
195
+ switch (key) {
196
+ case "name":
197
+ event.name = value.replace(/^"|"$/g, "").replace(/\\"/g, '"');
198
+ break;
199
+ case "start_ms":
200
+ event.startMs = parseFloat(value);
201
+ break;
202
+ case "duration_ms":
203
+ event.durationMs = parseFloat(value);
204
+ break;
205
+ case "tokens":
206
+ event.tokens = parseTokens(value);
207
+ break;
208
+ case "cached_input_tokens":
209
+ event.cachedInputTokens = parseInt(value, 10);
210
+ break;
211
+ case "cost_usd":
212
+ event.costUsd = parseFloat(value);
213
+ break;
214
+ case "cost_source":
215
+ event.costSource = value;
216
+ break;
217
+ case "run_index":
218
+ event.runIndex = parseInt(value, 10);
219
+ break;
220
+ case "error":
221
+ event.error = value.replace(/^"|"$/g, "").replace(/\\"/g, '"');
222
+ break;
223
+ }
224
+ }
225
+ else if (scene) {
226
+ switch (key) {
227
+ case "duration_ms":
228
+ scene.durationMs = parseFloat(value);
229
+ break;
230
+ case "tokens":
231
+ scene.tokens = parseTokens(value);
232
+ break;
233
+ case "cost_usd":
234
+ scene.costUsd = parseFloat(value);
235
+ break;
236
+ case "cost_source":
237
+ scene.costSource = value;
238
+ break;
239
+ }
240
+ }
241
+ }
242
+ pushScene();
243
+ return scenes.length > 0 ? scenes : undefined;
244
+ }
127
245
  export function parseReport(content, source) {
128
246
  const num = (key, fallback = 0) => parseFloat(extractField(content, key) ?? String(fallback));
129
247
  const avgIn = extractField(content, "average_input_tokens_per_case");
@@ -158,9 +276,17 @@ export function parseReport(content, source) {
158
276
  timestamp: extractField(content, "timestamp") ?? "",
159
277
  averageInputTokensPerCase: avgIn != null ? parseFloat(avgIn) : undefined,
160
278
  averageOutputTokensPerCase: avgOut != null ? parseFloat(avgOut) : undefined,
279
+ totalInputTokens: optNum("total_input_tokens"),
280
+ totalOutputTokens: optNum("total_output_tokens"),
281
+ totalCostUsd: optNum("total_cost_usd"),
282
+ scenes: parseScenes(content),
161
283
  suites: parseSuites(content),
162
284
  source,
163
285
  };
286
+ function optNum(key) {
287
+ const v = extractField(content, key);
288
+ return v != null ? parseFloat(v) : undefined;
289
+ }
164
290
  }
165
291
  export async function findReports(dir, depth = 0) {
166
292
  if (depth > 6)
@@ -0,0 +1,25 @@
1
+ import type { AgentResponse } from "./types";
2
+ /**
3
+ * Serialize an arbitrary agent value to the string view the judge model and
4
+ * the text matchers consume. Strings pass through untouched; everything else
5
+ * is JSON. This is the ONLY place a structured value is forced to a string,
6
+ * and it happens lazily — never before a matcher actually needs text.
7
+ */
8
+ export declare function serializeValue(value: unknown): string;
9
+ /**
10
+ * The agent's native output — the source of truth for deterministic,
11
+ * structural assertions. Tolerates a legacy `{ text }`-only response (no
12
+ * `value`) so executors can migrate incrementally.
13
+ */
14
+ export declare function resolveValue<T>(response: AgentResponse<T>): T | string | undefined;
15
+ /**
16
+ * The string view for the judge and text matchers. An explicit `text` wins
17
+ * (it's the enriched projection the executor chose to expose); otherwise we
18
+ * serialize `value` on demand.
19
+ */
20
+ export declare function resolveText<T>(response: AgentResponse<T>): string;
21
+ /**
22
+ * Walk a dot-path (with numeric array indices) into an arbitrary object.
23
+ * Returns `undefined` if any segment is missing. e.g. "plan_items.0.options".
24
+ */
25
+ export declare function navigatePath(root: unknown, path: string): unknown;
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Serialize an arbitrary agent value to the string view the judge model and
3
+ * the text matchers consume. Strings pass through untouched; everything else
4
+ * is JSON. This is the ONLY place a structured value is forced to a string,
5
+ * and it happens lazily — never before a matcher actually needs text.
6
+ */
7
+ export function serializeValue(value) {
8
+ if (typeof value === "string")
9
+ return value;
10
+ if (value === null || value === undefined)
11
+ return "";
12
+ try {
13
+ return JSON.stringify(value, null, 2);
14
+ }
15
+ catch {
16
+ return String(value);
17
+ }
18
+ }
19
+ /**
20
+ * The agent's native output — the source of truth for deterministic,
21
+ * structural assertions. Tolerates a legacy `{ text }`-only response (no
22
+ * `value`) so executors can migrate incrementally.
23
+ */
24
+ export function resolveValue(response) {
25
+ if (response.value !== undefined)
26
+ return response.value;
27
+ return response.text;
28
+ }
29
+ /**
30
+ * The string view for the judge and text matchers. An explicit `text` wins
31
+ * (it's the enriched projection the executor chose to expose); otherwise we
32
+ * serialize `value` on demand.
33
+ */
34
+ export function resolveText(response) {
35
+ if (typeof response.text === "string")
36
+ return response.text;
37
+ return serializeValue(response.value);
38
+ }
39
+ /**
40
+ * Walk a dot-path (with numeric array indices) into an arbitrary object.
41
+ * Returns `undefined` if any segment is missing. e.g. "plan_items.0.options".
42
+ */
43
+ export function navigatePath(root, path) {
44
+ let cur = root;
45
+ for (const seg of path.split(".")) {
46
+ if (cur == null)
47
+ return undefined;
48
+ if (Array.isArray(cur)) {
49
+ const idx = Number(seg);
50
+ if (!Number.isInteger(idx))
51
+ return undefined;
52
+ cur = cur[idx];
53
+ }
54
+ else if (typeof cur === "object" && seg in cur) {
55
+ cur = cur[seg];
56
+ }
57
+ else {
58
+ return undefined;
59
+ }
60
+ }
61
+ return cur;
62
+ }
package/dist/runner.d.ts CHANGED
@@ -1,4 +1,13 @@
1
1
  import type { AgentExecutor, AgentResponse, SceneDefinition, SceneResult } from "./types";
2
2
  import type { JudgeConfig } from "./config";
3
- export declare function extractField(response: AgentResponse, field: string): unknown;
4
- export declare function executeScene(executor: AgentExecutor, scene: SceneDefinition, globalTimeout?: number, judgeConfig?: JudgeConfig, globalTurns?: number, globalRuns?: number): Promise<SceneResult>;
3
+ /**
4
+ * Extract a named field from an agent response for assertion.
5
+ * - "response" / "value" → the native structured value (deterministic matchers)
6
+ * - "text" → the serialized/judge view (lazy; text matchers)
7
+ * - "metadata"/"refusal" → the corresponding response property
8
+ * - dot-path → navigated into the structured value first
9
+ * (e.g. "plan_items.0.options"), falling back to
10
+ * metadata so existing metadata paths keep resolving.
11
+ */
12
+ export declare function extractField<T>(response: AgentResponse<T>, field: string): unknown;
13
+ export declare function executeScene<T = string>(executor: AgentExecutor<T>, scene: SceneDefinition, globalTimeout?: number, judgeConfig?: JudgeConfig, globalTurns?: number, globalRuns?: number): Promise<SceneResult<T>>;