@sebastiantuyu/agest 0.3.3-next.1 → 0.3.3-next.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  import type { AgentExecutor } from "../types";
2
2
  type Runnable = {
3
- invoke: (input: any) => Promise<any>;
3
+ invoke: (input: any, options?: any) => Promise<any>;
4
4
  };
5
5
  type LangGraphGraph = Runnable & {
6
6
  lg_is_pregel: true;
@@ -1,3 +1,5 @@
1
+ import { computeCost } from "../pricing";
2
+ import { createTracingHandle } from "./tracing";
1
3
  /**
2
4
  * Adapter for LangChain runnables and agents.
3
5
  *
@@ -19,23 +21,43 @@ export function langchain(runnable) {
19
21
  function langGraphAdapter(graph) {
20
22
  const staticTools = extractGraphTools(graph);
21
23
  return async (input) => {
24
+ const baseline = performance.now();
25
+ const tracing = await createTracingHandle(baseline);
22
26
  let result;
23
27
  try {
24
28
  const { HumanMessage } = await import("@langchain/core/messages");
25
- result = await graph.invoke({ messages: [new HumanMessage(input)] });
29
+ result = await graph.invoke({ messages: [new HumanMessage(input)] }, { callbacks: tracing.callbacks });
26
30
  }
27
31
  catch (err) {
28
- return { text: "", executionError: err.message, metadata: { tools: staticTools } };
32
+ const { events } = tracing.drain();
33
+ return {
34
+ text: "",
35
+ executionError: err.message,
36
+ metadata: { tools: staticTools, events: events.length ? events : undefined },
37
+ };
29
38
  }
30
39
  const messages = result.messages;
31
40
  const last = messages[messages.length - 1];
32
41
  const text = typeof last?.content === "string"
33
42
  ? last.content
34
43
  : JSON.stringify(last?.content ?? result);
35
- const model = last?.response_metadata?.model_name;
44
+ const drained = tracing.drain();
45
+ const model = last?.response_metadata?.model_name ??
46
+ drained.modelName;
47
+ const { tokens, cost } = summarizeRun({
48
+ events: drained.events,
49
+ fallbackTokens: extractTokensFromMessage(last),
50
+ model,
51
+ });
36
52
  return {
37
53
  text,
38
- metadata: { model, tools: staticTools, tokens: extractTokensFromMessage(last) },
54
+ metadata: {
55
+ model,
56
+ tools: staticTools,
57
+ tokens,
58
+ cost,
59
+ events: drained.events.length ? drained.events : undefined,
60
+ },
39
61
  };
40
62
  };
41
63
  }
@@ -48,33 +70,69 @@ function reactAgentAdapter(agent) {
48
70
  ?.map((t) => t.name ?? t.getName?.())
49
71
  .filter(Boolean);
50
72
  return async (input) => {
73
+ const baseline = performance.now();
74
+ const tracing = await createTracingHandle(baseline);
51
75
  let result;
52
76
  try {
53
- result = await agent.invoke({ messages: [{ role: "human", content: input }] });
77
+ result = await agent.invoke({ messages: [{ role: "human", content: input }] }, { callbacks: tracing.callbacks });
54
78
  }
55
79
  catch (err) {
56
- return { text: "", executionError: err.message, metadata: { model, systemPrompt, tools } };
80
+ const { events } = tracing.drain();
81
+ return {
82
+ text: "",
83
+ executionError: err.message,
84
+ metadata: {
85
+ model,
86
+ systemPrompt,
87
+ tools,
88
+ events: events.length ? events : undefined,
89
+ },
90
+ };
57
91
  }
58
92
  const messages = result.messages;
59
93
  const last = messages[messages.length - 1];
60
94
  const text = typeof last?.content === "string"
61
95
  ? last.content
62
96
  : JSON.stringify(last?.content ?? result);
97
+ const drained = tracing.drain();
98
+ const { tokens, cost } = summarizeRun({
99
+ events: drained.events,
100
+ fallbackTokens: extractTokensFromMessage(last),
101
+ model,
102
+ });
63
103
  return {
64
104
  text,
65
- metadata: { model, systemPrompt, tools, tokens: extractTokensFromMessage(last) },
105
+ metadata: {
106
+ model,
107
+ systemPrompt,
108
+ tools,
109
+ tokens,
110
+ cost,
111
+ events: drained.events.length ? drained.events : undefined,
112
+ },
66
113
  };
67
114
  };
68
115
  }
69
116
  function chainAdapter(chain) {
70
117
  const { model, systemPrompt } = extractChainMeta(chain);
71
118
  return async (input) => {
119
+ const baseline = performance.now();
120
+ const tracing = await createTracingHandle(baseline);
72
121
  let result;
73
122
  try {
74
- result = await chain.invoke({ input });
123
+ result = await chain.invoke({ input }, { callbacks: tracing.callbacks });
75
124
  }
76
125
  catch (err) {
77
- return { text: "", executionError: err.message, metadata: { model, systemPrompt } };
126
+ const { events } = tracing.drain();
127
+ return {
128
+ text: "",
129
+ executionError: err.message,
130
+ metadata: {
131
+ model,
132
+ systemPrompt,
133
+ events: events.length ? events : undefined,
134
+ },
135
+ };
78
136
  }
79
137
  const text = typeof result === "string"
80
138
  ? result
@@ -83,12 +141,21 @@ function chainAdapter(chain) {
83
141
  : typeof result.content === "string"
84
142
  ? result.content
85
143
  : JSON.stringify(result);
144
+ const drained = tracing.drain();
145
+ const effectiveModel = model ?? drained.modelName ?? result.metadata?.model;
146
+ const { tokens, cost } = summarizeRun({
147
+ events: drained.events,
148
+ fallbackTokens: extractTokens(result),
149
+ model: effectiveModel,
150
+ });
86
151
  return {
87
152
  text,
88
153
  metadata: {
89
- model: model ?? result.metadata?.model,
154
+ model: effectiveModel,
90
155
  systemPrompt,
91
- tokens: extractTokens(result),
156
+ tokens,
157
+ cost,
158
+ events: drained.events.length ? drained.events : undefined,
92
159
  },
93
160
  };
94
161
  };
@@ -153,3 +220,49 @@ function extractTokensFromMessage(msg) {
153
220
  output: usage.output_tokens ?? usage.completion_tokens ?? 0,
154
221
  };
155
222
  }
223
+ function summarizeRun(input) {
224
+ const modelEvents = input.events.filter((e) => e.kind === "model");
225
+ let inputTokens = 0;
226
+ let outputTokens = 0;
227
+ let providerCost = 0;
228
+ let hasProviderCost = false;
229
+ let hasTableCost = false;
230
+ let tableCost = 0;
231
+ let hasTokens = false;
232
+ for (const e of modelEvents) {
233
+ if (e.tokens) {
234
+ hasTokens = true;
235
+ inputTokens += e.tokens.input;
236
+ outputTokens += e.tokens.output;
237
+ }
238
+ if (e.cost?.source === "provider" && e.cost.totalUsd != null) {
239
+ hasProviderCost = true;
240
+ providerCost += e.cost.totalUsd;
241
+ }
242
+ else if (e.cost?.source === "table" && e.cost.totalUsd != null) {
243
+ hasTableCost = true;
244
+ tableCost += e.cost.totalUsd;
245
+ }
246
+ }
247
+ let tokens = hasTokens ? { input: inputTokens, output: outputTokens } : undefined;
248
+ if (!tokens && input.fallbackTokens)
249
+ tokens = input.fallbackTokens;
250
+ // Pick cost: provider > table > recompute from fallback tokens
251
+ let cost;
252
+ if (hasProviderCost) {
253
+ cost = { totalUsd: providerCost, source: "provider" };
254
+ }
255
+ else if (hasTableCost) {
256
+ cost = { totalUsd: tableCost, source: "table" };
257
+ }
258
+ else if (tokens && input.model) {
259
+ const computed = computeCost({
260
+ model: input.model,
261
+ inputTokens: tokens.input,
262
+ outputTokens: tokens.output,
263
+ });
264
+ if (computed.source !== "unavailable")
265
+ cost = computed;
266
+ }
267
+ return { tokens, cost };
268
+ }
@@ -0,0 +1,19 @@
1
+ import type { TimelineEvent } from "../types";
2
+ export interface TracingHandle {
3
+ /** Pass this into `runnable.invoke(..., { callbacks: [handler.callbacks] })` */
4
+ callbacks: any[];
5
+ drain(): {
6
+ events: TimelineEvent[];
7
+ modelName?: string;
8
+ };
9
+ }
10
+ /**
11
+ * Creates a LangChain callback handler that records every LLM and tool
12
+ * invocation as a `TimelineEvent`. Returns a handle whose `drain()` method
13
+ * yields the captured events with `startMs` / `endMs` relative to the
14
+ * provided baseline.
15
+ *
16
+ * Designed to fail open: any unexpected callback shape is ignored rather
17
+ * than throwing — the underlying agent run must not be broken by tracing.
18
+ */
19
+ export declare function createTracingHandle(baselineMs: number): Promise<TracingHandle>;
@@ -0,0 +1,200 @@
1
+ import { computeCost } from "../pricing";
2
+ /**
3
+ * Creates a LangChain callback handler that records every LLM and tool
4
+ * invocation as a `TimelineEvent`. Returns a handle whose `drain()` method
5
+ * yields the captured events with `startMs` / `endMs` relative to the
6
+ * provided baseline.
7
+ *
8
+ * Designed to fail open: any unexpected callback shape is ignored rather
9
+ * than throwing — the underlying agent run must not be broken by tracing.
10
+ */
11
+ export async function createTracingHandle(baselineMs) {
12
+ // Import lazily so the adapter still works when @langchain/core is not present.
13
+ // BaseCallbackHandler is the runtime contract LangChain checks for.
14
+ let BaseCallbackHandler;
15
+ try {
16
+ ({ BaseCallbackHandler } = await import("@langchain/core/callbacks/base"));
17
+ }
18
+ catch {
19
+ return { callbacks: [], drain: () => ({ events: [] }) };
20
+ }
21
+ const events = [];
22
+ const openLLMs = new Map();
23
+ const openTools = new Map();
24
+ let lastModelName;
25
+ class AgestTracer extends BaseCallbackHandler {
26
+ name = "AgestTracer";
27
+ awaitHandlers = true;
28
+ handleLLMStart(llm, _prompts, runId, _parentRunId, extraParams) {
29
+ openLLMs.set(runId, {
30
+ startMs: now() - baselineMs,
31
+ name: extractModelName(llm, extraParams),
32
+ });
33
+ }
34
+ handleChatModelStart(llm, _messages, runId, _parentRunId, extraParams) {
35
+ openLLMs.set(runId, {
36
+ startMs: now() - baselineMs,
37
+ name: extractModelName(llm, extraParams),
38
+ });
39
+ }
40
+ handleLLMEnd(output, runId) {
41
+ const open = openLLMs.get(runId);
42
+ if (!open)
43
+ return;
44
+ openLLMs.delete(runId);
45
+ const endMs = now() - baselineMs;
46
+ const tokens = extractTokensFromLLMOutput(output);
47
+ const providerCost = extractProviderCost(output);
48
+ const name = open.name ?? extractModelNameFromOutput(output) ?? "model";
49
+ if (name && name !== "model")
50
+ lastModelName = name;
51
+ const cost = computeCost({
52
+ model: name,
53
+ inputTokens: tokens?.input,
54
+ outputTokens: tokens?.output,
55
+ providerCost,
56
+ });
57
+ events.push({
58
+ kind: "model",
59
+ name,
60
+ startMs: open.startMs,
61
+ endMs,
62
+ durationMs: Math.max(0, endMs - open.startMs),
63
+ tokens,
64
+ cost: stripCostIfEmpty(cost),
65
+ });
66
+ }
67
+ handleLLMError(err, runId) {
68
+ const open = openLLMs.get(runId);
69
+ if (!open)
70
+ return;
71
+ openLLMs.delete(runId);
72
+ const endMs = now() - baselineMs;
73
+ events.push({
74
+ kind: "model",
75
+ name: open.name ?? "model",
76
+ startMs: open.startMs,
77
+ endMs,
78
+ durationMs: Math.max(0, endMs - open.startMs),
79
+ error: err?.message ?? String(err),
80
+ });
81
+ }
82
+ handleToolStart(tool, _input, runId) {
83
+ openTools.set(runId, {
84
+ startMs: now() - baselineMs,
85
+ name: extractToolName(tool) ?? "tool",
86
+ });
87
+ }
88
+ handleToolEnd(_output, runId) {
89
+ const open = openTools.get(runId);
90
+ if (!open)
91
+ return;
92
+ openTools.delete(runId);
93
+ const endMs = now() - baselineMs;
94
+ events.push({
95
+ kind: "tool",
96
+ name: open.name,
97
+ startMs: open.startMs,
98
+ endMs,
99
+ durationMs: Math.max(0, endMs - open.startMs),
100
+ });
101
+ }
102
+ handleToolError(err, runId) {
103
+ const open = openTools.get(runId);
104
+ if (!open)
105
+ return;
106
+ openTools.delete(runId);
107
+ const endMs = now() - baselineMs;
108
+ events.push({
109
+ kind: "tool",
110
+ name: open.name,
111
+ startMs: open.startMs,
112
+ endMs,
113
+ durationMs: Math.max(0, endMs - open.startMs),
114
+ error: err?.message ?? String(err),
115
+ });
116
+ }
117
+ }
118
+ const handler = new AgestTracer();
119
+ return {
120
+ callbacks: [handler],
121
+ drain: () => {
122
+ const ordered = [...events].sort((a, b) => a.startMs - b.startMs);
123
+ events.length = 0;
124
+ return { events: ordered, modelName: lastModelName };
125
+ },
126
+ };
127
+ }
128
+ function now() {
129
+ return performance.now();
130
+ }
131
+ function extractModelName(llm, extraParams) {
132
+ const invocation = (extraParams?.invocation_params ?? {});
133
+ if (typeof invocation.model === "string")
134
+ return invocation.model;
135
+ if (typeof invocation.model_name === "string")
136
+ return invocation.model_name;
137
+ const kwargs = llm?.kwargs;
138
+ if (kwargs) {
139
+ if (typeof kwargs.model === "string")
140
+ return kwargs.model;
141
+ if (typeof kwargs.model_name === "string")
142
+ return kwargs.model_name;
143
+ if (typeof kwargs.modelName === "string")
144
+ return kwargs.modelName;
145
+ }
146
+ const id = llm?.id;
147
+ if (Array.isArray(id) && id.length > 0 && typeof id[id.length - 1] === "string") {
148
+ return id[id.length - 1];
149
+ }
150
+ return undefined;
151
+ }
152
+ function extractModelNameFromOutput(output) {
153
+ const gen = output?.generations?.[0]?.[0];
154
+ return (gen?.message?.response_metadata?.model_name ??
155
+ gen?.message?.response_metadata?.model ??
156
+ output?.llmOutput?.modelName ??
157
+ output?.llmOutput?.model);
158
+ }
159
+ function extractTokensFromLLMOutput(output) {
160
+ const usage = output?.llmOutput?.tokenUsage ??
161
+ output?.llmOutput?.usage ??
162
+ output?.generations?.[0]?.[0]?.message?.usage_metadata ??
163
+ output?.generations?.[0]?.[0]?.message?.response_metadata?.usage;
164
+ if (!usage)
165
+ return undefined;
166
+ const input = usage.input_tokens ?? usage.prompt_tokens ?? usage.promptTokens ?? 0;
167
+ const out = usage.output_tokens ?? usage.completion_tokens ?? usage.completionTokens ?? 0;
168
+ if (!input && !out)
169
+ return undefined;
170
+ return { input, output: out };
171
+ }
172
+ function extractProviderCost(output) {
173
+ const candidates = [
174
+ output?.llmOutput?.usage?.cost,
175
+ output?.llmOutput?.cost,
176
+ output?.generations?.[0]?.[0]?.message?.usage_metadata?.total_cost,
177
+ output?.generations?.[0]?.[0]?.message?.response_metadata?.usage?.cost,
178
+ output?.generations?.[0]?.[0]?.message?.response_metadata?.cost,
179
+ ];
180
+ for (const c of candidates) {
181
+ if (typeof c === "number" && Number.isFinite(c))
182
+ return c;
183
+ }
184
+ return undefined;
185
+ }
186
+ function extractToolName(tool) {
187
+ if (!tool)
188
+ return undefined;
189
+ if (typeof tool.name === "string")
190
+ return tool.name;
191
+ if (Array.isArray(tool.id) && tool.id.length > 0) {
192
+ return String(tool.id[tool.id.length - 1]);
193
+ }
194
+ return undefined;
195
+ }
196
+ function stripCostIfEmpty(cost) {
197
+ if (cost.source === "unavailable" && cost.totalUsd == null)
198
+ return undefined;
199
+ return cost;
200
+ }
package/dist/config.d.ts CHANGED
@@ -15,6 +15,15 @@ export interface AgestConfig {
15
15
  turns?: number;
16
16
  runs?: number;
17
17
  judge?: JudgeConfig;
18
+ /**
19
+ * Per-model pricing override (USD per 1M tokens). Merged on top of the
20
+ * built-in `src/pricing/models.json` table. Provide entries for any model
21
+ * you use that isn't already in the table, or to override a default.
22
+ */
23
+ pricing?: Record<string, {
24
+ input: number;
25
+ output: number;
26
+ }>;
18
27
  }
19
28
  export declare function defineConfig(config: AgestConfig): AgestConfig;
20
29
  export declare function loadConfig(): Promise<AgestConfig>;
package/dist/context.js CHANGED
@@ -3,6 +3,7 @@ import { executeScene } from "./runner";
3
3
  import { formatReport, writeReport, writeDiffEntry } from "./reporter";
4
4
  import { logger, c } from "./logger";
5
5
  import { loadConfig } from "./config";
6
+ import { setPricingOverrides } from "./pricing";
6
7
  import { PromisePool } from "@supercharge/promise-pool";
7
8
  export class SceneBuilder {
8
9
  _prompt;
@@ -77,6 +78,7 @@ export class AgentContext {
77
78
  }
78
79
  async execute() {
79
80
  const config = await loadConfig();
81
+ setPricingOverrides(config.pricing);
80
82
  const parallelism = Math.max(1, config.parallelism ?? 1);
81
83
  const definitions = this._scenes.map((s) => s.toDefinition());
82
84
  const orderedResults = new Array(definitions.length);
@@ -170,14 +172,25 @@ export class AgentContext {
170
172
  const successRate = results.length > 0
171
173
  ? Number((results.filter((r) => r.passed).length / results.length).toFixed(2))
172
174
  : 0;
173
- const tokensAvailable = results.some((r) => r.response.metadata?.tokens != null);
175
+ const sceneTokens = results
176
+ .map((r) => r.tokens ?? r.response.metadata?.tokens)
177
+ .filter((t) => t != null);
174
178
  let averageInputTokensPerCase;
175
179
  let averageOutputTokensPerCase;
176
- if (tokensAvailable) {
177
- const withTokens = results.filter((r) => r.response.metadata?.tokens != null);
178
- averageInputTokensPerCase = Math.round(withTokens.reduce((sum, r) => sum + (r.response.metadata.tokens.input ?? 0), 0) / withTokens.length);
179
- averageOutputTokensPerCase = Math.round(withTokens.reduce((sum, r) => sum + (r.response.metadata.tokens.output ?? 0), 0) / withTokens.length);
180
+ let totalInputTokens;
181
+ let totalOutputTokens;
182
+ if (sceneTokens.length > 0) {
183
+ totalInputTokens = sceneTokens.reduce((s, t) => s + (t.input ?? 0), 0);
184
+ totalOutputTokens = sceneTokens.reduce((s, t) => s + (t.output ?? 0), 0);
185
+ averageInputTokensPerCase = Math.round(totalInputTokens / sceneTokens.length);
186
+ averageOutputTokensPerCase = Math.round(totalOutputTokens / sceneTokens.length);
180
187
  }
188
+ const sceneCosts = results
189
+ .map((r) => r.costUsd)
190
+ .filter((c) => typeof c === "number");
191
+ const totalCostUsd = sceneCosts.length > 0
192
+ ? sceneCosts.reduce((s, c) => s + c, 0)
193
+ : undefined;
181
194
  const firstMeta = results.find((r) => r.response.metadata)?.response
182
195
  .metadata;
183
196
  const dimensions = {};
@@ -208,6 +221,9 @@ export class AgentContext {
208
221
  totalCases: results.length,
209
222
  averageInputTokensPerCase,
210
223
  averageOutputTokensPerCase,
224
+ totalInputTokens,
225
+ totalOutputTokens,
226
+ totalCostUsd,
211
227
  results,
212
228
  };
213
229
  if (report.systemPromptHash && firstMeta?.systemPrompt) {
@@ -0,0 +1,23 @@
1
+ export interface ModelPrice {
2
+ /** USD per 1M input tokens */
3
+ input: number;
4
+ /** USD per 1M output tokens */
5
+ output: number;
6
+ }
7
+ export type CostSource = "provider" | "table" | "unavailable";
8
+ export interface CostBreakdown {
9
+ inputUsd?: number;
10
+ outputUsd?: number;
11
+ totalUsd?: number;
12
+ source: CostSource;
13
+ }
14
+ export declare function setPricingOverrides(table?: Record<string, ModelPrice>): void;
15
+ export declare function lookupPrice(model: string | undefined): ModelPrice | undefined;
16
+ export interface ComputeCostInput {
17
+ model?: string;
18
+ inputTokens?: number;
19
+ outputTokens?: number;
20
+ /** USD cost the provider already reported (takes precedence) */
21
+ providerCost?: number;
22
+ }
23
+ export declare function computeCost(input: ComputeCostInput): CostBreakdown;
@@ -0,0 +1,42 @@
1
+ import { readFileSync } from "fs";
2
+ import { fileURLToPath } from "url";
3
+ import { dirname, join } from "path";
4
+ const here = dirname(fileURLToPath(import.meta.url));
5
+ const builtIn = JSON.parse(readFileSync(join(here, "models.json"), "utf-8"));
6
+ let overrides = {};
7
+ export function setPricingOverrides(table) {
8
+ overrides = table ?? {};
9
+ }
10
+ export function lookupPrice(model) {
11
+ if (!model)
12
+ return undefined;
13
+ if (overrides[model])
14
+ return overrides[model];
15
+ if (builtIn[model])
16
+ return builtIn[model];
17
+ // Loose suffix/prefix match — pick the longest matching key
18
+ const lowered = model.toLowerCase();
19
+ const keys = Object.keys({ ...builtIn, ...overrides })
20
+ .filter((k) => lowered.includes(k.toLowerCase()) || k.toLowerCase().includes(lowered))
21
+ .sort((a, b) => b.length - a.length);
22
+ if (keys.length > 0) {
23
+ return overrides[keys[0]] ?? builtIn[keys[0]];
24
+ }
25
+ return undefined;
26
+ }
27
+ export function computeCost(input) {
28
+ if (input.providerCost != null && Number.isFinite(input.providerCost)) {
29
+ return { totalUsd: input.providerCost, source: "provider" };
30
+ }
31
+ const price = lookupPrice(input.model);
32
+ if (!price)
33
+ return { source: "unavailable" };
34
+ const inputUsd = ((input.inputTokens ?? 0) / 1_000_000) * price.input;
35
+ const outputUsd = ((input.outputTokens ?? 0) / 1_000_000) * price.output;
36
+ return {
37
+ inputUsd,
38
+ outputUsd,
39
+ totalUsd: inputUsd + outputUsd,
40
+ source: "table",
41
+ };
42
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "claude-opus-4-7": { "input": 15, "output": 75 },
3
+ "claude-opus-4-6": { "input": 15, "output": 75 },
4
+ "claude-opus-4-5": { "input": 15, "output": 75 },
5
+ "claude-sonnet-4-6": { "input": 3, "output": 15 },
6
+ "claude-sonnet-4-5": { "input": 3, "output": 15 },
7
+ "claude-haiku-4-5": { "input": 1, "output": 5 },
8
+ "claude-3-5-sonnet-20241022": { "input": 3, "output": 15 },
9
+ "claude-3-5-haiku-20241022": { "input": 0.8, "output": 4 },
10
+ "claude-3-opus-20240229": { "input": 15, "output": 75 },
11
+ "gpt-4o": { "input": 2.5, "output": 10 },
12
+ "gpt-4o-mini": { "input": 0.15, "output": 0.6 },
13
+ "gpt-4.1": { "input": 2, "output": 8 },
14
+ "gpt-4.1-mini": { "input": 0.4, "output": 1.6 },
15
+ "gpt-4.1-nano": { "input": 0.1, "output": 0.4 },
16
+ "gpt-5": { "input": 1.25, "output": 10 },
17
+ "gpt-5-mini": { "input": 0.25, "output": 2 },
18
+ "o1": { "input": 15, "output": 60 },
19
+ "o1-mini": { "input": 1.1, "output": 4.4 },
20
+ "o3-mini": { "input": 1.1, "output": 4.4 }
21
+ }
package/dist/reporter.js CHANGED
@@ -78,8 +78,75 @@ export function formatReport(report) {
78
78
  if (report.averageOutputTokensPerCase != null) {
79
79
  lines.push(` average_output_tokens_per_case: ${report.averageOutputTokensPerCase}`);
80
80
  }
81
+ if (report.totalInputTokens != null) {
82
+ lines.push(` total_input_tokens: ${report.totalInputTokens}`);
83
+ }
84
+ if (report.totalOutputTokens != null) {
85
+ lines.push(` total_output_tokens: ${report.totalOutputTokens}`);
86
+ }
87
+ if (report.totalCostUsd != null) {
88
+ lines.push(` total_cost_usd: ${formatUsd(report.totalCostUsd)}`);
89
+ }
90
+ const observedScenes = report.results.filter((r) => r.tokens || r.costUsd != null || (r.events && r.events.length));
91
+ if (observedScenes.length > 0) {
92
+ lines.push(` scenes:`);
93
+ for (const r of observedScenes) {
94
+ lines.push(...renderSceneObservability(r));
95
+ }
96
+ }
81
97
  return lines.join("\n");
82
98
  }
99
+ function renderSceneObservability(r) {
100
+ const out = [];
101
+ const promptLabel = r.prompt.length > 80 ? r.prompt.slice(0, 77) + "..." : r.prompt;
102
+ out.push(` - prompt: "${escapeYaml(promptLabel)}"`);
103
+ out.push(` duration_ms: ${Math.round(r.duration)}`);
104
+ if (r.tokens) {
105
+ out.push(` tokens: { input: ${r.tokens.input}, output: ${r.tokens.output} }`);
106
+ }
107
+ if (r.costUsd != null) {
108
+ const source = r.costSource ?? "table";
109
+ out.push(` cost_usd: ${formatUsd(r.costUsd)}`);
110
+ out.push(` cost_source: ${source}`);
111
+ }
112
+ if (r.events && r.events.length) {
113
+ out.push(` timeline:`);
114
+ for (const e of r.events) {
115
+ out.push(...renderTimelineEvent(e));
116
+ }
117
+ }
118
+ return out;
119
+ }
120
+ function renderTimelineEvent(e) {
121
+ const out = [];
122
+ out.push(` - kind: ${e.kind}`);
123
+ out.push(` name: "${escapeYaml(e.name)}"`);
124
+ out.push(` start_ms: ${Math.round(e.startMs)}`);
125
+ out.push(` duration_ms: ${Math.round(e.durationMs)}`);
126
+ if (e.tokens) {
127
+ out.push(` tokens: { input: ${e.tokens.input}, output: ${e.tokens.output} }`);
128
+ }
129
+ if (e.cost?.totalUsd != null) {
130
+ out.push(` cost_usd: ${formatUsd(e.cost.totalUsd)}`);
131
+ out.push(` cost_source: ${e.cost.source}`);
132
+ }
133
+ if (e.runIndex != null) {
134
+ out.push(` run_index: ${e.runIndex}`);
135
+ }
136
+ if (e.error) {
137
+ out.push(` error: "${escapeYaml(e.error)}"`);
138
+ }
139
+ return out;
140
+ }
141
+ function formatUsd(n) {
142
+ if (n === 0)
143
+ return "0";
144
+ // Up to 6 decimal places, but trim trailing zeros for compactness
145
+ return Number(n.toFixed(6)).toString();
146
+ }
147
+ function escapeYaml(s) {
148
+ return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, "\\n");
149
+ }
83
150
  export async function writeReport(content, timestamp, name, dimensions) {
84
151
  const reportsDir = join(process.cwd(), ".reports");
85
152
  await mkdir(reportsDir, { recursive: true });
package/dist/runner.js CHANGED
@@ -116,6 +116,9 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
116
116
  // Single run — original fast path
117
117
  if (numRuns <= 1) {
118
118
  const run = await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig);
119
+ const tokens = run.response.metadata?.tokens;
120
+ const cost = run.response.metadata?.cost;
121
+ const events = run.response.metadata?.events;
119
122
  return {
120
123
  prompt: scene.prompt,
121
124
  response: run.response,
@@ -124,6 +127,10 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
124
127
  error: run.error,
125
128
  judgement: run.judgement,
126
129
  suite: scene.suite,
130
+ tokens: tokens ? { input: tokens.input, output: tokens.output } : undefined,
131
+ costUsd: cost?.totalUsd,
132
+ costSource: cost?.source,
133
+ events: events && events.length ? events : undefined,
127
134
  };
128
135
  }
129
136
  // Multiple runs — execute N times and aggregate
@@ -143,6 +150,37 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
143
150
  const error = overallPassed
144
151
  ? undefined
145
152
  : failedRuns[0]?.error ?? "Majority of runs failed";
153
+ // Aggregate tokens, cost, events across runs
154
+ let inputTokens = 0;
155
+ let outputTokens = 0;
156
+ let hasTokens = false;
157
+ let costTotal = 0;
158
+ let hasCost = false;
159
+ let costSource;
160
+ const allEvents = [];
161
+ runs.forEach((r, runIndex) => {
162
+ const meta = r.response.metadata;
163
+ if (meta?.tokens) {
164
+ hasTokens = true;
165
+ inputTokens += meta.tokens.input;
166
+ outputTokens += meta.tokens.output;
167
+ }
168
+ if (meta?.cost?.totalUsd != null) {
169
+ hasCost = true;
170
+ costTotal += meta.cost.totalUsd;
171
+ // Promote weakest source: provider > table > unavailable
172
+ if (costSource !== "table")
173
+ costSource = meta.cost.source;
174
+ if (meta.cost.source === "table" && costSource !== "table") {
175
+ costSource = "table";
176
+ }
177
+ }
178
+ if (meta?.events?.length) {
179
+ for (const e of meta.events) {
180
+ allEvents.push({ ...e, runIndex });
181
+ }
182
+ }
183
+ });
146
184
  return {
147
185
  prompt: scene.prompt,
148
186
  response: lastRun.response,
@@ -154,5 +192,9 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
154
192
  runs,
155
193
  passRate,
156
194
  statisticalSignificance,
195
+ tokens: hasTokens ? { input: inputTokens, output: outputTokens } : undefined,
196
+ costUsd: hasCost ? costTotal : undefined,
197
+ costSource,
198
+ events: allEvents.length ? allEvents : undefined,
157
199
  };
158
200
  }
package/dist/types.d.ts CHANGED
@@ -2,6 +2,30 @@ export interface ExecutorOptions {
2
2
  signal?: AbortSignal;
3
3
  }
4
4
  export type AgentExecutor = (input: string, options?: ExecutorOptions) => Promise<AgentResponse>;
5
+ export type CostSource = "provider" | "table" | "unavailable";
6
+ export interface CostBreakdown {
7
+ inputUsd?: number;
8
+ outputUsd?: number;
9
+ totalUsd?: number;
10
+ source: CostSource;
11
+ }
12
+ export type TimelineEventKind = "model" | "tool";
13
+ export interface TimelineEvent {
14
+ kind: TimelineEventKind;
15
+ name: string;
16
+ /** ms relative to the scene start */
17
+ startMs: number;
18
+ endMs: number;
19
+ durationMs: number;
20
+ tokens?: {
21
+ input: number;
22
+ output: number;
23
+ };
24
+ cost?: CostBreakdown;
25
+ /** Index of the run this event belongs to (only set when aggregating across multi-run scenes) */
26
+ runIndex?: number;
27
+ error?: string;
28
+ }
5
29
  export interface AgentResponse {
6
30
  text: string;
7
31
  refusal?: boolean;
@@ -14,6 +38,8 @@ export interface AgentResponse {
14
38
  };
15
39
  tools?: string[];
16
40
  systemPrompt?: string;
41
+ events?: TimelineEvent[];
42
+ cost?: CostBreakdown;
17
43
  [key: string]: unknown;
18
44
  };
19
45
  }
@@ -53,6 +79,16 @@ export interface SceneResult {
53
79
  runs?: RunResult[];
54
80
  passRate?: number;
55
81
  statisticalSignificance?: number;
82
+ /** Aggregate tokens across all runs of this scene */
83
+ tokens?: {
84
+ input: number;
85
+ output: number;
86
+ };
87
+ /** Aggregate USD cost across all runs of this scene */
88
+ costUsd?: number;
89
+ costSource?: CostSource;
90
+ /** Ordered timeline events from every run of the scene */
91
+ events?: TimelineEvent[];
56
92
  }
57
93
  export interface AgentReport {
58
94
  name?: string;
@@ -69,5 +105,8 @@ export interface AgentReport {
69
105
  totalCases: number;
70
106
  averageInputTokensPerCase?: number;
71
107
  averageOutputTokensPerCase?: number;
108
+ totalInputTokens?: number;
109
+ totalOutputTokens?: number;
110
+ totalCostUsd?: number;
72
111
  results: SceneResult[];
73
112
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sebastiantuyu/agest",
3
- "version": "0.3.3-next.1",
3
+ "version": "0.3.3-next.3",
4
4
  "description": "A testing library for agents",
5
5
  "repository": {
6
6
  "type": "git",
@@ -26,7 +26,7 @@
26
26
  }
27
27
  },
28
28
  "scripts": {
29
- "build": "tsc -p tsconfig.build.json",
29
+ "build": "tsc -p tsconfig.build.json && mkdir -p dist/pricing && cp src/pricing/models.json dist/pricing/models.json",
30
30
  "test": "vitest run",
31
31
  "test:watch": "vitest",
32
32
  "test:coverage": "vitest run --coverage",