@sebastiantuyu/agest 0.3.2 → 0.3.3-next.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +124 -11
- package/dist/adapters/remote.js +2 -1
- package/dist/adapters/tracing.d.ts +19 -0
- package/dist/adapters/tracing.js +200 -0
- package/dist/cli.js +30 -3
- package/dist/config.d.ts +9 -0
- package/dist/context.js +21 -5
- package/dist/discover.d.ts +16 -0
- package/dist/discover.js +62 -0
- package/dist/index.d.ts +1 -1
- package/dist/pricing/models.json +21 -0
- package/dist/pricing.d.ts +23 -0
- package/dist/pricing.js +42 -0
- package/dist/reporter.js +67 -0
- package/dist/runner.js +56 -7
- package/dist/types.d.ts +43 -1
- package/package.json +16 -15
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { computeCost } from "../pricing";
|
|
2
|
+
import { createTracingHandle } from "./tracing";
|
|
1
3
|
/**
|
|
2
4
|
* Adapter for LangChain runnables and agents.
|
|
3
5
|
*
|
|
@@ -19,23 +21,43 @@ export function langchain(runnable) {
|
|
|
19
21
|
function langGraphAdapter(graph) {
|
|
20
22
|
const staticTools = extractGraphTools(graph);
|
|
21
23
|
return async (input) => {
|
|
24
|
+
const baseline = performance.now();
|
|
25
|
+
const tracing = await createTracingHandle(baseline);
|
|
22
26
|
let result;
|
|
23
27
|
try {
|
|
24
28
|
const { HumanMessage } = await import("@langchain/core/messages");
|
|
25
|
-
result = await graph.invoke({ messages: [new HumanMessage(input)] });
|
|
29
|
+
result = await graph.invoke({ messages: [new HumanMessage(input)] }, { callbacks: tracing.callbacks });
|
|
26
30
|
}
|
|
27
31
|
catch (err) {
|
|
28
|
-
|
|
32
|
+
const { events } = tracing.drain();
|
|
33
|
+
return {
|
|
34
|
+
text: "",
|
|
35
|
+
executionError: err.message,
|
|
36
|
+
metadata: { tools: staticTools, events: events.length ? events : undefined },
|
|
37
|
+
};
|
|
29
38
|
}
|
|
30
39
|
const messages = result.messages;
|
|
31
40
|
const last = messages[messages.length - 1];
|
|
32
41
|
const text = typeof last?.content === "string"
|
|
33
42
|
? last.content
|
|
34
43
|
: JSON.stringify(last?.content ?? result);
|
|
35
|
-
const
|
|
44
|
+
const drained = tracing.drain();
|
|
45
|
+
const model = last?.response_metadata?.model_name ??
|
|
46
|
+
drained.modelName;
|
|
47
|
+
const { tokens, cost } = summarizeRun({
|
|
48
|
+
events: drained.events,
|
|
49
|
+
fallbackTokens: extractTokensFromMessage(last),
|
|
50
|
+
model,
|
|
51
|
+
});
|
|
36
52
|
return {
|
|
37
53
|
text,
|
|
38
|
-
metadata: {
|
|
54
|
+
metadata: {
|
|
55
|
+
model,
|
|
56
|
+
tools: staticTools,
|
|
57
|
+
tokens,
|
|
58
|
+
cost,
|
|
59
|
+
events: drained.events.length ? drained.events : undefined,
|
|
60
|
+
},
|
|
39
61
|
};
|
|
40
62
|
};
|
|
41
63
|
}
|
|
@@ -48,33 +70,69 @@ function reactAgentAdapter(agent) {
|
|
|
48
70
|
?.map((t) => t.name ?? t.getName?.())
|
|
49
71
|
.filter(Boolean);
|
|
50
72
|
return async (input) => {
|
|
73
|
+
const baseline = performance.now();
|
|
74
|
+
const tracing = await createTracingHandle(baseline);
|
|
51
75
|
let result;
|
|
52
76
|
try {
|
|
53
|
-
result = await agent.invoke({ messages: [{ role: "human", content: input }] });
|
|
77
|
+
result = await agent.invoke({ messages: [{ role: "human", content: input }] }, { callbacks: tracing.callbacks });
|
|
54
78
|
}
|
|
55
79
|
catch (err) {
|
|
56
|
-
|
|
80
|
+
const { events } = tracing.drain();
|
|
81
|
+
return {
|
|
82
|
+
text: "",
|
|
83
|
+
executionError: err.message,
|
|
84
|
+
metadata: {
|
|
85
|
+
model,
|
|
86
|
+
systemPrompt,
|
|
87
|
+
tools,
|
|
88
|
+
events: events.length ? events : undefined,
|
|
89
|
+
},
|
|
90
|
+
};
|
|
57
91
|
}
|
|
58
92
|
const messages = result.messages;
|
|
59
93
|
const last = messages[messages.length - 1];
|
|
60
94
|
const text = typeof last?.content === "string"
|
|
61
95
|
? last.content
|
|
62
96
|
: JSON.stringify(last?.content ?? result);
|
|
97
|
+
const drained = tracing.drain();
|
|
98
|
+
const { tokens, cost } = summarizeRun({
|
|
99
|
+
events: drained.events,
|
|
100
|
+
fallbackTokens: extractTokensFromMessage(last),
|
|
101
|
+
model,
|
|
102
|
+
});
|
|
63
103
|
return {
|
|
64
104
|
text,
|
|
65
|
-
metadata: {
|
|
105
|
+
metadata: {
|
|
106
|
+
model,
|
|
107
|
+
systemPrompt,
|
|
108
|
+
tools,
|
|
109
|
+
tokens,
|
|
110
|
+
cost,
|
|
111
|
+
events: drained.events.length ? drained.events : undefined,
|
|
112
|
+
},
|
|
66
113
|
};
|
|
67
114
|
};
|
|
68
115
|
}
|
|
69
116
|
function chainAdapter(chain) {
|
|
70
117
|
const { model, systemPrompt } = extractChainMeta(chain);
|
|
71
118
|
return async (input) => {
|
|
119
|
+
const baseline = performance.now();
|
|
120
|
+
const tracing = await createTracingHandle(baseline);
|
|
72
121
|
let result;
|
|
73
122
|
try {
|
|
74
|
-
result = await chain.invoke({ input });
|
|
123
|
+
result = await chain.invoke({ input }, { callbacks: tracing.callbacks });
|
|
75
124
|
}
|
|
76
125
|
catch (err) {
|
|
77
|
-
|
|
126
|
+
const { events } = tracing.drain();
|
|
127
|
+
return {
|
|
128
|
+
text: "",
|
|
129
|
+
executionError: err.message,
|
|
130
|
+
metadata: {
|
|
131
|
+
model,
|
|
132
|
+
systemPrompt,
|
|
133
|
+
events: events.length ? events : undefined,
|
|
134
|
+
},
|
|
135
|
+
};
|
|
78
136
|
}
|
|
79
137
|
const text = typeof result === "string"
|
|
80
138
|
? result
|
|
@@ -83,12 +141,21 @@ function chainAdapter(chain) {
|
|
|
83
141
|
: typeof result.content === "string"
|
|
84
142
|
? result.content
|
|
85
143
|
: JSON.stringify(result);
|
|
144
|
+
const drained = tracing.drain();
|
|
145
|
+
const effectiveModel = model ?? drained.modelName ?? result.metadata?.model;
|
|
146
|
+
const { tokens, cost } = summarizeRun({
|
|
147
|
+
events: drained.events,
|
|
148
|
+
fallbackTokens: extractTokens(result),
|
|
149
|
+
model: effectiveModel,
|
|
150
|
+
});
|
|
86
151
|
return {
|
|
87
152
|
text,
|
|
88
153
|
metadata: {
|
|
89
|
-
model:
|
|
154
|
+
model: effectiveModel,
|
|
90
155
|
systemPrompt,
|
|
91
|
-
tokens
|
|
156
|
+
tokens,
|
|
157
|
+
cost,
|
|
158
|
+
events: drained.events.length ? drained.events : undefined,
|
|
92
159
|
},
|
|
93
160
|
};
|
|
94
161
|
};
|
|
@@ -153,3 +220,49 @@ function extractTokensFromMessage(msg) {
|
|
|
153
220
|
output: usage.output_tokens ?? usage.completion_tokens ?? 0,
|
|
154
221
|
};
|
|
155
222
|
}
|
|
223
|
+
function summarizeRun(input) {
|
|
224
|
+
const modelEvents = input.events.filter((e) => e.kind === "model");
|
|
225
|
+
let inputTokens = 0;
|
|
226
|
+
let outputTokens = 0;
|
|
227
|
+
let providerCost = 0;
|
|
228
|
+
let hasProviderCost = false;
|
|
229
|
+
let hasTableCost = false;
|
|
230
|
+
let tableCost = 0;
|
|
231
|
+
let hasTokens = false;
|
|
232
|
+
for (const e of modelEvents) {
|
|
233
|
+
if (e.tokens) {
|
|
234
|
+
hasTokens = true;
|
|
235
|
+
inputTokens += e.tokens.input;
|
|
236
|
+
outputTokens += e.tokens.output;
|
|
237
|
+
}
|
|
238
|
+
if (e.cost?.source === "provider" && e.cost.totalUsd != null) {
|
|
239
|
+
hasProviderCost = true;
|
|
240
|
+
providerCost += e.cost.totalUsd;
|
|
241
|
+
}
|
|
242
|
+
else if (e.cost?.source === "table" && e.cost.totalUsd != null) {
|
|
243
|
+
hasTableCost = true;
|
|
244
|
+
tableCost += e.cost.totalUsd;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
let tokens = hasTokens ? { input: inputTokens, output: outputTokens } : undefined;
|
|
248
|
+
if (!tokens && input.fallbackTokens)
|
|
249
|
+
tokens = input.fallbackTokens;
|
|
250
|
+
// Pick cost: provider > table > recompute from fallback tokens
|
|
251
|
+
let cost;
|
|
252
|
+
if (hasProviderCost) {
|
|
253
|
+
cost = { totalUsd: providerCost, source: "provider" };
|
|
254
|
+
}
|
|
255
|
+
else if (hasTableCost) {
|
|
256
|
+
cost = { totalUsd: tableCost, source: "table" };
|
|
257
|
+
}
|
|
258
|
+
else if (tokens && input.model) {
|
|
259
|
+
const computed = computeCost({
|
|
260
|
+
model: input.model,
|
|
261
|
+
inputTokens: tokens.input,
|
|
262
|
+
outputTokens: tokens.output,
|
|
263
|
+
});
|
|
264
|
+
if (computed.source !== "unavailable")
|
|
265
|
+
cost = computed;
|
|
266
|
+
}
|
|
267
|
+
return { tokens, cost };
|
|
268
|
+
}
|
package/dist/adapters/remote.js
CHANGED
|
@@ -24,12 +24,13 @@
|
|
|
24
24
|
*/
|
|
25
25
|
export function remote(endpoint, options = {}) {
|
|
26
26
|
const { headers = {}, method = "POST", body: extraBody, buildRequest = defaultBuildRequest, parseResponse, metadata: staticMetadata, } = options;
|
|
27
|
-
return async (input) => {
|
|
27
|
+
return async (input, execOptions) => {
|
|
28
28
|
let res;
|
|
29
29
|
try {
|
|
30
30
|
const fetchOptions = {
|
|
31
31
|
method,
|
|
32
32
|
headers: { "Content-Type": "application/json", ...headers },
|
|
33
|
+
signal: execOptions?.signal,
|
|
33
34
|
};
|
|
34
35
|
if (method !== "GET") {
|
|
35
36
|
const built = buildRequest(input);
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { TimelineEvent } from "../types";
|
|
2
|
+
export interface TracingHandle {
|
|
3
|
+
/** Pass this into `runnable.invoke(..., { callbacks: [handler.callbacks] })` */
|
|
4
|
+
callbacks: any[];
|
|
5
|
+
drain(): {
|
|
6
|
+
events: TimelineEvent[];
|
|
7
|
+
modelName?: string;
|
|
8
|
+
};
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Creates a LangChain callback handler that records every LLM and tool
|
|
12
|
+
* invocation as a `TimelineEvent`. Returns a handle whose `drain()` method
|
|
13
|
+
* yields the captured events with `startMs` / `endMs` relative to the
|
|
14
|
+
* provided baseline.
|
|
15
|
+
*
|
|
16
|
+
* Designed to fail open: any unexpected callback shape is ignored rather
|
|
17
|
+
* than throwing — the underlying agent run must not be broken by tracing.
|
|
18
|
+
*/
|
|
19
|
+
export declare function createTracingHandle(baselineMs: number): Promise<TracingHandle>;
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import { computeCost } from "../pricing";
|
|
2
|
+
/**
|
|
3
|
+
* Creates a LangChain callback handler that records every LLM and tool
|
|
4
|
+
* invocation as a `TimelineEvent`. Returns a handle whose `drain()` method
|
|
5
|
+
* yields the captured events with `startMs` / `endMs` relative to the
|
|
6
|
+
* provided baseline.
|
|
7
|
+
*
|
|
8
|
+
* Designed to fail open: any unexpected callback shape is ignored rather
|
|
9
|
+
* than throwing — the underlying agent run must not be broken by tracing.
|
|
10
|
+
*/
|
|
11
|
+
export async function createTracingHandle(baselineMs) {
|
|
12
|
+
// Import lazily so the adapter still works when @langchain/core is not present.
|
|
13
|
+
// BaseCallbackHandler is the runtime contract LangChain checks for.
|
|
14
|
+
let BaseCallbackHandler;
|
|
15
|
+
try {
|
|
16
|
+
({ BaseCallbackHandler } = await import("@langchain/core/callbacks/base"));
|
|
17
|
+
}
|
|
18
|
+
catch {
|
|
19
|
+
return { callbacks: [], drain: () => ({ events: [] }) };
|
|
20
|
+
}
|
|
21
|
+
const events = [];
|
|
22
|
+
const openLLMs = new Map();
|
|
23
|
+
const openTools = new Map();
|
|
24
|
+
let lastModelName;
|
|
25
|
+
class AgestTracer extends BaseCallbackHandler {
|
|
26
|
+
name = "AgestTracer";
|
|
27
|
+
awaitHandlers = true;
|
|
28
|
+
handleLLMStart(llm, _prompts, runId, _parentRunId, extraParams) {
|
|
29
|
+
openLLMs.set(runId, {
|
|
30
|
+
startMs: now() - baselineMs,
|
|
31
|
+
name: extractModelName(llm, extraParams),
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
handleChatModelStart(llm, _messages, runId, _parentRunId, extraParams) {
|
|
35
|
+
openLLMs.set(runId, {
|
|
36
|
+
startMs: now() - baselineMs,
|
|
37
|
+
name: extractModelName(llm, extraParams),
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
handleLLMEnd(output, runId) {
|
|
41
|
+
const open = openLLMs.get(runId);
|
|
42
|
+
if (!open)
|
|
43
|
+
return;
|
|
44
|
+
openLLMs.delete(runId);
|
|
45
|
+
const endMs = now() - baselineMs;
|
|
46
|
+
const tokens = extractTokensFromLLMOutput(output);
|
|
47
|
+
const providerCost = extractProviderCost(output);
|
|
48
|
+
const name = open.name ?? extractModelNameFromOutput(output) ?? "model";
|
|
49
|
+
if (name && name !== "model")
|
|
50
|
+
lastModelName = name;
|
|
51
|
+
const cost = computeCost({
|
|
52
|
+
model: name,
|
|
53
|
+
inputTokens: tokens?.input,
|
|
54
|
+
outputTokens: tokens?.output,
|
|
55
|
+
providerCost,
|
|
56
|
+
});
|
|
57
|
+
events.push({
|
|
58
|
+
kind: "model",
|
|
59
|
+
name,
|
|
60
|
+
startMs: open.startMs,
|
|
61
|
+
endMs,
|
|
62
|
+
durationMs: Math.max(0, endMs - open.startMs),
|
|
63
|
+
tokens,
|
|
64
|
+
cost: stripCostIfEmpty(cost),
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
handleLLMError(err, runId) {
|
|
68
|
+
const open = openLLMs.get(runId);
|
|
69
|
+
if (!open)
|
|
70
|
+
return;
|
|
71
|
+
openLLMs.delete(runId);
|
|
72
|
+
const endMs = now() - baselineMs;
|
|
73
|
+
events.push({
|
|
74
|
+
kind: "model",
|
|
75
|
+
name: open.name ?? "model",
|
|
76
|
+
startMs: open.startMs,
|
|
77
|
+
endMs,
|
|
78
|
+
durationMs: Math.max(0, endMs - open.startMs),
|
|
79
|
+
error: err?.message ?? String(err),
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
handleToolStart(tool, _input, runId) {
|
|
83
|
+
openTools.set(runId, {
|
|
84
|
+
startMs: now() - baselineMs,
|
|
85
|
+
name: extractToolName(tool) ?? "tool",
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
handleToolEnd(_output, runId) {
|
|
89
|
+
const open = openTools.get(runId);
|
|
90
|
+
if (!open)
|
|
91
|
+
return;
|
|
92
|
+
openTools.delete(runId);
|
|
93
|
+
const endMs = now() - baselineMs;
|
|
94
|
+
events.push({
|
|
95
|
+
kind: "tool",
|
|
96
|
+
name: open.name,
|
|
97
|
+
startMs: open.startMs,
|
|
98
|
+
endMs,
|
|
99
|
+
durationMs: Math.max(0, endMs - open.startMs),
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
handleToolError(err, runId) {
|
|
103
|
+
const open = openTools.get(runId);
|
|
104
|
+
if (!open)
|
|
105
|
+
return;
|
|
106
|
+
openTools.delete(runId);
|
|
107
|
+
const endMs = now() - baselineMs;
|
|
108
|
+
events.push({
|
|
109
|
+
kind: "tool",
|
|
110
|
+
name: open.name,
|
|
111
|
+
startMs: open.startMs,
|
|
112
|
+
endMs,
|
|
113
|
+
durationMs: Math.max(0, endMs - open.startMs),
|
|
114
|
+
error: err?.message ?? String(err),
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
const handler = new AgestTracer();
|
|
119
|
+
return {
|
|
120
|
+
callbacks: [handler],
|
|
121
|
+
drain: () => {
|
|
122
|
+
const ordered = [...events].sort((a, b) => a.startMs - b.startMs);
|
|
123
|
+
events.length = 0;
|
|
124
|
+
return { events: ordered, modelName: lastModelName };
|
|
125
|
+
},
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
function now() {
|
|
129
|
+
return performance.now();
|
|
130
|
+
}
|
|
131
|
+
function extractModelName(llm, extraParams) {
|
|
132
|
+
const invocation = (extraParams?.invocation_params ?? {});
|
|
133
|
+
if (typeof invocation.model === "string")
|
|
134
|
+
return invocation.model;
|
|
135
|
+
if (typeof invocation.model_name === "string")
|
|
136
|
+
return invocation.model_name;
|
|
137
|
+
const kwargs = llm?.kwargs;
|
|
138
|
+
if (kwargs) {
|
|
139
|
+
if (typeof kwargs.model === "string")
|
|
140
|
+
return kwargs.model;
|
|
141
|
+
if (typeof kwargs.model_name === "string")
|
|
142
|
+
return kwargs.model_name;
|
|
143
|
+
if (typeof kwargs.modelName === "string")
|
|
144
|
+
return kwargs.modelName;
|
|
145
|
+
}
|
|
146
|
+
const id = llm?.id;
|
|
147
|
+
if (Array.isArray(id) && id.length > 0 && typeof id[id.length - 1] === "string") {
|
|
148
|
+
return id[id.length - 1];
|
|
149
|
+
}
|
|
150
|
+
return undefined;
|
|
151
|
+
}
|
|
152
|
+
function extractModelNameFromOutput(output) {
|
|
153
|
+
const gen = output?.generations?.[0]?.[0];
|
|
154
|
+
return (gen?.message?.response_metadata?.model_name ??
|
|
155
|
+
gen?.message?.response_metadata?.model ??
|
|
156
|
+
output?.llmOutput?.modelName ??
|
|
157
|
+
output?.llmOutput?.model);
|
|
158
|
+
}
|
|
159
|
+
function extractTokensFromLLMOutput(output) {
|
|
160
|
+
const usage = output?.llmOutput?.tokenUsage ??
|
|
161
|
+
output?.llmOutput?.usage ??
|
|
162
|
+
output?.generations?.[0]?.[0]?.message?.usage_metadata ??
|
|
163
|
+
output?.generations?.[0]?.[0]?.message?.response_metadata?.usage;
|
|
164
|
+
if (!usage)
|
|
165
|
+
return undefined;
|
|
166
|
+
const input = usage.input_tokens ?? usage.prompt_tokens ?? usage.promptTokens ?? 0;
|
|
167
|
+
const out = usage.output_tokens ?? usage.completion_tokens ?? usage.completionTokens ?? 0;
|
|
168
|
+
if (!input && !out)
|
|
169
|
+
return undefined;
|
|
170
|
+
return { input, output: out };
|
|
171
|
+
}
|
|
172
|
+
function extractProviderCost(output) {
|
|
173
|
+
const candidates = [
|
|
174
|
+
output?.llmOutput?.usage?.cost,
|
|
175
|
+
output?.llmOutput?.cost,
|
|
176
|
+
output?.generations?.[0]?.[0]?.message?.usage_metadata?.total_cost,
|
|
177
|
+
output?.generations?.[0]?.[0]?.message?.response_metadata?.usage?.cost,
|
|
178
|
+
output?.generations?.[0]?.[0]?.message?.response_metadata?.cost,
|
|
179
|
+
];
|
|
180
|
+
for (const c of candidates) {
|
|
181
|
+
if (typeof c === "number" && Number.isFinite(c))
|
|
182
|
+
return c;
|
|
183
|
+
}
|
|
184
|
+
return undefined;
|
|
185
|
+
}
|
|
186
|
+
function extractToolName(tool) {
|
|
187
|
+
if (!tool)
|
|
188
|
+
return undefined;
|
|
189
|
+
if (typeof tool.name === "string")
|
|
190
|
+
return tool.name;
|
|
191
|
+
if (Array.isArray(tool.id) && tool.id.length > 0) {
|
|
192
|
+
return String(tool.id[tool.id.length - 1]);
|
|
193
|
+
}
|
|
194
|
+
return undefined;
|
|
195
|
+
}
|
|
196
|
+
function stripCostIfEmpty(cost) {
|
|
197
|
+
if (cost.source === "unavailable" && cost.totalUsd == null)
|
|
198
|
+
return undefined;
|
|
199
|
+
return cost;
|
|
200
|
+
}
|
package/dist/cli.js
CHANGED
|
@@ -2,11 +2,35 @@
|
|
|
2
2
|
import { spawn } from "child_process";
|
|
3
3
|
import { main as stats } from "./stats.js";
|
|
4
4
|
import { main as preview } from "./preview.js";
|
|
5
|
+
import { DEFAULT_PATTERN, discoverTestFiles } from "./discover.js";
|
|
5
6
|
const command = process.argv[2];
|
|
7
|
+
function parseRunArgs(args) {
|
|
8
|
+
const targets = [];
|
|
9
|
+
let pattern;
|
|
10
|
+
for (let i = 0; i < args.length; i++) {
|
|
11
|
+
const a = args[i];
|
|
12
|
+
if (a === "--pattern" || a === "-p") {
|
|
13
|
+
pattern = args[++i];
|
|
14
|
+
if (pattern === undefined) {
|
|
15
|
+
console.error(" Error: --pattern requires a value");
|
|
16
|
+
process.exit(1);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
else if (a.startsWith("--pattern=")) {
|
|
20
|
+
pattern = a.slice("--pattern=".length);
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
targets.push(a);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return { pattern, targets };
|
|
27
|
+
}
|
|
6
28
|
async function run() {
|
|
7
|
-
const
|
|
29
|
+
const { pattern, targets } = parseRunArgs(process.argv.slice(3));
|
|
30
|
+
const files = await discoverTestFiles(targets, { pattern });
|
|
8
31
|
if (files.length === 0) {
|
|
9
|
-
|
|
32
|
+
const effective = pattern ?? DEFAULT_PATTERN;
|
|
33
|
+
console.error(` No test files found (pattern: ${effective})`);
|
|
10
34
|
process.exit(1);
|
|
11
35
|
}
|
|
12
36
|
for (const file of files) {
|
|
@@ -29,7 +53,10 @@ if (!command || !commands[command]) {
|
|
|
29
53
|
Usage: agest <command>
|
|
30
54
|
|
|
31
55
|
Commands:
|
|
32
|
-
run Run test file(s)
|
|
56
|
+
run Run test file(s), directories, or glob patterns
|
|
57
|
+
agest run tests/ # walks for ${DEFAULT_PATTERN}
|
|
58
|
+
agest run src/agest --pattern "**/*.test.ts"
|
|
59
|
+
agest run "tests/**/*.agest.ts" path/to/file.agest.ts
|
|
33
60
|
stats Show aggregated test statistics
|
|
34
61
|
preview Generate an HTML report preview
|
|
35
62
|
`);
|
package/dist/config.d.ts
CHANGED
|
@@ -15,6 +15,15 @@ export interface AgestConfig {
|
|
|
15
15
|
turns?: number;
|
|
16
16
|
runs?: number;
|
|
17
17
|
judge?: JudgeConfig;
|
|
18
|
+
/**
|
|
19
|
+
* Per-model pricing override (USD per 1M tokens). Merged on top of the
|
|
20
|
+
* built-in `src/pricing/models.json` table. Provide entries for any model
|
|
21
|
+
* you use that isn't already in the table, or to override a default.
|
|
22
|
+
*/
|
|
23
|
+
pricing?: Record<string, {
|
|
24
|
+
input: number;
|
|
25
|
+
output: number;
|
|
26
|
+
}>;
|
|
18
27
|
}
|
|
19
28
|
export declare function defineConfig(config: AgestConfig): AgestConfig;
|
|
20
29
|
export declare function loadConfig(): Promise<AgestConfig>;
|
package/dist/context.js
CHANGED
|
@@ -3,6 +3,7 @@ import { executeScene } from "./runner";
|
|
|
3
3
|
import { formatReport, writeReport, writeDiffEntry } from "./reporter";
|
|
4
4
|
import { logger, c } from "./logger";
|
|
5
5
|
import { loadConfig } from "./config";
|
|
6
|
+
import { setPricingOverrides } from "./pricing";
|
|
6
7
|
import { PromisePool } from "@supercharge/promise-pool";
|
|
7
8
|
export class SceneBuilder {
|
|
8
9
|
_prompt;
|
|
@@ -77,6 +78,7 @@ export class AgentContext {
|
|
|
77
78
|
}
|
|
78
79
|
async execute() {
|
|
79
80
|
const config = await loadConfig();
|
|
81
|
+
setPricingOverrides(config.pricing);
|
|
80
82
|
const parallelism = Math.max(1, config.parallelism ?? 1);
|
|
81
83
|
const definitions = this._scenes.map((s) => s.toDefinition());
|
|
82
84
|
const orderedResults = new Array(definitions.length);
|
|
@@ -170,14 +172,25 @@ export class AgentContext {
|
|
|
170
172
|
const successRate = results.length > 0
|
|
171
173
|
? Number((results.filter((r) => r.passed).length / results.length).toFixed(2))
|
|
172
174
|
: 0;
|
|
173
|
-
const
|
|
175
|
+
const sceneTokens = results
|
|
176
|
+
.map((r) => r.tokens ?? r.response.metadata?.tokens)
|
|
177
|
+
.filter((t) => t != null);
|
|
174
178
|
let averageInputTokensPerCase;
|
|
175
179
|
let averageOutputTokensPerCase;
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
+
let totalInputTokens;
|
|
181
|
+
let totalOutputTokens;
|
|
182
|
+
if (sceneTokens.length > 0) {
|
|
183
|
+
totalInputTokens = sceneTokens.reduce((s, t) => s + (t.input ?? 0), 0);
|
|
184
|
+
totalOutputTokens = sceneTokens.reduce((s, t) => s + (t.output ?? 0), 0);
|
|
185
|
+
averageInputTokensPerCase = Math.round(totalInputTokens / sceneTokens.length);
|
|
186
|
+
averageOutputTokensPerCase = Math.round(totalOutputTokens / sceneTokens.length);
|
|
180
187
|
}
|
|
188
|
+
const sceneCosts = results
|
|
189
|
+
.map((r) => r.costUsd)
|
|
190
|
+
.filter((c) => typeof c === "number");
|
|
191
|
+
const totalCostUsd = sceneCosts.length > 0
|
|
192
|
+
? sceneCosts.reduce((s, c) => s + c, 0)
|
|
193
|
+
: undefined;
|
|
181
194
|
const firstMeta = results.find((r) => r.response.metadata)?.response
|
|
182
195
|
.metadata;
|
|
183
196
|
const dimensions = {};
|
|
@@ -208,6 +221,9 @@ export class AgentContext {
|
|
|
208
221
|
totalCases: results.length,
|
|
209
222
|
averageInputTokensPerCase,
|
|
210
223
|
averageOutputTokensPerCase,
|
|
224
|
+
totalInputTokens,
|
|
225
|
+
totalOutputTokens,
|
|
226
|
+
totalCostUsd,
|
|
211
227
|
results,
|
|
212
228
|
};
|
|
213
229
|
if (report.systemPromptHash && firstMeta?.systemPrompt) {
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export declare const DEFAULT_PATTERN = "**/*.agest.ts";
|
|
2
|
+
export interface DiscoverOptions {
|
|
3
|
+
pattern?: string;
|
|
4
|
+
cwd?: string;
|
|
5
|
+
}
|
|
6
|
+
/**
|
|
7
|
+
* Resolve a mix of file paths, directories, and glob patterns into a
|
|
8
|
+
* deduplicated, sorted list of absolute file paths.
|
|
9
|
+
*
|
|
10
|
+
* Rules per target:
|
|
11
|
+
* - directory: search recursively for `pattern` (default `**\/*.agest.ts`)
|
|
12
|
+
* - glob (contains *, ?, [], {}): expand it
|
|
13
|
+
* - file: use as-is
|
|
14
|
+
* - anything else: try as glob (zero matches is fine)
|
|
15
|
+
*/
|
|
16
|
+
export declare function discoverTestFiles(targets: string[], options?: DiscoverOptions): Promise<string[]>;
|
package/dist/discover.js
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { promises as fs } from "node:fs";
|
|
2
|
+
import { isAbsolute, resolve } from "node:path";
|
|
3
|
+
export const DEFAULT_PATTERN = "**/*.agest.ts";
|
|
4
|
+
const GLOB_CHARS = /[*?[\]{}]/;
|
|
5
|
+
function hasGlobChars(value) {
|
|
6
|
+
return GLOB_CHARS.test(value);
|
|
7
|
+
}
|
|
8
|
+
async function statSafe(path) {
|
|
9
|
+
try {
|
|
10
|
+
const stat = await fs.stat(path);
|
|
11
|
+
return { isFile: stat.isFile(), isDir: stat.isDirectory() };
|
|
12
|
+
}
|
|
13
|
+
catch {
|
|
14
|
+
return { isFile: false, isDir: false };
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
async function expandGlob(pattern, cwd) {
|
|
18
|
+
const out = [];
|
|
19
|
+
// fs.promises.glob is available in Node >= 22 (the package's required engine).
|
|
20
|
+
for await (const match of fs.glob(pattern, { cwd })) {
|
|
21
|
+
out.push(isAbsolute(match) ? match : resolve(cwd, match));
|
|
22
|
+
}
|
|
23
|
+
return out;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Resolve a mix of file paths, directories, and glob patterns into a
|
|
27
|
+
* deduplicated, sorted list of absolute file paths.
|
|
28
|
+
*
|
|
29
|
+
* Rules per target:
|
|
30
|
+
* - directory: search recursively for `pattern` (default `**\/*.agest.ts`)
|
|
31
|
+
* - glob (contains *, ?, [], {}): expand it
|
|
32
|
+
* - file: use as-is
|
|
33
|
+
* - anything else: try as glob (zero matches is fine)
|
|
34
|
+
*/
|
|
35
|
+
export async function discoverTestFiles(targets, options = {}) {
|
|
36
|
+
const cwd = options.cwd ?? process.cwd();
|
|
37
|
+
const pattern = options.pattern ?? DEFAULT_PATTERN;
|
|
38
|
+
const work = targets.length === 0 ? ["."] : targets;
|
|
39
|
+
const found = new Set();
|
|
40
|
+
for (const target of work) {
|
|
41
|
+
if (hasGlobChars(target)) {
|
|
42
|
+
for (const f of await expandGlob(target, cwd))
|
|
43
|
+
found.add(f);
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
const stat = await statSafe(isAbsolute(target) ? target : resolve(cwd, target));
|
|
47
|
+
if (stat.isDir) {
|
|
48
|
+
const trimmed = target.replace(/\/+$/, "");
|
|
49
|
+
const dirPattern = `${trimmed}/${pattern}`;
|
|
50
|
+
for (const f of await expandGlob(dirPattern, cwd))
|
|
51
|
+
found.add(f);
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
if (stat.isFile) {
|
|
55
|
+
found.add(isAbsolute(target) ? target : resolve(cwd, target));
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
for (const f of await expandGlob(target, cwd))
|
|
59
|
+
found.add(f);
|
|
60
|
+
}
|
|
61
|
+
return [...found].sort();
|
|
62
|
+
}
|
package/dist/index.d.ts
CHANGED
|
@@ -7,7 +7,7 @@ export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
|
|
|
7
7
|
export type { LogLevel } from "./logger";
|
|
8
8
|
export type { AgentExpectation, AgentMatchers } from "./assertions";
|
|
9
9
|
export type { JudgeCriteria } from "./judge";
|
|
10
|
-
export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, } from "./types";
|
|
10
|
+
export type { AgentExecutor, ExecutorOptions, AgentResponse, AgentReport, SceneResult, RunResult, JudgeVerdict, JudgeResult, HookFn, } from "./types";
|
|
11
11
|
export interface AgentOptions {
|
|
12
12
|
name?: string;
|
|
13
13
|
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"claude-opus-4-7": { "input": 15, "output": 75 },
|
|
3
|
+
"claude-opus-4-6": { "input": 15, "output": 75 },
|
|
4
|
+
"claude-opus-4-5": { "input": 15, "output": 75 },
|
|
5
|
+
"claude-sonnet-4-6": { "input": 3, "output": 15 },
|
|
6
|
+
"claude-sonnet-4-5": { "input": 3, "output": 15 },
|
|
7
|
+
"claude-haiku-4-5": { "input": 1, "output": 5 },
|
|
8
|
+
"claude-3-5-sonnet-20241022": { "input": 3, "output": 15 },
|
|
9
|
+
"claude-3-5-haiku-20241022": { "input": 0.8, "output": 4 },
|
|
10
|
+
"claude-3-opus-20240229": { "input": 15, "output": 75 },
|
|
11
|
+
"gpt-4o": { "input": 2.5, "output": 10 },
|
|
12
|
+
"gpt-4o-mini": { "input": 0.15, "output": 0.6 },
|
|
13
|
+
"gpt-4.1": { "input": 2, "output": 8 },
|
|
14
|
+
"gpt-4.1-mini": { "input": 0.4, "output": 1.6 },
|
|
15
|
+
"gpt-4.1-nano": { "input": 0.1, "output": 0.4 },
|
|
16
|
+
"gpt-5": { "input": 1.25, "output": 10 },
|
|
17
|
+
"gpt-5-mini": { "input": 0.25, "output": 2 },
|
|
18
|
+
"o1": { "input": 15, "output": 60 },
|
|
19
|
+
"o1-mini": { "input": 1.1, "output": 4.4 },
|
|
20
|
+
"o3-mini": { "input": 1.1, "output": 4.4 }
|
|
21
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export interface ModelPrice {
|
|
2
|
+
/** USD per 1M input tokens */
|
|
3
|
+
input: number;
|
|
4
|
+
/** USD per 1M output tokens */
|
|
5
|
+
output: number;
|
|
6
|
+
}
|
|
7
|
+
export type CostSource = "provider" | "table" | "unavailable";
|
|
8
|
+
export interface CostBreakdown {
|
|
9
|
+
inputUsd?: number;
|
|
10
|
+
outputUsd?: number;
|
|
11
|
+
totalUsd?: number;
|
|
12
|
+
source: CostSource;
|
|
13
|
+
}
|
|
14
|
+
export declare function setPricingOverrides(table?: Record<string, ModelPrice>): void;
|
|
15
|
+
export declare function lookupPrice(model: string | undefined): ModelPrice | undefined;
|
|
16
|
+
export interface ComputeCostInput {
|
|
17
|
+
model?: string;
|
|
18
|
+
inputTokens?: number;
|
|
19
|
+
outputTokens?: number;
|
|
20
|
+
/** USD cost the provider already reported (takes precedence) */
|
|
21
|
+
providerCost?: number;
|
|
22
|
+
}
|
|
23
|
+
export declare function computeCost(input: ComputeCostInput): CostBreakdown;
|
package/dist/pricing.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { readFileSync } from "fs";
|
|
2
|
+
import { fileURLToPath } from "url";
|
|
3
|
+
import { dirname, join } from "path";
|
|
4
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
5
|
+
const builtIn = JSON.parse(readFileSync(join(here, "pricing", "models.json"), "utf-8"));
|
|
6
|
+
let overrides = {};
|
|
7
|
+
export function setPricingOverrides(table) {
|
|
8
|
+
overrides = table ?? {};
|
|
9
|
+
}
|
|
10
|
+
export function lookupPrice(model) {
|
|
11
|
+
if (!model)
|
|
12
|
+
return undefined;
|
|
13
|
+
if (overrides[model])
|
|
14
|
+
return overrides[model];
|
|
15
|
+
if (builtIn[model])
|
|
16
|
+
return builtIn[model];
|
|
17
|
+
// Loose suffix/prefix match — pick the longest matching key
|
|
18
|
+
const lowered = model.toLowerCase();
|
|
19
|
+
const keys = Object.keys({ ...builtIn, ...overrides })
|
|
20
|
+
.filter((k) => lowered.includes(k.toLowerCase()) || k.toLowerCase().includes(lowered))
|
|
21
|
+
.sort((a, b) => b.length - a.length);
|
|
22
|
+
if (keys.length > 0) {
|
|
23
|
+
return overrides[keys[0]] ?? builtIn[keys[0]];
|
|
24
|
+
}
|
|
25
|
+
return undefined;
|
|
26
|
+
}
|
|
27
|
+
export function computeCost(input) {
|
|
28
|
+
if (input.providerCost != null && Number.isFinite(input.providerCost)) {
|
|
29
|
+
return { totalUsd: input.providerCost, source: "provider" };
|
|
30
|
+
}
|
|
31
|
+
const price = lookupPrice(input.model);
|
|
32
|
+
if (!price)
|
|
33
|
+
return { source: "unavailable" };
|
|
34
|
+
const inputUsd = ((input.inputTokens ?? 0) / 1_000_000) * price.input;
|
|
35
|
+
const outputUsd = ((input.outputTokens ?? 0) / 1_000_000) * price.output;
|
|
36
|
+
return {
|
|
37
|
+
inputUsd,
|
|
38
|
+
outputUsd,
|
|
39
|
+
totalUsd: inputUsd + outputUsd,
|
|
40
|
+
source: "table",
|
|
41
|
+
};
|
|
42
|
+
}
|
package/dist/reporter.js
CHANGED
|
@@ -78,8 +78,75 @@ export function formatReport(report) {
|
|
|
78
78
|
if (report.averageOutputTokensPerCase != null) {
|
|
79
79
|
lines.push(` average_output_tokens_per_case: ${report.averageOutputTokensPerCase}`);
|
|
80
80
|
}
|
|
81
|
+
if (report.totalInputTokens != null) {
|
|
82
|
+
lines.push(` total_input_tokens: ${report.totalInputTokens}`);
|
|
83
|
+
}
|
|
84
|
+
if (report.totalOutputTokens != null) {
|
|
85
|
+
lines.push(` total_output_tokens: ${report.totalOutputTokens}`);
|
|
86
|
+
}
|
|
87
|
+
if (report.totalCostUsd != null) {
|
|
88
|
+
lines.push(` total_cost_usd: ${formatUsd(report.totalCostUsd)}`);
|
|
89
|
+
}
|
|
90
|
+
const observedScenes = report.results.filter((r) => r.tokens || r.costUsd != null || (r.events && r.events.length));
|
|
91
|
+
if (observedScenes.length > 0) {
|
|
92
|
+
lines.push(` scenes:`);
|
|
93
|
+
for (const r of observedScenes) {
|
|
94
|
+
lines.push(...renderSceneObservability(r));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
81
97
|
return lines.join("\n");
|
|
82
98
|
}
|
|
99
|
+
function renderSceneObservability(r) {
|
|
100
|
+
const out = [];
|
|
101
|
+
const promptLabel = r.prompt.length > 80 ? r.prompt.slice(0, 77) + "..." : r.prompt;
|
|
102
|
+
out.push(` - prompt: "${escapeYaml(promptLabel)}"`);
|
|
103
|
+
out.push(` duration_ms: ${Math.round(r.duration)}`);
|
|
104
|
+
if (r.tokens) {
|
|
105
|
+
out.push(` tokens: { input: ${r.tokens.input}, output: ${r.tokens.output} }`);
|
|
106
|
+
}
|
|
107
|
+
if (r.costUsd != null) {
|
|
108
|
+
const source = r.costSource ?? "table";
|
|
109
|
+
out.push(` cost_usd: ${formatUsd(r.costUsd)}`);
|
|
110
|
+
out.push(` cost_source: ${source}`);
|
|
111
|
+
}
|
|
112
|
+
if (r.events && r.events.length) {
|
|
113
|
+
out.push(` timeline:`);
|
|
114
|
+
for (const e of r.events) {
|
|
115
|
+
out.push(...renderTimelineEvent(e));
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return out;
|
|
119
|
+
}
|
|
120
|
+
function renderTimelineEvent(e) {
|
|
121
|
+
const out = [];
|
|
122
|
+
out.push(` - kind: ${e.kind}`);
|
|
123
|
+
out.push(` name: "${escapeYaml(e.name)}"`);
|
|
124
|
+
out.push(` start_ms: ${Math.round(e.startMs)}`);
|
|
125
|
+
out.push(` duration_ms: ${Math.round(e.durationMs)}`);
|
|
126
|
+
if (e.tokens) {
|
|
127
|
+
out.push(` tokens: { input: ${e.tokens.input}, output: ${e.tokens.output} }`);
|
|
128
|
+
}
|
|
129
|
+
if (e.cost?.totalUsd != null) {
|
|
130
|
+
out.push(` cost_usd: ${formatUsd(e.cost.totalUsd)}`);
|
|
131
|
+
out.push(` cost_source: ${e.cost.source}`);
|
|
132
|
+
}
|
|
133
|
+
if (e.runIndex != null) {
|
|
134
|
+
out.push(` run_index: ${e.runIndex}`);
|
|
135
|
+
}
|
|
136
|
+
if (e.error) {
|
|
137
|
+
out.push(` error: "${escapeYaml(e.error)}"`);
|
|
138
|
+
}
|
|
139
|
+
return out;
|
|
140
|
+
}
|
|
141
|
+
function formatUsd(n) {
|
|
142
|
+
if (n === 0)
|
|
143
|
+
return "0";
|
|
144
|
+
// Up to 6 decimal places, but trim trailing zeros for compactness
|
|
145
|
+
return Number(n.toFixed(6)).toString();
|
|
146
|
+
}
|
|
147
|
+
function escapeYaml(s) {
|
|
148
|
+
return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, "\\n");
|
|
149
|
+
}
|
|
83
150
|
export async function writeReport(content, timestamp, name, dimensions) {
|
|
84
151
|
const reportsDir = join(process.cwd(), ".reports");
|
|
85
152
|
await mkdir(reportsDir, { recursive: true });
|
package/dist/runner.js
CHANGED
|
@@ -37,13 +37,20 @@ async function executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig)
|
|
|
37
37
|
const start = performance.now();
|
|
38
38
|
const input = scene.prompt;
|
|
39
39
|
for (let t = 0; t < turns; t++) {
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
40
|
+
const controller = new AbortController();
|
|
41
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
42
|
+
try {
|
|
43
|
+
response = await executor(input, { signal: controller.signal });
|
|
44
|
+
}
|
|
45
|
+
catch (err) {
|
|
46
|
+
if (err.name === "AbortError" || controller.signal.aborted) {
|
|
47
|
+
throw new Error(`Scene timed out after ${timeoutMs}ms`);
|
|
48
|
+
}
|
|
49
|
+
throw err;
|
|
50
|
+
}
|
|
51
|
+
finally {
|
|
52
|
+
clearTimeout(timer);
|
|
53
|
+
}
|
|
47
54
|
if (response.executionError)
|
|
48
55
|
break;
|
|
49
56
|
}
|
|
@@ -109,6 +116,9 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
109
116
|
// Single run — original fast path
|
|
110
117
|
if (numRuns <= 1) {
|
|
111
118
|
const run = await executeSingleRun(executor, scene, timeoutMs, turns, judgeConfig);
|
|
119
|
+
const tokens = run.response.metadata?.tokens;
|
|
120
|
+
const cost = run.response.metadata?.cost;
|
|
121
|
+
const events = run.response.metadata?.events;
|
|
112
122
|
return {
|
|
113
123
|
prompt: scene.prompt,
|
|
114
124
|
response: run.response,
|
|
@@ -117,6 +127,10 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
117
127
|
error: run.error,
|
|
118
128
|
judgement: run.judgement,
|
|
119
129
|
suite: scene.suite,
|
|
130
|
+
tokens: tokens ? { input: tokens.input, output: tokens.output } : undefined,
|
|
131
|
+
costUsd: cost?.totalUsd,
|
|
132
|
+
costSource: cost?.source,
|
|
133
|
+
events: events && events.length ? events : undefined,
|
|
120
134
|
};
|
|
121
135
|
}
|
|
122
136
|
// Multiple runs — execute N times and aggregate
|
|
@@ -136,6 +150,37 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
136
150
|
const error = overallPassed
|
|
137
151
|
? undefined
|
|
138
152
|
: failedRuns[0]?.error ?? "Majority of runs failed";
|
|
153
|
+
// Aggregate tokens, cost, events across runs
|
|
154
|
+
let inputTokens = 0;
|
|
155
|
+
let outputTokens = 0;
|
|
156
|
+
let hasTokens = false;
|
|
157
|
+
let costTotal = 0;
|
|
158
|
+
let hasCost = false;
|
|
159
|
+
let costSource;
|
|
160
|
+
const allEvents = [];
|
|
161
|
+
runs.forEach((r, runIndex) => {
|
|
162
|
+
const meta = r.response.metadata;
|
|
163
|
+
if (meta?.tokens) {
|
|
164
|
+
hasTokens = true;
|
|
165
|
+
inputTokens += meta.tokens.input;
|
|
166
|
+
outputTokens += meta.tokens.output;
|
|
167
|
+
}
|
|
168
|
+
if (meta?.cost?.totalUsd != null) {
|
|
169
|
+
hasCost = true;
|
|
170
|
+
costTotal += meta.cost.totalUsd;
|
|
171
|
+
// Promote weakest source: provider > table > unavailable
|
|
172
|
+
if (costSource !== "table")
|
|
173
|
+
costSource = meta.cost.source;
|
|
174
|
+
if (meta.cost.source === "table" && costSource !== "table") {
|
|
175
|
+
costSource = "table";
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
if (meta?.events?.length) {
|
|
179
|
+
for (const e of meta.events) {
|
|
180
|
+
allEvents.push({ ...e, runIndex });
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
});
|
|
139
184
|
return {
|
|
140
185
|
prompt: scene.prompt,
|
|
141
186
|
response: lastRun.response,
|
|
@@ -147,5 +192,9 @@ export async function executeScene(executor, scene, globalTimeout, judgeConfig,
|
|
|
147
192
|
runs,
|
|
148
193
|
passRate,
|
|
149
194
|
statisticalSignificance,
|
|
195
|
+
tokens: hasTokens ? { input: inputTokens, output: outputTokens } : undefined,
|
|
196
|
+
costUsd: hasCost ? costTotal : undefined,
|
|
197
|
+
costSource,
|
|
198
|
+
events: allEvents.length ? allEvents : undefined,
|
|
150
199
|
};
|
|
151
200
|
}
|
package/dist/types.d.ts
CHANGED
|
@@ -1,4 +1,31 @@
|
|
|
1
|
-
export
|
|
1
|
+
export interface ExecutorOptions {
|
|
2
|
+
signal?: AbortSignal;
|
|
3
|
+
}
|
|
4
|
+
export type AgentExecutor = (input: string, options?: ExecutorOptions) => Promise<AgentResponse>;
|
|
5
|
+
export type CostSource = "provider" | "table" | "unavailable";
|
|
6
|
+
export interface CostBreakdown {
|
|
7
|
+
inputUsd?: number;
|
|
8
|
+
outputUsd?: number;
|
|
9
|
+
totalUsd?: number;
|
|
10
|
+
source: CostSource;
|
|
11
|
+
}
|
|
12
|
+
export type TimelineEventKind = "model" | "tool";
|
|
13
|
+
export interface TimelineEvent {
|
|
14
|
+
kind: TimelineEventKind;
|
|
15
|
+
name: string;
|
|
16
|
+
/** ms relative to the scene start */
|
|
17
|
+
startMs: number;
|
|
18
|
+
endMs: number;
|
|
19
|
+
durationMs: number;
|
|
20
|
+
tokens?: {
|
|
21
|
+
input: number;
|
|
22
|
+
output: number;
|
|
23
|
+
};
|
|
24
|
+
cost?: CostBreakdown;
|
|
25
|
+
/** Index of the run this event belongs to (only set when aggregating across multi-run scenes) */
|
|
26
|
+
runIndex?: number;
|
|
27
|
+
error?: string;
|
|
28
|
+
}
|
|
2
29
|
export interface AgentResponse {
|
|
3
30
|
text: string;
|
|
4
31
|
refusal?: boolean;
|
|
@@ -11,6 +38,8 @@ export interface AgentResponse {
|
|
|
11
38
|
};
|
|
12
39
|
tools?: string[];
|
|
13
40
|
systemPrompt?: string;
|
|
41
|
+
events?: TimelineEvent[];
|
|
42
|
+
cost?: CostBreakdown;
|
|
14
43
|
[key: string]: unknown;
|
|
15
44
|
};
|
|
16
45
|
}
|
|
@@ -50,6 +79,16 @@ export interface SceneResult {
|
|
|
50
79
|
runs?: RunResult[];
|
|
51
80
|
passRate?: number;
|
|
52
81
|
statisticalSignificance?: number;
|
|
82
|
+
/** Aggregate tokens across all runs of this scene */
|
|
83
|
+
tokens?: {
|
|
84
|
+
input: number;
|
|
85
|
+
output: number;
|
|
86
|
+
};
|
|
87
|
+
/** Aggregate USD cost across all runs of this scene */
|
|
88
|
+
costUsd?: number;
|
|
89
|
+
costSource?: CostSource;
|
|
90
|
+
/** Ordered timeline events from every run of the scene */
|
|
91
|
+
events?: TimelineEvent[];
|
|
53
92
|
}
|
|
54
93
|
export interface AgentReport {
|
|
55
94
|
name?: string;
|
|
@@ -66,5 +105,8 @@ export interface AgentReport {
|
|
|
66
105
|
totalCases: number;
|
|
67
106
|
averageInputTokensPerCase?: number;
|
|
68
107
|
averageOutputTokensPerCase?: number;
|
|
108
|
+
totalInputTokens?: number;
|
|
109
|
+
totalOutputTokens?: number;
|
|
110
|
+
totalCostUsd?: number;
|
|
69
111
|
results: SceneResult[];
|
|
70
112
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sebastiantuyu/agest",
|
|
3
|
-
"version": "0.3.2",
|
|
3
|
+
"version": "0.3.3-next.2",
|
|
4
4
|
"description": "A testing library for agents",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
}
|
|
27
27
|
},
|
|
28
28
|
"scripts": {
|
|
29
|
-
"build": "tsc -p tsconfig.build.json",
|
|
29
|
+
"build": "tsc -p tsconfig.build.json && mkdir -p dist/pricing && cp src/pricing/models.json dist/pricing/models.json",
|
|
30
30
|
"test": "vitest run",
|
|
31
31
|
"test:watch": "vitest",
|
|
32
32
|
"test:coverage": "vitest run --coverage",
|
|
@@ -37,25 +37,26 @@
|
|
|
37
37
|
"site:preview": "npx serve site -p 3000",
|
|
38
38
|
"release:patch": "npm version patch && git push && git push --tags",
|
|
39
39
|
"release:minor": "npm version minor && git push && git push --tags",
|
|
40
|
-
"release:major": "npm version major && git push && git push --tags"
|
|
40
|
+
"release:major": "npm version major && git push && git push --tags",
|
|
41
|
+
"release:next": "npm version prerelease --preid=next && git push && git push --tags"
|
|
41
42
|
},
|
|
42
43
|
"engines": {
|
|
43
44
|
"node": ">=22.0.0"
|
|
44
45
|
},
|
|
45
46
|
"devDependencies": {
|
|
46
|
-
"@langchain/core": "
|
|
47
|
-
"@langchain/langgraph": "
|
|
48
|
-
"@langchain/openai": "
|
|
49
|
-
"@types/node": "
|
|
50
|
-
"@vitest/coverage-v8": "
|
|
51
|
-
"dotenv": "
|
|
52
|
-
"langchain": "
|
|
53
|
-
"tsx": "
|
|
54
|
-
"typescript": "
|
|
55
|
-
"vitest": "
|
|
56
|
-
"zod": "
|
|
47
|
+
"@langchain/core": "1.1.39",
|
|
48
|
+
"@langchain/langgraph": "1.2.8",
|
|
49
|
+
"@langchain/openai": "1.4.4",
|
|
50
|
+
"@types/node": "22.19.17",
|
|
51
|
+
"@vitest/coverage-v8": "3.2.4",
|
|
52
|
+
"dotenv": "17.4.1",
|
|
53
|
+
"langchain": "1.3.1",
|
|
54
|
+
"tsx": "4.21.0",
|
|
55
|
+
"typescript": "5.9.3",
|
|
56
|
+
"vitest": "3.2.4",
|
|
57
|
+
"zod": "4.3.6"
|
|
57
58
|
},
|
|
58
59
|
"dependencies": {
|
|
59
|
-
"@supercharge/promise-pool": "
|
|
60
|
+
"@supercharge/promise-pool": "3.3.0"
|
|
60
61
|
}
|
|
61
62
|
}
|