@sebastiantuyu/agest 0.3.2 → 0.3.3-next.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +158 -1
- package/dist/adapters/index.d.ts +2 -0
- package/dist/adapters/index.js +1 -0
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +80 -11
- package/dist/adapters/remote.d.ts +1 -1
- package/dist/adapters/remote.js +3 -2
- package/dist/adapters/tracing.d.ts +73 -0
- package/dist/adapters/tracing.js +338 -0
- package/dist/assertions.d.ts +57 -2
- package/dist/assertions.js +119 -33
- package/dist/cli.d.ts +15 -1
- package/dist/cli.js +97 -18
- package/dist/config.d.ts +9 -0
- package/dist/context.d.ts +32 -11
- package/dist/context.js +84 -10
- package/dist/discover.d.ts +16 -0
- package/dist/discover.js +62 -0
- package/dist/index.d.ts +20 -2
- package/dist/index.js +10 -3
- package/dist/match.d.ts +28 -0
- package/dist/match.js +57 -0
- package/dist/preview.js +93 -0
- package/dist/pricing/index.d.ts +32 -0
- package/dist/pricing/index.js +48 -0
- package/dist/pricing/models.json +21 -0
- package/dist/reporter.d.ts +1 -1
- package/dist/reporter.js +77 -4
- package/dist/reports.d.ts +37 -0
- package/dist/reports.js +126 -0
- package/dist/resolve.d.ts +25 -0
- package/dist/resolve.js +62 -0
- package/dist/runner.d.ts +11 -2
- package/dist/runner.js +97 -11
- package/dist/schema.d.ts +63 -0
- package/dist/schema.js +61 -0
- package/dist/types.d.ts +84 -9
- package/dist/waterfall.d.ts +11 -0
- package/dist/waterfall.js +46 -0
- package/package.json +24 -15
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
import { computeCost } from "../pricing";
|
|
2
|
+
import { logger } from "../logger";
|
|
3
|
+
/**
|
|
4
|
+
* Creates a LangChain callback handler that records every LLM and tool
|
|
5
|
+
* invocation as a `TimelineEvent`. Returns a handle whose `drain()` method
|
|
6
|
+
* yields the captured events with `startMs` / `endMs` relative to the
|
|
7
|
+
* provided baseline.
|
|
8
|
+
*
|
|
9
|
+
* Designed to fail open: any unexpected callback shape is ignored rather
|
|
10
|
+
* than throwing — the underlying agent run must not be broken by tracing.
|
|
11
|
+
*/
|
|
12
|
+
export async function createTracingHandle(baselineMs) {
|
|
13
|
+
// Import lazily so the adapter still works when @langchain/core is not present.
|
|
14
|
+
// BaseCallbackHandler is the runtime contract LangChain checks for.
|
|
15
|
+
let BaseCallbackHandler;
|
|
16
|
+
try {
|
|
17
|
+
({ BaseCallbackHandler } = await import("@langchain/core/callbacks/base"));
|
|
18
|
+
}
|
|
19
|
+
catch (err) {
|
|
20
|
+
logger.debug(`[agest] tracing disabled: could not load @langchain/core/callbacks/base — ` +
|
|
21
|
+
`install @langchain/core as a peer to capture per-scene cost/timeline. (${err.message})`);
|
|
22
|
+
return { callbacks: [], drain: () => ({ events: [] }) };
|
|
23
|
+
}
|
|
24
|
+
const events = [];
|
|
25
|
+
const openLLMs = new Map();
|
|
26
|
+
const openTools = new Map();
|
|
27
|
+
let lastModelName;
|
|
28
|
+
class AgestTracer extends BaseCallbackHandler {
|
|
29
|
+
name = "AgestTracer";
|
|
30
|
+
awaitHandlers = true;
|
|
31
|
+
handleLLMStart(llm, _prompts, runId, _parentRunId, extraParams) {
|
|
32
|
+
openLLMs.set(runId, {
|
|
33
|
+
startMs: now() - baselineMs,
|
|
34
|
+
name: extractModelName(llm, extraParams),
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
handleChatModelStart(llm, _messages, runId, _parentRunId, extraParams) {
|
|
38
|
+
openLLMs.set(runId, {
|
|
39
|
+
startMs: now() - baselineMs,
|
|
40
|
+
name: extractModelName(llm, extraParams),
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
handleLLMEnd(output, runId) {
|
|
44
|
+
const open = openLLMs.get(runId);
|
|
45
|
+
if (!open)
|
|
46
|
+
return;
|
|
47
|
+
openLLMs.delete(runId);
|
|
48
|
+
const endMs = now() - baselineMs;
|
|
49
|
+
const tokens = extractTokensFromLLMOutput(output);
|
|
50
|
+
const providerCost = extractProviderCost(output);
|
|
51
|
+
const cachedInputTokens = extractCachedTokens(output);
|
|
52
|
+
const name = open.name ?? extractModelNameFromOutput(output) ?? "model";
|
|
53
|
+
if (name && name !== "model")
|
|
54
|
+
lastModelName = name;
|
|
55
|
+
const cost = computeCost({
|
|
56
|
+
model: name,
|
|
57
|
+
inputTokens: tokens?.input,
|
|
58
|
+
outputTokens: tokens?.output,
|
|
59
|
+
cachedInputTokens,
|
|
60
|
+
providerCost,
|
|
61
|
+
});
|
|
62
|
+
events.push({
|
|
63
|
+
kind: "model",
|
|
64
|
+
name,
|
|
65
|
+
startMs: open.startMs,
|
|
66
|
+
endMs,
|
|
67
|
+
durationMs: Math.max(0, endMs - open.startMs),
|
|
68
|
+
tokens,
|
|
69
|
+
cachedInputTokens,
|
|
70
|
+
cost: stripCostIfEmpty(cost),
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
handleLLMError(err, runId) {
|
|
74
|
+
const open = openLLMs.get(runId);
|
|
75
|
+
if (!open)
|
|
76
|
+
return;
|
|
77
|
+
openLLMs.delete(runId);
|
|
78
|
+
const endMs = now() - baselineMs;
|
|
79
|
+
events.push({
|
|
80
|
+
kind: "model",
|
|
81
|
+
name: open.name ?? "model",
|
|
82
|
+
startMs: open.startMs,
|
|
83
|
+
endMs,
|
|
84
|
+
durationMs: Math.max(0, endMs - open.startMs),
|
|
85
|
+
error: err?.message ?? String(err),
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
handleToolStart(tool, _input, runId, _parentRunId, _tags, _metadata, runName) {
|
|
89
|
+
openTools.set(runId, {
|
|
90
|
+
startMs: now() - baselineMs,
|
|
91
|
+
name: extractToolName(tool, runName) ?? "tool",
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
handleToolEnd(_output, runId) {
|
|
95
|
+
const open = openTools.get(runId);
|
|
96
|
+
if (!open)
|
|
97
|
+
return;
|
|
98
|
+
openTools.delete(runId);
|
|
99
|
+
const endMs = now() - baselineMs;
|
|
100
|
+
events.push({
|
|
101
|
+
kind: "tool",
|
|
102
|
+
name: open.name,
|
|
103
|
+
startMs: open.startMs,
|
|
104
|
+
endMs,
|
|
105
|
+
durationMs: Math.max(0, endMs - open.startMs),
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
handleToolError(err, runId) {
|
|
109
|
+
const open = openTools.get(runId);
|
|
110
|
+
if (!open)
|
|
111
|
+
return;
|
|
112
|
+
openTools.delete(runId);
|
|
113
|
+
const endMs = now() - baselineMs;
|
|
114
|
+
events.push({
|
|
115
|
+
kind: "tool",
|
|
116
|
+
name: open.name,
|
|
117
|
+
startMs: open.startMs,
|
|
118
|
+
endMs,
|
|
119
|
+
durationMs: Math.max(0, endMs - open.startMs),
|
|
120
|
+
error: err?.message ?? String(err),
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
const handler = new AgestTracer();
|
|
125
|
+
return {
|
|
126
|
+
callbacks: [handler],
|
|
127
|
+
drain: () => {
|
|
128
|
+
const ordered = [...events].sort((a, b) => a.startMs - b.startMs);
|
|
129
|
+
events.length = 0;
|
|
130
|
+
return { events: ordered, modelName: lastModelName };
|
|
131
|
+
},
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Public tracing helper for custom executors (i.e. agents not wired through
|
|
136
|
+
* the `langchain()` adapter). Create one per scene run, hand its `callbacks`
|
|
137
|
+
* to your LangChain/LangGraph invocation, then spread `collect()` into the
|
|
138
|
+
* response metadata.
|
|
139
|
+
*
|
|
140
|
+
* @example
|
|
141
|
+
* ```ts
|
|
142
|
+
* const trace = await createTrace({ model: env.OPENROUTER_MODEL });
|
|
143
|
+
* const plan = await generatePlan(input, { callbacks: trace.callbacks });
|
|
144
|
+
* return { text: render(plan), metadata: { model, tools, ...trace.collect() } };
|
|
145
|
+
* ```
|
|
146
|
+
*/
|
|
147
|
+
export async function createTrace(opts) {
|
|
148
|
+
const baseline = performance.now();
|
|
149
|
+
const handle = await createTracingHandle(baseline);
|
|
150
|
+
let collected;
|
|
151
|
+
return {
|
|
152
|
+
callbacks: handle.callbacks,
|
|
153
|
+
collect() {
|
|
154
|
+
if (collected)
|
|
155
|
+
return collected;
|
|
156
|
+
const drained = handle.drain();
|
|
157
|
+
const { tokens, cost } = summarizeEvents(drained.events, opts?.model ?? drained.modelName);
|
|
158
|
+
collected = { events: drained.events, tokens, cost };
|
|
159
|
+
return collected;
|
|
160
|
+
},
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Aggregate token counts and cost across a timeline's model events.
|
|
165
|
+
* Provider-reported cost wins; otherwise the table-derived cost; otherwise
|
|
166
|
+
* cost is recomputed from `model` and the summed tokens. `fallbackTokens` is
|
|
167
|
+
* used only when no model event carried usage.
|
|
168
|
+
*/
|
|
169
|
+
export function summarizeEvents(events, model, fallbackTokens) {
|
|
170
|
+
const modelEvents = events.filter((e) => e.kind === "model");
|
|
171
|
+
let inputTokens = 0;
|
|
172
|
+
let outputTokens = 0;
|
|
173
|
+
let providerCost = 0;
|
|
174
|
+
let hasProviderCost = false;
|
|
175
|
+
let hasTableCost = false;
|
|
176
|
+
let tableCost = 0;
|
|
177
|
+
let hasTokens = false;
|
|
178
|
+
for (const e of modelEvents) {
|
|
179
|
+
if (e.tokens) {
|
|
180
|
+
hasTokens = true;
|
|
181
|
+
inputTokens += e.tokens.input;
|
|
182
|
+
outputTokens += e.tokens.output;
|
|
183
|
+
}
|
|
184
|
+
if (e.cost?.source === "provider" && e.cost.totalUsd != null) {
|
|
185
|
+
hasProviderCost = true;
|
|
186
|
+
providerCost += e.cost.totalUsd;
|
|
187
|
+
}
|
|
188
|
+
else if (e.cost?.source === "table" && e.cost.totalUsd != null) {
|
|
189
|
+
hasTableCost = true;
|
|
190
|
+
tableCost += e.cost.totalUsd;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
let tokens = hasTokens ? { input: inputTokens, output: outputTokens } : undefined;
|
|
194
|
+
if (!tokens && fallbackTokens)
|
|
195
|
+
tokens = fallbackTokens;
|
|
196
|
+
let cost;
|
|
197
|
+
if (hasProviderCost) {
|
|
198
|
+
cost = { totalUsd: providerCost, source: "provider" };
|
|
199
|
+
}
|
|
200
|
+
else if (hasTableCost) {
|
|
201
|
+
cost = { totalUsd: tableCost, source: "table" };
|
|
202
|
+
}
|
|
203
|
+
else if (tokens && model) {
|
|
204
|
+
const computed = computeCost({
|
|
205
|
+
model,
|
|
206
|
+
inputTokens: tokens.input,
|
|
207
|
+
outputTokens: tokens.output,
|
|
208
|
+
});
|
|
209
|
+
if (computed.source !== "unavailable")
|
|
210
|
+
cost = computed;
|
|
211
|
+
}
|
|
212
|
+
return { tokens, cost };
|
|
213
|
+
}
|
|
214
|
+
function now() {
|
|
215
|
+
return performance.now();
|
|
216
|
+
}
|
|
217
|
+
function extractModelName(llm, extraParams) {
|
|
218
|
+
const invocation = (extraParams?.invocation_params ?? {});
|
|
219
|
+
if (typeof invocation.model === "string")
|
|
220
|
+
return invocation.model;
|
|
221
|
+
if (typeof invocation.model_name === "string")
|
|
222
|
+
return invocation.model_name;
|
|
223
|
+
const kwargs = llm?.kwargs;
|
|
224
|
+
if (kwargs) {
|
|
225
|
+
if (typeof kwargs.model === "string")
|
|
226
|
+
return kwargs.model;
|
|
227
|
+
if (typeof kwargs.model_name === "string")
|
|
228
|
+
return kwargs.model_name;
|
|
229
|
+
if (typeof kwargs.modelName === "string")
|
|
230
|
+
return kwargs.modelName;
|
|
231
|
+
}
|
|
232
|
+
const id = llm?.id;
|
|
233
|
+
if (Array.isArray(id) && id.length > 0 && typeof id[id.length - 1] === "string") {
|
|
234
|
+
return id[id.length - 1];
|
|
235
|
+
}
|
|
236
|
+
return undefined;
|
|
237
|
+
}
|
|
238
|
+
function extractModelNameFromOutput(output) {
|
|
239
|
+
const gen = output?.generations?.[0]?.[0];
|
|
240
|
+
return (gen?.message?.response_metadata?.model_name ??
|
|
241
|
+
gen?.message?.response_metadata?.model ??
|
|
242
|
+
output?.llmOutput?.modelName ??
|
|
243
|
+
output?.llmOutput?.model);
|
|
244
|
+
}
|
|
245
|
+
function extractTokensFromLLMOutput(output) {
|
|
246
|
+
const usage = output?.llmOutput?.tokenUsage ??
|
|
247
|
+
output?.llmOutput?.usage ??
|
|
248
|
+
output?.generations?.[0]?.[0]?.message?.usage_metadata ??
|
|
249
|
+
output?.generations?.[0]?.[0]?.message?.response_metadata?.usage;
|
|
250
|
+
if (!usage)
|
|
251
|
+
return undefined;
|
|
252
|
+
const input = usage.input_tokens ?? usage.prompt_tokens ?? usage.promptTokens ?? 0;
|
|
253
|
+
const out = usage.output_tokens ?? usage.completion_tokens ?? usage.completionTokens ?? 0;
|
|
254
|
+
if (!input && !out)
|
|
255
|
+
return undefined;
|
|
256
|
+
return { input, output: out };
|
|
257
|
+
}
|
|
258
|
+
/** Collect the usage-bearing objects LangChain/OpenRouter may attach to an LLM result. */
|
|
259
|
+
function usageObjects(output) {
|
|
260
|
+
const msg = output?.generations?.[0]?.[0]?.message;
|
|
261
|
+
return [
|
|
262
|
+
output?.llmOutput?.usage,
|
|
263
|
+
output?.llmOutput?.tokenUsage,
|
|
264
|
+
output?.llmOutput?.estimatedTokenUsage,
|
|
265
|
+
output?.llmOutput,
|
|
266
|
+
msg?.usage_metadata,
|
|
267
|
+
msg?.response_metadata?.usage,
|
|
268
|
+
msg?.response_metadata?.tokenUsage,
|
|
269
|
+
msg?.response_metadata?.estimatedTokenUsage,
|
|
270
|
+
msg?.response_metadata,
|
|
271
|
+
msg?.additional_kwargs?.usage,
|
|
272
|
+
].filter((u) => u && typeof u === "object");
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* OpenRouter (with `usage: { include: true }`) reports real USD cost. LangChain
|
|
276
|
+
* surfaces it inconsistently across versions, so scan the known usage objects
|
|
277
|
+
* for a numeric `cost` / `total_cost`.
|
|
278
|
+
*/
|
|
279
|
+
function extractProviderCost(output) {
|
|
280
|
+
for (const u of usageObjects(output)) {
|
|
281
|
+
const c = (typeof u.cost === "number" ? u.cost : undefined) ??
|
|
282
|
+
(typeof u.total_cost === "number" ? u.total_cost : undefined) ??
|
|
283
|
+
(typeof u.cost_usd === "number" ? u.cost_usd : undefined) ??
|
|
284
|
+
(typeof u.cost_details?.upstream_inference_cost === "number"
|
|
285
|
+
? u.cost_details.upstream_inference_cost
|
|
286
|
+
: undefined);
|
|
287
|
+
if (typeof c === "number" && Number.isFinite(c))
|
|
288
|
+
return c;
|
|
289
|
+
}
|
|
290
|
+
return undefined;
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Cached (prompt-cache hit) input tokens, when the provider reports them.
|
|
294
|
+
* Charged at a fraction of the normal input rate, so surfacing them lets the
|
|
295
|
+
* report explain why provider cost is below the flat-table estimate.
|
|
296
|
+
*/
|
|
297
|
+
function extractCachedTokens(output) {
|
|
298
|
+
for (const u of usageObjects(output)) {
|
|
299
|
+
const cached = u.input_token_details?.cache_read ??
|
|
300
|
+
u.prompt_tokens_details?.cached_tokens ??
|
|
301
|
+
u.cache_read_input_tokens ??
|
|
302
|
+
u.cached_tokens;
|
|
303
|
+
if (typeof cached === "number" && cached > 0)
|
|
304
|
+
return cached;
|
|
305
|
+
}
|
|
306
|
+
return undefined;
|
|
307
|
+
}
|
|
308
|
+
const TOOL_CLASS_NAMES = new Set([
|
|
309
|
+
"DynamicStructuredTool",
|
|
310
|
+
"DynamicTool",
|
|
311
|
+
"StructuredTool",
|
|
312
|
+
"Tool",
|
|
313
|
+
]);
|
|
314
|
+
function extractToolName(tool, runName) {
|
|
315
|
+
// `runName` is the actual tool name LangChain assigns the run (e.g.
|
|
316
|
+
// "search_recipes"); prefer it over the serialized class name.
|
|
317
|
+
if (runName && !TOOL_CLASS_NAMES.has(runName))
|
|
318
|
+
return runName;
|
|
319
|
+
if (tool) {
|
|
320
|
+
if (typeof tool.name === "string" && !TOOL_CLASS_NAMES.has(tool.name))
|
|
321
|
+
return tool.name;
|
|
322
|
+
if (typeof tool.kwargs?.name === "string")
|
|
323
|
+
return tool.kwargs.name;
|
|
324
|
+
if (Array.isArray(tool.id) && tool.id.length > 0) {
|
|
325
|
+
const last = String(tool.id[tool.id.length - 1]);
|
|
326
|
+
if (!TOOL_CLASS_NAMES.has(last))
|
|
327
|
+
return last;
|
|
328
|
+
}
|
|
329
|
+
if (typeof tool.name === "string")
|
|
330
|
+
return tool.name;
|
|
331
|
+
}
|
|
332
|
+
return runName;
|
|
333
|
+
}
|
|
334
|
+
function stripCostIfEmpty(cost) {
|
|
335
|
+
if (cost.source === "unavailable" && cost.totalUsd == null)
|
|
336
|
+
return undefined;
|
|
337
|
+
return cost;
|
|
338
|
+
}
|
package/dist/assertions.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { type StandardSchemaV1 } from "./schema";
|
|
1
2
|
import type { JudgeCriteria } from "./judge";
|
|
2
3
|
export interface PendingJudgement {
|
|
3
4
|
value: unknown;
|
|
@@ -5,10 +6,64 @@ export interface PendingJudgement {
|
|
|
5
6
|
}
|
|
6
7
|
export declare function collectPendingJudgements(): PendingJudgement[];
|
|
7
8
|
export interface AgentMatchers {
|
|
9
|
+
/** Assert the agent refused. */
|
|
8
10
|
refusal(): void;
|
|
11
|
+
/** Assert the agent did NOT refuse. */
|
|
9
12
|
notRefusal(): void;
|
|
10
|
-
|
|
11
|
-
|
|
13
|
+
/**
|
|
14
|
+
* Text containment: `text` appears as a substring. For a non-string value the
|
|
15
|
+
* serialized form is searched. Case-INsensitive by default; pass
|
|
16
|
+
* `{ caseSensitive: true }` for an exact substring.
|
|
17
|
+
*/
|
|
18
|
+
containingText(text: string | number, opts?: {
|
|
19
|
+
caseSensitive?: boolean;
|
|
20
|
+
}): void;
|
|
21
|
+
/** Assert text containment does NOT hold. See {@link containingText}. */
|
|
22
|
+
notContainingText(text: string | number, opts?: {
|
|
23
|
+
caseSensitive?: boolean;
|
|
24
|
+
}): void;
|
|
25
|
+
/**
|
|
26
|
+
* Array membership: the value is an array containing `item` as an EXACT
|
|
27
|
+
* (deep-equal) element. Throws if the value is not an array. Use
|
|
28
|
+
* {@link containingSubset} when you want partial element matching.
|
|
29
|
+
*/
|
|
30
|
+
containingItem(item: unknown): void;
|
|
31
|
+
/**
|
|
32
|
+
* Structural subset: `subset` is recursively contained in the value.
|
|
33
|
+
* - object value + object `subset` → every key in `subset` is present with a
|
|
34
|
+
* recursively-contained value (extra keys allowed).
|
|
35
|
+
* - array value + array `subset` → every `subset` element matches a distinct
|
|
36
|
+
* element of the value (partial element matching, order-independent).
|
|
37
|
+
*
|
|
38
|
+
* Exact at the leaves (case-sensitive). Throws if the value is not an
|
|
39
|
+
* object/array, or `subset` is not an object/array.
|
|
40
|
+
*/
|
|
41
|
+
containingSubset(subset: object): void;
|
|
42
|
+
/** Assert the serialized text view matches `pattern`. */
|
|
43
|
+
matchingPattern(pattern: RegExp): void;
|
|
44
|
+
/** Deep structural equality against the native value. */
|
|
45
|
+
equalTo(expected: unknown): void;
|
|
46
|
+
/** Assert deep structural INequality against the native value. */
|
|
47
|
+
notEqualTo(expected: unknown): void;
|
|
48
|
+
/** Assert the value (array/string) has length `n`. */
|
|
49
|
+
ofLength(n: number): void;
|
|
50
|
+
/**
|
|
51
|
+
* Validate the native value against a Standard Schema (zod 4, valibot,
|
|
52
|
+
* arktype, …). Throws with the schema's formatted issues on failure.
|
|
53
|
+
* Synchronous — for async (`refine`-style) schemas, declare the schema at the
|
|
54
|
+
* agent() or scene().expectSchema() level instead.
|
|
55
|
+
*/
|
|
56
|
+
matchingSchema(schema: StandardSchemaV1): void;
|
|
57
|
+
/**
|
|
58
|
+
* Escape hatch for anything not covered by a named matcher: a predicate over
|
|
59
|
+
* the native value. Stays deterministic — use it to express negatives too,
|
|
60
|
+
* e.g. `satisfying((v) => !v.includes("secret"))`.
|
|
61
|
+
*/
|
|
62
|
+
satisfying(predicate: (value: any) => boolean, message?: string): void;
|
|
63
|
+
/**
|
|
64
|
+
* Queue an LLM-judged assertion, resolved asynchronously by the runner.
|
|
65
|
+
* Fuzzy + paid (express the negative in `failWhen`).
|
|
66
|
+
*/
|
|
12
67
|
judgedBy(criteria: JudgeCriteria): void;
|
|
13
68
|
}
|
|
14
69
|
export interface AgentExpectation {
|
package/dist/assertions.js
CHANGED
|
@@ -1,46 +1,132 @@
|
|
|
1
|
+
import { isDeepStrictEqual } from "node:util";
|
|
1
2
|
import { isRefusal } from "./refusal";
|
|
3
|
+
import { serializeValue } from "./resolve";
|
|
4
|
+
import { isObjectLike, isPlainObject, structuralContains } from "./match";
|
|
5
|
+
import { validateSync } from "./schema";
|
|
2
6
|
let pendingJudgements = [];
|
|
3
7
|
export function collectPendingJudgements() {
|
|
4
8
|
const collected = pendingJudgements;
|
|
5
9
|
pendingJudgements = [];
|
|
6
10
|
return collected;
|
|
7
11
|
}
|
|
12
|
+
/**
|
|
13
|
+
* 100-char preview for error messages. Uses COMPACT JSON for objects (the
|
|
14
|
+
* judge-facing `serializeValue` pretty-prints; error previews stay terse and
|
|
15
|
+
* match the library's original contract).
|
|
16
|
+
*/
|
|
17
|
+
function preview(value) {
|
|
18
|
+
let s;
|
|
19
|
+
if (typeof value === "string") {
|
|
20
|
+
s = value;
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
try {
|
|
24
|
+
s = JSON.stringify(value);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
s = String(value);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return s.slice(0, 100);
|
|
31
|
+
}
|
|
32
|
+
/** Compact one-line form for an inline needle/expected in an error message. */
|
|
33
|
+
function compact(value) {
|
|
34
|
+
try {
|
|
35
|
+
return typeof value === "string" ? value : JSON.stringify(value);
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
return String(value);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
/** Human-readable type label for diagnostics (e.g. "a number", "an array"). */
|
|
42
|
+
function describeType(value) {
|
|
43
|
+
if (value === null)
|
|
44
|
+
return "null";
|
|
45
|
+
if (Array.isArray(value))
|
|
46
|
+
return "an array";
|
|
47
|
+
return `a ${typeof value}`;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Substring search shared by `containingText` / `notContainingText`. A string
|
|
51
|
+
* value is searched directly; anything else via its serialized form.
|
|
52
|
+
* Case-insensitive unless `caseSensitive` is set.
|
|
53
|
+
*/
|
|
54
|
+
function textContains(value, text, opts) {
|
|
55
|
+
const actual = typeof value === "string" ? value : serializeValue(value);
|
|
56
|
+
const needle = String(text);
|
|
57
|
+
const hit = opts?.caseSensitive
|
|
58
|
+
? actual.includes(needle)
|
|
59
|
+
: actual.toLowerCase().includes(needle.toLowerCase());
|
|
60
|
+
return { actual, hit };
|
|
61
|
+
}
|
|
62
|
+
function makeMatchers(value) {
|
|
63
|
+
const assert = (cond, message) => {
|
|
64
|
+
if (!cond)
|
|
65
|
+
throw new Error(message);
|
|
66
|
+
};
|
|
67
|
+
return {
|
|
68
|
+
refusal() {
|
|
69
|
+
assert(isRefusal(value), `Expected a refusal but got: "${preview(value)}"`);
|
|
70
|
+
},
|
|
71
|
+
notRefusal() {
|
|
72
|
+
assert(!isRefusal(value), `Expected a non-refusal response but got: "${preview(value)}"`);
|
|
73
|
+
},
|
|
74
|
+
containingText(text, opts) {
|
|
75
|
+
const { actual, hit } = textContains(value, text, opts);
|
|
76
|
+
assert(hit, `Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
77
|
+
},
|
|
78
|
+
notContainingText(text, opts) {
|
|
79
|
+
const { actual, hit } = textContains(value, text, opts);
|
|
80
|
+
assert(!hit, `Expected response NOT to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
81
|
+
},
|
|
82
|
+
containingItem(item) {
|
|
83
|
+
if (!Array.isArray(value)) {
|
|
84
|
+
throw new Error(`containingItem() expects an array value but got ${describeType(value)}. ` +
|
|
85
|
+
`Use containingText() for substrings or containingSubset() for objects.`);
|
|
86
|
+
}
|
|
87
|
+
assert(value.some((el) => isDeepStrictEqual(el, item)), `Expected array to contain item ${compact(item)} but it did not (got ${preview(value)})`);
|
|
88
|
+
},
|
|
89
|
+
containingSubset(subset) {
|
|
90
|
+
if (!Array.isArray(value) && !isObjectLike(value)) {
|
|
91
|
+
throw new Error(`containingSubset() expects an object or array value but got ${describeType(value)}.`);
|
|
92
|
+
}
|
|
93
|
+
if (!Array.isArray(subset) && !isPlainObject(subset)) {
|
|
94
|
+
throw new Error(`containingSubset() expects an object or array subset but got ${describeType(subset)}.`);
|
|
95
|
+
}
|
|
96
|
+
assert(structuralContains(value, subset), `Expected value to contain subset ${compact(subset)} but it did not (got ${preview(value)})`);
|
|
97
|
+
},
|
|
98
|
+
matchingPattern(pattern) {
|
|
99
|
+
const actual = typeof value === "string" ? value : serializeValue(value);
|
|
100
|
+
assert(pattern.test(actual), `Expected response to match ${pattern} but got: "${actual.slice(0, 100)}"`);
|
|
101
|
+
},
|
|
102
|
+
equalTo(expected) {
|
|
103
|
+
assert(isDeepStrictEqual(value, expected), `Expected value to equal ${compact(expected)} but got ${preview(value)}`);
|
|
104
|
+
},
|
|
105
|
+
notEqualTo(expected) {
|
|
106
|
+
assert(!isDeepStrictEqual(value, expected), `Expected value NOT to equal ${compact(expected)} but it did`);
|
|
107
|
+
},
|
|
108
|
+
ofLength(n) {
|
|
109
|
+
const len = typeof value === "string" || Array.isArray(value)
|
|
110
|
+
? value.length
|
|
111
|
+
: NaN;
|
|
112
|
+
assert(len === n, `Expected length ${n} but got ${Number.isNaN(len) ? "a non-measurable value" : len}`);
|
|
113
|
+
},
|
|
114
|
+
matchingSchema(schema) {
|
|
115
|
+
const outcome = validateSync(schema, value);
|
|
116
|
+
assert(outcome.ok, `Schema validation failed for value "${preview(value)}" — ${outcome.ok ? "" : outcome.message}`);
|
|
117
|
+
},
|
|
118
|
+
satisfying(predicate, message) {
|
|
119
|
+
assert(Boolean(predicate(value)), message ?? `Predicate failed for value: "${preview(value)}"`);
|
|
120
|
+
},
|
|
121
|
+
judgedBy(criteria) {
|
|
122
|
+
pendingJudgements.push({ value, criteria });
|
|
123
|
+
},
|
|
124
|
+
};
|
|
125
|
+
}
|
|
8
126
|
export function expect(value) {
|
|
9
127
|
return {
|
|
10
128
|
get toBe() {
|
|
11
|
-
return
|
|
12
|
-
refusal() {
|
|
13
|
-
if (!isRefusal(value)) {
|
|
14
|
-
const preview = typeof value === "string"
|
|
15
|
-
? value.slice(0, 100)
|
|
16
|
-
: JSON.stringify(value).slice(0, 100);
|
|
17
|
-
throw new Error(`Expected a refusal but got: "${preview}"`);
|
|
18
|
-
}
|
|
19
|
-
},
|
|
20
|
-
notRefusal() {
|
|
21
|
-
if (isRefusal(value)) {
|
|
22
|
-
const preview = typeof value === "string"
|
|
23
|
-
? value.slice(0, 100)
|
|
24
|
-
: JSON.stringify(value).slice(0, 100);
|
|
25
|
-
throw new Error(`Expected a non-refusal response but got: "${preview}"`);
|
|
26
|
-
}
|
|
27
|
-
},
|
|
28
|
-
containing(text) {
|
|
29
|
-
const actual = typeof value === "string" ? value : String(value);
|
|
30
|
-
if (!actual.toLowerCase().includes(text.toLowerCase())) {
|
|
31
|
-
throw new Error(`Expected response to contain "${text}" but got: "${actual.slice(0, 100)}"`);
|
|
32
|
-
}
|
|
33
|
-
},
|
|
34
|
-
matchingPattern(regex) {
|
|
35
|
-
const actual = typeof value === "string" ? value : String(value);
|
|
36
|
-
if (!regex.test(actual)) {
|
|
37
|
-
throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
|
|
38
|
-
}
|
|
39
|
-
},
|
|
40
|
-
judgedBy(criteria) {
|
|
41
|
-
pendingJudgements.push({ value, criteria });
|
|
42
|
-
},
|
|
43
|
-
};
|
|
129
|
+
return makeMatchers(value);
|
|
44
130
|
},
|
|
45
131
|
};
|
|
46
132
|
}
|
package/dist/cli.d.ts
CHANGED
|
@@ -1,2 +1,16 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
export {
|
|
2
|
+
export interface ParsedRunArgs {
|
|
3
|
+
pattern?: string;
|
|
4
|
+
targets: string[];
|
|
5
|
+
full: boolean;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Extract the args that follow the command word from a full `process.argv`.
|
|
9
|
+
* `argv = [execPath, scriptPath, command, ...commandArgs]`, so the command's
|
|
10
|
+
* args always start at index 3. Capturing them here (once, from the original
|
|
11
|
+
* argv) avoids re-slicing a mutated argv downstream — the double-shift that
|
|
12
|
+
* silently dropped a lone `run` target and made discovery scan the whole cwd.
|
|
13
|
+
*/
|
|
14
|
+
export declare function getCommandArgs(argv: string[]): string[];
|
|
15
|
+
export declare function parseRunArgs(args: string[]): ParsedRunArgs;
|
|
16
|
+
export declare function main(argv: string[]): Promise<void>;
|