@docshield/didactic 0.1.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +333 -228
- package/dist/index.cjs +1090 -550
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +134 -65
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +134 -65
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1085 -552
- package/dist/index.mjs.map +1 -1
- package/package.json +20 -3
package/dist/index.mjs
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
|
-
import * as chrono from "chrono-node";
|
|
2
|
-
import { differenceInDays } from "date-fns";
|
|
3
|
-
import Levenshtein from "levenshtein";
|
|
4
1
|
import munkres from "munkres-js";
|
|
5
2
|
import Anthropic from "@anthropic-ai/sdk";
|
|
6
3
|
import OpenAI from "openai";
|
|
7
|
-
import * as
|
|
4
|
+
import * as chrono from "chrono-node";
|
|
5
|
+
import { differenceInDays } from "date-fns";
|
|
6
|
+
import Levenshtein from "levenshtein";
|
|
8
7
|
import * as fs from "fs";
|
|
8
|
+
import * as path from "path";
|
|
9
|
+
import chalk from "chalk";
|
|
10
|
+
import ora from "ora";
|
|
11
|
+
import cliProgress from "cli-progress";
|
|
12
|
+
import figures from "figures";
|
|
13
|
+
import * as crypto from "crypto";
|
|
9
14
|
|
|
10
15
|
//#region src/types.ts
|
|
11
16
|
/**
|
|
12
17
|
* Supported LLM providers.
|
|
18
|
+
* Used by both optimizer and LLM-based comparators.
|
|
13
19
|
*/
|
|
14
20
|
let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
|
|
15
21
|
LLMProviders$1["anthropic_claude_opus"] = "anthropic_claude_opus";
|
|
@@ -21,7 +27,7 @@ let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
|
|
|
21
27
|
}({});
|
|
22
28
|
|
|
23
29
|
//#endregion
|
|
24
|
-
//#region src/constants.ts
|
|
30
|
+
//#region src/library/constants.ts
|
|
25
31
|
const PROVIDER_SPECS = {
|
|
26
32
|
[LLMProviders.anthropic_claude_opus]: {
|
|
27
33
|
model: "claude-opus-4-5-20251101",
|
|
@@ -36,7 +42,7 @@ const PROVIDER_SPECS = {
|
|
|
36
42
|
costPerMillionOutput: 15
|
|
37
43
|
},
|
|
38
44
|
[LLMProviders.anthropic_claude_haiku]: {
|
|
39
|
-
model: "claude-haiku-4-5-
|
|
45
|
+
model: "claude-haiku-4-5-20251001",
|
|
40
46
|
maxTokens: 64e3,
|
|
41
47
|
costPerMillionInput: 1,
|
|
42
48
|
costPerMillionOutput: 5
|
|
@@ -61,7 +67,154 @@ const DEFAULT_PER_TEST_THRESHOLD = 1;
|
|
|
61
67
|
const NAME_SUFFIXES = /(?<=\S)\s*,?\s*(inc\.?|llc\.?|ltd\.?|l\.l\.c\.?|corp\.?|corporation|company|co\.?)$/i;
|
|
62
68
|
|
|
63
69
|
//#endregion
|
|
64
|
-
//#region src/
|
|
70
|
+
//#region src/library/llm/llm-client.ts
|
|
71
|
+
/**
|
|
72
|
+
* Call an LLM provider with the given messages.
|
|
73
|
+
* Returns raw text output - caller is responsible for parsing if structured output is needed.
|
|
74
|
+
*/
|
|
75
|
+
async function callLLM(config) {
|
|
76
|
+
const { provider, apiKey, messages, useThinking = false } = config;
|
|
77
|
+
const spec = PROVIDER_SPECS[provider];
|
|
78
|
+
try {
|
|
79
|
+
if (provider.startsWith("anthropic")) {
|
|
80
|
+
const client = new Anthropic({ apiKey });
|
|
81
|
+
const streamOptions = {
|
|
82
|
+
model: spec.model,
|
|
83
|
+
max_tokens: spec.maxTokens,
|
|
84
|
+
system: messages.find((m) => m.role === "system")?.content,
|
|
85
|
+
messages: messages.filter((m) => m.role !== "system").map((m) => ({
|
|
86
|
+
role: m.role,
|
|
87
|
+
content: m.content
|
|
88
|
+
}))
|
|
89
|
+
};
|
|
90
|
+
if (useThinking) streamOptions.thinking = {
|
|
91
|
+
type: "enabled",
|
|
92
|
+
budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
|
|
93
|
+
};
|
|
94
|
+
const finalMessage = await client.messages.stream(streamOptions).finalMessage();
|
|
95
|
+
const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
|
|
96
|
+
const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
|
|
97
|
+
const inputTokens = finalMessage.usage.input_tokens;
|
|
98
|
+
const outputTokens = finalMessage.usage.output_tokens;
|
|
99
|
+
return {
|
|
100
|
+
text,
|
|
101
|
+
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
102
|
+
inputTokens,
|
|
103
|
+
outputTokens
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
if (provider.startsWith("openai")) {
|
|
107
|
+
const client = new OpenAI({ apiKey });
|
|
108
|
+
const completionOptions = {
|
|
109
|
+
model: spec.model,
|
|
110
|
+
messages: messages.map((m) => ({
|
|
111
|
+
role: m.role,
|
|
112
|
+
content: m.content
|
|
113
|
+
})),
|
|
114
|
+
max_completion_tokens: spec.maxTokens
|
|
115
|
+
};
|
|
116
|
+
if (useThinking) completionOptions.reasoning_effort = "xhigh";
|
|
117
|
+
const response = await client.chat.completions.create(completionOptions);
|
|
118
|
+
const text = response.choices[0].message.content ?? "";
|
|
119
|
+
const inputTokens = response.usage?.prompt_tokens ?? 0;
|
|
120
|
+
const outputTokens = response.usage?.completion_tokens ?? 0;
|
|
121
|
+
return {
|
|
122
|
+
text,
|
|
123
|
+
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
124
|
+
inputTokens,
|
|
125
|
+
outputTokens
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
throw new Error(`Unsupported provider: ${provider}`);
|
|
129
|
+
} catch (error) {
|
|
130
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
131
|
+
throw new Error(`LLM call failed (${spec.model}): ${message}`);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Call an LLM provider with structured output.
|
|
136
|
+
* Returns parsed JSON data conforming to the provided schema.
|
|
137
|
+
*/
|
|
138
|
+
async function callStructuredLLM(config) {
|
|
139
|
+
const { provider, apiKey, messages, schema, useThinking = false } = config;
|
|
140
|
+
const spec = PROVIDER_SPECS[provider];
|
|
141
|
+
try {
|
|
142
|
+
if (provider.startsWith("anthropic")) {
|
|
143
|
+
const client = new Anthropic({ apiKey });
|
|
144
|
+
const baseOptions = {
|
|
145
|
+
model: spec.model,
|
|
146
|
+
max_tokens: spec.maxTokens,
|
|
147
|
+
betas: ["structured-outputs-2025-11-13"],
|
|
148
|
+
system: messages.find((m) => m.role === "system")?.content,
|
|
149
|
+
messages: messages.filter((m) => m.role !== "system").map((m) => ({
|
|
150
|
+
role: m.role,
|
|
151
|
+
content: m.content
|
|
152
|
+
})),
|
|
153
|
+
output_format: {
|
|
154
|
+
type: "json_schema",
|
|
155
|
+
schema
|
|
156
|
+
}
|
|
157
|
+
};
|
|
158
|
+
const streamOptions = useThinking ? {
|
|
159
|
+
...baseOptions,
|
|
160
|
+
thinking: {
|
|
161
|
+
type: "enabled",
|
|
162
|
+
budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
|
|
163
|
+
}
|
|
164
|
+
} : baseOptions;
|
|
165
|
+
const finalMessage = await client.beta.messages.stream(streamOptions).finalMessage();
|
|
166
|
+
const content = finalMessage.content[0];
|
|
167
|
+
if (content.type !== "text") throw new Error("Unexpected response type from LLM");
|
|
168
|
+
const data = JSON.parse(content.text);
|
|
169
|
+
const inputTokens = finalMessage.usage.input_tokens;
|
|
170
|
+
const outputTokens = finalMessage.usage.output_tokens;
|
|
171
|
+
return {
|
|
172
|
+
data,
|
|
173
|
+
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
174
|
+
inputTokens,
|
|
175
|
+
outputTokens
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
if (provider.startsWith("openai")) {
|
|
179
|
+
const client = new OpenAI({ apiKey });
|
|
180
|
+
const completionOptions = {
|
|
181
|
+
model: spec.model,
|
|
182
|
+
messages: messages.map((m) => ({
|
|
183
|
+
role: m.role,
|
|
184
|
+
content: m.content
|
|
185
|
+
})),
|
|
186
|
+
max_completion_tokens: spec.maxTokens,
|
|
187
|
+
response_format: {
|
|
188
|
+
type: "json_schema",
|
|
189
|
+
json_schema: {
|
|
190
|
+
name: "response",
|
|
191
|
+
strict: true,
|
|
192
|
+
schema
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
};
|
|
196
|
+
if (useThinking) completionOptions.reasoning_effort = "xhigh";
|
|
197
|
+
const response = await client.chat.completions.create(completionOptions);
|
|
198
|
+
const text = response.choices[0].message.content ?? "";
|
|
199
|
+
const data = JSON.parse(text);
|
|
200
|
+
const inputTokens = response.usage?.prompt_tokens ?? 0;
|
|
201
|
+
const outputTokens = response.usage?.completion_tokens ?? 0;
|
|
202
|
+
return {
|
|
203
|
+
data,
|
|
204
|
+
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
205
|
+
inputTokens,
|
|
206
|
+
outputTokens
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
throw new Error(`Unsupported provider: ${provider}`);
|
|
210
|
+
} catch (error) {
|
|
211
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
212
|
+
throw new Error(`Structured LLM call failed (${spec.model}): ${message}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
//#endregion
|
|
217
|
+
//#region src/eval/comparators/comparators.ts
|
|
65
218
|
/** Checks if actual string contains a substring. */
|
|
66
219
|
function contains(substring) {
|
|
67
220
|
return (_expected, actual) => {
|
|
@@ -198,6 +351,103 @@ function within(config) {
|
|
|
198
351
|
};
|
|
199
352
|
};
|
|
200
353
|
}
|
|
354
|
+
/** Schema for LLM comparison response. */
|
|
355
|
+
const LLM_COMPARE_SCHEMA = {
|
|
356
|
+
type: "object",
|
|
357
|
+
properties: {
|
|
358
|
+
passed: {
|
|
359
|
+
type: "boolean",
|
|
360
|
+
description: "Whether the actual value matches the expected value"
|
|
361
|
+
},
|
|
362
|
+
rationale: {
|
|
363
|
+
type: "string",
|
|
364
|
+
description: "Brief explanation of the comparison decision"
|
|
365
|
+
}
|
|
366
|
+
},
|
|
367
|
+
required: ["passed", "rationale"],
|
|
368
|
+
additionalProperties: false
|
|
369
|
+
};
|
|
370
|
+
const DEFAULT_LLM_COMPARE_SYSTEM_PROMPT = `Compare the following two values and determine if they are semantically equivalent.
|
|
371
|
+
|
|
372
|
+
Focus on whether they convey the same core meaning or information, even if expressed differently. Consider synonyms, paraphrasing, and stylistic variations as acceptable. Only mark as failed if there are substantial differences in the actual facts or meaning being conveyed.`;
|
|
373
|
+
const buildLLMCompareUserPrompt = (expected, actual) => `Expected value:
|
|
374
|
+
${JSON.stringify(expected, null, 2)}
|
|
375
|
+
|
|
376
|
+
Actual value:
|
|
377
|
+
${JSON.stringify(actual, null, 2)}`;
|
|
378
|
+
/**
|
|
379
|
+
* Uses an LLM to compare expected vs actual values.
|
|
380
|
+
* Returns a comparison result with rationale and cost tracking.
|
|
381
|
+
* Default provider: anthropic_claude_haiku (fastest, cheapest).
|
|
382
|
+
*/
|
|
383
|
+
function llmCompare(config) {
|
|
384
|
+
const systemPrompt = config.systemPrompt ?? DEFAULT_LLM_COMPARE_SYSTEM_PROMPT;
|
|
385
|
+
return async (expected, actual, context) => {
|
|
386
|
+
try {
|
|
387
|
+
const apiKey = config.apiKey ?? context?.llmConfig?.apiKey;
|
|
388
|
+
if (!apiKey) throw new Error("llmCompare requires an apiKey. Either pass it directly to llmCompare() or set llmConfig.apiKey in eval config.");
|
|
389
|
+
const provider = config.provider ?? context?.llmConfig?.provider ?? LLMProviders.anthropic_claude_haiku;
|
|
390
|
+
const userPrompt = buildLLMCompareUserPrompt(expected, actual);
|
|
391
|
+
const result = await callStructuredLLM({
|
|
392
|
+
provider,
|
|
393
|
+
apiKey,
|
|
394
|
+
messages: [{
|
|
395
|
+
role: "system",
|
|
396
|
+
content: systemPrompt
|
|
397
|
+
}, {
|
|
398
|
+
role: "user",
|
|
399
|
+
content: userPrompt
|
|
400
|
+
}],
|
|
401
|
+
schema: LLM_COMPARE_SCHEMA
|
|
402
|
+
});
|
|
403
|
+
return {
|
|
404
|
+
passed: result.data.passed,
|
|
405
|
+
rationale: result.data.rationale,
|
|
406
|
+
cost: result.cost,
|
|
407
|
+
similarity: result.data.passed ? 1 : 0
|
|
408
|
+
};
|
|
409
|
+
} catch (error) {
|
|
410
|
+
return {
|
|
411
|
+
passed: false,
|
|
412
|
+
rationale: `LLM comparison failed: ${error instanceof Error ? error.message : String(error)}`,
|
|
413
|
+
cost: 0,
|
|
414
|
+
similarity: 0
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
};
|
|
418
|
+
}
|
|
419
|
+
/**
|
|
420
|
+
* Marks a comparator or comparator config as unordered.
|
|
421
|
+
* When applied to an array field, items will be matched by similarity
|
|
422
|
+
* rather than index position (using Hungarian algorithm).
|
|
423
|
+
*
|
|
424
|
+
* @example
|
|
425
|
+
* // Unordered array of objects
|
|
426
|
+
* lineItems: unordered({
|
|
427
|
+
* description: name,
|
|
428
|
+
* price: within({ tolerance: 5 })
|
|
429
|
+
* })
|
|
430
|
+
*
|
|
431
|
+
* @example
|
|
432
|
+
* // Unordered array of primitives
|
|
433
|
+
* tags: unordered(exact)
|
|
434
|
+
*
|
|
435
|
+
* @example
|
|
436
|
+
* // When entire output is an array
|
|
437
|
+
* comparators: unordered({
|
|
438
|
+
* carrier: exact,
|
|
439
|
+
* premium: within({ tolerance: 0.05 })
|
|
440
|
+
* })
|
|
441
|
+
*/
|
|
442
|
+
function unordered(comparator) {
|
|
443
|
+
const baseFunction = typeof comparator === "function" ? comparator : () => {
|
|
444
|
+
throw new Error("unordered() base function should not be called when nested comparators exist. This is likely a bug in the evaluation logic.");
|
|
445
|
+
};
|
|
446
|
+
return Object.assign(baseFunction, {
|
|
447
|
+
_unordered: true,
|
|
448
|
+
_nestedComparators: typeof comparator === "object" ? comparator : void 0
|
|
449
|
+
});
|
|
450
|
+
}
|
|
201
451
|
/**
|
|
202
452
|
* Deep equality comparison with cycle detection.
|
|
203
453
|
* Uses WeakSet to track visited object pairs to prevent stack overflow on circular references.
|
|
@@ -235,198 +485,74 @@ function normalizeNumeric(value) {
|
|
|
235
485
|
if (value == null || value === "") return null;
|
|
236
486
|
const str = String(value);
|
|
237
487
|
const isNegativeParens = /^\(.*\)$/.test(str.trim());
|
|
238
|
-
let cleaned = str.replace(/[^0-9
|
|
488
|
+
let cleaned = str.replace(/[^0-9.-]/g, "");
|
|
239
489
|
if (isNegativeParens && !cleaned.startsWith("-")) cleaned = "-" + cleaned;
|
|
240
490
|
const num = parseFloat(cleaned);
|
|
241
491
|
return isNaN(num) ? null : num;
|
|
242
492
|
}
|
|
243
493
|
|
|
244
494
|
//#endregion
|
|
245
|
-
//#region src/
|
|
495
|
+
//#region src/eval/comparators/matching.ts
|
|
496
|
+
function isObject$1(value) {
|
|
497
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
498
|
+
}
|
|
246
499
|
/**
|
|
247
|
-
*
|
|
248
|
-
*
|
|
249
|
-
*
|
|
250
|
-
*
|
|
251
|
-
* const executor = endpoint('https://api.example.com/workflow', {
|
|
252
|
-
* headers: { Authorization: 'Bearer token' },
|
|
253
|
-
* });
|
|
254
|
-
* ```
|
|
500
|
+
* Calculate similarity score between two values (0.0 to 1.0).
|
|
501
|
+
* For arrays: recursively match and average similarity of paired elements.
|
|
502
|
+
* For objects: average similarity across all fields using comparator results.
|
|
503
|
+
* For primitives: uses exact comparison's similarity score.
|
|
255
504
|
*/
|
|
256
|
-
function
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
const
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
const cost = mapCost?.(data) ?? 0;
|
|
286
|
-
if (mapResponse) return {
|
|
287
|
-
output: mapResponse(data),
|
|
288
|
-
additionalContext,
|
|
289
|
-
cost
|
|
290
|
-
};
|
|
291
|
-
return {
|
|
292
|
-
output: data,
|
|
293
|
-
additionalContext,
|
|
294
|
-
cost
|
|
295
|
-
};
|
|
296
|
-
} catch (error) {
|
|
297
|
-
clearTimeout(timeoutId);
|
|
298
|
-
throw error;
|
|
299
|
-
}
|
|
300
|
-
};
|
|
505
|
+
async function getSimilarity(expected, actual, comparators) {
|
|
506
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
507
|
+
if (expected.length === 0 && actual.length === 0) return 1;
|
|
508
|
+
if (expected.length === 0 || actual.length === 0) return 0;
|
|
509
|
+
const result = await matchArrays(expected, actual, comparators);
|
|
510
|
+
let total$1 = 0;
|
|
511
|
+
for (const [expIdx, actIdx] of result.assignments) total$1 += await getSimilarity(expected[expIdx], actual[actIdx], comparators);
|
|
512
|
+
const maxLen = Math.max(expected.length, actual.length);
|
|
513
|
+
return total$1 / maxLen;
|
|
514
|
+
}
|
|
515
|
+
if (!isObject$1(expected) || !isObject$1(actual)) {
|
|
516
|
+
const result = exact(expected, actual);
|
|
517
|
+
return result.similarity ?? (result.passed ? 1 : 0);
|
|
518
|
+
}
|
|
519
|
+
const fields = Object.keys(expected).filter((key) => {
|
|
520
|
+
const comp = comparators[key];
|
|
521
|
+
return comp !== void 0 && typeof comp === "function";
|
|
522
|
+
});
|
|
523
|
+
if (fields.length === 0) return 1;
|
|
524
|
+
let total = 0;
|
|
525
|
+
for (const key of fields) {
|
|
526
|
+
const comparatorConfig = comparators[key];
|
|
527
|
+
const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected[key], actual[key], {
|
|
528
|
+
expectedParent: expected,
|
|
529
|
+
actualParent: actual
|
|
530
|
+
});
|
|
531
|
+
total += result.similarity ?? (result.passed ? 1 : 0);
|
|
532
|
+
}
|
|
533
|
+
return total / fields.length;
|
|
301
534
|
}
|
|
302
535
|
/**
|
|
303
|
-
*
|
|
304
|
-
*
|
|
305
|
-
* @example
|
|
306
|
-
* ```ts
|
|
307
|
-
* const executor = fn({
|
|
308
|
-
* fn: async (input, systemPrompt) => {
|
|
309
|
-
* const result = await myLLMCall(input, systemPrompt);
|
|
310
|
-
* return result;
|
|
311
|
-
* },
|
|
312
|
-
* });
|
|
313
|
-
* ```
|
|
536
|
+
* Find optimal pairing between expected and actual arrays using Hungarian algorithm.
|
|
537
|
+
* Pure matching - no pass/fail determination.
|
|
314
538
|
*
|
|
315
|
-
* @
|
|
316
|
-
*
|
|
317
|
-
*
|
|
318
|
-
*
|
|
319
|
-
* mapResponse: (result) => ({ documentType: result.documentType }),
|
|
320
|
-
* mapCost: (result) => result.cost,
|
|
321
|
-
* mapAdditionalContext: (result) => result.metadata,
|
|
322
|
-
* });
|
|
323
|
-
* ```
|
|
539
|
+
* @param expected - Array of expected items
|
|
540
|
+
* @param actual - Array of actual items
|
|
541
|
+
* @param comparators - Nested comparator configuration for array items
|
|
542
|
+
* @returns Matching result with assignments and unmatched indices
|
|
324
543
|
*/
|
|
325
|
-
function
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
additionalContext: config.mapAdditionalContext?.(raw),
|
|
331
|
-
cost: config.mapCost?.(raw) ?? 0
|
|
332
|
-
};
|
|
333
|
-
};
|
|
334
|
-
}
|
|
335
|
-
/**
|
|
336
|
-
* Creates a mock executor for testing.
|
|
337
|
-
* Can accept either:
|
|
338
|
-
* - An array of outputs (returned in sequence, cycling if more calls than outputs)
|
|
339
|
-
* - A function that maps input to output
|
|
340
|
-
*
|
|
341
|
-
* @example Array-based:
|
|
342
|
-
* ```ts
|
|
343
|
-
* const executor = mock([
|
|
344
|
-
* { premium: 12500, policyType: 'claims-made' },
|
|
345
|
-
* { premium: 8200, policyType: 'entity' },
|
|
346
|
-
* ]);
|
|
347
|
-
* ```
|
|
348
|
-
*
|
|
349
|
-
* @example Function-based:
|
|
350
|
-
* ```ts
|
|
351
|
-
* const executor = mock((input) => ({
|
|
352
|
-
* id: input.id,
|
|
353
|
-
* processed: true,
|
|
354
|
-
* }));
|
|
355
|
-
* ```
|
|
356
|
-
*/
|
|
357
|
-
function mock(outputsOrFn) {
|
|
358
|
-
if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
|
|
359
|
-
return { output: outputsOrFn(input, systemPrompt) };
|
|
360
|
-
};
|
|
361
|
-
const outputs = outputsOrFn;
|
|
362
|
-
if (outputs.length === 0) throw new Error("mock() requires at least one output");
|
|
363
|
-
let callIndex = 0;
|
|
364
|
-
return async () => {
|
|
365
|
-
const output = outputs[callIndex % outputs.length];
|
|
366
|
-
callIndex++;
|
|
367
|
-
return { output };
|
|
368
|
-
};
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
//#endregion
|
|
372
|
-
//#region src/matching.ts
|
|
373
|
-
function isObject$1(value) {
|
|
374
|
-
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
375
|
-
}
|
|
376
|
-
/**
|
|
377
|
-
* Calculate similarity score between two values (0.0 to 1.0).
|
|
378
|
-
* For arrays: recursively match and average similarity of paired elements.
|
|
379
|
-
* For objects: average similarity across all fields using comparator results.
|
|
380
|
-
* For primitives: uses exact comparison's similarity score.
|
|
381
|
-
*/
|
|
382
|
-
function getSimilarity(expected, actual, comparators) {
|
|
383
|
-
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
384
|
-
if (expected.length === 0 && actual.length === 0) return 1;
|
|
385
|
-
if (expected.length === 0 || actual.length === 0) return 0;
|
|
386
|
-
const result = matchArrays(expected, actual, comparators);
|
|
387
|
-
let total$1 = 0;
|
|
388
|
-
for (const [expIdx, actIdx] of result.assignments) total$1 += getSimilarity(expected[expIdx], actual[actIdx], comparators);
|
|
389
|
-
const maxLen = Math.max(expected.length, actual.length);
|
|
390
|
-
return total$1 / maxLen;
|
|
391
|
-
}
|
|
392
|
-
if (!isObject$1(expected) || !isObject$1(actual)) {
|
|
393
|
-
const result = exact(expected, actual);
|
|
394
|
-
return result.similarity ?? (result.passed ? 1 : 0);
|
|
395
|
-
}
|
|
396
|
-
const fields = Object.keys(expected).filter((key) => comparators[key]);
|
|
397
|
-
if (fields.length === 0) return 1;
|
|
398
|
-
let total = 0;
|
|
399
|
-
for (const key of fields) {
|
|
400
|
-
const comparator = comparators[key];
|
|
401
|
-
const result = comparator(expected[key], actual[key], {
|
|
402
|
-
expectedParent: expected,
|
|
403
|
-
actualParent: actual
|
|
404
|
-
});
|
|
405
|
-
total += result.similarity ?? (result.passed ? 1 : 0);
|
|
406
|
-
}
|
|
407
|
-
return total / fields.length;
|
|
408
|
-
}
|
|
409
|
-
/**
|
|
410
|
-
* Find optimal pairing between expected and actual arrays using Hungarian algorithm.
|
|
411
|
-
* Pure matching - no pass/fail determination.
|
|
412
|
-
*
|
|
413
|
-
* @param expected - Array of expected items
|
|
414
|
-
* @param actual - Array of actual items
|
|
415
|
-
* @param comparators - Map of field names to comparator functions
|
|
416
|
-
* @returns Matching result with assignments and unmatched indices
|
|
417
|
-
*/
|
|
418
|
-
function matchArrays(expected, actual, comparators = {}) {
|
|
419
|
-
if (expected.length === 0) return {
|
|
420
|
-
assignments: [],
|
|
421
|
-
unmatchedExpected: [],
|
|
422
|
-
unmatchedActual: [...Array(actual.length).keys()]
|
|
544
|
+
async function matchArrays(expected, actual, comparators = {}) {
|
|
545
|
+
if (expected.length === 0) return {
|
|
546
|
+
assignments: [],
|
|
547
|
+
unmatchedExpected: [],
|
|
548
|
+
unmatchedActual: [...Array(actual.length).keys()]
|
|
423
549
|
};
|
|
424
550
|
if (actual.length === 0) return {
|
|
425
551
|
assignments: [],
|
|
426
552
|
unmatchedExpected: [...Array(expected.length).keys()],
|
|
427
553
|
unmatchedActual: []
|
|
428
554
|
};
|
|
429
|
-
const rawAssignments = munkres(expected.map((exp) => actual.map((act) => 1 - getSimilarity(exp, act, comparators))));
|
|
555
|
+
const rawAssignments = munkres(await Promise.all(expected.map(async (exp) => Promise.all(actual.map(async (act) => 1 - await getSimilarity(exp, act, comparators))))));
|
|
430
556
|
const assignments = [];
|
|
431
557
|
const matchedExp = /* @__PURE__ */ new Set();
|
|
432
558
|
const matchedAct = /* @__PURE__ */ new Set();
|
|
@@ -443,212 +569,126 @@ function matchArrays(expected, actual, comparators = {}) {
|
|
|
443
569
|
}
|
|
444
570
|
|
|
445
571
|
//#endregion
|
|
446
|
-
//#region src/
|
|
572
|
+
//#region src/optimizer/ui.ts
|
|
447
573
|
/**
|
|
448
|
-
*
|
|
574
|
+
* UI utilities for beautiful console output
|
|
449
575
|
*/
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
else fields = compareFields({
|
|
484
|
-
expected,
|
|
485
|
-
actual: result.output,
|
|
486
|
-
comparators,
|
|
487
|
-
unorderedList: config.unorderedList
|
|
488
|
-
});
|
|
489
|
-
const passedFields = Object.values(fields).filter((f) => f.passed).length;
|
|
490
|
-
const totalFields$1 = Object.values(fields).length;
|
|
491
|
-
const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
|
|
492
|
-
const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
|
|
493
|
-
return {
|
|
494
|
-
input,
|
|
495
|
-
expected,
|
|
496
|
-
actual: result.output,
|
|
497
|
-
additionalContext: result.additionalContext,
|
|
498
|
-
cost: result.cost ?? 0,
|
|
499
|
-
passed: passed$1,
|
|
500
|
-
fields,
|
|
501
|
-
passedFields,
|
|
502
|
-
totalFields: totalFields$1,
|
|
503
|
-
passRate
|
|
504
|
-
};
|
|
505
|
-
} catch (error) {
|
|
506
|
-
return {
|
|
507
|
-
input,
|
|
508
|
-
expected,
|
|
509
|
-
actual: void 0,
|
|
510
|
-
cost: 0,
|
|
511
|
-
passed: false,
|
|
512
|
-
fields: {},
|
|
513
|
-
passedFields: 0,
|
|
514
|
-
totalFields: 0,
|
|
515
|
-
passRate: 0,
|
|
516
|
-
error: error instanceof Error ? error.message : String(error)
|
|
517
|
-
};
|
|
576
|
+
const theme = {
|
|
577
|
+
success: chalk.green,
|
|
578
|
+
error: chalk.red,
|
|
579
|
+
warning: chalk.yellow,
|
|
580
|
+
bold: chalk.bold,
|
|
581
|
+
dim: chalk.dim,
|
|
582
|
+
check: chalk.green(figures.tick),
|
|
583
|
+
cross: chalk.red(figures.cross),
|
|
584
|
+
warn: chalk.yellow(figures.warning),
|
|
585
|
+
bullet: chalk.dim(figures.bullet),
|
|
586
|
+
pointer: chalk.yellow(figures.pointer),
|
|
587
|
+
separator: chalk.dim(" · "),
|
|
588
|
+
divider: (label, width = 60) => {
|
|
589
|
+
const prefix = `━━━ ${label} `;
|
|
590
|
+
const remaining = Math.max(0, width - prefix.length);
|
|
591
|
+
return chalk.cyan.dim(prefix + "━".repeat(remaining));
|
|
592
|
+
}
|
|
593
|
+
};
|
|
594
|
+
let activeSpinner = null;
|
|
595
|
+
const spinner = {
|
|
596
|
+
start(text) {
|
|
597
|
+
if (activeSpinner) activeSpinner.stop();
|
|
598
|
+
activeSpinner = ora({
|
|
599
|
+
text,
|
|
600
|
+
spinner: "dots",
|
|
601
|
+
indent: 4
|
|
602
|
+
}).start();
|
|
603
|
+
return activeSpinner;
|
|
604
|
+
},
|
|
605
|
+
succeed(text) {
|
|
606
|
+
if (activeSpinner) {
|
|
607
|
+
activeSpinner.succeed(text);
|
|
608
|
+
activeSpinner = null;
|
|
518
609
|
}
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
for (let i = 0; i < testCases.length; i += rateLimitBatch) {
|
|
525
|
-
const batch = testCases.slice(i, i + rateLimitBatch);
|
|
526
|
-
const batchResults = await Promise.all(batch.map(executeTestCase));
|
|
527
|
-
results.push(...batchResults);
|
|
528
|
-
const rateLimitPause = config.rateLimitPause;
|
|
529
|
-
if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
|
|
610
|
+
},
|
|
611
|
+
fail(text) {
|
|
612
|
+
if (activeSpinner) {
|
|
613
|
+
activeSpinner.fail(text);
|
|
614
|
+
activeSpinner = null;
|
|
530
615
|
}
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
if (
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
totalFields += fieldResults.length;
|
|
544
|
-
correctFields += fieldResults.filter((f) => f.passed).length;
|
|
616
|
+
},
|
|
617
|
+
stop() {
|
|
618
|
+
if (activeSpinner) {
|
|
619
|
+
activeSpinner.stop();
|
|
620
|
+
activeSpinner = null;
|
|
621
|
+
}
|
|
622
|
+
},
|
|
623
|
+
clear() {
|
|
624
|
+
if (activeSpinner) activeSpinner.clear();
|
|
625
|
+
},
|
|
626
|
+
isActive() {
|
|
627
|
+
return activeSpinner !== null;
|
|
545
628
|
}
|
|
546
|
-
|
|
547
|
-
|
|
629
|
+
};
|
|
630
|
+
function createProgressTracker(label) {
|
|
631
|
+
let bar = null;
|
|
632
|
+
let startTime = 0;
|
|
633
|
+
let lastUpdate = 0;
|
|
634
|
+
const MIN_UPDATE_INTERVAL = 100;
|
|
548
635
|
return {
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
636
|
+
start(total) {
|
|
637
|
+
spinner.stop();
|
|
638
|
+
startTime = Date.now();
|
|
639
|
+
bar = new cliProgress.SingleBar({
|
|
640
|
+
format: ` {bar} {percentage}% {value}/{total} ${label} {duration_formatted}`,
|
|
641
|
+
barCompleteChar: "█",
|
|
642
|
+
barIncompleteChar: "░",
|
|
643
|
+
barsize: 20,
|
|
644
|
+
hideCursor: true,
|
|
645
|
+
clearOnComplete: false,
|
|
646
|
+
stopOnComplete: false,
|
|
647
|
+
forceRedraw: true,
|
|
648
|
+
fps: 10
|
|
649
|
+
});
|
|
650
|
+
bar.start(total, 0, { duration_formatted: "0s" });
|
|
651
|
+
},
|
|
652
|
+
update(current) {
|
|
653
|
+
const now = Date.now();
|
|
654
|
+
if (now - lastUpdate < MIN_UPDATE_INTERVAL && bar) {
|
|
655
|
+
if (current < bar.getTotal()) return;
|
|
656
|
+
}
|
|
657
|
+
lastUpdate = now;
|
|
658
|
+
if (bar) {
|
|
659
|
+
const elapsed = Math.round((now - startTime) / 1e3);
|
|
660
|
+
bar.update(current, { duration_formatted: `${elapsed}s` });
|
|
661
|
+
}
|
|
662
|
+
},
|
|
663
|
+
stop() {
|
|
664
|
+
if (bar) {
|
|
665
|
+
const elapsed = Math.round((Date.now() - startTime) / 1e3);
|
|
666
|
+
bar.update(bar.getTotal(), { duration_formatted: `${elapsed}s` });
|
|
667
|
+
bar.stop();
|
|
668
|
+
bar = null;
|
|
669
|
+
}
|
|
670
|
+
}
|
|
558
671
|
};
|
|
559
672
|
}
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
* Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
|
|
563
|
-
*/
|
|
564
|
-
function compareFields(opts) {
|
|
565
|
-
const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, unorderedList = false } = opts;
|
|
566
|
-
const results = {};
|
|
567
|
-
const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
|
|
568
|
-
if (Array.isArray(expected)) {
|
|
569
|
-
if (!Array.isArray(actual)) return { [path$1]: {
|
|
570
|
-
passed: false,
|
|
571
|
-
expected,
|
|
572
|
-
actual
|
|
573
|
-
} };
|
|
574
|
-
if (expected.length === 0) return {};
|
|
575
|
-
let matchedPairs;
|
|
576
|
-
if (unorderedList) matchedPairs = matchArrays(expected, actual, comparators).assignments;
|
|
577
|
-
else {
|
|
578
|
-
matchedPairs = [];
|
|
579
|
-
for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
|
|
580
|
-
}
|
|
581
|
-
const matchedIndices = new Set(matchedPairs.map(([i]) => i));
|
|
582
|
-
for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, compareFields({
|
|
583
|
-
expected: expected[expIdx],
|
|
584
|
-
actual: actual[actIdx],
|
|
585
|
-
comparators,
|
|
586
|
-
path: indexPath(expIdx),
|
|
587
|
-
expectedParent,
|
|
588
|
-
actualParent,
|
|
589
|
-
unorderedList
|
|
590
|
-
}));
|
|
591
|
-
const arrayFieldName = getFieldName(path$1);
|
|
592
|
-
const hasArrayComparator = arrayFieldName in comparators || arrayFieldName === "";
|
|
593
|
-
for (let i = 0; i < expected.length; i++) {
|
|
594
|
-
if (matchedIndices.has(i)) continue;
|
|
595
|
-
const item = expected[i];
|
|
596
|
-
if (isObject(item)) {
|
|
597
|
-
for (const [field, value] of Object.entries(item)) if (field in comparators) results[`${indexPath(i)}.${field}`] = {
|
|
598
|
-
passed: false,
|
|
599
|
-
expected: value,
|
|
600
|
-
actual: void 0
|
|
601
|
-
};
|
|
602
|
-
} else if (hasArrayComparator) results[indexPath(i)] = {
|
|
603
|
-
passed: false,
|
|
604
|
-
expected: item,
|
|
605
|
-
actual: void 0
|
|
606
|
-
};
|
|
607
|
-
}
|
|
608
|
-
return results;
|
|
609
|
-
}
|
|
610
|
-
if (isObject(expected)) {
|
|
611
|
-
if (!isObject(actual)) return { [path$1]: {
|
|
612
|
-
passed: false,
|
|
613
|
-
expected,
|
|
614
|
-
actual
|
|
615
|
-
} };
|
|
616
|
-
for (const [field, expValue] of Object.entries(expected)) {
|
|
617
|
-
const fieldPath = path$1 ? `${path$1}.${field}` : field;
|
|
618
|
-
Object.assign(results, compareFields({
|
|
619
|
-
expected: expValue,
|
|
620
|
-
actual: actual[field],
|
|
621
|
-
comparators,
|
|
622
|
-
path: fieldPath,
|
|
623
|
-
expectedParent: expected,
|
|
624
|
-
actualParent: actual,
|
|
625
|
-
unorderedList
|
|
626
|
-
}));
|
|
627
|
-
}
|
|
628
|
-
return results;
|
|
629
|
-
}
|
|
630
|
-
const fieldName = getFieldName(path$1);
|
|
631
|
-
const comparator = comparators[fieldName] ?? (fieldName === "" ? exact : void 0);
|
|
632
|
-
if (!comparator) return {};
|
|
633
|
-
const result = comparator(expected, actual, {
|
|
634
|
-
expectedParent,
|
|
635
|
-
actualParent
|
|
636
|
-
});
|
|
637
|
-
return { [path$1]: {
|
|
638
|
-
...result,
|
|
639
|
-
expected,
|
|
640
|
-
actual
|
|
641
|
-
} };
|
|
673
|
+
function formatCost(cost) {
|
|
674
|
+
return theme.dim(`$${cost.toFixed(4)}`);
|
|
642
675
|
}
|
|
643
|
-
function
|
|
644
|
-
return
|
|
676
|
+
function formatCostShort(cost) {
|
|
677
|
+
return theme.dim(`$${cost.toFixed(2)}`);
|
|
645
678
|
}
|
|
646
|
-
function
|
|
647
|
-
|
|
679
|
+
function formatDuration(ms) {
|
|
680
|
+
const totalSeconds = Math.round(ms / 1e3);
|
|
681
|
+
if (totalSeconds < 60) return `${totalSeconds}s`;
|
|
682
|
+
const minutes = Math.floor(totalSeconds / 60);
|
|
683
|
+
const seconds = totalSeconds % 60;
|
|
684
|
+
return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
|
|
685
|
+
}
|
|
686
|
+
function formatPercentage(rate) {
|
|
687
|
+
return `${(rate * 100).toFixed(1)}%`;
|
|
648
688
|
}
|
|
649
689
|
|
|
650
690
|
//#endregion
|
|
651
|
-
//#region src/optimizer-logging.ts
|
|
691
|
+
//#region src/optimizer/optimizer-logging.ts
|
|
652
692
|
function formatMsCompact(ms) {
|
|
653
693
|
const totalSeconds = Math.round(ms / 1e3);
|
|
654
694
|
if (totalSeconds < 60) return `${totalSeconds}s`;
|
|
@@ -666,12 +706,75 @@ function formatTokensCompact(tokens) {
|
|
|
666
706
|
if (tokens >= 1e3) return `${Math.round(tokens / 1e3)}K`;
|
|
667
707
|
return String(tokens);
|
|
668
708
|
}
|
|
709
|
+
/**
|
|
710
|
+
* Clear any active progress line before logging
|
|
711
|
+
* Call this before all console.log statements
|
|
712
|
+
*/
|
|
713
|
+
function clearProgressLine() {
|
|
714
|
+
const width = process.stdout.columns || 80;
|
|
715
|
+
process.stdout.write("\r" + " ".repeat(width) + "\r");
|
|
716
|
+
}
|
|
717
|
+
/**
|
|
718
|
+
* Create a progress updater using cli-progress for beautiful output
|
|
719
|
+
*/
|
|
720
|
+
function createProgressUpdater(label) {
|
|
721
|
+
let tracker = null;
|
|
722
|
+
let total = 0;
|
|
723
|
+
return {
|
|
724
|
+
update(completed, newTotal) {
|
|
725
|
+
if (!tracker) {
|
|
726
|
+
total = newTotal;
|
|
727
|
+
tracker = createProgressTracker(label);
|
|
728
|
+
tracker.start(total);
|
|
729
|
+
}
|
|
730
|
+
tracker.update(completed);
|
|
731
|
+
},
|
|
732
|
+
finish() {
|
|
733
|
+
if (tracker) {
|
|
734
|
+
tracker.stop();
|
|
735
|
+
tracker = null;
|
|
736
|
+
}
|
|
737
|
+
},
|
|
738
|
+
clear() {
|
|
739
|
+
clearProgressLine();
|
|
740
|
+
}
|
|
741
|
+
};
|
|
742
|
+
}
|
|
743
|
+
/**
|
|
744
|
+
* Track progress of Promise.allSettled with real-time updates
|
|
745
|
+
*
|
|
746
|
+
* @param promises Array of promises to track
|
|
747
|
+
* @param onProgress Callback called when each promise settles
|
|
748
|
+
* @returns Promise.allSettled result
|
|
749
|
+
*/
|
|
750
|
+
async function trackPromiseProgress(promises, onProgress) {
|
|
751
|
+
if (promises.length === 0) return [];
|
|
752
|
+
let completed = 0;
|
|
753
|
+
const total = promises.length;
|
|
754
|
+
onProgress(0, total);
|
|
755
|
+
const wrappedPromises = promises.map((promise) => promise.then((value) => {
|
|
756
|
+
completed++;
|
|
757
|
+
onProgress(completed, total);
|
|
758
|
+
return {
|
|
759
|
+
status: "fulfilled",
|
|
760
|
+
value
|
|
761
|
+
};
|
|
762
|
+
}).catch((reason) => {
|
|
763
|
+
completed++;
|
|
764
|
+
onProgress(completed, total);
|
|
765
|
+
return {
|
|
766
|
+
status: "rejected",
|
|
767
|
+
reason
|
|
768
|
+
};
|
|
769
|
+
}));
|
|
770
|
+
return Promise.all(wrappedPromises);
|
|
771
|
+
}
|
|
669
772
|
function formatFailure(testCase) {
|
|
670
773
|
const lines = [];
|
|
671
774
|
lines.push(`Input: ${JSON.stringify(testCase.input, null, 2)}`);
|
|
672
775
|
lines.push(`Expected: ${JSON.stringify(testCase.expected, null, 2)}`);
|
|
673
776
|
lines.push(`Actual: ${JSON.stringify(testCase.actual, null, 2)}`);
|
|
674
|
-
if (testCase.additionalContext) lines.push(`Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
|
|
777
|
+
if (testCase.additionalContext) lines.push(`Additional Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
|
|
675
778
|
lines.push("");
|
|
676
779
|
lines.push("Field-level failures:");
|
|
677
780
|
for (const [fieldPath, result] of Object.entries(testCase.fields)) if (!result.passed) lines.push(` ${fieldPath || "(root)"}: expected ${JSON.stringify(result.expected)}, got ${JSON.stringify(result.actual)}`);
|
|
@@ -695,56 +798,98 @@ function computeTotals(iterations) {
|
|
|
695
798
|
totalDuration
|
|
696
799
|
};
|
|
697
800
|
}
|
|
698
|
-
function
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
801
|
+
function logOptimizerHeader(model, targetRate, testCount) {
|
|
802
|
+
spinner.stop();
|
|
803
|
+
console.log("");
|
|
804
|
+
console.log(theme.bold("Didactic Optimizer"));
|
|
805
|
+
console.log(` ${theme.dim("Model:")} ${model}${theme.separator}${theme.dim("Target:")} ${formatPercentage(targetRate)}${theme.separator}${theme.dim("Tests:")} ${testCount}`);
|
|
702
806
|
}
|
|
703
807
|
function logIterationStart(iterationLabel) {
|
|
704
|
-
|
|
808
|
+
spinner.stop();
|
|
809
|
+
clearProgressLine();
|
|
810
|
+
console.log("");
|
|
811
|
+
console.log(theme.divider(`Iteration ${iterationLabel}`));
|
|
812
|
+
console.log("");
|
|
705
813
|
}
|
|
706
814
|
function logEvaluationStart() {
|
|
707
|
-
|
|
815
|
+
spinner.stop();
|
|
816
|
+
clearProgressLine();
|
|
817
|
+
console.log(` ${theme.bold("Evaluating prompt")}`);
|
|
818
|
+
spinner.start("Running evals...");
|
|
708
819
|
}
|
|
709
820
|
function logEvaluationResult(result, cumulativeCost, durationMs) {
|
|
710
|
-
|
|
821
|
+
spinner.stop();
|
|
822
|
+
clearProgressLine();
|
|
823
|
+
const successIcon = result.successRate >= .9 ? theme.check : result.successRate >= .5 ? theme.warn : theme.cross;
|
|
824
|
+
console.log(` ${successIcon} ${theme.bold(formatPercentage(result.successRate))} success rate ${theme.dim(`(${result.passed}/${result.total} passed)`)}`);
|
|
825
|
+
console.log(` ${theme.dim("Cost:")} ${formatCost(result.cost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
|
|
711
826
|
}
|
|
712
827
|
function logRegressionDetected(bestSuccessRate) {
|
|
713
|
-
|
|
828
|
+
spinner.stop();
|
|
829
|
+
clearProgressLine();
|
|
830
|
+
console.log(` ${theme.pointer} ${theme.warning("Regression")} ${theme.dim(`(was ${formatPercentage(bestSuccessRate)})`)}`);
|
|
714
831
|
}
|
|
715
832
|
function logTargetReached(targetSuccessRate) {
|
|
716
|
-
|
|
833
|
+
spinner.stop();
|
|
834
|
+
clearProgressLine();
|
|
835
|
+
console.log(` ${theme.check} ${theme.success("Target reached!")} ${theme.dim(`(${formatPercentage(targetSuccessRate)})`)}`);
|
|
717
836
|
}
|
|
718
837
|
function logTargetFailures(targetSuccessRate, failureCount) {
|
|
719
|
-
|
|
838
|
+
spinner.stop();
|
|
839
|
+
clearProgressLine();
|
|
840
|
+
console.log(` ${theme.cross} ${theme.error(`${failureCount} failures`)} to address ${theme.dim(`(target: ${formatPercentage(targetSuccessRate)})`)}`);
|
|
720
841
|
}
|
|
721
842
|
function logCostLimitReached(cumulativeCost) {
|
|
722
|
-
|
|
843
|
+
spinner.stop();
|
|
844
|
+
clearProgressLine();
|
|
845
|
+
console.log(` ${theme.warn} ${theme.warning("Cost limit reached")} ${theme.dim(`($${cumulativeCost.toFixed(2)})`)}`);
|
|
723
846
|
}
|
|
724
847
|
function logPatchGenerationStart(failureCount) {
|
|
725
|
-
|
|
726
|
-
|
|
848
|
+
spinner.stop();
|
|
849
|
+
clearProgressLine();
|
|
850
|
+
console.log("");
|
|
851
|
+
console.log(` ${theme.bold("Generating patches")}`);
|
|
852
|
+
spinner.start(`Generating ${failureCount} patches in parallel...`);
|
|
727
853
|
}
|
|
728
854
|
function logPatchGenerationResult(patchCost, cumulativeCost, durationMs) {
|
|
729
|
-
|
|
855
|
+
spinner.stop();
|
|
856
|
+
clearProgressLine();
|
|
857
|
+
console.log(` ${theme.check} Patches generated${theme.separator}${theme.dim("Cost:")} ${formatCost(patchCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
|
|
730
858
|
}
|
|
731
859
|
function logMergeStart() {
|
|
732
|
-
|
|
733
|
-
|
|
860
|
+
spinner.stop();
|
|
861
|
+
clearProgressLine();
|
|
862
|
+
console.log("");
|
|
863
|
+
console.log(` ${theme.bold("Merging patches")}`);
|
|
864
|
+
spinner.start("Merging patches...");
|
|
734
865
|
}
|
|
735
866
|
function logMergeResult(mergeCost, cumulativeCost, durationMs) {
|
|
736
|
-
|
|
867
|
+
spinner.stop();
|
|
868
|
+
clearProgressLine();
|
|
869
|
+
console.log(` ${theme.check} Merged${theme.separator}${theme.dim("Cost:")} ${formatCost(mergeCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
|
|
737
870
|
}
|
|
738
871
|
function logPatchGenerationFailures(failedCount, totalCount) {
|
|
739
|
-
|
|
872
|
+
spinner.stop();
|
|
873
|
+
clearProgressLine();
|
|
874
|
+
console.log(` ${theme.warn} ${theme.warning(`${failedCount}/${totalCount} patch generations failed`)}`);
|
|
740
875
|
}
|
|
741
876
|
function logOptimizationComplete(bestSuccessRate, targetSuccessRate, cumulativeCost) {
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
console.log(
|
|
877
|
+
spinner.stop();
|
|
878
|
+
clearProgressLine();
|
|
879
|
+
console.log("");
|
|
880
|
+
console.log(theme.divider("Complete"));
|
|
881
|
+
console.log("");
|
|
882
|
+
const targetMet = bestSuccessRate >= targetSuccessRate;
|
|
883
|
+
const icon = targetMet ? theme.check : theme.cross;
|
|
884
|
+
const rateColor = targetMet ? theme.success : theme.error;
|
|
885
|
+
console.log(` ${icon} ${theme.bold("Best:")} ${rateColor(formatPercentage(bestSuccessRate))}`);
|
|
886
|
+
console.log(` ${theme.dim("Target:")} ${formatPercentage(targetSuccessRate)}${theme.separator}${theme.dim("Total Cost:")} ${formatCostShort(cumulativeCost)}`);
|
|
745
887
|
}
|
|
746
888
|
function logLogsWritten(logPath) {
|
|
747
|
-
|
|
889
|
+
spinner.stop();
|
|
890
|
+
clearProgressLine();
|
|
891
|
+
console.log(` ${theme.dim("Logs written to:")} ${logPath}`);
|
|
892
|
+
console.log("");
|
|
748
893
|
}
|
|
749
894
|
function generateConfigSection(ctx, testCaseCount) {
|
|
750
895
|
const lines = [];
|
|
@@ -911,6 +1056,7 @@ function writeRawDataJson(folderPath, iterations, ctx, success) {
|
|
|
911
1056
|
input: tc.input,
|
|
912
1057
|
expected: tc.expected,
|
|
913
1058
|
actual: tc.actual,
|
|
1059
|
+
additionalContext: tc.additionalContext,
|
|
914
1060
|
fields: tc.fields
|
|
915
1061
|
});
|
|
916
1062
|
});
|
|
@@ -984,6 +1130,7 @@ function writeBestRunJson(folderPath, iterations, ctx) {
|
|
|
984
1130
|
input: tc.input,
|
|
985
1131
|
expected: tc.expected,
|
|
986
1132
|
actual: tc.actual,
|
|
1133
|
+
additionalContext: tc.additionalContext,
|
|
987
1134
|
failedFields: extractFailedFields(tc.fields)
|
|
988
1135
|
});
|
|
989
1136
|
else if (tc.passRate < 1) partialFailures.push({
|
|
@@ -992,13 +1139,15 @@ function writeBestRunJson(folderPath, iterations, ctx) {
|
|
|
992
1139
|
input: tc.input,
|
|
993
1140
|
expected: tc.expected,
|
|
994
1141
|
actual: tc.actual,
|
|
1142
|
+
additionalContext: tc.additionalContext,
|
|
995
1143
|
failedFields: extractFailedFields(tc.fields)
|
|
996
1144
|
});
|
|
997
1145
|
else successes.push({
|
|
998
1146
|
testIndex: testIdx,
|
|
999
1147
|
input: tc.input,
|
|
1000
1148
|
expected: tc.expected,
|
|
1001
|
-
actual: tc.actual
|
|
1149
|
+
actual: tc.actual,
|
|
1150
|
+
additionalContext: tc.additionalContext
|
|
1002
1151
|
});
|
|
1003
1152
|
});
|
|
1004
1153
|
const report = {
|
|
@@ -1035,29 +1184,402 @@ function writeBestRunJson(folderPath, iterations, ctx) {
|
|
|
1035
1184
|
};
|
|
1036
1185
|
fs.writeFileSync(bestRunPath, JSON.stringify(report, null, 2), "utf-8");
|
|
1037
1186
|
}
|
|
1038
|
-
function writeFinalLogs(logPath, iterationLogs, logContext, success) {
|
|
1039
|
-
const folderPath = path.dirname(logPath);
|
|
1040
|
-
if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
|
|
1041
|
-
const content = generateLogContent(iterationLogs, logContext, success);
|
|
1042
|
-
fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
|
|
1043
|
-
writePromptsFile(folderPath, iterationLogs, logContext);
|
|
1044
|
-
writeRawDataJson(folderPath, iterationLogs, logContext, success);
|
|
1045
|
-
writeBestRunJson(folderPath, iterationLogs, logContext);
|
|
1187
|
+
function writeFinalLogs(logPath, iterationLogs, logContext, success) {
|
|
1188
|
+
const folderPath = path.dirname(logPath);
|
|
1189
|
+
if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
|
|
1190
|
+
const content = generateLogContent(iterationLogs, logContext, success);
|
|
1191
|
+
fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
|
|
1192
|
+
writePromptsFile(folderPath, iterationLogs, logContext);
|
|
1193
|
+
writeRawDataJson(folderPath, iterationLogs, logContext, success);
|
|
1194
|
+
writeBestRunJson(folderPath, iterationLogs, logContext);
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
//#endregion
|
|
1198
|
+
//#region src/eval/eval-logging.ts
|
|
1199
|
+
/**
|
|
1200
|
+
* Write evaluation results to rawData.json
|
|
1201
|
+
*
|
|
1202
|
+
* Synchronous writes are intentional - logging runs after evaluation completes
|
|
1203
|
+
* and errors are caught. This avoids async complexity in the calling code.
|
|
1204
|
+
*/
|
|
1205
|
+
function writeEvalLogs(logPath, result, durationMs, perTestThreshold) {
|
|
1206
|
+
try {
|
|
1207
|
+
const dir = path.dirname(logPath);
|
|
1208
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
1209
|
+
const report = {
|
|
1210
|
+
metadata: {
|
|
1211
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1212
|
+
systemPrompt: result.systemPrompt,
|
|
1213
|
+
testCaseCount: result.total,
|
|
1214
|
+
perTestThreshold: perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD
|
|
1215
|
+
},
|
|
1216
|
+
summary: {
|
|
1217
|
+
passed: result.passed,
|
|
1218
|
+
total: result.total,
|
|
1219
|
+
successRate: result.successRate,
|
|
1220
|
+
correctFields: result.correctFields,
|
|
1221
|
+
totalFields: result.totalFields,
|
|
1222
|
+
accuracy: result.accuracy,
|
|
1223
|
+
executorCost: result.cost,
|
|
1224
|
+
comparatorCost: result.comparatorCost,
|
|
1225
|
+
totalCost: result.cost + result.comparatorCost,
|
|
1226
|
+
durationMs
|
|
1227
|
+
},
|
|
1228
|
+
testCases: result.testCases.map((tc, index) => ({
|
|
1229
|
+
index,
|
|
1230
|
+
passed: tc.passed,
|
|
1231
|
+
passRate: tc.passRate,
|
|
1232
|
+
input: tc.input,
|
|
1233
|
+
expected: tc.expected,
|
|
1234
|
+
actual: tc.actual,
|
|
1235
|
+
additionalContext: tc.additionalContext,
|
|
1236
|
+
executorCost: tc.cost ?? 0,
|
|
1237
|
+
comparatorCost: tc.comparatorCost ?? 0,
|
|
1238
|
+
error: tc.error,
|
|
1239
|
+
fields: tc.fields
|
|
1240
|
+
}))
|
|
1241
|
+
};
|
|
1242
|
+
fs.writeFileSync(logPath, JSON.stringify(report, null, 2), "utf-8");
|
|
1243
|
+
} catch (error) {
|
|
1244
|
+
console.error(`Failed to write eval logs to ${logPath}:`, error instanceof Error ? error.message : String(error));
|
|
1245
|
+
}
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
//#endregion
|
|
1249
|
+
//#region src/eval/eval.ts
|
|
1250
|
+
/**
|
|
1251
|
+
* Run all test cases and return results.
|
|
1252
|
+
*/
|
|
1253
|
+
async function evaluate(config) {
|
|
1254
|
+
const { testCases, systemPrompt, executor, comparators, comparatorOverride } = config;
|
|
1255
|
+
if (testCases.length === 0) throw new Error("testCases array cannot be empty");
|
|
1256
|
+
if (!executor) throw new Error("executor is required");
|
|
1257
|
+
const startTime = Date.now();
|
|
1258
|
+
const logPath = config.storeLogs ? typeof config.storeLogs === "string" ? config.storeLogs : `./didactic-logs/eval_${Date.now()}_${crypto.randomUUID().slice(0, 8)}/rawData.json` : void 0;
|
|
1259
|
+
const executeTestCase = async ({ input, expected }) => {
|
|
1260
|
+
try {
|
|
1261
|
+
const result = await executor(input, systemPrompt);
|
|
1262
|
+
let fields;
|
|
1263
|
+
if (comparatorOverride) {
|
|
1264
|
+
const compResult = await comparatorOverride(expected, result.output);
|
|
1265
|
+
fields = { "": {
|
|
1266
|
+
passed: compResult.passed,
|
|
1267
|
+
expected,
|
|
1268
|
+
actual: result.output
|
|
1269
|
+
} };
|
|
1270
|
+
} else {
|
|
1271
|
+
let comparatorConfig;
|
|
1272
|
+
if (!comparators) comparatorConfig = { "": exact };
|
|
1273
|
+
else if (typeof comparators === "function") comparatorConfig = { "": comparators };
|
|
1274
|
+
else comparatorConfig = comparators;
|
|
1275
|
+
fields = await compareFields({
|
|
1276
|
+
expected,
|
|
1277
|
+
actual: result.output,
|
|
1278
|
+
comparators: comparatorConfig,
|
|
1279
|
+
llmConfig: config.llmConfig
|
|
1280
|
+
});
|
|
1281
|
+
}
|
|
1282
|
+
const passedFields = Object.values(fields).filter((f) => f.passed).length;
|
|
1283
|
+
const totalFields$1 = Object.values(fields).length;
|
|
1284
|
+
const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
|
|
1285
|
+
const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
|
|
1286
|
+
const comparatorCost$1 = Object.values(fields).reduce((sum, field) => sum + (field.cost ?? 0), 0);
|
|
1287
|
+
return {
|
|
1288
|
+
input,
|
|
1289
|
+
expected,
|
|
1290
|
+
actual: result.output,
|
|
1291
|
+
additionalContext: result.additionalContext,
|
|
1292
|
+
cost: result.cost ?? 0,
|
|
1293
|
+
comparatorCost: comparatorCost$1,
|
|
1294
|
+
passed: passed$1,
|
|
1295
|
+
fields,
|
|
1296
|
+
passedFields,
|
|
1297
|
+
totalFields: totalFields$1,
|
|
1298
|
+
passRate
|
|
1299
|
+
};
|
|
1300
|
+
} catch (error) {
|
|
1301
|
+
return {
|
|
1302
|
+
input,
|
|
1303
|
+
expected,
|
|
1304
|
+
actual: void 0,
|
|
1305
|
+
cost: 0,
|
|
1306
|
+
comparatorCost: 0,
|
|
1307
|
+
passed: false,
|
|
1308
|
+
fields: {},
|
|
1309
|
+
passedFields: 0,
|
|
1310
|
+
totalFields: 0,
|
|
1311
|
+
passRate: 0,
|
|
1312
|
+
error: error instanceof Error ? error.message : String(error)
|
|
1313
|
+
};
|
|
1314
|
+
}
|
|
1315
|
+
};
|
|
1316
|
+
const rateLimitBatch = config.rateLimitBatch;
|
|
1317
|
+
let results;
|
|
1318
|
+
if (rateLimitBatch && rateLimitBatch > 0) {
|
|
1319
|
+
results = [];
|
|
1320
|
+
const progress = createProgressUpdater("evals");
|
|
1321
|
+
for (let i = 0; i < testCases.length; i += rateLimitBatch) {
|
|
1322
|
+
const batch = testCases.slice(i, i + rateLimitBatch);
|
|
1323
|
+
const batchResults = await Promise.all(batch.map(executeTestCase));
|
|
1324
|
+
results.push(...batchResults);
|
|
1325
|
+
progress.update(results.length, testCases.length);
|
|
1326
|
+
const rateLimitPause = config.rateLimitPause;
|
|
1327
|
+
if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
|
|
1328
|
+
}
|
|
1329
|
+
progress.finish();
|
|
1330
|
+
} else {
|
|
1331
|
+
const progress = createProgressUpdater("evals");
|
|
1332
|
+
results = (await trackPromiseProgress(testCases.map((tc) => executeTestCase(tc)), (completed, total$1) => progress.update(completed, total$1))).map((r) => r.value);
|
|
1333
|
+
progress.finish();
|
|
1334
|
+
}
|
|
1335
|
+
results.sort((a, b) => {
|
|
1336
|
+
if (a.passed !== b.passed) return a.passed ? 1 : -1;
|
|
1337
|
+
return a.passRate - b.passRate;
|
|
1338
|
+
});
|
|
1339
|
+
const passed = results.filter((r) => r.passed).length;
|
|
1340
|
+
const total = results.length;
|
|
1341
|
+
const successRate = total > 0 ? passed / total : 0;
|
|
1342
|
+
let correctFields = 0;
|
|
1343
|
+
let totalFields = 0;
|
|
1344
|
+
for (const r of results) {
|
|
1345
|
+
const fieldResults = Object.values(r.fields);
|
|
1346
|
+
totalFields += fieldResults.length;
|
|
1347
|
+
correctFields += fieldResults.filter((f) => f.passed).length;
|
|
1348
|
+
}
|
|
1349
|
+
const accuracy = totalFields > 0 ? correctFields / totalFields : 0;
|
|
1350
|
+
const cost = results.reduce((sum, r) => sum + (r.cost ?? 0), 0);
|
|
1351
|
+
const comparatorCost = results.reduce((sum, r) => sum + (r.comparatorCost ?? 0), 0);
|
|
1352
|
+
const durationMs = Date.now() - startTime;
|
|
1353
|
+
const logFolder = logPath ? path.dirname(logPath) : void 0;
|
|
1354
|
+
const evalResult = {
|
|
1355
|
+
systemPrompt,
|
|
1356
|
+
testCases: results,
|
|
1357
|
+
passed,
|
|
1358
|
+
total,
|
|
1359
|
+
successRate,
|
|
1360
|
+
correctFields,
|
|
1361
|
+
totalFields,
|
|
1362
|
+
accuracy,
|
|
1363
|
+
cost,
|
|
1364
|
+
comparatorCost,
|
|
1365
|
+
...logFolder && { logFolder }
|
|
1366
|
+
};
|
|
1367
|
+
if (logPath) writeEvalLogs(logPath, evalResult, durationMs, config.perTestThreshold);
|
|
1368
|
+
return evalResult;
|
|
1369
|
+
}
|
|
1370
|
+
/**
|
|
1371
|
+
* Recursively compare expected vs actual, returning field-level results.
|
|
1372
|
+
* Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
|
|
1373
|
+
*/
|
|
1374
|
+
async function compareFields(opts) {
|
|
1375
|
+
const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, llmConfig } = opts;
|
|
1376
|
+
const results = {};
|
|
1377
|
+
const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
|
|
1378
|
+
if (Array.isArray(expected)) {
|
|
1379
|
+
if (!Array.isArray(actual)) return { [path$1]: {
|
|
1380
|
+
passed: false,
|
|
1381
|
+
expected,
|
|
1382
|
+
actual
|
|
1383
|
+
} };
|
|
1384
|
+
if (expected.length === 0) return {};
|
|
1385
|
+
const fieldComparator = comparators[getFieldName(path$1)];
|
|
1386
|
+
const isUnordered = fieldComparator && typeof fieldComparator === "function" && "_unordered" in fieldComparator && fieldComparator._unordered === true;
|
|
1387
|
+
let itemComparators;
|
|
1388
|
+
if (isUnordered) itemComparators = fieldComparator._nestedComparators || comparators;
|
|
1389
|
+
else if (fieldComparator && typeof fieldComparator === "object" && !("_unordered" in fieldComparator)) itemComparators = fieldComparator;
|
|
1390
|
+
else itemComparators = comparators;
|
|
1391
|
+
let matchedPairs;
|
|
1392
|
+
if (isUnordered) matchedPairs = (await matchArrays(expected, actual, itemComparators)).assignments;
|
|
1393
|
+
else {
|
|
1394
|
+
matchedPairs = [];
|
|
1395
|
+
for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
|
|
1396
|
+
}
|
|
1397
|
+
const matchedIndices = new Set(matchedPairs.map(([i]) => i));
|
|
1398
|
+
for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, await compareFields({
|
|
1399
|
+
expected: expected[expIdx],
|
|
1400
|
+
actual: actual[actIdx],
|
|
1401
|
+
comparators: itemComparators,
|
|
1402
|
+
path: indexPath(expIdx),
|
|
1403
|
+
expectedParent,
|
|
1404
|
+
actualParent,
|
|
1405
|
+
llmConfig
|
|
1406
|
+
}));
|
|
1407
|
+
const hasArrayComparator = fieldComparator !== void 0;
|
|
1408
|
+
for (let i = 0; i < expected.length; i++) {
|
|
1409
|
+
if (matchedIndices.has(i)) continue;
|
|
1410
|
+
const item = expected[i];
|
|
1411
|
+
if (isObject(item)) {
|
|
1412
|
+
for (const [field, value] of Object.entries(item)) if (field in itemComparators) results[`${indexPath(i)}.${field}`] = {
|
|
1413
|
+
passed: false,
|
|
1414
|
+
expected: value,
|
|
1415
|
+
actual: void 0
|
|
1416
|
+
};
|
|
1417
|
+
} else if (hasArrayComparator) results[indexPath(i)] = {
|
|
1418
|
+
passed: false,
|
|
1419
|
+
expected: item,
|
|
1420
|
+
actual: void 0
|
|
1421
|
+
};
|
|
1422
|
+
}
|
|
1423
|
+
return results;
|
|
1424
|
+
}
|
|
1425
|
+
if (isObject(expected)) {
|
|
1426
|
+
if (!isObject(actual)) return { [path$1]: {
|
|
1427
|
+
passed: false,
|
|
1428
|
+
expected,
|
|
1429
|
+
actual
|
|
1430
|
+
} };
|
|
1431
|
+
for (const [field, expValue] of Object.entries(expected)) {
|
|
1432
|
+
const fieldPath = path$1 ? `${path$1}.${field}` : field;
|
|
1433
|
+
const fieldConfig = comparators[field];
|
|
1434
|
+
if (fieldConfig === void 0) continue;
|
|
1435
|
+
let fieldComparators;
|
|
1436
|
+
if (fieldConfig && typeof fieldConfig === "object" && !("_unordered" in fieldConfig)) fieldComparators = fieldConfig;
|
|
1437
|
+
else fieldComparators = comparators;
|
|
1438
|
+
Object.assign(results, await compareFields({
|
|
1439
|
+
expected: expValue,
|
|
1440
|
+
actual: actual[field],
|
|
1441
|
+
comparators: fieldComparators,
|
|
1442
|
+
path: fieldPath,
|
|
1443
|
+
expectedParent: expected,
|
|
1444
|
+
actualParent: actual,
|
|
1445
|
+
llmConfig
|
|
1446
|
+
}));
|
|
1447
|
+
}
|
|
1448
|
+
return results;
|
|
1449
|
+
}
|
|
1450
|
+
const fieldName = getFieldName(path$1);
|
|
1451
|
+
let comparatorConfig = comparators[fieldName];
|
|
1452
|
+
if (!comparatorConfig && fieldName === "") comparatorConfig = exact;
|
|
1453
|
+
if (!comparatorConfig) return {};
|
|
1454
|
+
const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected, actual, {
|
|
1455
|
+
expectedParent,
|
|
1456
|
+
actualParent,
|
|
1457
|
+
llmConfig
|
|
1458
|
+
});
|
|
1459
|
+
return { [path$1]: {
|
|
1460
|
+
...result,
|
|
1461
|
+
expected,
|
|
1462
|
+
actual
|
|
1463
|
+
} };
|
|
1464
|
+
}
|
|
1465
|
+
function isObject(value) {
|
|
1466
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
1467
|
+
}
|
|
1468
|
+
function getFieldName(path$1) {
|
|
1469
|
+
return (path$1.split(".").pop() || "").replace(/\[\d+\]$/, "");
|
|
1470
|
+
}
|
|
1471
|
+
|
|
1472
|
+
//#endregion
|
|
1473
|
+
//#region src/optimizer/prompts.ts
|
|
1474
|
+
/**
|
|
1475
|
+
* Default system prompt for patch generation.
|
|
1476
|
+
* Analyzes failures and suggests specific, focused changes to improve the prompt.
|
|
1477
|
+
*/
|
|
1478
|
+
const DEFAULT_PATCH_SYSTEM_PROMPT = `
|
|
1479
|
+
'You are optimizing a system prompt for an LLM workflow.
|
|
1480
|
+
Analyze the failure and suggest a specific, focused change to improve the prompt.
|
|
1481
|
+
Do NOT overfit. Be generalizable.
|
|
1482
|
+
|
|
1483
|
+
<examples>
|
|
1484
|
+
VERY IMPORTANT, CRITICAL!!!
|
|
1485
|
+
Examples MUST be anonymized.
|
|
1486
|
+
NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
|
|
1487
|
+
- example: (for an invoice processor)
|
|
1488
|
+
- task: extract data from parsed invoices
|
|
1489
|
+
- failure context: (returned expected: true, actual: false)
|
|
1490
|
+
- prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
|
|
1491
|
+
|
|
1492
|
+
- example: (for a calendar app)
|
|
1493
|
+
- task: extract cost from calendar event
|
|
1494
|
+
- failure context: (cost expected: 123.45, actual: 167.89)
|
|
1495
|
+
- prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
|
|
1496
|
+
</examples>
|
|
1497
|
+
`;
|
|
1498
|
+
/**
|
|
1499
|
+
* Default system prompt for merging patches.
|
|
1500
|
+
* Combines multiple patches into a coherent system prompt.
|
|
1501
|
+
*/
|
|
1502
|
+
const DEFAULT_MERGE_SYSTEM_PROMPT = `
|
|
1503
|
+
You are an expert LLM prompt editor.
|
|
1504
|
+
You are merging improvements into a system prompt.
|
|
1505
|
+
Incorporate the suggestions while keeping the prompt clear and coherent.
|
|
1506
|
+
`;
|
|
1507
|
+
/**
|
|
1508
|
+
* Builds the user prompt for patch generation.
|
|
1509
|
+
* Formats the failure context and current prompt for the LLM.
|
|
1510
|
+
*/
|
|
1511
|
+
function buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures) {
|
|
1512
|
+
let userContent = `
|
|
1513
|
+
Current system prompt:
|
|
1514
|
+
---
|
|
1515
|
+
${currentPrompt}
|
|
1516
|
+
---
|
|
1517
|
+
|
|
1518
|
+
A test case failed:
|
|
1519
|
+
${formatFailure(failure)}
|
|
1520
|
+
`;
|
|
1521
|
+
if (previousBetterPrompt) {
|
|
1522
|
+
const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
|
|
1523
|
+
userContent += `
|
|
1524
|
+
Note: The current prompt is a REGRESSION from a better-performing version.
|
|
1525
|
+
Previous (better) prompt for reference:
|
|
1526
|
+
---
|
|
1527
|
+
${previousBetterPrompt}
|
|
1528
|
+
---
|
|
1529
|
+
|
|
1530
|
+
The failures the better prompt had:
|
|
1531
|
+
${failuresContext}
|
|
1532
|
+
|
|
1533
|
+
Your changes introduced new failures instead of fixing the above.
|
|
1534
|
+
Analyze what changed between the two prompts that might have caused this regression.
|
|
1535
|
+
Are there any new failures that were not present in the previous better prompt?
|
|
1536
|
+
Are there any failures that were present in the previous better prompt but not in the current prompt?
|
|
1537
|
+
Did any of our patches contradict any of the new failures?
|
|
1538
|
+
`;
|
|
1539
|
+
}
|
|
1540
|
+
userContent += `
|
|
1541
|
+
Suggest a specific change to the system prompt that would fix this failure.
|
|
1542
|
+
Be concise. Output ONLY the suggested patch/change, not the full prompt.
|
|
1543
|
+
DO NOT overfit the prompt to the test case.
|
|
1544
|
+
Generalize examples if you choose to use them.
|
|
1545
|
+
`;
|
|
1546
|
+
return userContent;
|
|
1547
|
+
}
|
|
1548
|
+
/**
|
|
1549
|
+
* Builds the user prompt for merging patches.
|
|
1550
|
+
* Formats the current prompt and suggested patches for the LLM.
|
|
1551
|
+
*/
|
|
1552
|
+
function buildMergeUserPrompt(patches, currentPrompt) {
|
|
1553
|
+
return `
|
|
1554
|
+
Current prompt:
|
|
1555
|
+
---
|
|
1556
|
+
${currentPrompt}
|
|
1557
|
+
---
|
|
1558
|
+
|
|
1559
|
+
Suggested improvements:
|
|
1560
|
+
${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
|
|
1561
|
+
|
|
1562
|
+
Create a single improved system prompt that incorporates these suggestions.
|
|
1563
|
+
Be mindful of the size of the new prompt.
|
|
1564
|
+
Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
|
|
1565
|
+
Output ONLY the new system prompt, nothing else.
|
|
1566
|
+
Respect enums.
|
|
1567
|
+
`;
|
|
1046
1568
|
}
|
|
1047
1569
|
|
|
1048
1570
|
//#endregion
|
|
1049
|
-
//#region src/optimizer.ts
|
|
1571
|
+
//#region src/optimizer/optimizer.ts
|
|
1050
1572
|
async function optimize(evalConfig, config) {
|
|
1051
1573
|
if (!config.apiKey) throw new Error("apiKey is required");
|
|
1052
|
-
if (!config.systemPrompt) throw new Error("systemPrompt is required");
|
|
1053
1574
|
if (config.targetSuccessRate < 0 || config.targetSuccessRate > 1) throw new Error("targetSuccessRate must be between 0 and 1");
|
|
1054
1575
|
const iterationLogs = [];
|
|
1055
1576
|
const maxIterations = config.maxIterations ?? (config.maxCost !== void 0 ? Infinity : 5);
|
|
1056
1577
|
const startTime = /* @__PURE__ */ new Date();
|
|
1578
|
+
const model = PROVIDER_SPECS[config.provider].model;
|
|
1057
1579
|
const logContext = {
|
|
1058
1580
|
config,
|
|
1059
1581
|
startTime,
|
|
1060
|
-
model
|
|
1582
|
+
model,
|
|
1061
1583
|
perTestThreshold: evalConfig.perTestThreshold,
|
|
1062
1584
|
rateLimitBatch: evalConfig.rateLimitBatch,
|
|
1063
1585
|
rateLimitPause: evalConfig.rateLimitPause
|
|
@@ -1114,6 +1636,8 @@ async function optimize(evalConfig, config) {
|
|
|
1114
1636
|
totalCost: cumulativeCost
|
|
1115
1637
|
};
|
|
1116
1638
|
};
|
|
1639
|
+
const testCount = evalConfig.testCases?.length ?? 0;
|
|
1640
|
+
logOptimizerHeader(model, config.targetSuccessRate, testCount);
|
|
1117
1641
|
for (let i = 1; i <= maxIterations; i++) {
|
|
1118
1642
|
const iterationStart = Date.now();
|
|
1119
1643
|
let iterInputTokens = 0;
|
|
@@ -1127,7 +1651,7 @@ async function optimize(evalConfig, config) {
|
|
|
1127
1651
|
});
|
|
1128
1652
|
cumulativeCost += result.cost;
|
|
1129
1653
|
logEvaluationResult(result, cumulativeCost, Date.now() - evalStart);
|
|
1130
|
-
const regressed = i > 1 && result.successRate
|
|
1654
|
+
const regressed = i > 1 && result.successRate <= bestSuccessRate;
|
|
1131
1655
|
if (regressed) logRegressionDetected(bestSuccessRate);
|
|
1132
1656
|
if (result.successRate > bestSuccessRate) {
|
|
1133
1657
|
bestSuccessRate = result.successRate;
|
|
@@ -1140,10 +1664,6 @@ async function optimize(evalConfig, config) {
|
|
|
1140
1664
|
return finalizeOptimization(true, currentPrompt);
|
|
1141
1665
|
}
|
|
1142
1666
|
const failures = result.testCases.filter((tc) => !tc.passed);
|
|
1143
|
-
if (failures.length === 0) {
|
|
1144
|
-
recordIteration(i, currentPrompt, result, result.cost, Date.now() - iterationStart, iterInputTokens, iterOutputTokens);
|
|
1145
|
-
return finalizeOptimization(true, currentPrompt);
|
|
1146
|
-
}
|
|
1147
1667
|
logTargetFailures(config.targetSuccessRate, failures.length);
|
|
1148
1668
|
if (config.maxCost !== void 0 && cumulativeCost >= config.maxCost) {
|
|
1149
1669
|
logCostLimitReached(cumulativeCost);
|
|
@@ -1152,7 +1672,9 @@ async function optimize(evalConfig, config) {
|
|
|
1152
1672
|
}
|
|
1153
1673
|
logPatchGenerationStart(failures.length);
|
|
1154
1674
|
const patchStart = Date.now();
|
|
1155
|
-
const
|
|
1675
|
+
const patchProgress = createProgressUpdater("patches");
|
|
1676
|
+
const patchSettled = await trackPromiseProgress(failures.map((failure) => generatePatch(failure, currentPrompt, config, regressed ? bestPrompt : void 0, regressed ? bestPromptFailures : void 0)), (completed, total) => patchProgress.update(completed, total));
|
|
1677
|
+
patchProgress.finish();
|
|
1156
1678
|
const patchResults = patchSettled.filter((r) => r.status === "fulfilled").map((r) => r.value);
|
|
1157
1679
|
const failedPatchCount = patchSettled.filter((r) => r.status === "rejected").length;
|
|
1158
1680
|
if (failedPatchCount > 0) logPatchGenerationFailures(failedPatchCount, failures.length);
|
|
@@ -1192,154 +1714,165 @@ async function optimize(evalConfig, config) {
|
|
|
1192
1714
|
}
|
|
1193
1715
|
return finalizeOptimization(false, bestPrompt);
|
|
1194
1716
|
}
|
|
1195
|
-
async function callLLM(messages, config, useThinking = false) {
|
|
1196
|
-
const spec = PROVIDER_SPECS[config.provider];
|
|
1197
|
-
try {
|
|
1198
|
-
if (config.provider.startsWith("anthropic")) {
|
|
1199
|
-
const client = new Anthropic({ apiKey: config.apiKey });
|
|
1200
|
-
const streamOptions = {
|
|
1201
|
-
model: spec.model,
|
|
1202
|
-
max_tokens: spec.maxTokens,
|
|
1203
|
-
system: messages.find((m) => m.role === "system")?.content,
|
|
1204
|
-
messages: messages.filter((m) => m.role !== "system").map((m) => ({
|
|
1205
|
-
role: m.role,
|
|
1206
|
-
content: m.content
|
|
1207
|
-
}))
|
|
1208
|
-
};
|
|
1209
|
-
if (useThinking) streamOptions.thinking = {
|
|
1210
|
-
type: "enabled",
|
|
1211
|
-
budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
|
|
1212
|
-
};
|
|
1213
|
-
const finalMessage = await client.messages.stream(streamOptions).finalMessage();
|
|
1214
|
-
const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
|
|
1215
|
-
const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
|
|
1216
|
-
const inputTokens = finalMessage.usage.input_tokens;
|
|
1217
|
-
const outputTokens = finalMessage.usage.output_tokens;
|
|
1218
|
-
return {
|
|
1219
|
-
text,
|
|
1220
|
-
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
1221
|
-
inputTokens,
|
|
1222
|
-
outputTokens
|
|
1223
|
-
};
|
|
1224
|
-
}
|
|
1225
|
-
if (config.provider.startsWith("openai")) {
|
|
1226
|
-
const client = new OpenAI({ apiKey: config.apiKey });
|
|
1227
|
-
const completionOptions = {
|
|
1228
|
-
model: spec.model,
|
|
1229
|
-
messages: messages.map((m) => ({
|
|
1230
|
-
role: m.role,
|
|
1231
|
-
content: m.content
|
|
1232
|
-
})),
|
|
1233
|
-
max_completion_tokens: spec.maxTokens
|
|
1234
|
-
};
|
|
1235
|
-
if (useThinking) completionOptions.reasoning_effort = "xhigh";
|
|
1236
|
-
const response = await client.chat.completions.create(completionOptions);
|
|
1237
|
-
const text = response.choices[0].message.content ?? "";
|
|
1238
|
-
const inputTokens = response.usage?.prompt_tokens ?? 0;
|
|
1239
|
-
const outputTokens = response.usage?.completion_tokens ?? 0;
|
|
1240
|
-
return {
|
|
1241
|
-
text,
|
|
1242
|
-
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
1243
|
-
inputTokens,
|
|
1244
|
-
outputTokens
|
|
1245
|
-
};
|
|
1246
|
-
}
|
|
1247
|
-
throw new Error(`Unsupported provider: ${config.provider}`);
|
|
1248
|
-
} catch (error) {
|
|
1249
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1250
|
-
throw new Error(`LLM call failed (${spec.model}): ${message}`);
|
|
1251
|
-
}
|
|
1252
|
-
}
|
|
1253
1717
|
async function generatePatch(failure, currentPrompt, config, previousBetterPrompt, previousBetterPromptFailures) {
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
---
|
|
1257
|
-
${currentPrompt}
|
|
1258
|
-
---
|
|
1259
|
-
|
|
1260
|
-
A test case failed:
|
|
1261
|
-
${formatFailure(failure)}
|
|
1262
|
-
`;
|
|
1263
|
-
if (previousBetterPrompt) {
|
|
1264
|
-
const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
|
|
1265
|
-
userContent += `
|
|
1266
|
-
Note: The current prompt is a REGRESSION from a better-performing version.
|
|
1267
|
-
Previous (better) prompt for reference:
|
|
1268
|
-
---
|
|
1269
|
-
${previousBetterPrompt}
|
|
1270
|
-
---
|
|
1271
|
-
|
|
1272
|
-
The failures the better prompt had:
|
|
1273
|
-
${failuresContext}
|
|
1274
|
-
|
|
1275
|
-
Your changes introduced new failures instead of fixing the above.
|
|
1276
|
-
Analyze what changed between the two prompts that might have caused this regression.
|
|
1277
|
-
Are there any new failures that were not present in the previous better prompt?
|
|
1278
|
-
Are there any failures that were present in the previous better prompt but not in the current prompt?
|
|
1279
|
-
Did any of our patches contradict any of the new failures?
|
|
1280
|
-
`;
|
|
1281
|
-
}
|
|
1282
|
-
userContent += `
|
|
1283
|
-
Suggest a specific change to the system prompt that would fix this failure.
|
|
1284
|
-
Be concise. Output ONLY the suggested patch/change, not the full prompt.
|
|
1285
|
-
DO NOT overfit the prompt to the test case.
|
|
1286
|
-
Generalize examples if you choose to use them.
|
|
1287
|
-
`;
|
|
1288
|
-
return callLLM([{
|
|
1718
|
+
const userContent = buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures);
|
|
1719
|
+
const messages = [{
|
|
1289
1720
|
role: "system",
|
|
1290
|
-
content:
|
|
1291
|
-
'You are optimizing a system prompt for an LLM workflow.
|
|
1292
|
-
Analyze the failure and suggest a specific, focused change to improve the prompt.
|
|
1293
|
-
Do NOT overfit. Be generalizable.
|
|
1294
|
-
|
|
1295
|
-
<examples>
|
|
1296
|
-
VERY IMPORTANT, CRITICAL!!!
|
|
1297
|
-
Examples MUST be anonymized.
|
|
1298
|
-
NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
|
|
1299
|
-
- example: (for an invoice processor)
|
|
1300
|
-
- task: extract data from parsed invoices
|
|
1301
|
-
- failure context: (returned expected: true, actual: false)
|
|
1302
|
-
- prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
|
|
1303
|
-
|
|
1304
|
-
- example: (for a calendar app)
|
|
1305
|
-
- task: extract cost from calendar event
|
|
1306
|
-
- failure context: (cost expected: 123.45, actual: 167.89)
|
|
1307
|
-
- prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
|
|
1308
|
-
</examples>
|
|
1309
|
-
`
|
|
1721
|
+
content: config.patchSystemPrompt ?? DEFAULT_PATCH_SYSTEM_PROMPT
|
|
1310
1722
|
}, {
|
|
1311
1723
|
role: "user",
|
|
1312
1724
|
content: userContent
|
|
1313
|
-
}]
|
|
1725
|
+
}];
|
|
1726
|
+
return callLLM({
|
|
1727
|
+
provider: config.provider,
|
|
1728
|
+
apiKey: config.apiKey,
|
|
1729
|
+
messages,
|
|
1730
|
+
useThinking: config.thinking ?? false
|
|
1731
|
+
});
|
|
1314
1732
|
}
|
|
1315
1733
|
async function mergePatches(patches, currentPrompt, config) {
|
|
1316
|
-
const systemContent =
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
Incorporate the suggestions while keeping the prompt clear and coherent.
|
|
1320
|
-
`;
|
|
1321
|
-
const userContent = `
|
|
1322
|
-
Current prompt:
|
|
1323
|
-
---
|
|
1324
|
-
${currentPrompt}
|
|
1325
|
-
---
|
|
1326
|
-
|
|
1327
|
-
Suggested improvements:
|
|
1328
|
-
${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
|
|
1329
|
-
|
|
1330
|
-
Create a single improved system prompt that incorporates these suggestions.
|
|
1331
|
-
Be mindful of the size of the new prompt.
|
|
1332
|
-
Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
|
|
1333
|
-
Output ONLY the new system prompt, nothing else.
|
|
1334
|
-
Respect enums.
|
|
1335
|
-
`;
|
|
1336
|
-
return callLLM([{
|
|
1734
|
+
const systemContent = config.mergeSystemPrompt ?? DEFAULT_MERGE_SYSTEM_PROMPT;
|
|
1735
|
+
const userContent = buildMergeUserPrompt(patches, currentPrompt);
|
|
1736
|
+
const messages = [{
|
|
1337
1737
|
role: "system",
|
|
1338
1738
|
content: systemContent
|
|
1339
1739
|
}, {
|
|
1340
1740
|
role: "user",
|
|
1341
1741
|
content: userContent
|
|
1342
|
-
}]
|
|
1742
|
+
}];
|
|
1743
|
+
return callLLM({
|
|
1744
|
+
provider: config.provider,
|
|
1745
|
+
apiKey: config.apiKey,
|
|
1746
|
+
messages,
|
|
1747
|
+
useThinking: config.thinking ?? false
|
|
1748
|
+
});
|
|
1749
|
+
}
|
|
1750
|
+
|
|
1751
|
+
//#endregion
|
|
1752
|
+
//#region src/eval/executors.ts
|
|
1753
|
+
/**
|
|
1754
|
+
* Creates an executor that calls an HTTP endpoint.
|
|
1755
|
+
*
|
|
1756
|
+
* @example
|
|
1757
|
+
* ```ts
|
|
1758
|
+
* const executor = endpoint('https://api.example.com/workflow', {
|
|
1759
|
+
* headers: { Authorization: 'Bearer token' },
|
|
1760
|
+
* });
|
|
1761
|
+
* ```
|
|
1762
|
+
*/
|
|
1763
|
+
function endpoint(url, config = {}) {
|
|
1764
|
+
const { method = "POST", headers = {}, mapResponse, mapAdditionalContext, mapCost, timeout = DEFAULT_ENDPOINT_TIMEOUT_MS } = config;
|
|
1765
|
+
return async (input, systemPrompt) => {
|
|
1766
|
+
const body = typeof input === "object" && input !== null ? {
|
|
1767
|
+
...input,
|
|
1768
|
+
systemPrompt
|
|
1769
|
+
} : {
|
|
1770
|
+
input,
|
|
1771
|
+
systemPrompt
|
|
1772
|
+
};
|
|
1773
|
+
const controller = new AbortController();
|
|
1774
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
1775
|
+
try {
|
|
1776
|
+
const response = await fetch(url, {
|
|
1777
|
+
method,
|
|
1778
|
+
headers: {
|
|
1779
|
+
"Content-Type": "application/json",
|
|
1780
|
+
...headers
|
|
1781
|
+
},
|
|
1782
|
+
body: JSON.stringify(body),
|
|
1783
|
+
signal: controller.signal
|
|
1784
|
+
});
|
|
1785
|
+
clearTimeout(timeoutId);
|
|
1786
|
+
if (!response.ok) {
|
|
1787
|
+
const text = await response.text();
|
|
1788
|
+
throw new Error(`HTTP ${response.status}: ${text}`);
|
|
1789
|
+
}
|
|
1790
|
+
const data = await response.json();
|
|
1791
|
+
const additionalContext = mapAdditionalContext?.(data);
|
|
1792
|
+
const cost = mapCost?.(data) ?? 0;
|
|
1793
|
+
if (mapResponse) return {
|
|
1794
|
+
output: mapResponse(data),
|
|
1795
|
+
additionalContext,
|
|
1796
|
+
cost
|
|
1797
|
+
};
|
|
1798
|
+
return {
|
|
1799
|
+
output: data,
|
|
1800
|
+
additionalContext,
|
|
1801
|
+
cost
|
|
1802
|
+
};
|
|
1803
|
+
} catch (error) {
|
|
1804
|
+
clearTimeout(timeoutId);
|
|
1805
|
+
throw error;
|
|
1806
|
+
}
|
|
1807
|
+
};
|
|
1808
|
+
}
|
|
1809
|
+
/**
|
|
1810
|
+
* Creates an executor from a local function.
|
|
1811
|
+
*
|
|
1812
|
+
* @example
|
|
1813
|
+
* ```ts
|
|
1814
|
+
* const executor = fn({
|
|
1815
|
+
* fn: async (input, systemPrompt) => {
|
|
1816
|
+
* const result = await myLLMCall(input, systemPrompt);
|
|
1817
|
+
* return result;
|
|
1818
|
+
* },
|
|
1819
|
+
* });
|
|
1820
|
+
* ```
|
|
1821
|
+
*
|
|
1822
|
+
* @example With mapResponse to extract output from a richer response:
|
|
1823
|
+
* ```ts
|
|
1824
|
+
* const executor = fn({
|
|
1825
|
+
* fn: async (input, systemPrompt) => await startWorkflow({ ... }),
|
|
1826
|
+
* mapResponse: (result) => ({ documentType: result.documentType }),
|
|
1827
|
+
* mapCost: (result) => result.cost,
|
|
1828
|
+
* mapAdditionalContext: (result) => result.metadata,
|
|
1829
|
+
* });
|
|
1830
|
+
* ```
|
|
1831
|
+
*/
|
|
1832
|
+
function fn(config) {
|
|
1833
|
+
return async (input, systemPrompt) => {
|
|
1834
|
+
const raw = await config.fn(input, systemPrompt);
|
|
1835
|
+
return {
|
|
1836
|
+
output: config.mapResponse ? config.mapResponse(raw) : raw,
|
|
1837
|
+
additionalContext: config.mapAdditionalContext?.(raw),
|
|
1838
|
+
cost: config.mapCost?.(raw) ?? 0
|
|
1839
|
+
};
|
|
1840
|
+
};
|
|
1841
|
+
}
|
|
1842
|
+
/**
|
|
1843
|
+
* Creates a mock executor for testing.
|
|
1844
|
+
* Can accept either:
|
|
1845
|
+
* - An array of outputs (returned in sequence, cycling if more calls than outputs)
|
|
1846
|
+
* - A function that maps input to output
|
|
1847
|
+
*
|
|
1848
|
+
* @example Array-based:
|
|
1849
|
+
* ```ts
|
|
1850
|
+
* const executor = mock([
|
|
1851
|
+
* { premium: 12500, policyType: 'claims-made' },
|
|
1852
|
+
* { premium: 8200, policyType: 'entity' },
|
|
1853
|
+
* ]);
|
|
1854
|
+
* ```
|
|
1855
|
+
*
|
|
1856
|
+
* @example Function-based:
|
|
1857
|
+
* ```ts
|
|
1858
|
+
* const executor = mock((input) => ({
|
|
1859
|
+
* id: input.id,
|
|
1860
|
+
* processed: true,
|
|
1861
|
+
* }));
|
|
1862
|
+
* ```
|
|
1863
|
+
*/
|
|
1864
|
+
function mock(outputsOrFn) {
|
|
1865
|
+
if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
|
|
1866
|
+
return { output: outputsOrFn(input, systemPrompt) };
|
|
1867
|
+
};
|
|
1868
|
+
const outputs = outputsOrFn;
|
|
1869
|
+
if (outputs.length === 0) throw new Error("mock() requires at least one output");
|
|
1870
|
+
let callIndex = 0;
|
|
1871
|
+
return async () => {
|
|
1872
|
+
const output = outputs[callIndex % outputs.length];
|
|
1873
|
+
callIndex++;
|
|
1874
|
+
return { output };
|
|
1875
|
+
};
|
|
1343
1876
|
}
|
|
1344
1877
|
|
|
1345
1878
|
//#endregion
|
|
@@ -1392,5 +1925,5 @@ const didactic = {
|
|
|
1392
1925
|
var src_default = didactic;
|
|
1393
1926
|
|
|
1394
1927
|
//#endregion
|
|
1395
|
-
export { LLMProviders, contains, custom, date, src_default as default, didactic as didact, didactic, endpoint, evaluate, exact, fn, mock, name, numeric, oneOf, optimize, presence, within };
|
|
1928
|
+
export { LLMProviders, contains, custom, date, src_default as default, didactic as didact, didactic, endpoint, evaluate, exact, fn, llmCompare, mock, name, numeric, oneOf, optimize, presence, unordered, within };
|
|
1396
1929
|
//# sourceMappingURL=index.mjs.map
|