@docshield/didactic 0.1.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +333 -228
- package/dist/index.cjs +1090 -550
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +134 -65
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +134 -65
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +1085 -552
- package/dist/index.mjs.map +1 -1
- package/package.json +20 -3
package/dist/index.cjs
CHANGED
|
@@ -26,25 +26,36 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
26
26
|
}) : target, mod));
|
|
27
27
|
|
|
28
28
|
//#endregion
|
|
29
|
-
let chrono_node = require("chrono-node");
|
|
30
|
-
chrono_node = __toESM(chrono_node);
|
|
31
|
-
let date_fns = require("date-fns");
|
|
32
|
-
let levenshtein = require("levenshtein");
|
|
33
|
-
levenshtein = __toESM(levenshtein);
|
|
34
29
|
let munkres_js = require("munkres-js");
|
|
35
30
|
munkres_js = __toESM(munkres_js);
|
|
36
31
|
let _anthropic_ai_sdk = require("@anthropic-ai/sdk");
|
|
37
32
|
_anthropic_ai_sdk = __toESM(_anthropic_ai_sdk);
|
|
38
33
|
let openai = require("openai");
|
|
39
34
|
openai = __toESM(openai);
|
|
40
|
-
let
|
|
41
|
-
|
|
35
|
+
let chrono_node = require("chrono-node");
|
|
36
|
+
chrono_node = __toESM(chrono_node);
|
|
37
|
+
let date_fns = require("date-fns");
|
|
38
|
+
let levenshtein = require("levenshtein");
|
|
39
|
+
levenshtein = __toESM(levenshtein);
|
|
42
40
|
let fs = require("fs");
|
|
43
41
|
fs = __toESM(fs);
|
|
42
|
+
let path = require("path");
|
|
43
|
+
path = __toESM(path);
|
|
44
|
+
let chalk = require("chalk");
|
|
45
|
+
chalk = __toESM(chalk);
|
|
46
|
+
let ora = require("ora");
|
|
47
|
+
ora = __toESM(ora);
|
|
48
|
+
let cli_progress = require("cli-progress");
|
|
49
|
+
cli_progress = __toESM(cli_progress);
|
|
50
|
+
let figures = require("figures");
|
|
51
|
+
figures = __toESM(figures);
|
|
52
|
+
let crypto = require("crypto");
|
|
53
|
+
crypto = __toESM(crypto);
|
|
44
54
|
|
|
45
55
|
//#region src/types.ts
|
|
46
56
|
/**
|
|
47
57
|
* Supported LLM providers.
|
|
58
|
+
* Used by both optimizer and LLM-based comparators.
|
|
48
59
|
*/
|
|
49
60
|
let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
|
|
50
61
|
LLMProviders$1["anthropic_claude_opus"] = "anthropic_claude_opus";
|
|
@@ -56,7 +67,7 @@ let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
|
|
|
56
67
|
}({});
|
|
57
68
|
|
|
58
69
|
//#endregion
|
|
59
|
-
//#region src/constants.ts
|
|
70
|
+
//#region src/library/constants.ts
|
|
60
71
|
const PROVIDER_SPECS = {
|
|
61
72
|
[LLMProviders.anthropic_claude_opus]: {
|
|
62
73
|
model: "claude-opus-4-5-20251101",
|
|
@@ -71,7 +82,7 @@ const PROVIDER_SPECS = {
|
|
|
71
82
|
costPerMillionOutput: 15
|
|
72
83
|
},
|
|
73
84
|
[LLMProviders.anthropic_claude_haiku]: {
|
|
74
|
-
model: "claude-haiku-4-5-
|
|
85
|
+
model: "claude-haiku-4-5-20251001",
|
|
75
86
|
maxTokens: 64e3,
|
|
76
87
|
costPerMillionInput: 1,
|
|
77
88
|
costPerMillionOutput: 5
|
|
@@ -96,7 +107,154 @@ const DEFAULT_PER_TEST_THRESHOLD = 1;
|
|
|
96
107
|
const NAME_SUFFIXES = /(?<=\S)\s*,?\s*(inc\.?|llc\.?|ltd\.?|l\.l\.c\.?|corp\.?|corporation|company|co\.?)$/i;
|
|
97
108
|
|
|
98
109
|
//#endregion
|
|
99
|
-
//#region src/
|
|
110
|
+
//#region src/library/llm/llm-client.ts
|
|
111
|
+
/**
|
|
112
|
+
* Call an LLM provider with the given messages.
|
|
113
|
+
* Returns raw text output - caller is responsible for parsing if structured output is needed.
|
|
114
|
+
*/
|
|
115
|
+
async function callLLM(config) {
|
|
116
|
+
const { provider, apiKey, messages, useThinking = false } = config;
|
|
117
|
+
const spec = PROVIDER_SPECS[provider];
|
|
118
|
+
try {
|
|
119
|
+
if (provider.startsWith("anthropic")) {
|
|
120
|
+
const client = new _anthropic_ai_sdk.default({ apiKey });
|
|
121
|
+
const streamOptions = {
|
|
122
|
+
model: spec.model,
|
|
123
|
+
max_tokens: spec.maxTokens,
|
|
124
|
+
system: messages.find((m) => m.role === "system")?.content,
|
|
125
|
+
messages: messages.filter((m) => m.role !== "system").map((m) => ({
|
|
126
|
+
role: m.role,
|
|
127
|
+
content: m.content
|
|
128
|
+
}))
|
|
129
|
+
};
|
|
130
|
+
if (useThinking) streamOptions.thinking = {
|
|
131
|
+
type: "enabled",
|
|
132
|
+
budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
|
|
133
|
+
};
|
|
134
|
+
const finalMessage = await client.messages.stream(streamOptions).finalMessage();
|
|
135
|
+
const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
|
|
136
|
+
const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
|
|
137
|
+
const inputTokens = finalMessage.usage.input_tokens;
|
|
138
|
+
const outputTokens = finalMessage.usage.output_tokens;
|
|
139
|
+
return {
|
|
140
|
+
text,
|
|
141
|
+
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
142
|
+
inputTokens,
|
|
143
|
+
outputTokens
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
if (provider.startsWith("openai")) {
|
|
147
|
+
const client = new openai.default({ apiKey });
|
|
148
|
+
const completionOptions = {
|
|
149
|
+
model: spec.model,
|
|
150
|
+
messages: messages.map((m) => ({
|
|
151
|
+
role: m.role,
|
|
152
|
+
content: m.content
|
|
153
|
+
})),
|
|
154
|
+
max_completion_tokens: spec.maxTokens
|
|
155
|
+
};
|
|
156
|
+
if (useThinking) completionOptions.reasoning_effort = "xhigh";
|
|
157
|
+
const response = await client.chat.completions.create(completionOptions);
|
|
158
|
+
const text = response.choices[0].message.content ?? "";
|
|
159
|
+
const inputTokens = response.usage?.prompt_tokens ?? 0;
|
|
160
|
+
const outputTokens = response.usage?.completion_tokens ?? 0;
|
|
161
|
+
return {
|
|
162
|
+
text,
|
|
163
|
+
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
164
|
+
inputTokens,
|
|
165
|
+
outputTokens
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
throw new Error(`Unsupported provider: ${provider}`);
|
|
169
|
+
} catch (error) {
|
|
170
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
171
|
+
throw new Error(`LLM call failed (${spec.model}): ${message}`);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
/**
|
|
175
|
+
* Call an LLM provider with structured output.
|
|
176
|
+
* Returns parsed JSON data conforming to the provided schema.
|
|
177
|
+
*/
|
|
178
|
+
async function callStructuredLLM(config) {
|
|
179
|
+
const { provider, apiKey, messages, schema, useThinking = false } = config;
|
|
180
|
+
const spec = PROVIDER_SPECS[provider];
|
|
181
|
+
try {
|
|
182
|
+
if (provider.startsWith("anthropic")) {
|
|
183
|
+
const client = new _anthropic_ai_sdk.default({ apiKey });
|
|
184
|
+
const baseOptions = {
|
|
185
|
+
model: spec.model,
|
|
186
|
+
max_tokens: spec.maxTokens,
|
|
187
|
+
betas: ["structured-outputs-2025-11-13"],
|
|
188
|
+
system: messages.find((m) => m.role === "system")?.content,
|
|
189
|
+
messages: messages.filter((m) => m.role !== "system").map((m) => ({
|
|
190
|
+
role: m.role,
|
|
191
|
+
content: m.content
|
|
192
|
+
})),
|
|
193
|
+
output_format: {
|
|
194
|
+
type: "json_schema",
|
|
195
|
+
schema
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
const streamOptions = useThinking ? {
|
|
199
|
+
...baseOptions,
|
|
200
|
+
thinking: {
|
|
201
|
+
type: "enabled",
|
|
202
|
+
budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
|
|
203
|
+
}
|
|
204
|
+
} : baseOptions;
|
|
205
|
+
const finalMessage = await client.beta.messages.stream(streamOptions).finalMessage();
|
|
206
|
+
const content = finalMessage.content[0];
|
|
207
|
+
if (content.type !== "text") throw new Error("Unexpected response type from LLM");
|
|
208
|
+
const data = JSON.parse(content.text);
|
|
209
|
+
const inputTokens = finalMessage.usage.input_tokens;
|
|
210
|
+
const outputTokens = finalMessage.usage.output_tokens;
|
|
211
|
+
return {
|
|
212
|
+
data,
|
|
213
|
+
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
214
|
+
inputTokens,
|
|
215
|
+
outputTokens
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
if (provider.startsWith("openai")) {
|
|
219
|
+
const client = new openai.default({ apiKey });
|
|
220
|
+
const completionOptions = {
|
|
221
|
+
model: spec.model,
|
|
222
|
+
messages: messages.map((m) => ({
|
|
223
|
+
role: m.role,
|
|
224
|
+
content: m.content
|
|
225
|
+
})),
|
|
226
|
+
max_completion_tokens: spec.maxTokens,
|
|
227
|
+
response_format: {
|
|
228
|
+
type: "json_schema",
|
|
229
|
+
json_schema: {
|
|
230
|
+
name: "response",
|
|
231
|
+
strict: true,
|
|
232
|
+
schema
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
};
|
|
236
|
+
if (useThinking) completionOptions.reasoning_effort = "xhigh";
|
|
237
|
+
const response = await client.chat.completions.create(completionOptions);
|
|
238
|
+
const text = response.choices[0].message.content ?? "";
|
|
239
|
+
const data = JSON.parse(text);
|
|
240
|
+
const inputTokens = response.usage?.prompt_tokens ?? 0;
|
|
241
|
+
const outputTokens = response.usage?.completion_tokens ?? 0;
|
|
242
|
+
return {
|
|
243
|
+
data,
|
|
244
|
+
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
245
|
+
inputTokens,
|
|
246
|
+
outputTokens
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
throw new Error(`Unsupported provider: ${provider}`);
|
|
250
|
+
} catch (error) {
|
|
251
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
252
|
+
throw new Error(`Structured LLM call failed (${spec.model}): ${message}`);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
//#endregion
|
|
257
|
+
//#region src/eval/comparators/comparators.ts
|
|
100
258
|
/** Checks if actual string contains a substring. */
|
|
101
259
|
function contains(substring) {
|
|
102
260
|
return (_expected, actual) => {
|
|
@@ -233,6 +391,103 @@ function within(config) {
|
|
|
233
391
|
};
|
|
234
392
|
};
|
|
235
393
|
}
|
|
394
|
+
/** Schema for LLM comparison response. */
|
|
395
|
+
const LLM_COMPARE_SCHEMA = {
|
|
396
|
+
type: "object",
|
|
397
|
+
properties: {
|
|
398
|
+
passed: {
|
|
399
|
+
type: "boolean",
|
|
400
|
+
description: "Whether the actual value matches the expected value"
|
|
401
|
+
},
|
|
402
|
+
rationale: {
|
|
403
|
+
type: "string",
|
|
404
|
+
description: "Brief explanation of the comparison decision"
|
|
405
|
+
}
|
|
406
|
+
},
|
|
407
|
+
required: ["passed", "rationale"],
|
|
408
|
+
additionalProperties: false
|
|
409
|
+
};
|
|
410
|
+
const DEFAULT_LLM_COMPARE_SYSTEM_PROMPT = `Compare the following two values and determine if they are semantically equivalent.
|
|
411
|
+
|
|
412
|
+
Focus on whether they convey the same core meaning or information, even if expressed differently. Consider synonyms, paraphrasing, and stylistic variations as acceptable. Only mark as failed if there are substantial differences in the actual facts or meaning being conveyed.`;
|
|
413
|
+
const buildLLMCompareUserPrompt = (expected, actual) => `Expected value:
|
|
414
|
+
${JSON.stringify(expected, null, 2)}
|
|
415
|
+
|
|
416
|
+
Actual value:
|
|
417
|
+
${JSON.stringify(actual, null, 2)}`;
|
|
418
|
+
/**
|
|
419
|
+
* Uses an LLM to compare expected vs actual values.
|
|
420
|
+
* Returns a comparison result with rationale and cost tracking.
|
|
421
|
+
* Default provider: anthropic_claude_haiku (fastest, cheapest).
|
|
422
|
+
*/
|
|
423
|
+
function llmCompare(config) {
|
|
424
|
+
const systemPrompt = config.systemPrompt ?? DEFAULT_LLM_COMPARE_SYSTEM_PROMPT;
|
|
425
|
+
return async (expected, actual, context) => {
|
|
426
|
+
try {
|
|
427
|
+
const apiKey = config.apiKey ?? context?.llmConfig?.apiKey;
|
|
428
|
+
if (!apiKey) throw new Error("llmCompare requires an apiKey. Either pass it directly to llmCompare() or set llmConfig.apiKey in eval config.");
|
|
429
|
+
const provider = config.provider ?? context?.llmConfig?.provider ?? LLMProviders.anthropic_claude_haiku;
|
|
430
|
+
const userPrompt = buildLLMCompareUserPrompt(expected, actual);
|
|
431
|
+
const result = await callStructuredLLM({
|
|
432
|
+
provider,
|
|
433
|
+
apiKey,
|
|
434
|
+
messages: [{
|
|
435
|
+
role: "system",
|
|
436
|
+
content: systemPrompt
|
|
437
|
+
}, {
|
|
438
|
+
role: "user",
|
|
439
|
+
content: userPrompt
|
|
440
|
+
}],
|
|
441
|
+
schema: LLM_COMPARE_SCHEMA
|
|
442
|
+
});
|
|
443
|
+
return {
|
|
444
|
+
passed: result.data.passed,
|
|
445
|
+
rationale: result.data.rationale,
|
|
446
|
+
cost: result.cost,
|
|
447
|
+
similarity: result.data.passed ? 1 : 0
|
|
448
|
+
};
|
|
449
|
+
} catch (error) {
|
|
450
|
+
return {
|
|
451
|
+
passed: false,
|
|
452
|
+
rationale: `LLM comparison failed: ${error instanceof Error ? error.message : String(error)}`,
|
|
453
|
+
cost: 0,
|
|
454
|
+
similarity: 0
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
/**
|
|
460
|
+
* Marks a comparator or comparator config as unordered.
|
|
461
|
+
* When applied to an array field, items will be matched by similarity
|
|
462
|
+
* rather than index position (using Hungarian algorithm).
|
|
463
|
+
*
|
|
464
|
+
* @example
|
|
465
|
+
* // Unordered array of objects
|
|
466
|
+
* lineItems: unordered({
|
|
467
|
+
* description: name,
|
|
468
|
+
* price: within({ tolerance: 5 })
|
|
469
|
+
* })
|
|
470
|
+
*
|
|
471
|
+
* @example
|
|
472
|
+
* // Unordered array of primitives
|
|
473
|
+
* tags: unordered(exact)
|
|
474
|
+
*
|
|
475
|
+
* @example
|
|
476
|
+
* // When entire output is an array
|
|
477
|
+
* comparators: unordered({
|
|
478
|
+
* carrier: exact,
|
|
479
|
+
* premium: within({ tolerance: 0.05 })
|
|
480
|
+
* })
|
|
481
|
+
*/
|
|
482
|
+
function unordered(comparator) {
|
|
483
|
+
const baseFunction = typeof comparator === "function" ? comparator : () => {
|
|
484
|
+
throw new Error("unordered() base function should not be called when nested comparators exist. This is likely a bug in the evaluation logic.");
|
|
485
|
+
};
|
|
486
|
+
return Object.assign(baseFunction, {
|
|
487
|
+
_unordered: true,
|
|
488
|
+
_nestedComparators: typeof comparator === "object" ? comparator : void 0
|
|
489
|
+
});
|
|
490
|
+
}
|
|
236
491
|
/**
|
|
237
492
|
* Deep equality comparison with cycle detection.
|
|
238
493
|
* Uses WeakSet to track visited object pairs to prevent stack overflow on circular references.
|
|
@@ -270,187 +525,63 @@ function normalizeNumeric(value) {
|
|
|
270
525
|
if (value == null || value === "") return null;
|
|
271
526
|
const str = String(value);
|
|
272
527
|
const isNegativeParens = /^\(.*\)$/.test(str.trim());
|
|
273
|
-
let cleaned = str.replace(/[^0-9
|
|
528
|
+
let cleaned = str.replace(/[^0-9.-]/g, "");
|
|
274
529
|
if (isNegativeParens && !cleaned.startsWith("-")) cleaned = "-" + cleaned;
|
|
275
530
|
const num = parseFloat(cleaned);
|
|
276
531
|
return isNaN(num) ? null : num;
|
|
277
532
|
}
|
|
278
533
|
|
|
279
534
|
//#endregion
|
|
280
|
-
//#region src/
|
|
535
|
+
//#region src/eval/comparators/matching.ts
|
|
536
|
+
function isObject$1(value) {
|
|
537
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
538
|
+
}
|
|
281
539
|
/**
|
|
282
|
-
*
|
|
283
|
-
*
|
|
284
|
-
*
|
|
285
|
-
*
|
|
286
|
-
* const executor = endpoint('https://api.example.com/workflow', {
|
|
287
|
-
* headers: { Authorization: 'Bearer token' },
|
|
288
|
-
* });
|
|
289
|
-
* ```
|
|
540
|
+
* Calculate similarity score between two values (0.0 to 1.0).
|
|
541
|
+
* For arrays: recursively match and average similarity of paired elements.
|
|
542
|
+
* For objects: average similarity across all fields using comparator results.
|
|
543
|
+
* For primitives: uses exact comparison's similarity score.
|
|
290
544
|
*/
|
|
291
|
-
function
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
const
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
const cost = mapCost?.(data) ?? 0;
|
|
321
|
-
if (mapResponse) return {
|
|
322
|
-
output: mapResponse(data),
|
|
323
|
-
additionalContext,
|
|
324
|
-
cost
|
|
325
|
-
};
|
|
326
|
-
return {
|
|
327
|
-
output: data,
|
|
328
|
-
additionalContext,
|
|
329
|
-
cost
|
|
330
|
-
};
|
|
331
|
-
} catch (error) {
|
|
332
|
-
clearTimeout(timeoutId);
|
|
333
|
-
throw error;
|
|
334
|
-
}
|
|
335
|
-
};
|
|
545
|
+
async function getSimilarity(expected, actual, comparators) {
|
|
546
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
547
|
+
if (expected.length === 0 && actual.length === 0) return 1;
|
|
548
|
+
if (expected.length === 0 || actual.length === 0) return 0;
|
|
549
|
+
const result = await matchArrays(expected, actual, comparators);
|
|
550
|
+
let total$1 = 0;
|
|
551
|
+
for (const [expIdx, actIdx] of result.assignments) total$1 += await getSimilarity(expected[expIdx], actual[actIdx], comparators);
|
|
552
|
+
const maxLen = Math.max(expected.length, actual.length);
|
|
553
|
+
return total$1 / maxLen;
|
|
554
|
+
}
|
|
555
|
+
if (!isObject$1(expected) || !isObject$1(actual)) {
|
|
556
|
+
const result = exact(expected, actual);
|
|
557
|
+
return result.similarity ?? (result.passed ? 1 : 0);
|
|
558
|
+
}
|
|
559
|
+
const fields = Object.keys(expected).filter((key) => {
|
|
560
|
+
const comp = comparators[key];
|
|
561
|
+
return comp !== void 0 && typeof comp === "function";
|
|
562
|
+
});
|
|
563
|
+
if (fields.length === 0) return 1;
|
|
564
|
+
let total = 0;
|
|
565
|
+
for (const key of fields) {
|
|
566
|
+
const comparatorConfig = comparators[key];
|
|
567
|
+
const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected[key], actual[key], {
|
|
568
|
+
expectedParent: expected,
|
|
569
|
+
actualParent: actual
|
|
570
|
+
});
|
|
571
|
+
total += result.similarity ?? (result.passed ? 1 : 0);
|
|
572
|
+
}
|
|
573
|
+
return total / fields.length;
|
|
336
574
|
}
|
|
337
575
|
/**
|
|
338
|
-
*
|
|
339
|
-
*
|
|
340
|
-
* @example
|
|
341
|
-
* ```ts
|
|
342
|
-
* const executor = fn({
|
|
343
|
-
* fn: async (input, systemPrompt) => {
|
|
344
|
-
* const result = await myLLMCall(input, systemPrompt);
|
|
345
|
-
* return result;
|
|
346
|
-
* },
|
|
347
|
-
* });
|
|
348
|
-
* ```
|
|
576
|
+
* Find optimal pairing between expected and actual arrays using Hungarian algorithm.
|
|
577
|
+
* Pure matching - no pass/fail determination.
|
|
349
578
|
*
|
|
350
|
-
* @
|
|
351
|
-
*
|
|
352
|
-
*
|
|
353
|
-
*
|
|
354
|
-
* mapResponse: (result) => ({ documentType: result.documentType }),
|
|
355
|
-
* mapCost: (result) => result.cost,
|
|
356
|
-
* mapAdditionalContext: (result) => result.metadata,
|
|
357
|
-
* });
|
|
358
|
-
* ```
|
|
579
|
+
* @param expected - Array of expected items
|
|
580
|
+
* @param actual - Array of actual items
|
|
581
|
+
* @param comparators - Nested comparator configuration for array items
|
|
582
|
+
* @returns Matching result with assignments and unmatched indices
|
|
359
583
|
*/
|
|
360
|
-
function
|
|
361
|
-
return async (input, systemPrompt) => {
|
|
362
|
-
const raw = await config.fn(input, systemPrompt);
|
|
363
|
-
return {
|
|
364
|
-
output: config.mapResponse ? config.mapResponse(raw) : raw,
|
|
365
|
-
additionalContext: config.mapAdditionalContext?.(raw),
|
|
366
|
-
cost: config.mapCost?.(raw) ?? 0
|
|
367
|
-
};
|
|
368
|
-
};
|
|
369
|
-
}
|
|
370
|
-
/**
|
|
371
|
-
* Creates a mock executor for testing.
|
|
372
|
-
* Can accept either:
|
|
373
|
-
* - An array of outputs (returned in sequence, cycling if more calls than outputs)
|
|
374
|
-
* - A function that maps input to output
|
|
375
|
-
*
|
|
376
|
-
* @example Array-based:
|
|
377
|
-
* ```ts
|
|
378
|
-
* const executor = mock([
|
|
379
|
-
* { premium: 12500, policyType: 'claims-made' },
|
|
380
|
-
* { premium: 8200, policyType: 'entity' },
|
|
381
|
-
* ]);
|
|
382
|
-
* ```
|
|
383
|
-
*
|
|
384
|
-
* @example Function-based:
|
|
385
|
-
* ```ts
|
|
386
|
-
* const executor = mock((input) => ({
|
|
387
|
-
* id: input.id,
|
|
388
|
-
* processed: true,
|
|
389
|
-
* }));
|
|
390
|
-
* ```
|
|
391
|
-
*/
|
|
392
|
-
function mock(outputsOrFn) {
|
|
393
|
-
if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
|
|
394
|
-
return { output: outputsOrFn(input, systemPrompt) };
|
|
395
|
-
};
|
|
396
|
-
const outputs = outputsOrFn;
|
|
397
|
-
if (outputs.length === 0) throw new Error("mock() requires at least one output");
|
|
398
|
-
let callIndex = 0;
|
|
399
|
-
return async () => {
|
|
400
|
-
const output = outputs[callIndex % outputs.length];
|
|
401
|
-
callIndex++;
|
|
402
|
-
return { output };
|
|
403
|
-
};
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
//#endregion
|
|
407
|
-
//#region src/matching.ts
|
|
408
|
-
function isObject$1(value) {
|
|
409
|
-
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
410
|
-
}
|
|
411
|
-
/**
|
|
412
|
-
* Calculate similarity score between two values (0.0 to 1.0).
|
|
413
|
-
* For arrays: recursively match and average similarity of paired elements.
|
|
414
|
-
* For objects: average similarity across all fields using comparator results.
|
|
415
|
-
* For primitives: uses exact comparison's similarity score.
|
|
416
|
-
*/
|
|
417
|
-
function getSimilarity(expected, actual, comparators) {
|
|
418
|
-
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
419
|
-
if (expected.length === 0 && actual.length === 0) return 1;
|
|
420
|
-
if (expected.length === 0 || actual.length === 0) return 0;
|
|
421
|
-
const result = matchArrays(expected, actual, comparators);
|
|
422
|
-
let total$1 = 0;
|
|
423
|
-
for (const [expIdx, actIdx] of result.assignments) total$1 += getSimilarity(expected[expIdx], actual[actIdx], comparators);
|
|
424
|
-
const maxLen = Math.max(expected.length, actual.length);
|
|
425
|
-
return total$1 / maxLen;
|
|
426
|
-
}
|
|
427
|
-
if (!isObject$1(expected) || !isObject$1(actual)) {
|
|
428
|
-
const result = exact(expected, actual);
|
|
429
|
-
return result.similarity ?? (result.passed ? 1 : 0);
|
|
430
|
-
}
|
|
431
|
-
const fields = Object.keys(expected).filter((key) => comparators[key]);
|
|
432
|
-
if (fields.length === 0) return 1;
|
|
433
|
-
let total = 0;
|
|
434
|
-
for (const key of fields) {
|
|
435
|
-
const comparator = comparators[key];
|
|
436
|
-
const result = comparator(expected[key], actual[key], {
|
|
437
|
-
expectedParent: expected,
|
|
438
|
-
actualParent: actual
|
|
439
|
-
});
|
|
440
|
-
total += result.similarity ?? (result.passed ? 1 : 0);
|
|
441
|
-
}
|
|
442
|
-
return total / fields.length;
|
|
443
|
-
}
|
|
444
|
-
/**
|
|
445
|
-
* Find optimal pairing between expected and actual arrays using Hungarian algorithm.
|
|
446
|
-
* Pure matching - no pass/fail determination.
|
|
447
|
-
*
|
|
448
|
-
* @param expected - Array of expected items
|
|
449
|
-
* @param actual - Array of actual items
|
|
450
|
-
* @param comparators - Map of field names to comparator functions
|
|
451
|
-
* @returns Matching result with assignments and unmatched indices
|
|
452
|
-
*/
|
|
453
|
-
function matchArrays(expected, actual, comparators = {}) {
|
|
584
|
+
async function matchArrays(expected, actual, comparators = {}) {
|
|
454
585
|
if (expected.length === 0) return {
|
|
455
586
|
assignments: [],
|
|
456
587
|
unmatchedExpected: [],
|
|
@@ -461,7 +592,7 @@ function matchArrays(expected, actual, comparators = {}) {
|
|
|
461
592
|
unmatchedExpected: [...Array(expected.length).keys()],
|
|
462
593
|
unmatchedActual: []
|
|
463
594
|
};
|
|
464
|
-
const rawAssignments = (0, munkres_js.default)(expected.map((exp) => actual.map((act) => 1 - getSimilarity(exp, act, comparators))));
|
|
595
|
+
const rawAssignments = (0, munkres_js.default)(await Promise.all(expected.map(async (exp) => Promise.all(actual.map(async (act) => 1 - await getSimilarity(exp, act, comparators))))));
|
|
465
596
|
const assignments = [];
|
|
466
597
|
const matchedExp = /* @__PURE__ */ new Set();
|
|
467
598
|
const matchedAct = /* @__PURE__ */ new Set();
|
|
@@ -478,212 +609,126 @@ function matchArrays(expected, actual, comparators = {}) {
|
|
|
478
609
|
}
|
|
479
610
|
|
|
480
611
|
//#endregion
|
|
481
|
-
//#region src/
|
|
612
|
+
//#region src/optimizer/ui.ts
|
|
482
613
|
/**
|
|
483
|
-
*
|
|
614
|
+
* UI utilities for beautiful console output
|
|
484
615
|
*/
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
else fields = compareFields({
|
|
519
|
-
expected,
|
|
520
|
-
actual: result.output,
|
|
521
|
-
comparators,
|
|
522
|
-
unorderedList: config.unorderedList
|
|
523
|
-
});
|
|
524
|
-
const passedFields = Object.values(fields).filter((f) => f.passed).length;
|
|
525
|
-
const totalFields$1 = Object.values(fields).length;
|
|
526
|
-
const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
|
|
527
|
-
const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
|
|
528
|
-
return {
|
|
529
|
-
input,
|
|
530
|
-
expected,
|
|
531
|
-
actual: result.output,
|
|
532
|
-
additionalContext: result.additionalContext,
|
|
533
|
-
cost: result.cost ?? 0,
|
|
534
|
-
passed: passed$1,
|
|
535
|
-
fields,
|
|
536
|
-
passedFields,
|
|
537
|
-
totalFields: totalFields$1,
|
|
538
|
-
passRate
|
|
539
|
-
};
|
|
540
|
-
} catch (error) {
|
|
541
|
-
return {
|
|
542
|
-
input,
|
|
543
|
-
expected,
|
|
544
|
-
actual: void 0,
|
|
545
|
-
cost: 0,
|
|
546
|
-
passed: false,
|
|
547
|
-
fields: {},
|
|
548
|
-
passedFields: 0,
|
|
549
|
-
totalFields: 0,
|
|
550
|
-
passRate: 0,
|
|
551
|
-
error: error instanceof Error ? error.message : String(error)
|
|
552
|
-
};
|
|
616
|
+
const theme = {
|
|
617
|
+
success: chalk.default.green,
|
|
618
|
+
error: chalk.default.red,
|
|
619
|
+
warning: chalk.default.yellow,
|
|
620
|
+
bold: chalk.default.bold,
|
|
621
|
+
dim: chalk.default.dim,
|
|
622
|
+
check: chalk.default.green(figures.default.tick),
|
|
623
|
+
cross: chalk.default.red(figures.default.cross),
|
|
624
|
+
warn: chalk.default.yellow(figures.default.warning),
|
|
625
|
+
bullet: chalk.default.dim(figures.default.bullet),
|
|
626
|
+
pointer: chalk.default.yellow(figures.default.pointer),
|
|
627
|
+
separator: chalk.default.dim(" · "),
|
|
628
|
+
divider: (label, width = 60) => {
|
|
629
|
+
const prefix = `━━━ ${label} `;
|
|
630
|
+
const remaining = Math.max(0, width - prefix.length);
|
|
631
|
+
return chalk.default.cyan.dim(prefix + "━".repeat(remaining));
|
|
632
|
+
}
|
|
633
|
+
};
|
|
634
|
+
let activeSpinner = null;
|
|
635
|
+
const spinner = {
|
|
636
|
+
start(text) {
|
|
637
|
+
if (activeSpinner) activeSpinner.stop();
|
|
638
|
+
activeSpinner = (0, ora.default)({
|
|
639
|
+
text,
|
|
640
|
+
spinner: "dots",
|
|
641
|
+
indent: 4
|
|
642
|
+
}).start();
|
|
643
|
+
return activeSpinner;
|
|
644
|
+
},
|
|
645
|
+
succeed(text) {
|
|
646
|
+
if (activeSpinner) {
|
|
647
|
+
activeSpinner.succeed(text);
|
|
648
|
+
activeSpinner = null;
|
|
553
649
|
}
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
for (let i = 0; i < testCases.length; i += rateLimitBatch) {
|
|
560
|
-
const batch = testCases.slice(i, i + rateLimitBatch);
|
|
561
|
-
const batchResults = await Promise.all(batch.map(executeTestCase));
|
|
562
|
-
results.push(...batchResults);
|
|
563
|
-
const rateLimitPause = config.rateLimitPause;
|
|
564
|
-
if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
|
|
650
|
+
},
|
|
651
|
+
fail(text) {
|
|
652
|
+
if (activeSpinner) {
|
|
653
|
+
activeSpinner.fail(text);
|
|
654
|
+
activeSpinner = null;
|
|
565
655
|
}
|
|
566
|
-
}
|
|
567
|
-
|
|
568
|
-
if (
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
totalFields += fieldResults.length;
|
|
579
|
-
correctFields += fieldResults.filter((f) => f.passed).length;
|
|
656
|
+
},
|
|
657
|
+
stop() {
|
|
658
|
+
if (activeSpinner) {
|
|
659
|
+
activeSpinner.stop();
|
|
660
|
+
activeSpinner = null;
|
|
661
|
+
}
|
|
662
|
+
},
|
|
663
|
+
clear() {
|
|
664
|
+
if (activeSpinner) activeSpinner.clear();
|
|
665
|
+
},
|
|
666
|
+
isActive() {
|
|
667
|
+
return activeSpinner !== null;
|
|
580
668
|
}
|
|
581
|
-
|
|
582
|
-
|
|
669
|
+
};
|
|
670
|
+
function createProgressTracker(label) {
|
|
671
|
+
let bar = null;
|
|
672
|
+
let startTime = 0;
|
|
673
|
+
let lastUpdate = 0;
|
|
674
|
+
const MIN_UPDATE_INTERVAL = 100;
|
|
583
675
|
return {
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
676
|
+
start(total) {
|
|
677
|
+
spinner.stop();
|
|
678
|
+
startTime = Date.now();
|
|
679
|
+
bar = new cli_progress.default.SingleBar({
|
|
680
|
+
format: ` {bar} {percentage}% {value}/{total} ${label} {duration_formatted}`,
|
|
681
|
+
barCompleteChar: "█",
|
|
682
|
+
barIncompleteChar: "░",
|
|
683
|
+
barsize: 20,
|
|
684
|
+
hideCursor: true,
|
|
685
|
+
clearOnComplete: false,
|
|
686
|
+
stopOnComplete: false,
|
|
687
|
+
forceRedraw: true,
|
|
688
|
+
fps: 10
|
|
689
|
+
});
|
|
690
|
+
bar.start(total, 0, { duration_formatted: "0s" });
|
|
691
|
+
},
|
|
692
|
+
update(current) {
|
|
693
|
+
const now = Date.now();
|
|
694
|
+
if (now - lastUpdate < MIN_UPDATE_INTERVAL && bar) {
|
|
695
|
+
if (current < bar.getTotal()) return;
|
|
696
|
+
}
|
|
697
|
+
lastUpdate = now;
|
|
698
|
+
if (bar) {
|
|
699
|
+
const elapsed = Math.round((now - startTime) / 1e3);
|
|
700
|
+
bar.update(current, { duration_formatted: `${elapsed}s` });
|
|
701
|
+
}
|
|
702
|
+
},
|
|
703
|
+
stop() {
|
|
704
|
+
if (bar) {
|
|
705
|
+
const elapsed = Math.round((Date.now() - startTime) / 1e3);
|
|
706
|
+
bar.update(bar.getTotal(), { duration_formatted: `${elapsed}s` });
|
|
707
|
+
bar.stop();
|
|
708
|
+
bar = null;
|
|
709
|
+
}
|
|
710
|
+
}
|
|
593
711
|
};
|
|
594
712
|
}
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
* Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
|
|
598
|
-
*/
|
|
599
|
-
function compareFields(opts) {
|
|
600
|
-
const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, unorderedList = false } = opts;
|
|
601
|
-
const results = {};
|
|
602
|
-
const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
|
|
603
|
-
if (Array.isArray(expected)) {
|
|
604
|
-
if (!Array.isArray(actual)) return { [path$1]: {
|
|
605
|
-
passed: false,
|
|
606
|
-
expected,
|
|
607
|
-
actual
|
|
608
|
-
} };
|
|
609
|
-
if (expected.length === 0) return {};
|
|
610
|
-
let matchedPairs;
|
|
611
|
-
if (unorderedList) matchedPairs = matchArrays(expected, actual, comparators).assignments;
|
|
612
|
-
else {
|
|
613
|
-
matchedPairs = [];
|
|
614
|
-
for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
|
|
615
|
-
}
|
|
616
|
-
const matchedIndices = new Set(matchedPairs.map(([i]) => i));
|
|
617
|
-
for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, compareFields({
|
|
618
|
-
expected: expected[expIdx],
|
|
619
|
-
actual: actual[actIdx],
|
|
620
|
-
comparators,
|
|
621
|
-
path: indexPath(expIdx),
|
|
622
|
-
expectedParent,
|
|
623
|
-
actualParent,
|
|
624
|
-
unorderedList
|
|
625
|
-
}));
|
|
626
|
-
const arrayFieldName = getFieldName(path$1);
|
|
627
|
-
const hasArrayComparator = arrayFieldName in comparators || arrayFieldName === "";
|
|
628
|
-
for (let i = 0; i < expected.length; i++) {
|
|
629
|
-
if (matchedIndices.has(i)) continue;
|
|
630
|
-
const item = expected[i];
|
|
631
|
-
if (isObject(item)) {
|
|
632
|
-
for (const [field, value] of Object.entries(item)) if (field in comparators) results[`${indexPath(i)}.${field}`] = {
|
|
633
|
-
passed: false,
|
|
634
|
-
expected: value,
|
|
635
|
-
actual: void 0
|
|
636
|
-
};
|
|
637
|
-
} else if (hasArrayComparator) results[indexPath(i)] = {
|
|
638
|
-
passed: false,
|
|
639
|
-
expected: item,
|
|
640
|
-
actual: void 0
|
|
641
|
-
};
|
|
642
|
-
}
|
|
643
|
-
return results;
|
|
644
|
-
}
|
|
645
|
-
if (isObject(expected)) {
|
|
646
|
-
if (!isObject(actual)) return { [path$1]: {
|
|
647
|
-
passed: false,
|
|
648
|
-
expected,
|
|
649
|
-
actual
|
|
650
|
-
} };
|
|
651
|
-
for (const [field, expValue] of Object.entries(expected)) {
|
|
652
|
-
const fieldPath = path$1 ? `${path$1}.${field}` : field;
|
|
653
|
-
Object.assign(results, compareFields({
|
|
654
|
-
expected: expValue,
|
|
655
|
-
actual: actual[field],
|
|
656
|
-
comparators,
|
|
657
|
-
path: fieldPath,
|
|
658
|
-
expectedParent: expected,
|
|
659
|
-
actualParent: actual,
|
|
660
|
-
unorderedList
|
|
661
|
-
}));
|
|
662
|
-
}
|
|
663
|
-
return results;
|
|
664
|
-
}
|
|
665
|
-
const fieldName = getFieldName(path$1);
|
|
666
|
-
const comparator = comparators[fieldName] ?? (fieldName === "" ? exact : void 0);
|
|
667
|
-
if (!comparator) return {};
|
|
668
|
-
const result = comparator(expected, actual, {
|
|
669
|
-
expectedParent,
|
|
670
|
-
actualParent
|
|
671
|
-
});
|
|
672
|
-
return { [path$1]: {
|
|
673
|
-
...result,
|
|
674
|
-
expected,
|
|
675
|
-
actual
|
|
676
|
-
} };
|
|
713
|
+
function formatCost(cost) {
|
|
714
|
+
return theme.dim(`$${cost.toFixed(4)}`);
|
|
677
715
|
}
|
|
678
|
-
function
|
|
679
|
-
return
|
|
716
|
+
function formatCostShort(cost) {
|
|
717
|
+
return theme.dim(`$${cost.toFixed(2)}`);
|
|
680
718
|
}
|
|
681
|
-
function
|
|
682
|
-
|
|
719
|
+
function formatDuration(ms) {
|
|
720
|
+
const totalSeconds = Math.round(ms / 1e3);
|
|
721
|
+
if (totalSeconds < 60) return `${totalSeconds}s`;
|
|
722
|
+
const minutes = Math.floor(totalSeconds / 60);
|
|
723
|
+
const seconds = totalSeconds % 60;
|
|
724
|
+
return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
|
|
725
|
+
}
|
|
726
|
+
function formatPercentage(rate) {
|
|
727
|
+
return `${(rate * 100).toFixed(1)}%`;
|
|
683
728
|
}
|
|
684
729
|
|
|
685
730
|
//#endregion
|
|
686
|
-
//#region src/optimizer-logging.ts
|
|
731
|
+
//#region src/optimizer/optimizer-logging.ts
|
|
687
732
|
function formatMsCompact(ms) {
|
|
688
733
|
const totalSeconds = Math.round(ms / 1e3);
|
|
689
734
|
if (totalSeconds < 60) return `${totalSeconds}s`;
|
|
@@ -701,12 +746,75 @@ function formatTokensCompact(tokens) {
|
|
|
701
746
|
if (tokens >= 1e3) return `${Math.round(tokens / 1e3)}K`;
|
|
702
747
|
return String(tokens);
|
|
703
748
|
}
|
|
749
|
+
/**
|
|
750
|
+
* Clear any active progress line before logging
|
|
751
|
+
* Call this before all console.log statements
|
|
752
|
+
*/
|
|
753
|
+
function clearProgressLine() {
|
|
754
|
+
const width = process.stdout.columns || 80;
|
|
755
|
+
process.stdout.write("\r" + " ".repeat(width) + "\r");
|
|
756
|
+
}
|
|
757
|
+
/**
|
|
758
|
+
* Create a progress updater using cli-progress for beautiful output
|
|
759
|
+
*/
|
|
760
|
+
function createProgressUpdater(label) {
|
|
761
|
+
let tracker = null;
|
|
762
|
+
let total = 0;
|
|
763
|
+
return {
|
|
764
|
+
update(completed, newTotal) {
|
|
765
|
+
if (!tracker) {
|
|
766
|
+
total = newTotal;
|
|
767
|
+
tracker = createProgressTracker(label);
|
|
768
|
+
tracker.start(total);
|
|
769
|
+
}
|
|
770
|
+
tracker.update(completed);
|
|
771
|
+
},
|
|
772
|
+
finish() {
|
|
773
|
+
if (tracker) {
|
|
774
|
+
tracker.stop();
|
|
775
|
+
tracker = null;
|
|
776
|
+
}
|
|
777
|
+
},
|
|
778
|
+
clear() {
|
|
779
|
+
clearProgressLine();
|
|
780
|
+
}
|
|
781
|
+
};
|
|
782
|
+
}
|
|
783
|
+
/**
|
|
784
|
+
* Track progress of Promise.allSettled with real-time updates
|
|
785
|
+
*
|
|
786
|
+
* @param promises Array of promises to track
|
|
787
|
+
* @param onProgress Callback called when each promise settles
|
|
788
|
+
* @returns Promise.allSettled result
|
|
789
|
+
*/
|
|
790
|
+
async function trackPromiseProgress(promises, onProgress) {
|
|
791
|
+
if (promises.length === 0) return [];
|
|
792
|
+
let completed = 0;
|
|
793
|
+
const total = promises.length;
|
|
794
|
+
onProgress(0, total);
|
|
795
|
+
const wrappedPromises = promises.map((promise) => promise.then((value) => {
|
|
796
|
+
completed++;
|
|
797
|
+
onProgress(completed, total);
|
|
798
|
+
return {
|
|
799
|
+
status: "fulfilled",
|
|
800
|
+
value
|
|
801
|
+
};
|
|
802
|
+
}).catch((reason) => {
|
|
803
|
+
completed++;
|
|
804
|
+
onProgress(completed, total);
|
|
805
|
+
return {
|
|
806
|
+
status: "rejected",
|
|
807
|
+
reason
|
|
808
|
+
};
|
|
809
|
+
}));
|
|
810
|
+
return Promise.all(wrappedPromises);
|
|
811
|
+
}
|
|
704
812
|
function formatFailure(testCase) {
|
|
705
813
|
const lines = [];
|
|
706
814
|
lines.push(`Input: ${JSON.stringify(testCase.input, null, 2)}`);
|
|
707
815
|
lines.push(`Expected: ${JSON.stringify(testCase.expected, null, 2)}`);
|
|
708
816
|
lines.push(`Actual: ${JSON.stringify(testCase.actual, null, 2)}`);
|
|
709
|
-
if (testCase.additionalContext) lines.push(`Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
|
|
817
|
+
if (testCase.additionalContext) lines.push(`Additional Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
|
|
710
818
|
lines.push("");
|
|
711
819
|
lines.push("Field-level failures:");
|
|
712
820
|
for (const [fieldPath, result] of Object.entries(testCase.fields)) if (!result.passed) lines.push(` ${fieldPath || "(root)"}: expected ${JSON.stringify(result.expected)}, got ${JSON.stringify(result.actual)}`);
|
|
@@ -730,56 +838,98 @@ function computeTotals(iterations) {
|
|
|
730
838
|
totalDuration
|
|
731
839
|
};
|
|
732
840
|
}
|
|
733
|
-
function
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
841
|
+
function logOptimizerHeader(model, targetRate, testCount) {
|
|
842
|
+
spinner.stop();
|
|
843
|
+
console.log("");
|
|
844
|
+
console.log(theme.bold("Didactic Optimizer"));
|
|
845
|
+
console.log(` ${theme.dim("Model:")} ${model}${theme.separator}${theme.dim("Target:")} ${formatPercentage(targetRate)}${theme.separator}${theme.dim("Tests:")} ${testCount}`);
|
|
737
846
|
}
|
|
738
847
|
function logIterationStart(iterationLabel) {
|
|
739
|
-
|
|
848
|
+
spinner.stop();
|
|
849
|
+
clearProgressLine();
|
|
850
|
+
console.log("");
|
|
851
|
+
console.log(theme.divider(`Iteration ${iterationLabel}`));
|
|
852
|
+
console.log("");
|
|
740
853
|
}
|
|
741
854
|
function logEvaluationStart() {
|
|
742
|
-
|
|
855
|
+
spinner.stop();
|
|
856
|
+
clearProgressLine();
|
|
857
|
+
console.log(` ${theme.bold("Evaluating prompt")}`);
|
|
858
|
+
spinner.start("Running evals...");
|
|
743
859
|
}
|
|
744
860
|
function logEvaluationResult(result, cumulativeCost, durationMs) {
|
|
745
|
-
|
|
861
|
+
spinner.stop();
|
|
862
|
+
clearProgressLine();
|
|
863
|
+
const successIcon = result.successRate >= .9 ? theme.check : result.successRate >= .5 ? theme.warn : theme.cross;
|
|
864
|
+
console.log(` ${successIcon} ${theme.bold(formatPercentage(result.successRate))} success rate ${theme.dim(`(${result.passed}/${result.total} passed)`)}`);
|
|
865
|
+
console.log(` ${theme.dim("Cost:")} ${formatCost(result.cost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
|
|
746
866
|
}
|
|
747
867
|
function logRegressionDetected(bestSuccessRate) {
|
|
748
|
-
|
|
868
|
+
spinner.stop();
|
|
869
|
+
clearProgressLine();
|
|
870
|
+
console.log(` ${theme.pointer} ${theme.warning("Regression")} ${theme.dim(`(was ${formatPercentage(bestSuccessRate)})`)}`);
|
|
749
871
|
}
|
|
750
872
|
function logTargetReached(targetSuccessRate) {
|
|
751
|
-
|
|
873
|
+
spinner.stop();
|
|
874
|
+
clearProgressLine();
|
|
875
|
+
console.log(` ${theme.check} ${theme.success("Target reached!")} ${theme.dim(`(${formatPercentage(targetSuccessRate)})`)}`);
|
|
752
876
|
}
|
|
753
877
|
function logTargetFailures(targetSuccessRate, failureCount) {
|
|
754
|
-
|
|
878
|
+
spinner.stop();
|
|
879
|
+
clearProgressLine();
|
|
880
|
+
console.log(` ${theme.cross} ${theme.error(`${failureCount} failures`)} to address ${theme.dim(`(target: ${formatPercentage(targetSuccessRate)})`)}`);
|
|
755
881
|
}
|
|
756
882
|
function logCostLimitReached(cumulativeCost) {
|
|
757
|
-
|
|
883
|
+
spinner.stop();
|
|
884
|
+
clearProgressLine();
|
|
885
|
+
console.log(` ${theme.warn} ${theme.warning("Cost limit reached")} ${theme.dim(`($${cumulativeCost.toFixed(2)})`)}`);
|
|
758
886
|
}
|
|
759
887
|
function logPatchGenerationStart(failureCount) {
|
|
760
|
-
|
|
761
|
-
|
|
888
|
+
spinner.stop();
|
|
889
|
+
clearProgressLine();
|
|
890
|
+
console.log("");
|
|
891
|
+
console.log(` ${theme.bold("Generating patches")}`);
|
|
892
|
+
spinner.start(`Generating ${failureCount} patches in parallel...`);
|
|
762
893
|
}
|
|
763
894
|
function logPatchGenerationResult(patchCost, cumulativeCost, durationMs) {
|
|
764
|
-
|
|
895
|
+
spinner.stop();
|
|
896
|
+
clearProgressLine();
|
|
897
|
+
console.log(` ${theme.check} Patches generated${theme.separator}${theme.dim("Cost:")} ${formatCost(patchCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
|
|
765
898
|
}
|
|
766
899
|
function logMergeStart() {
|
|
767
|
-
|
|
768
|
-
|
|
900
|
+
spinner.stop();
|
|
901
|
+
clearProgressLine();
|
|
902
|
+
console.log("");
|
|
903
|
+
console.log(` ${theme.bold("Merging patches")}`);
|
|
904
|
+
spinner.start("Merging patches...");
|
|
769
905
|
}
|
|
770
906
|
function logMergeResult(mergeCost, cumulativeCost, durationMs) {
|
|
771
|
-
|
|
907
|
+
spinner.stop();
|
|
908
|
+
clearProgressLine();
|
|
909
|
+
console.log(` ${theme.check} Merged${theme.separator}${theme.dim("Cost:")} ${formatCost(mergeCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
|
|
772
910
|
}
|
|
773
911
|
function logPatchGenerationFailures(failedCount, totalCount) {
|
|
774
|
-
|
|
912
|
+
spinner.stop();
|
|
913
|
+
clearProgressLine();
|
|
914
|
+
console.log(` ${theme.warn} ${theme.warning(`${failedCount}/${totalCount} patch generations failed`)}`);
|
|
775
915
|
}
|
|
776
916
|
function logOptimizationComplete(bestSuccessRate, targetSuccessRate, cumulativeCost) {
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
console.log(
|
|
917
|
+
spinner.stop();
|
|
918
|
+
clearProgressLine();
|
|
919
|
+
console.log("");
|
|
920
|
+
console.log(theme.divider("Complete"));
|
|
921
|
+
console.log("");
|
|
922
|
+
const targetMet = bestSuccessRate >= targetSuccessRate;
|
|
923
|
+
const icon = targetMet ? theme.check : theme.cross;
|
|
924
|
+
const rateColor = targetMet ? theme.success : theme.error;
|
|
925
|
+
console.log(` ${icon} ${theme.bold("Best:")} ${rateColor(formatPercentage(bestSuccessRate))}`);
|
|
926
|
+
console.log(` ${theme.dim("Target:")} ${formatPercentage(targetSuccessRate)}${theme.separator}${theme.dim("Total Cost:")} ${formatCostShort(cumulativeCost)}`);
|
|
780
927
|
}
|
|
781
928
|
function logLogsWritten(logPath) {
|
|
782
|
-
|
|
929
|
+
spinner.stop();
|
|
930
|
+
clearProgressLine();
|
|
931
|
+
console.log(` ${theme.dim("Logs written to:")} ${logPath}`);
|
|
932
|
+
console.log("");
|
|
783
933
|
}
|
|
784
934
|
function generateConfigSection(ctx, testCaseCount) {
|
|
785
935
|
const lines = [];
|
|
@@ -946,6 +1096,7 @@ function writeRawDataJson(folderPath, iterations, ctx, success) {
|
|
|
946
1096
|
input: tc.input,
|
|
947
1097
|
expected: tc.expected,
|
|
948
1098
|
actual: tc.actual,
|
|
1099
|
+
additionalContext: tc.additionalContext,
|
|
949
1100
|
fields: tc.fields
|
|
950
1101
|
});
|
|
951
1102
|
});
|
|
@@ -1019,6 +1170,7 @@ function writeBestRunJson(folderPath, iterations, ctx) {
|
|
|
1019
1170
|
input: tc.input,
|
|
1020
1171
|
expected: tc.expected,
|
|
1021
1172
|
actual: tc.actual,
|
|
1173
|
+
additionalContext: tc.additionalContext,
|
|
1022
1174
|
failedFields: extractFailedFields(tc.fields)
|
|
1023
1175
|
});
|
|
1024
1176
|
else if (tc.passRate < 1) partialFailures.push({
|
|
@@ -1027,13 +1179,15 @@ function writeBestRunJson(folderPath, iterations, ctx) {
|
|
|
1027
1179
|
input: tc.input,
|
|
1028
1180
|
expected: tc.expected,
|
|
1029
1181
|
actual: tc.actual,
|
|
1182
|
+
additionalContext: tc.additionalContext,
|
|
1030
1183
|
failedFields: extractFailedFields(tc.fields)
|
|
1031
1184
|
});
|
|
1032
1185
|
else successes.push({
|
|
1033
1186
|
testIndex: testIdx,
|
|
1034
1187
|
input: tc.input,
|
|
1035
1188
|
expected: tc.expected,
|
|
1036
|
-
actual: tc.actual
|
|
1189
|
+
actual: tc.actual,
|
|
1190
|
+
additionalContext: tc.additionalContext
|
|
1037
1191
|
});
|
|
1038
1192
|
});
|
|
1039
1193
|
const report = {
|
|
@@ -1070,29 +1224,402 @@ function writeBestRunJson(folderPath, iterations, ctx) {
|
|
|
1070
1224
|
};
|
|
1071
1225
|
fs.writeFileSync(bestRunPath, JSON.stringify(report, null, 2), "utf-8");
|
|
1072
1226
|
}
|
|
1073
|
-
function writeFinalLogs(logPath, iterationLogs, logContext, success) {
|
|
1074
|
-
const folderPath = path.dirname(logPath);
|
|
1075
|
-
if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
|
|
1076
|
-
const content = generateLogContent(iterationLogs, logContext, success);
|
|
1077
|
-
fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
|
|
1078
|
-
writePromptsFile(folderPath, iterationLogs, logContext);
|
|
1079
|
-
writeRawDataJson(folderPath, iterationLogs, logContext, success);
|
|
1080
|
-
writeBestRunJson(folderPath, iterationLogs, logContext);
|
|
1227
|
+
function writeFinalLogs(logPath, iterationLogs, logContext, success) {
|
|
1228
|
+
const folderPath = path.dirname(logPath);
|
|
1229
|
+
if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
|
|
1230
|
+
const content = generateLogContent(iterationLogs, logContext, success);
|
|
1231
|
+
fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
|
|
1232
|
+
writePromptsFile(folderPath, iterationLogs, logContext);
|
|
1233
|
+
writeRawDataJson(folderPath, iterationLogs, logContext, success);
|
|
1234
|
+
writeBestRunJson(folderPath, iterationLogs, logContext);
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
//#endregion
|
|
1238
|
+
//#region src/eval/eval-logging.ts
|
|
1239
|
+
/**
|
|
1240
|
+
* Write evaluation results to rawData.json
|
|
1241
|
+
*
|
|
1242
|
+
* Synchronous writes are intentional - logging runs after evaluation completes
|
|
1243
|
+
* and errors are caught. This avoids async complexity in the calling code.
|
|
1244
|
+
*/
|
|
1245
|
+
function writeEvalLogs(logPath, result, durationMs, perTestThreshold) {
|
|
1246
|
+
try {
|
|
1247
|
+
const dir = path.dirname(logPath);
|
|
1248
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
1249
|
+
const report = {
|
|
1250
|
+
metadata: {
|
|
1251
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1252
|
+
systemPrompt: result.systemPrompt,
|
|
1253
|
+
testCaseCount: result.total,
|
|
1254
|
+
perTestThreshold: perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD
|
|
1255
|
+
},
|
|
1256
|
+
summary: {
|
|
1257
|
+
passed: result.passed,
|
|
1258
|
+
total: result.total,
|
|
1259
|
+
successRate: result.successRate,
|
|
1260
|
+
correctFields: result.correctFields,
|
|
1261
|
+
totalFields: result.totalFields,
|
|
1262
|
+
accuracy: result.accuracy,
|
|
1263
|
+
executorCost: result.cost,
|
|
1264
|
+
comparatorCost: result.comparatorCost,
|
|
1265
|
+
totalCost: result.cost + result.comparatorCost,
|
|
1266
|
+
durationMs
|
|
1267
|
+
},
|
|
1268
|
+
testCases: result.testCases.map((tc, index) => ({
|
|
1269
|
+
index,
|
|
1270
|
+
passed: tc.passed,
|
|
1271
|
+
passRate: tc.passRate,
|
|
1272
|
+
input: tc.input,
|
|
1273
|
+
expected: tc.expected,
|
|
1274
|
+
actual: tc.actual,
|
|
1275
|
+
additionalContext: tc.additionalContext,
|
|
1276
|
+
executorCost: tc.cost ?? 0,
|
|
1277
|
+
comparatorCost: tc.comparatorCost ?? 0,
|
|
1278
|
+
error: tc.error,
|
|
1279
|
+
fields: tc.fields
|
|
1280
|
+
}))
|
|
1281
|
+
};
|
|
1282
|
+
fs.writeFileSync(logPath, JSON.stringify(report, null, 2), "utf-8");
|
|
1283
|
+
} catch (error) {
|
|
1284
|
+
console.error(`Failed to write eval logs to ${logPath}:`, error instanceof Error ? error.message : String(error));
|
|
1285
|
+
}
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
//#endregion
|
|
1289
|
+
//#region src/eval/eval.ts
|
|
1290
|
+
/**
|
|
1291
|
+
* Run all test cases and return results.
|
|
1292
|
+
*/
|
|
1293
|
+
async function evaluate(config) {
|
|
1294
|
+
const { testCases, systemPrompt, executor, comparators, comparatorOverride } = config;
|
|
1295
|
+
if (testCases.length === 0) throw new Error("testCases array cannot be empty");
|
|
1296
|
+
if (!executor) throw new Error("executor is required");
|
|
1297
|
+
const startTime = Date.now();
|
|
1298
|
+
const logPath = config.storeLogs ? typeof config.storeLogs === "string" ? config.storeLogs : `./didactic-logs/eval_${Date.now()}_${crypto.randomUUID().slice(0, 8)}/rawData.json` : void 0;
|
|
1299
|
+
const executeTestCase = async ({ input, expected }) => {
|
|
1300
|
+
try {
|
|
1301
|
+
const result = await executor(input, systemPrompt);
|
|
1302
|
+
let fields;
|
|
1303
|
+
if (comparatorOverride) {
|
|
1304
|
+
const compResult = await comparatorOverride(expected, result.output);
|
|
1305
|
+
fields = { "": {
|
|
1306
|
+
passed: compResult.passed,
|
|
1307
|
+
expected,
|
|
1308
|
+
actual: result.output
|
|
1309
|
+
} };
|
|
1310
|
+
} else {
|
|
1311
|
+
let comparatorConfig;
|
|
1312
|
+
if (!comparators) comparatorConfig = { "": exact };
|
|
1313
|
+
else if (typeof comparators === "function") comparatorConfig = { "": comparators };
|
|
1314
|
+
else comparatorConfig = comparators;
|
|
1315
|
+
fields = await compareFields({
|
|
1316
|
+
expected,
|
|
1317
|
+
actual: result.output,
|
|
1318
|
+
comparators: comparatorConfig,
|
|
1319
|
+
llmConfig: config.llmConfig
|
|
1320
|
+
});
|
|
1321
|
+
}
|
|
1322
|
+
const passedFields = Object.values(fields).filter((f) => f.passed).length;
|
|
1323
|
+
const totalFields$1 = Object.values(fields).length;
|
|
1324
|
+
const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
|
|
1325
|
+
const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
|
|
1326
|
+
const comparatorCost$1 = Object.values(fields).reduce((sum, field) => sum + (field.cost ?? 0), 0);
|
|
1327
|
+
return {
|
|
1328
|
+
input,
|
|
1329
|
+
expected,
|
|
1330
|
+
actual: result.output,
|
|
1331
|
+
additionalContext: result.additionalContext,
|
|
1332
|
+
cost: result.cost ?? 0,
|
|
1333
|
+
comparatorCost: comparatorCost$1,
|
|
1334
|
+
passed: passed$1,
|
|
1335
|
+
fields,
|
|
1336
|
+
passedFields,
|
|
1337
|
+
totalFields: totalFields$1,
|
|
1338
|
+
passRate
|
|
1339
|
+
};
|
|
1340
|
+
} catch (error) {
|
|
1341
|
+
return {
|
|
1342
|
+
input,
|
|
1343
|
+
expected,
|
|
1344
|
+
actual: void 0,
|
|
1345
|
+
cost: 0,
|
|
1346
|
+
comparatorCost: 0,
|
|
1347
|
+
passed: false,
|
|
1348
|
+
fields: {},
|
|
1349
|
+
passedFields: 0,
|
|
1350
|
+
totalFields: 0,
|
|
1351
|
+
passRate: 0,
|
|
1352
|
+
error: error instanceof Error ? error.message : String(error)
|
|
1353
|
+
};
|
|
1354
|
+
}
|
|
1355
|
+
};
|
|
1356
|
+
const rateLimitBatch = config.rateLimitBatch;
|
|
1357
|
+
let results;
|
|
1358
|
+
if (rateLimitBatch && rateLimitBatch > 0) {
|
|
1359
|
+
results = [];
|
|
1360
|
+
const progress = createProgressUpdater("evals");
|
|
1361
|
+
for (let i = 0; i < testCases.length; i += rateLimitBatch) {
|
|
1362
|
+
const batch = testCases.slice(i, i + rateLimitBatch);
|
|
1363
|
+
const batchResults = await Promise.all(batch.map(executeTestCase));
|
|
1364
|
+
results.push(...batchResults);
|
|
1365
|
+
progress.update(results.length, testCases.length);
|
|
1366
|
+
const rateLimitPause = config.rateLimitPause;
|
|
1367
|
+
if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
|
|
1368
|
+
}
|
|
1369
|
+
progress.finish();
|
|
1370
|
+
} else {
|
|
1371
|
+
const progress = createProgressUpdater("evals");
|
|
1372
|
+
results = (await trackPromiseProgress(testCases.map((tc) => executeTestCase(tc)), (completed, total$1) => progress.update(completed, total$1))).map((r) => r.value);
|
|
1373
|
+
progress.finish();
|
|
1374
|
+
}
|
|
1375
|
+
results.sort((a, b) => {
|
|
1376
|
+
if (a.passed !== b.passed) return a.passed ? 1 : -1;
|
|
1377
|
+
return a.passRate - b.passRate;
|
|
1378
|
+
});
|
|
1379
|
+
const passed = results.filter((r) => r.passed).length;
|
|
1380
|
+
const total = results.length;
|
|
1381
|
+
const successRate = total > 0 ? passed / total : 0;
|
|
1382
|
+
let correctFields = 0;
|
|
1383
|
+
let totalFields = 0;
|
|
1384
|
+
for (const r of results) {
|
|
1385
|
+
const fieldResults = Object.values(r.fields);
|
|
1386
|
+
totalFields += fieldResults.length;
|
|
1387
|
+
correctFields += fieldResults.filter((f) => f.passed).length;
|
|
1388
|
+
}
|
|
1389
|
+
const accuracy = totalFields > 0 ? correctFields / totalFields : 0;
|
|
1390
|
+
const cost = results.reduce((sum, r) => sum + (r.cost ?? 0), 0);
|
|
1391
|
+
const comparatorCost = results.reduce((sum, r) => sum + (r.comparatorCost ?? 0), 0);
|
|
1392
|
+
const durationMs = Date.now() - startTime;
|
|
1393
|
+
const logFolder = logPath ? path.dirname(logPath) : void 0;
|
|
1394
|
+
const evalResult = {
|
|
1395
|
+
systemPrompt,
|
|
1396
|
+
testCases: results,
|
|
1397
|
+
passed,
|
|
1398
|
+
total,
|
|
1399
|
+
successRate,
|
|
1400
|
+
correctFields,
|
|
1401
|
+
totalFields,
|
|
1402
|
+
accuracy,
|
|
1403
|
+
cost,
|
|
1404
|
+
comparatorCost,
|
|
1405
|
+
...logFolder && { logFolder }
|
|
1406
|
+
};
|
|
1407
|
+
if (logPath) writeEvalLogs(logPath, evalResult, durationMs, config.perTestThreshold);
|
|
1408
|
+
return evalResult;
|
|
1409
|
+
}
|
|
1410
|
+
/**
|
|
1411
|
+
* Recursively compare expected vs actual, returning field-level results.
|
|
1412
|
+
* Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
|
|
1413
|
+
*/
|
|
1414
|
+
async function compareFields(opts) {
|
|
1415
|
+
const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, llmConfig } = opts;
|
|
1416
|
+
const results = {};
|
|
1417
|
+
const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
|
|
1418
|
+
if (Array.isArray(expected)) {
|
|
1419
|
+
if (!Array.isArray(actual)) return { [path$1]: {
|
|
1420
|
+
passed: false,
|
|
1421
|
+
expected,
|
|
1422
|
+
actual
|
|
1423
|
+
} };
|
|
1424
|
+
if (expected.length === 0) return {};
|
|
1425
|
+
const fieldComparator = comparators[getFieldName(path$1)];
|
|
1426
|
+
const isUnordered = fieldComparator && typeof fieldComparator === "function" && "_unordered" in fieldComparator && fieldComparator._unordered === true;
|
|
1427
|
+
let itemComparators;
|
|
1428
|
+
if (isUnordered) itemComparators = fieldComparator._nestedComparators || comparators;
|
|
1429
|
+
else if (fieldComparator && typeof fieldComparator === "object" && !("_unordered" in fieldComparator)) itemComparators = fieldComparator;
|
|
1430
|
+
else itemComparators = comparators;
|
|
1431
|
+
let matchedPairs;
|
|
1432
|
+
if (isUnordered) matchedPairs = (await matchArrays(expected, actual, itemComparators)).assignments;
|
|
1433
|
+
else {
|
|
1434
|
+
matchedPairs = [];
|
|
1435
|
+
for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
|
|
1436
|
+
}
|
|
1437
|
+
const matchedIndices = new Set(matchedPairs.map(([i]) => i));
|
|
1438
|
+
for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, await compareFields({
|
|
1439
|
+
expected: expected[expIdx],
|
|
1440
|
+
actual: actual[actIdx],
|
|
1441
|
+
comparators: itemComparators,
|
|
1442
|
+
path: indexPath(expIdx),
|
|
1443
|
+
expectedParent,
|
|
1444
|
+
actualParent,
|
|
1445
|
+
llmConfig
|
|
1446
|
+
}));
|
|
1447
|
+
const hasArrayComparator = fieldComparator !== void 0;
|
|
1448
|
+
for (let i = 0; i < expected.length; i++) {
|
|
1449
|
+
if (matchedIndices.has(i)) continue;
|
|
1450
|
+
const item = expected[i];
|
|
1451
|
+
if (isObject(item)) {
|
|
1452
|
+
for (const [field, value] of Object.entries(item)) if (field in itemComparators) results[`${indexPath(i)}.${field}`] = {
|
|
1453
|
+
passed: false,
|
|
1454
|
+
expected: value,
|
|
1455
|
+
actual: void 0
|
|
1456
|
+
};
|
|
1457
|
+
} else if (hasArrayComparator) results[indexPath(i)] = {
|
|
1458
|
+
passed: false,
|
|
1459
|
+
expected: item,
|
|
1460
|
+
actual: void 0
|
|
1461
|
+
};
|
|
1462
|
+
}
|
|
1463
|
+
return results;
|
|
1464
|
+
}
|
|
1465
|
+
if (isObject(expected)) {
|
|
1466
|
+
if (!isObject(actual)) return { [path$1]: {
|
|
1467
|
+
passed: false,
|
|
1468
|
+
expected,
|
|
1469
|
+
actual
|
|
1470
|
+
} };
|
|
1471
|
+
for (const [field, expValue] of Object.entries(expected)) {
|
|
1472
|
+
const fieldPath = path$1 ? `${path$1}.${field}` : field;
|
|
1473
|
+
const fieldConfig = comparators[field];
|
|
1474
|
+
if (fieldConfig === void 0) continue;
|
|
1475
|
+
let fieldComparators;
|
|
1476
|
+
if (fieldConfig && typeof fieldConfig === "object" && !("_unordered" in fieldConfig)) fieldComparators = fieldConfig;
|
|
1477
|
+
else fieldComparators = comparators;
|
|
1478
|
+
Object.assign(results, await compareFields({
|
|
1479
|
+
expected: expValue,
|
|
1480
|
+
actual: actual[field],
|
|
1481
|
+
comparators: fieldComparators,
|
|
1482
|
+
path: fieldPath,
|
|
1483
|
+
expectedParent: expected,
|
|
1484
|
+
actualParent: actual,
|
|
1485
|
+
llmConfig
|
|
1486
|
+
}));
|
|
1487
|
+
}
|
|
1488
|
+
return results;
|
|
1489
|
+
}
|
|
1490
|
+
const fieldName = getFieldName(path$1);
|
|
1491
|
+
let comparatorConfig = comparators[fieldName];
|
|
1492
|
+
if (!comparatorConfig && fieldName === "") comparatorConfig = exact;
|
|
1493
|
+
if (!comparatorConfig) return {};
|
|
1494
|
+
const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected, actual, {
|
|
1495
|
+
expectedParent,
|
|
1496
|
+
actualParent,
|
|
1497
|
+
llmConfig
|
|
1498
|
+
});
|
|
1499
|
+
return { [path$1]: {
|
|
1500
|
+
...result,
|
|
1501
|
+
expected,
|
|
1502
|
+
actual
|
|
1503
|
+
} };
|
|
1504
|
+
}
|
|
1505
|
+
function isObject(value) {
|
|
1506
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
1507
|
+
}
|
|
1508
|
+
function getFieldName(path$1) {
|
|
1509
|
+
return (path$1.split(".").pop() || "").replace(/\[\d+\]$/, "");
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
//#endregion
|
|
1513
|
+
//#region src/optimizer/prompts.ts
|
|
1514
|
+
/**
|
|
1515
|
+
* Default system prompt for patch generation.
|
|
1516
|
+
* Analyzes failures and suggests specific, focused changes to improve the prompt.
|
|
1517
|
+
*/
|
|
1518
|
+
const DEFAULT_PATCH_SYSTEM_PROMPT = `
|
|
1519
|
+
'You are optimizing a system prompt for an LLM workflow.
|
|
1520
|
+
Analyze the failure and suggest a specific, focused change to improve the prompt.
|
|
1521
|
+
Do NOT overfit. Be generalizable.
|
|
1522
|
+
|
|
1523
|
+
<examples>
|
|
1524
|
+
VERY IMPORTANT, CRITICAL!!!
|
|
1525
|
+
Examples MUST be anonymized.
|
|
1526
|
+
NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
|
|
1527
|
+
- example: (for an invoice processor)
|
|
1528
|
+
- task: extract data from parsed invoices
|
|
1529
|
+
- failure context: (returned expected: true, actual: false)
|
|
1530
|
+
- prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
|
|
1531
|
+
|
|
1532
|
+
- example: (for a calendar app)
|
|
1533
|
+
- task: extract cost from calendar event
|
|
1534
|
+
- failure context: (cost expected: 123.45, actual: 167.89)
|
|
1535
|
+
- prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
|
|
1536
|
+
</examples>
|
|
1537
|
+
`;
|
|
1538
|
+
/**
|
|
1539
|
+
* Default system prompt for merging patches.
|
|
1540
|
+
* Combines multiple patches into a coherent system prompt.
|
|
1541
|
+
*/
|
|
1542
|
+
const DEFAULT_MERGE_SYSTEM_PROMPT = `
|
|
1543
|
+
You are an expert LLM prompt editor.
|
|
1544
|
+
You are merging improvements into a system prompt.
|
|
1545
|
+
Incorporate the suggestions while keeping the prompt clear and coherent.
|
|
1546
|
+
`;
|
|
1547
|
+
/**
|
|
1548
|
+
* Builds the user prompt for patch generation.
|
|
1549
|
+
* Formats the failure context and current prompt for the LLM.
|
|
1550
|
+
*/
|
|
1551
|
+
function buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures) {
|
|
1552
|
+
let userContent = `
|
|
1553
|
+
Current system prompt:
|
|
1554
|
+
---
|
|
1555
|
+
${currentPrompt}
|
|
1556
|
+
---
|
|
1557
|
+
|
|
1558
|
+
A test case failed:
|
|
1559
|
+
${formatFailure(failure)}
|
|
1560
|
+
`;
|
|
1561
|
+
if (previousBetterPrompt) {
|
|
1562
|
+
const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
|
|
1563
|
+
userContent += `
|
|
1564
|
+
Note: The current prompt is a REGRESSION from a better-performing version.
|
|
1565
|
+
Previous (better) prompt for reference:
|
|
1566
|
+
---
|
|
1567
|
+
${previousBetterPrompt}
|
|
1568
|
+
---
|
|
1569
|
+
|
|
1570
|
+
The failures the better prompt had:
|
|
1571
|
+
${failuresContext}
|
|
1572
|
+
|
|
1573
|
+
Your changes introduced new failures instead of fixing the above.
|
|
1574
|
+
Analyze what changed between the two prompts that might have caused this regression.
|
|
1575
|
+
Are there any new failures that were not present in the previous better prompt?
|
|
1576
|
+
Are there any failures that were present in the previous better prompt but not in the current prompt?
|
|
1577
|
+
Did any of our patches contradict any of the new failures?
|
|
1578
|
+
`;
|
|
1579
|
+
}
|
|
1580
|
+
userContent += `
|
|
1581
|
+
Suggest a specific change to the system prompt that would fix this failure.
|
|
1582
|
+
Be concise. Output ONLY the suggested patch/change, not the full prompt.
|
|
1583
|
+
DO NOT overfit the prompt to the test case.
|
|
1584
|
+
Generalize examples if you choose to use them.
|
|
1585
|
+
`;
|
|
1586
|
+
return userContent;
|
|
1587
|
+
}
|
|
1588
|
+
/**
|
|
1589
|
+
* Builds the user prompt for merging patches.
|
|
1590
|
+
* Formats the current prompt and suggested patches for the LLM.
|
|
1591
|
+
*/
|
|
1592
|
+
function buildMergeUserPrompt(patches, currentPrompt) {
|
|
1593
|
+
return `
|
|
1594
|
+
Current prompt:
|
|
1595
|
+
---
|
|
1596
|
+
${currentPrompt}
|
|
1597
|
+
---
|
|
1598
|
+
|
|
1599
|
+
Suggested improvements:
|
|
1600
|
+
${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
|
|
1601
|
+
|
|
1602
|
+
Create a single improved system prompt that incorporates these suggestions.
|
|
1603
|
+
Be mindful of the size of the new prompt.
|
|
1604
|
+
Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
|
|
1605
|
+
Output ONLY the new system prompt, nothing else.
|
|
1606
|
+
Respect enums.
|
|
1607
|
+
`;
|
|
1081
1608
|
}
|
|
1082
1609
|
|
|
1083
1610
|
//#endregion
|
|
1084
|
-
//#region src/optimizer.ts
|
|
1611
|
+
//#region src/optimizer/optimizer.ts
|
|
1085
1612
|
async function optimize(evalConfig, config) {
|
|
1086
1613
|
if (!config.apiKey) throw new Error("apiKey is required");
|
|
1087
|
-
if (!config.systemPrompt) throw new Error("systemPrompt is required");
|
|
1088
1614
|
if (config.targetSuccessRate < 0 || config.targetSuccessRate > 1) throw new Error("targetSuccessRate must be between 0 and 1");
|
|
1089
1615
|
const iterationLogs = [];
|
|
1090
1616
|
const maxIterations = config.maxIterations ?? (config.maxCost !== void 0 ? Infinity : 5);
|
|
1091
1617
|
const startTime = /* @__PURE__ */ new Date();
|
|
1618
|
+
const model = PROVIDER_SPECS[config.provider].model;
|
|
1092
1619
|
const logContext = {
|
|
1093
1620
|
config,
|
|
1094
1621
|
startTime,
|
|
1095
|
-
model
|
|
1622
|
+
model,
|
|
1096
1623
|
perTestThreshold: evalConfig.perTestThreshold,
|
|
1097
1624
|
rateLimitBatch: evalConfig.rateLimitBatch,
|
|
1098
1625
|
rateLimitPause: evalConfig.rateLimitPause
|
|
@@ -1149,6 +1676,8 @@ async function optimize(evalConfig, config) {
|
|
|
1149
1676
|
totalCost: cumulativeCost
|
|
1150
1677
|
};
|
|
1151
1678
|
};
|
|
1679
|
+
const testCount = evalConfig.testCases?.length ?? 0;
|
|
1680
|
+
logOptimizerHeader(model, config.targetSuccessRate, testCount);
|
|
1152
1681
|
for (let i = 1; i <= maxIterations; i++) {
|
|
1153
1682
|
const iterationStart = Date.now();
|
|
1154
1683
|
let iterInputTokens = 0;
|
|
@@ -1162,7 +1691,7 @@ async function optimize(evalConfig, config) {
|
|
|
1162
1691
|
});
|
|
1163
1692
|
cumulativeCost += result.cost;
|
|
1164
1693
|
logEvaluationResult(result, cumulativeCost, Date.now() - evalStart);
|
|
1165
|
-
const regressed = i > 1 && result.successRate
|
|
1694
|
+
const regressed = i > 1 && result.successRate <= bestSuccessRate;
|
|
1166
1695
|
if (regressed) logRegressionDetected(bestSuccessRate);
|
|
1167
1696
|
if (result.successRate > bestSuccessRate) {
|
|
1168
1697
|
bestSuccessRate = result.successRate;
|
|
@@ -1175,10 +1704,6 @@ async function optimize(evalConfig, config) {
|
|
|
1175
1704
|
return finalizeOptimization(true, currentPrompt);
|
|
1176
1705
|
}
|
|
1177
1706
|
const failures = result.testCases.filter((tc) => !tc.passed);
|
|
1178
|
-
if (failures.length === 0) {
|
|
1179
|
-
recordIteration(i, currentPrompt, result, result.cost, Date.now() - iterationStart, iterInputTokens, iterOutputTokens);
|
|
1180
|
-
return finalizeOptimization(true, currentPrompt);
|
|
1181
|
-
}
|
|
1182
1707
|
logTargetFailures(config.targetSuccessRate, failures.length);
|
|
1183
1708
|
if (config.maxCost !== void 0 && cumulativeCost >= config.maxCost) {
|
|
1184
1709
|
logCostLimitReached(cumulativeCost);
|
|
@@ -1187,7 +1712,9 @@ async function optimize(evalConfig, config) {
|
|
|
1187
1712
|
}
|
|
1188
1713
|
logPatchGenerationStart(failures.length);
|
|
1189
1714
|
const patchStart = Date.now();
|
|
1190
|
-
const
|
|
1715
|
+
const patchProgress = createProgressUpdater("patches");
|
|
1716
|
+
const patchSettled = await trackPromiseProgress(failures.map((failure) => generatePatch(failure, currentPrompt, config, regressed ? bestPrompt : void 0, regressed ? bestPromptFailures : void 0)), (completed, total) => patchProgress.update(completed, total));
|
|
1717
|
+
patchProgress.finish();
|
|
1191
1718
|
const patchResults = patchSettled.filter((r) => r.status === "fulfilled").map((r) => r.value);
|
|
1192
1719
|
const failedPatchCount = patchSettled.filter((r) => r.status === "rejected").length;
|
|
1193
1720
|
if (failedPatchCount > 0) logPatchGenerationFailures(failedPatchCount, failures.length);
|
|
@@ -1227,154 +1754,165 @@ async function optimize(evalConfig, config) {
|
|
|
1227
1754
|
}
|
|
1228
1755
|
return finalizeOptimization(false, bestPrompt);
|
|
1229
1756
|
}
|
|
1230
|
-
async function callLLM(messages, config, useThinking = false) {
|
|
1231
|
-
const spec = PROVIDER_SPECS[config.provider];
|
|
1232
|
-
try {
|
|
1233
|
-
if (config.provider.startsWith("anthropic")) {
|
|
1234
|
-
const client = new _anthropic_ai_sdk.default({ apiKey: config.apiKey });
|
|
1235
|
-
const streamOptions = {
|
|
1236
|
-
model: spec.model,
|
|
1237
|
-
max_tokens: spec.maxTokens,
|
|
1238
|
-
system: messages.find((m) => m.role === "system")?.content,
|
|
1239
|
-
messages: messages.filter((m) => m.role !== "system").map((m) => ({
|
|
1240
|
-
role: m.role,
|
|
1241
|
-
content: m.content
|
|
1242
|
-
}))
|
|
1243
|
-
};
|
|
1244
|
-
if (useThinking) streamOptions.thinking = {
|
|
1245
|
-
type: "enabled",
|
|
1246
|
-
budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
|
|
1247
|
-
};
|
|
1248
|
-
const finalMessage = await client.messages.stream(streamOptions).finalMessage();
|
|
1249
|
-
const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
|
|
1250
|
-
const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
|
|
1251
|
-
const inputTokens = finalMessage.usage.input_tokens;
|
|
1252
|
-
const outputTokens = finalMessage.usage.output_tokens;
|
|
1253
|
-
return {
|
|
1254
|
-
text,
|
|
1255
|
-
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
1256
|
-
inputTokens,
|
|
1257
|
-
outputTokens
|
|
1258
|
-
};
|
|
1259
|
-
}
|
|
1260
|
-
if (config.provider.startsWith("openai")) {
|
|
1261
|
-
const client = new openai.default({ apiKey: config.apiKey });
|
|
1262
|
-
const completionOptions = {
|
|
1263
|
-
model: spec.model,
|
|
1264
|
-
messages: messages.map((m) => ({
|
|
1265
|
-
role: m.role,
|
|
1266
|
-
content: m.content
|
|
1267
|
-
})),
|
|
1268
|
-
max_completion_tokens: spec.maxTokens
|
|
1269
|
-
};
|
|
1270
|
-
if (useThinking) completionOptions.reasoning_effort = "xhigh";
|
|
1271
|
-
const response = await client.chat.completions.create(completionOptions);
|
|
1272
|
-
const text = response.choices[0].message.content ?? "";
|
|
1273
|
-
const inputTokens = response.usage?.prompt_tokens ?? 0;
|
|
1274
|
-
const outputTokens = response.usage?.completion_tokens ?? 0;
|
|
1275
|
-
return {
|
|
1276
|
-
text,
|
|
1277
|
-
cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
|
|
1278
|
-
inputTokens,
|
|
1279
|
-
outputTokens
|
|
1280
|
-
};
|
|
1281
|
-
}
|
|
1282
|
-
throw new Error(`Unsupported provider: ${config.provider}`);
|
|
1283
|
-
} catch (error) {
|
|
1284
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1285
|
-
throw new Error(`LLM call failed (${spec.model}): ${message}`);
|
|
1286
|
-
}
|
|
1287
|
-
}
|
|
1288
1757
|
async function generatePatch(failure, currentPrompt, config, previousBetterPrompt, previousBetterPromptFailures) {
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
---
|
|
1292
|
-
${currentPrompt}
|
|
1293
|
-
---
|
|
1294
|
-
|
|
1295
|
-
A test case failed:
|
|
1296
|
-
${formatFailure(failure)}
|
|
1297
|
-
`;
|
|
1298
|
-
if (previousBetterPrompt) {
|
|
1299
|
-
const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
|
|
1300
|
-
userContent += `
|
|
1301
|
-
Note: The current prompt is a REGRESSION from a better-performing version.
|
|
1302
|
-
Previous (better) prompt for reference:
|
|
1303
|
-
---
|
|
1304
|
-
${previousBetterPrompt}
|
|
1305
|
-
---
|
|
1306
|
-
|
|
1307
|
-
The failures the better prompt had:
|
|
1308
|
-
${failuresContext}
|
|
1309
|
-
|
|
1310
|
-
Your changes introduced new failures instead of fixing the above.
|
|
1311
|
-
Analyze what changed between the two prompts that might have caused this regression.
|
|
1312
|
-
Are there any new failures that were not present in the previous better prompt?
|
|
1313
|
-
Are there any failures that were present in the previous better prompt but not in the current prompt?
|
|
1314
|
-
Did any of our patches contradict any of the new failures?
|
|
1315
|
-
`;
|
|
1316
|
-
}
|
|
1317
|
-
userContent += `
|
|
1318
|
-
Suggest a specific change to the system prompt that would fix this failure.
|
|
1319
|
-
Be concise. Output ONLY the suggested patch/change, not the full prompt.
|
|
1320
|
-
DO NOT overfit the prompt to the test case.
|
|
1321
|
-
Generalize examples if you choose to use them.
|
|
1322
|
-
`;
|
|
1323
|
-
return callLLM([{
|
|
1758
|
+
const userContent = buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures);
|
|
1759
|
+
const messages = [{
|
|
1324
1760
|
role: "system",
|
|
1325
|
-
content:
|
|
1326
|
-
'You are optimizing a system prompt for an LLM workflow.
|
|
1327
|
-
Analyze the failure and suggest a specific, focused change to improve the prompt.
|
|
1328
|
-
Do NOT overfit. Be generalizable.
|
|
1329
|
-
|
|
1330
|
-
<examples>
|
|
1331
|
-
VERY IMPORTANT, CRITICAL!!!
|
|
1332
|
-
Examples MUST be anonymized.
|
|
1333
|
-
NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
|
|
1334
|
-
- example: (for an invoice processor)
|
|
1335
|
-
- task: extract data from parsed invoices
|
|
1336
|
-
- failure context: (returned expected: true, actual: false)
|
|
1337
|
-
- prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
|
|
1338
|
-
|
|
1339
|
-
- example: (for a calendar app)
|
|
1340
|
-
- task: extract cost from calendar event
|
|
1341
|
-
- failure context: (cost expected: 123.45, actual: 167.89)
|
|
1342
|
-
- prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
|
|
1343
|
-
</examples>
|
|
1344
|
-
`
|
|
1761
|
+
content: config.patchSystemPrompt ?? DEFAULT_PATCH_SYSTEM_PROMPT
|
|
1345
1762
|
}, {
|
|
1346
1763
|
role: "user",
|
|
1347
1764
|
content: userContent
|
|
1348
|
-
}]
|
|
1765
|
+
}];
|
|
1766
|
+
return callLLM({
|
|
1767
|
+
provider: config.provider,
|
|
1768
|
+
apiKey: config.apiKey,
|
|
1769
|
+
messages,
|
|
1770
|
+
useThinking: config.thinking ?? false
|
|
1771
|
+
});
|
|
1349
1772
|
}
|
|
1350
1773
|
async function mergePatches(patches, currentPrompt, config) {
|
|
1351
|
-
const systemContent =
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
Incorporate the suggestions while keeping the prompt clear and coherent.
|
|
1355
|
-
`;
|
|
1356
|
-
const userContent = `
|
|
1357
|
-
Current prompt:
|
|
1358
|
-
---
|
|
1359
|
-
${currentPrompt}
|
|
1360
|
-
---
|
|
1361
|
-
|
|
1362
|
-
Suggested improvements:
|
|
1363
|
-
${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
|
|
1364
|
-
|
|
1365
|
-
Create a single improved system prompt that incorporates these suggestions.
|
|
1366
|
-
Be mindful of the size of the new prompt.
|
|
1367
|
-
Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
|
|
1368
|
-
Output ONLY the new system prompt, nothing else.
|
|
1369
|
-
Respect enums.
|
|
1370
|
-
`;
|
|
1371
|
-
return callLLM([{
|
|
1774
|
+
const systemContent = config.mergeSystemPrompt ?? DEFAULT_MERGE_SYSTEM_PROMPT;
|
|
1775
|
+
const userContent = buildMergeUserPrompt(patches, currentPrompt);
|
|
1776
|
+
const messages = [{
|
|
1372
1777
|
role: "system",
|
|
1373
1778
|
content: systemContent
|
|
1374
1779
|
}, {
|
|
1375
1780
|
role: "user",
|
|
1376
1781
|
content: userContent
|
|
1377
|
-
}]
|
|
1782
|
+
}];
|
|
1783
|
+
return callLLM({
|
|
1784
|
+
provider: config.provider,
|
|
1785
|
+
apiKey: config.apiKey,
|
|
1786
|
+
messages,
|
|
1787
|
+
useThinking: config.thinking ?? false
|
|
1788
|
+
});
|
|
1789
|
+
}
|
|
1790
|
+
|
|
1791
|
+
//#endregion
|
|
1792
|
+
//#region src/eval/executors.ts
|
|
1793
|
+
/**
|
|
1794
|
+
* Creates an executor that calls an HTTP endpoint.
|
|
1795
|
+
*
|
|
1796
|
+
* @example
|
|
1797
|
+
* ```ts
|
|
1798
|
+
* const executor = endpoint('https://api.example.com/workflow', {
|
|
1799
|
+
* headers: { Authorization: 'Bearer token' },
|
|
1800
|
+
* });
|
|
1801
|
+
* ```
|
|
1802
|
+
*/
|
|
1803
|
+
function endpoint(url, config = {}) {
|
|
1804
|
+
const { method = "POST", headers = {}, mapResponse, mapAdditionalContext, mapCost, timeout = DEFAULT_ENDPOINT_TIMEOUT_MS } = config;
|
|
1805
|
+
return async (input, systemPrompt) => {
|
|
1806
|
+
const body = typeof input === "object" && input !== null ? {
|
|
1807
|
+
...input,
|
|
1808
|
+
systemPrompt
|
|
1809
|
+
} : {
|
|
1810
|
+
input,
|
|
1811
|
+
systemPrompt
|
|
1812
|
+
};
|
|
1813
|
+
const controller = new AbortController();
|
|
1814
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
1815
|
+
try {
|
|
1816
|
+
const response = await fetch(url, {
|
|
1817
|
+
method,
|
|
1818
|
+
headers: {
|
|
1819
|
+
"Content-Type": "application/json",
|
|
1820
|
+
...headers
|
|
1821
|
+
},
|
|
1822
|
+
body: JSON.stringify(body),
|
|
1823
|
+
signal: controller.signal
|
|
1824
|
+
});
|
|
1825
|
+
clearTimeout(timeoutId);
|
|
1826
|
+
if (!response.ok) {
|
|
1827
|
+
const text = await response.text();
|
|
1828
|
+
throw new Error(`HTTP ${response.status}: ${text}`);
|
|
1829
|
+
}
|
|
1830
|
+
const data = await response.json();
|
|
1831
|
+
const additionalContext = mapAdditionalContext?.(data);
|
|
1832
|
+
const cost = mapCost?.(data) ?? 0;
|
|
1833
|
+
if (mapResponse) return {
|
|
1834
|
+
output: mapResponse(data),
|
|
1835
|
+
additionalContext,
|
|
1836
|
+
cost
|
|
1837
|
+
};
|
|
1838
|
+
return {
|
|
1839
|
+
output: data,
|
|
1840
|
+
additionalContext,
|
|
1841
|
+
cost
|
|
1842
|
+
};
|
|
1843
|
+
} catch (error) {
|
|
1844
|
+
clearTimeout(timeoutId);
|
|
1845
|
+
throw error;
|
|
1846
|
+
}
|
|
1847
|
+
};
|
|
1848
|
+
}
|
|
1849
|
+
/**
|
|
1850
|
+
* Creates an executor from a local function.
|
|
1851
|
+
*
|
|
1852
|
+
* @example
|
|
1853
|
+
* ```ts
|
|
1854
|
+
* const executor = fn({
|
|
1855
|
+
* fn: async (input, systemPrompt) => {
|
|
1856
|
+
* const result = await myLLMCall(input, systemPrompt);
|
|
1857
|
+
* return result;
|
|
1858
|
+
* },
|
|
1859
|
+
* });
|
|
1860
|
+
* ```
|
|
1861
|
+
*
|
|
1862
|
+
* @example With mapResponse to extract output from a richer response:
|
|
1863
|
+
* ```ts
|
|
1864
|
+
* const executor = fn({
|
|
1865
|
+
* fn: async (input, systemPrompt) => await startWorkflow({ ... }),
|
|
1866
|
+
* mapResponse: (result) => ({ documentType: result.documentType }),
|
|
1867
|
+
* mapCost: (result) => result.cost,
|
|
1868
|
+
* mapAdditionalContext: (result) => result.metadata,
|
|
1869
|
+
* });
|
|
1870
|
+
* ```
|
|
1871
|
+
*/
|
|
1872
|
+
function fn(config) {
|
|
1873
|
+
return async (input, systemPrompt) => {
|
|
1874
|
+
const raw = await config.fn(input, systemPrompt);
|
|
1875
|
+
return {
|
|
1876
|
+
output: config.mapResponse ? config.mapResponse(raw) : raw,
|
|
1877
|
+
additionalContext: config.mapAdditionalContext?.(raw),
|
|
1878
|
+
cost: config.mapCost?.(raw) ?? 0
|
|
1879
|
+
};
|
|
1880
|
+
};
|
|
1881
|
+
}
|
|
1882
|
+
/**
|
|
1883
|
+
* Creates a mock executor for testing.
|
|
1884
|
+
* Can accept either:
|
|
1885
|
+
* - An array of outputs (returned in sequence, cycling if more calls than outputs)
|
|
1886
|
+
* - A function that maps input to output
|
|
1887
|
+
*
|
|
1888
|
+
* @example Array-based:
|
|
1889
|
+
* ```ts
|
|
1890
|
+
* const executor = mock([
|
|
1891
|
+
* { premium: 12500, policyType: 'claims-made' },
|
|
1892
|
+
* { premium: 8200, policyType: 'entity' },
|
|
1893
|
+
* ]);
|
|
1894
|
+
* ```
|
|
1895
|
+
*
|
|
1896
|
+
* @example Function-based:
|
|
1897
|
+
* ```ts
|
|
1898
|
+
* const executor = mock((input) => ({
|
|
1899
|
+
* id: input.id,
|
|
1900
|
+
* processed: true,
|
|
1901
|
+
* }));
|
|
1902
|
+
* ```
|
|
1903
|
+
*/
|
|
1904
|
+
function mock(outputsOrFn) {
|
|
1905
|
+
if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
|
|
1906
|
+
return { output: outputsOrFn(input, systemPrompt) };
|
|
1907
|
+
};
|
|
1908
|
+
const outputs = outputsOrFn;
|
|
1909
|
+
if (outputs.length === 0) throw new Error("mock() requires at least one output");
|
|
1910
|
+
let callIndex = 0;
|
|
1911
|
+
return async () => {
|
|
1912
|
+
const output = outputs[callIndex % outputs.length];
|
|
1913
|
+
callIndex++;
|
|
1914
|
+
return { output };
|
|
1915
|
+
};
|
|
1378
1916
|
}
|
|
1379
1917
|
|
|
1380
1918
|
//#endregion
|
|
@@ -1438,11 +1976,13 @@ exports.endpoint = endpoint;
|
|
|
1438
1976
|
exports.evaluate = evaluate;
|
|
1439
1977
|
exports.exact = exact;
|
|
1440
1978
|
exports.fn = fn;
|
|
1979
|
+
exports.llmCompare = llmCompare;
|
|
1441
1980
|
exports.mock = mock;
|
|
1442
1981
|
exports.name = name;
|
|
1443
1982
|
exports.numeric = numeric;
|
|
1444
1983
|
exports.oneOf = oneOf;
|
|
1445
1984
|
exports.optimize = optimize;
|
|
1446
1985
|
exports.presence = presence;
|
|
1986
|
+
exports.unordered = unordered;
|
|
1447
1987
|
exports.within = within;
|
|
1448
1988
|
//# sourceMappingURL=index.cjs.map
|