@docshield/didactic 0.1.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -26,25 +26,36 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
26
26
  }) : target, mod));
27
27
 
28
28
  //#endregion
29
- let chrono_node = require("chrono-node");
30
- chrono_node = __toESM(chrono_node);
31
- let date_fns = require("date-fns");
32
- let levenshtein = require("levenshtein");
33
- levenshtein = __toESM(levenshtein);
34
29
  let munkres_js = require("munkres-js");
35
30
  munkres_js = __toESM(munkres_js);
36
31
  let _anthropic_ai_sdk = require("@anthropic-ai/sdk");
37
32
  _anthropic_ai_sdk = __toESM(_anthropic_ai_sdk);
38
33
  let openai = require("openai");
39
34
  openai = __toESM(openai);
40
- let path = require("path");
41
- path = __toESM(path);
35
+ let chrono_node = require("chrono-node");
36
+ chrono_node = __toESM(chrono_node);
37
+ let date_fns = require("date-fns");
38
+ let levenshtein = require("levenshtein");
39
+ levenshtein = __toESM(levenshtein);
42
40
  let fs = require("fs");
43
41
  fs = __toESM(fs);
42
+ let path = require("path");
43
+ path = __toESM(path);
44
+ let chalk = require("chalk");
45
+ chalk = __toESM(chalk);
46
+ let ora = require("ora");
47
+ ora = __toESM(ora);
48
+ let cli_progress = require("cli-progress");
49
+ cli_progress = __toESM(cli_progress);
50
+ let figures = require("figures");
51
+ figures = __toESM(figures);
52
+ let crypto = require("crypto");
53
+ crypto = __toESM(crypto);
44
54
 
45
55
  //#region src/types.ts
46
56
  /**
47
57
  * Supported LLM providers.
58
+ * Used by both optimizer and LLM-based comparators.
48
59
  */
49
60
  let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
50
61
  LLMProviders$1["anthropic_claude_opus"] = "anthropic_claude_opus";
@@ -56,7 +67,7 @@ let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
56
67
  }({});
57
68
 
58
69
  //#endregion
59
- //#region src/constants.ts
70
+ //#region src/library/constants.ts
60
71
  const PROVIDER_SPECS = {
61
72
  [LLMProviders.anthropic_claude_opus]: {
62
73
  model: "claude-opus-4-5-20251101",
@@ -71,7 +82,7 @@ const PROVIDER_SPECS = {
71
82
  costPerMillionOutput: 15
72
83
  },
73
84
  [LLMProviders.anthropic_claude_haiku]: {
74
- model: "claude-haiku-4-5-20251101",
85
+ model: "claude-haiku-4-5-20251001",
75
86
  maxTokens: 64e3,
76
87
  costPerMillionInput: 1,
77
88
  costPerMillionOutput: 5
@@ -96,7 +107,154 @@ const DEFAULT_PER_TEST_THRESHOLD = 1;
96
107
  const NAME_SUFFIXES = /(?<=\S)\s*,?\s*(inc\.?|llc\.?|ltd\.?|l\.l\.c\.?|corp\.?|corporation|company|co\.?)$/i;
97
108
 
98
109
  //#endregion
99
- //#region src/comparators.ts
110
+ //#region src/library/llm/llm-client.ts
111
+ /**
112
+ * Call an LLM provider with the given messages.
113
+ * Returns raw text output - caller is responsible for parsing if structured output is needed.
114
+ */
115
+ async function callLLM(config) {
116
+ const { provider, apiKey, messages, useThinking = false } = config;
117
+ const spec = PROVIDER_SPECS[provider];
118
+ try {
119
+ if (provider.startsWith("anthropic")) {
120
+ const client = new _anthropic_ai_sdk.default({ apiKey });
121
+ const streamOptions = {
122
+ model: spec.model,
123
+ max_tokens: spec.maxTokens,
124
+ system: messages.find((m) => m.role === "system")?.content,
125
+ messages: messages.filter((m) => m.role !== "system").map((m) => ({
126
+ role: m.role,
127
+ content: m.content
128
+ }))
129
+ };
130
+ if (useThinking) streamOptions.thinking = {
131
+ type: "enabled",
132
+ budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
133
+ };
134
+ const finalMessage = await client.messages.stream(streamOptions).finalMessage();
135
+ const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
136
+ const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
137
+ const inputTokens = finalMessage.usage.input_tokens;
138
+ const outputTokens = finalMessage.usage.output_tokens;
139
+ return {
140
+ text,
141
+ cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
142
+ inputTokens,
143
+ outputTokens
144
+ };
145
+ }
146
+ if (provider.startsWith("openai")) {
147
+ const client = new openai.default({ apiKey });
148
+ const completionOptions = {
149
+ model: spec.model,
150
+ messages: messages.map((m) => ({
151
+ role: m.role,
152
+ content: m.content
153
+ })),
154
+ max_completion_tokens: spec.maxTokens
155
+ };
156
+ if (useThinking) completionOptions.reasoning_effort = "xhigh";
157
+ const response = await client.chat.completions.create(completionOptions);
158
+ const text = response.choices[0].message.content ?? "";
159
+ const inputTokens = response.usage?.prompt_tokens ?? 0;
160
+ const outputTokens = response.usage?.completion_tokens ?? 0;
161
+ return {
162
+ text,
163
+ cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
164
+ inputTokens,
165
+ outputTokens
166
+ };
167
+ }
168
+ throw new Error(`Unsupported provider: ${provider}`);
169
+ } catch (error) {
170
+ const message = error instanceof Error ? error.message : String(error);
171
+ throw new Error(`LLM call failed (${spec.model}): ${message}`);
172
+ }
173
+ }
174
+ /**
175
+ * Call an LLM provider with structured output.
176
+ * Returns parsed JSON data conforming to the provided schema.
177
+ */
178
+ async function callStructuredLLM(config) {
179
+ const { provider, apiKey, messages, schema, useThinking = false } = config;
180
+ const spec = PROVIDER_SPECS[provider];
181
+ try {
182
+ if (provider.startsWith("anthropic")) {
183
+ const client = new _anthropic_ai_sdk.default({ apiKey });
184
+ const baseOptions = {
185
+ model: spec.model,
186
+ max_tokens: spec.maxTokens,
187
+ betas: ["structured-outputs-2025-11-13"],
188
+ system: messages.find((m) => m.role === "system")?.content,
189
+ messages: messages.filter((m) => m.role !== "system").map((m) => ({
190
+ role: m.role,
191
+ content: m.content
192
+ })),
193
+ output_format: {
194
+ type: "json_schema",
195
+ schema
196
+ }
197
+ };
198
+ const streamOptions = useThinking ? {
199
+ ...baseOptions,
200
+ thinking: {
201
+ type: "enabled",
202
+ budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
203
+ }
204
+ } : baseOptions;
205
+ const finalMessage = await client.beta.messages.stream(streamOptions).finalMessage();
206
+ const content = finalMessage.content[0];
207
+ if (content.type !== "text") throw new Error("Unexpected response type from LLM");
208
+ const data = JSON.parse(content.text);
209
+ const inputTokens = finalMessage.usage.input_tokens;
210
+ const outputTokens = finalMessage.usage.output_tokens;
211
+ return {
212
+ data,
213
+ cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
214
+ inputTokens,
215
+ outputTokens
216
+ };
217
+ }
218
+ if (provider.startsWith("openai")) {
219
+ const client = new openai.default({ apiKey });
220
+ const completionOptions = {
221
+ model: spec.model,
222
+ messages: messages.map((m) => ({
223
+ role: m.role,
224
+ content: m.content
225
+ })),
226
+ max_completion_tokens: spec.maxTokens,
227
+ response_format: {
228
+ type: "json_schema",
229
+ json_schema: {
230
+ name: "response",
231
+ strict: true,
232
+ schema
233
+ }
234
+ }
235
+ };
236
+ if (useThinking) completionOptions.reasoning_effort = "xhigh";
237
+ const response = await client.chat.completions.create(completionOptions);
238
+ const text = response.choices[0].message.content ?? "";
239
+ const data = JSON.parse(text);
240
+ const inputTokens = response.usage?.prompt_tokens ?? 0;
241
+ const outputTokens = response.usage?.completion_tokens ?? 0;
242
+ return {
243
+ data,
244
+ cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
245
+ inputTokens,
246
+ outputTokens
247
+ };
248
+ }
249
+ throw new Error(`Unsupported provider: ${provider}`);
250
+ } catch (error) {
251
+ const message = error instanceof Error ? error.message : String(error);
252
+ throw new Error(`Structured LLM call failed (${spec.model}): ${message}`);
253
+ }
254
+ }
255
+
256
+ //#endregion
257
+ //#region src/eval/comparators/comparators.ts
100
258
  /** Checks if actual string contains a substring. */
101
259
  function contains(substring) {
102
260
  return (_expected, actual) => {
@@ -233,6 +391,103 @@ function within(config) {
233
391
  };
234
392
  };
235
393
  }
394
+ /** Schema for LLM comparison response. */
395
+ const LLM_COMPARE_SCHEMA = {
396
+ type: "object",
397
+ properties: {
398
+ passed: {
399
+ type: "boolean",
400
+ description: "Whether the actual value matches the expected value"
401
+ },
402
+ rationale: {
403
+ type: "string",
404
+ description: "Brief explanation of the comparison decision"
405
+ }
406
+ },
407
+ required: ["passed", "rationale"],
408
+ additionalProperties: false
409
+ };
410
+ const DEFAULT_LLM_COMPARE_SYSTEM_PROMPT = `Compare the following two values and determine if they are semantically equivalent.
411
+
412
+ Focus on whether they convey the same core meaning or information, even if expressed differently. Consider synonyms, paraphrasing, and stylistic variations as acceptable. Only mark as failed if there are substantial differences in the actual facts or meaning being conveyed.`;
413
+ const buildLLMCompareUserPrompt = (expected, actual) => `Expected value:
414
+ ${JSON.stringify(expected, null, 2)}
415
+
416
+ Actual value:
417
+ ${JSON.stringify(actual, null, 2)}`;
418
+ /**
419
+ * Uses an LLM to compare expected vs actual values.
420
+ * Returns a comparison result with rationale and cost tracking.
421
+ * Default provider: anthropic_claude_haiku (fastest, cheapest).
422
+ */
423
+ function llmCompare(config) {
424
+ const systemPrompt = config.systemPrompt ?? DEFAULT_LLM_COMPARE_SYSTEM_PROMPT;
425
+ return async (expected, actual, context) => {
426
+ try {
427
+ const apiKey = config.apiKey ?? context?.llmConfig?.apiKey;
428
+ if (!apiKey) throw new Error("llmCompare requires an apiKey. Either pass it directly to llmCompare() or set llmConfig.apiKey in eval config.");
429
+ const provider = config.provider ?? context?.llmConfig?.provider ?? LLMProviders.anthropic_claude_haiku;
430
+ const userPrompt = buildLLMCompareUserPrompt(expected, actual);
431
+ const result = await callStructuredLLM({
432
+ provider,
433
+ apiKey,
434
+ messages: [{
435
+ role: "system",
436
+ content: systemPrompt
437
+ }, {
438
+ role: "user",
439
+ content: userPrompt
440
+ }],
441
+ schema: LLM_COMPARE_SCHEMA
442
+ });
443
+ return {
444
+ passed: result.data.passed,
445
+ rationale: result.data.rationale,
446
+ cost: result.cost,
447
+ similarity: result.data.passed ? 1 : 0
448
+ };
449
+ } catch (error) {
450
+ return {
451
+ passed: false,
452
+ rationale: `LLM comparison failed: ${error instanceof Error ? error.message : String(error)}`,
453
+ cost: 0,
454
+ similarity: 0
455
+ };
456
+ }
457
+ };
458
+ }
459
+ /**
460
+ * Marks a comparator or comparator config as unordered.
461
+ * When applied to an array field, items will be matched by similarity
462
+ * rather than index position (using Hungarian algorithm).
463
+ *
464
+ * @example
465
+ * // Unordered array of objects
466
+ * lineItems: unordered({
467
+ * description: name,
468
+ * price: within({ tolerance: 5 })
469
+ * })
470
+ *
471
+ * @example
472
+ * // Unordered array of primitives
473
+ * tags: unordered(exact)
474
+ *
475
+ * @example
476
+ * // When entire output is an array
477
+ * comparators: unordered({
478
+ * carrier: exact,
479
+ * premium: within({ tolerance: 0.05 })
480
+ * })
481
+ */
482
+ function unordered(comparator) {
483
+ const baseFunction = typeof comparator === "function" ? comparator : () => {
484
+ throw new Error("unordered() base function should not be called when nested comparators exist. This is likely a bug in the evaluation logic.");
485
+ };
486
+ return Object.assign(baseFunction, {
487
+ _unordered: true,
488
+ _nestedComparators: typeof comparator === "object" ? comparator : void 0
489
+ });
490
+ }
236
491
  /**
237
492
  * Deep equality comparison with cycle detection.
238
493
  * Uses WeakSet to track visited object pairs to prevent stack overflow on circular references.
@@ -270,187 +525,63 @@ function normalizeNumeric(value) {
270
525
  if (value == null || value === "") return null;
271
526
  const str = String(value);
272
527
  const isNegativeParens = /^\(.*\)$/.test(str.trim());
273
- let cleaned = str.replace(/[^0-9.\-]/g, "");
528
+ let cleaned = str.replace(/[^0-9.-]/g, "");
274
529
  if (isNegativeParens && !cleaned.startsWith("-")) cleaned = "-" + cleaned;
275
530
  const num = parseFloat(cleaned);
276
531
  return isNaN(num) ? null : num;
277
532
  }
278
533
 
279
534
  //#endregion
280
- //#region src/executors.ts
535
+ //#region src/eval/comparators/matching.ts
536
+ function isObject$1(value) {
537
+ return value !== null && typeof value === "object" && !Array.isArray(value);
538
+ }
281
539
  /**
282
- * Creates an executor that calls an HTTP endpoint.
283
- *
284
- * @example
285
- * ```ts
286
- * const executor = endpoint('https://api.example.com/workflow', {
287
- * headers: { Authorization: 'Bearer token' },
288
- * });
289
- * ```
540
+ * Calculate similarity score between two values (0.0 to 1.0).
541
+ * For arrays: recursively match and average similarity of paired elements.
542
+ * For objects: average similarity across all fields using comparator results.
543
+ * For primitives: uses exact comparison's similarity score.
290
544
  */
291
- function endpoint(url, config = {}) {
292
- const { method = "POST", headers = {}, mapResponse, mapAdditionalContext, mapCost, timeout = DEFAULT_ENDPOINT_TIMEOUT_MS } = config;
293
- return async (input, systemPrompt) => {
294
- const body = typeof input === "object" && input !== null ? {
295
- ...input,
296
- systemPrompt
297
- } : {
298
- input,
299
- systemPrompt
300
- };
301
- const controller = new AbortController();
302
- const timeoutId = setTimeout(() => controller.abort(), timeout);
303
- try {
304
- const response = await fetch(url, {
305
- method,
306
- headers: {
307
- "Content-Type": "application/json",
308
- ...headers
309
- },
310
- body: JSON.stringify(body),
311
- signal: controller.signal
312
- });
313
- clearTimeout(timeoutId);
314
- if (!response.ok) {
315
- const text = await response.text();
316
- throw new Error(`HTTP ${response.status}: ${text}`);
317
- }
318
- const data = await response.json();
319
- const additionalContext = mapAdditionalContext?.(data);
320
- const cost = mapCost?.(data) ?? 0;
321
- if (mapResponse) return {
322
- output: mapResponse(data),
323
- additionalContext,
324
- cost
325
- };
326
- return {
327
- output: data,
328
- additionalContext,
329
- cost
330
- };
331
- } catch (error) {
332
- clearTimeout(timeoutId);
333
- throw error;
334
- }
335
- };
545
+ async function getSimilarity(expected, actual, comparators) {
546
+ if (Array.isArray(expected) && Array.isArray(actual)) {
547
+ if (expected.length === 0 && actual.length === 0) return 1;
548
+ if (expected.length === 0 || actual.length === 0) return 0;
549
+ const result = await matchArrays(expected, actual, comparators);
550
+ let total$1 = 0;
551
+ for (const [expIdx, actIdx] of result.assignments) total$1 += await getSimilarity(expected[expIdx], actual[actIdx], comparators);
552
+ const maxLen = Math.max(expected.length, actual.length);
553
+ return total$1 / maxLen;
554
+ }
555
+ if (!isObject$1(expected) || !isObject$1(actual)) {
556
+ const result = exact(expected, actual);
557
+ return result.similarity ?? (result.passed ? 1 : 0);
558
+ }
559
+ const fields = Object.keys(expected).filter((key) => {
560
+ const comp = comparators[key];
561
+ return comp !== void 0 && typeof comp === "function";
562
+ });
563
+ if (fields.length === 0) return 1;
564
+ let total = 0;
565
+ for (const key of fields) {
566
+ const comparatorConfig = comparators[key];
567
+ const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected[key], actual[key], {
568
+ expectedParent: expected,
569
+ actualParent: actual
570
+ });
571
+ total += result.similarity ?? (result.passed ? 1 : 0);
572
+ }
573
+ return total / fields.length;
336
574
  }
337
575
  /**
338
- * Creates an executor from a local function.
339
- *
340
- * @example
341
- * ```ts
342
- * const executor = fn({
343
- * fn: async (input, systemPrompt) => {
344
- * const result = await myLLMCall(input, systemPrompt);
345
- * return result;
346
- * },
347
- * });
348
- * ```
576
+ * Find optimal pairing between expected and actual arrays using Hungarian algorithm.
577
+ * Pure matching - no pass/fail determination.
349
578
  *
350
- * @example With mapResponse to extract output from a richer response:
351
- * ```ts
352
- * const executor = fn({
353
- * fn: async (input, systemPrompt) => await startWorkflow({ ... }),
354
- * mapResponse: (result) => ({ documentType: result.documentType }),
355
- * mapCost: (result) => result.cost,
356
- * mapAdditionalContext: (result) => result.metadata,
357
- * });
358
- * ```
579
+ * @param expected - Array of expected items
580
+ * @param actual - Array of actual items
581
+ * @param comparators - Nested comparator configuration for array items
582
+ * @returns Matching result with assignments and unmatched indices
359
583
  */
360
- function fn(config) {
361
- return async (input, systemPrompt) => {
362
- const raw = await config.fn(input, systemPrompt);
363
- return {
364
- output: config.mapResponse ? config.mapResponse(raw) : raw,
365
- additionalContext: config.mapAdditionalContext?.(raw),
366
- cost: config.mapCost?.(raw) ?? 0
367
- };
368
- };
369
- }
370
- /**
371
- * Creates a mock executor for testing.
372
- * Can accept either:
373
- * - An array of outputs (returned in sequence, cycling if more calls than outputs)
374
- * - A function that maps input to output
375
- *
376
- * @example Array-based:
377
- * ```ts
378
- * const executor = mock([
379
- * { premium: 12500, policyType: 'claims-made' },
380
- * { premium: 8200, policyType: 'entity' },
381
- * ]);
382
- * ```
383
- *
384
- * @example Function-based:
385
- * ```ts
386
- * const executor = mock((input) => ({
387
- * id: input.id,
388
- * processed: true,
389
- * }));
390
- * ```
391
- */
392
- function mock(outputsOrFn) {
393
- if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
394
- return { output: outputsOrFn(input, systemPrompt) };
395
- };
396
- const outputs = outputsOrFn;
397
- if (outputs.length === 0) throw new Error("mock() requires at least one output");
398
- let callIndex = 0;
399
- return async () => {
400
- const output = outputs[callIndex % outputs.length];
401
- callIndex++;
402
- return { output };
403
- };
404
- }
405
-
406
- //#endregion
407
- //#region src/matching.ts
408
- function isObject$1(value) {
409
- return value !== null && typeof value === "object" && !Array.isArray(value);
410
- }
411
- /**
412
- * Calculate similarity score between two values (0.0 to 1.0).
413
- * For arrays: recursively match and average similarity of paired elements.
414
- * For objects: average similarity across all fields using comparator results.
415
- * For primitives: uses exact comparison's similarity score.
416
- */
417
- function getSimilarity(expected, actual, comparators) {
418
- if (Array.isArray(expected) && Array.isArray(actual)) {
419
- if (expected.length === 0 && actual.length === 0) return 1;
420
- if (expected.length === 0 || actual.length === 0) return 0;
421
- const result = matchArrays(expected, actual, comparators);
422
- let total$1 = 0;
423
- for (const [expIdx, actIdx] of result.assignments) total$1 += getSimilarity(expected[expIdx], actual[actIdx], comparators);
424
- const maxLen = Math.max(expected.length, actual.length);
425
- return total$1 / maxLen;
426
- }
427
- if (!isObject$1(expected) || !isObject$1(actual)) {
428
- const result = exact(expected, actual);
429
- return result.similarity ?? (result.passed ? 1 : 0);
430
- }
431
- const fields = Object.keys(expected).filter((key) => comparators[key]);
432
- if (fields.length === 0) return 1;
433
- let total = 0;
434
- for (const key of fields) {
435
- const comparator = comparators[key];
436
- const result = comparator(expected[key], actual[key], {
437
- expectedParent: expected,
438
- actualParent: actual
439
- });
440
- total += result.similarity ?? (result.passed ? 1 : 0);
441
- }
442
- return total / fields.length;
443
- }
444
- /**
445
- * Find optimal pairing between expected and actual arrays using Hungarian algorithm.
446
- * Pure matching - no pass/fail determination.
447
- *
448
- * @param expected - Array of expected items
449
- * @param actual - Array of actual items
450
- * @param comparators - Map of field names to comparator functions
451
- * @returns Matching result with assignments and unmatched indices
452
- */
453
- function matchArrays(expected, actual, comparators = {}) {
584
+ async function matchArrays(expected, actual, comparators = {}) {
454
585
  if (expected.length === 0) return {
455
586
  assignments: [],
456
587
  unmatchedExpected: [],
@@ -461,7 +592,7 @@ function matchArrays(expected, actual, comparators = {}) {
461
592
  unmatchedExpected: [...Array(expected.length).keys()],
462
593
  unmatchedActual: []
463
594
  };
464
- const rawAssignments = (0, munkres_js.default)(expected.map((exp) => actual.map((act) => 1 - getSimilarity(exp, act, comparators))));
595
+ const rawAssignments = (0, munkres_js.default)(await Promise.all(expected.map(async (exp) => Promise.all(actual.map(async (act) => 1 - await getSimilarity(exp, act, comparators))))));
465
596
  const assignments = [];
466
597
  const matchedExp = /* @__PURE__ */ new Set();
467
598
  const matchedAct = /* @__PURE__ */ new Set();
@@ -478,212 +609,126 @@ function matchArrays(expected, actual, comparators = {}) {
478
609
  }
479
610
 
480
611
  //#endregion
481
- //#region src/eval.ts
612
+ //#region src/optimizer/ui.ts
482
613
  /**
483
- * Run all test cases and return results.
614
+ * UI utilities for beautiful console output
484
615
  */
485
- async function evaluate(config) {
486
- const { testCases, systemPrompt, executor, comparators, comparatorOverride } = config;
487
- if (testCases.length === 0) throw new Error("testCases array cannot be empty");
488
- if (!executor) throw new Error("executor is required");
489
- if (!comparators && !comparatorOverride) throw new Error("either \"comparators\" (field mapping or single function) or \"comparatorOverride\" (whole-object) is required");
490
- const executeTestCase = async ({ input, expected }) => {
491
- try {
492
- const result = await executor(input, systemPrompt);
493
- let fields;
494
- if (comparatorOverride) {
495
- const compResult = comparatorOverride(expected, result.output);
496
- fields = { "": {
497
- passed: compResult.passed,
498
- expected,
499
- actual: result.output
500
- } };
501
- } else if (typeof comparators === "function") if (Array.isArray(expected)) fields = compareFields({
502
- expected,
503
- actual: result.output,
504
- comparators: { "": comparators },
505
- unorderedList: config.unorderedList
506
- });
507
- else {
508
- const compResult = comparators(expected, result.output, {
509
- expectedParent: void 0,
510
- actualParent: void 0
511
- });
512
- fields = { "": {
513
- ...compResult,
514
- expected,
515
- actual: result.output
516
- } };
517
- }
518
- else fields = compareFields({
519
- expected,
520
- actual: result.output,
521
- comparators,
522
- unorderedList: config.unorderedList
523
- });
524
- const passedFields = Object.values(fields).filter((f) => f.passed).length;
525
- const totalFields$1 = Object.values(fields).length;
526
- const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
527
- const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
528
- return {
529
- input,
530
- expected,
531
- actual: result.output,
532
- additionalContext: result.additionalContext,
533
- cost: result.cost ?? 0,
534
- passed: passed$1,
535
- fields,
536
- passedFields,
537
- totalFields: totalFields$1,
538
- passRate
539
- };
540
- } catch (error) {
541
- return {
542
- input,
543
- expected,
544
- actual: void 0,
545
- cost: 0,
546
- passed: false,
547
- fields: {},
548
- passedFields: 0,
549
- totalFields: 0,
550
- passRate: 0,
551
- error: error instanceof Error ? error.message : String(error)
552
- };
616
+ const theme = {
617
+ success: chalk.default.green,
618
+ error: chalk.default.red,
619
+ warning: chalk.default.yellow,
620
+ bold: chalk.default.bold,
621
+ dim: chalk.default.dim,
622
+ check: chalk.default.green(figures.default.tick),
623
+ cross: chalk.default.red(figures.default.cross),
624
+ warn: chalk.default.yellow(figures.default.warning),
625
+ bullet: chalk.default.dim(figures.default.bullet),
626
+ pointer: chalk.default.yellow(figures.default.pointer),
627
+ separator: chalk.default.dim(" · "),
628
+ divider: (label, width = 60) => {
629
+ const prefix = `━━━ ${label} `;
630
+ const remaining = Math.max(0, width - prefix.length);
631
+ return chalk.default.cyan.dim(prefix + "━".repeat(remaining));
632
+ }
633
+ };
634
+ let activeSpinner = null;
635
+ const spinner = {
636
+ start(text) {
637
+ if (activeSpinner) activeSpinner.stop();
638
+ activeSpinner = (0, ora.default)({
639
+ text,
640
+ spinner: "dots",
641
+ indent: 4
642
+ }).start();
643
+ return activeSpinner;
644
+ },
645
+ succeed(text) {
646
+ if (activeSpinner) {
647
+ activeSpinner.succeed(text);
648
+ activeSpinner = null;
553
649
  }
554
- };
555
- const rateLimitBatch = config.rateLimitBatch;
556
- let results;
557
- if (rateLimitBatch && rateLimitBatch > 0) {
558
- results = [];
559
- for (let i = 0; i < testCases.length; i += rateLimitBatch) {
560
- const batch = testCases.slice(i, i + rateLimitBatch);
561
- const batchResults = await Promise.all(batch.map(executeTestCase));
562
- results.push(...batchResults);
563
- const rateLimitPause = config.rateLimitPause;
564
- if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
650
+ },
651
+ fail(text) {
652
+ if (activeSpinner) {
653
+ activeSpinner.fail(text);
654
+ activeSpinner = null;
565
655
  }
566
- } else results = await Promise.all(testCases.map(executeTestCase));
567
- results.sort((a, b) => {
568
- if (a.passed !== b.passed) return a.passed ? 1 : -1;
569
- return a.passRate - b.passRate;
570
- });
571
- const passed = results.filter((r) => r.passed).length;
572
- const total = results.length;
573
- const successRate = total > 0 ? passed / total : 0;
574
- let correctFields = 0;
575
- let totalFields = 0;
576
- for (const r of results) {
577
- const fieldResults = Object.values(r.fields);
578
- totalFields += fieldResults.length;
579
- correctFields += fieldResults.filter((f) => f.passed).length;
656
+ },
657
+ stop() {
658
+ if (activeSpinner) {
659
+ activeSpinner.stop();
660
+ activeSpinner = null;
661
+ }
662
+ },
663
+ clear() {
664
+ if (activeSpinner) activeSpinner.clear();
665
+ },
666
+ isActive() {
667
+ return activeSpinner !== null;
580
668
  }
581
- const accuracy = totalFields > 0 ? correctFields / totalFields : 0;
582
- const cost = results.reduce((sum, r) => sum + (r.cost ?? 0), 0);
669
+ };
670
+ function createProgressTracker(label) {
671
+ let bar = null;
672
+ let startTime = 0;
673
+ let lastUpdate = 0;
674
+ const MIN_UPDATE_INTERVAL = 100;
583
675
  return {
584
- systemPrompt,
585
- testCases: results,
586
- passed,
587
- total,
588
- successRate,
589
- correctFields,
590
- totalFields,
591
- accuracy,
592
- cost
676
+ start(total) {
677
+ spinner.stop();
678
+ startTime = Date.now();
679
+ bar = new cli_progress.default.SingleBar({
680
+ format: ` {bar} {percentage}% {value}/{total} ${label} {duration_formatted}`,
681
+ barCompleteChar: "█",
682
+ barIncompleteChar: "░",
683
+ barsize: 20,
684
+ hideCursor: true,
685
+ clearOnComplete: false,
686
+ stopOnComplete: false,
687
+ forceRedraw: true,
688
+ fps: 10
689
+ });
690
+ bar.start(total, 0, { duration_formatted: "0s" });
691
+ },
692
+ update(current) {
693
+ const now = Date.now();
694
+ if (now - lastUpdate < MIN_UPDATE_INTERVAL && bar) {
695
+ if (current < bar.getTotal()) return;
696
+ }
697
+ lastUpdate = now;
698
+ if (bar) {
699
+ const elapsed = Math.round((now - startTime) / 1e3);
700
+ bar.update(current, { duration_formatted: `${elapsed}s` });
701
+ }
702
+ },
703
+ stop() {
704
+ if (bar) {
705
+ const elapsed = Math.round((Date.now() - startTime) / 1e3);
706
+ bar.update(bar.getTotal(), { duration_formatted: `${elapsed}s` });
707
+ bar.stop();
708
+ bar = null;
709
+ }
710
+ }
593
711
  };
594
712
  }
595
- /**
596
- * Recursively compare expected vs actual, returning field-level results.
597
- * Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
598
- */
599
- function compareFields(opts) {
600
- const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, unorderedList = false } = opts;
601
- const results = {};
602
- const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
603
- if (Array.isArray(expected)) {
604
- if (!Array.isArray(actual)) return { [path$1]: {
605
- passed: false,
606
- expected,
607
- actual
608
- } };
609
- if (expected.length === 0) return {};
610
- let matchedPairs;
611
- if (unorderedList) matchedPairs = matchArrays(expected, actual, comparators).assignments;
612
- else {
613
- matchedPairs = [];
614
- for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
615
- }
616
- const matchedIndices = new Set(matchedPairs.map(([i]) => i));
617
- for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, compareFields({
618
- expected: expected[expIdx],
619
- actual: actual[actIdx],
620
- comparators,
621
- path: indexPath(expIdx),
622
- expectedParent,
623
- actualParent,
624
- unorderedList
625
- }));
626
- const arrayFieldName = getFieldName(path$1);
627
- const hasArrayComparator = arrayFieldName in comparators || arrayFieldName === "";
628
- for (let i = 0; i < expected.length; i++) {
629
- if (matchedIndices.has(i)) continue;
630
- const item = expected[i];
631
- if (isObject(item)) {
632
- for (const [field, value] of Object.entries(item)) if (field in comparators) results[`${indexPath(i)}.${field}`] = {
633
- passed: false,
634
- expected: value,
635
- actual: void 0
636
- };
637
- } else if (hasArrayComparator) results[indexPath(i)] = {
638
- passed: false,
639
- expected: item,
640
- actual: void 0
641
- };
642
- }
643
- return results;
644
- }
645
- if (isObject(expected)) {
646
- if (!isObject(actual)) return { [path$1]: {
647
- passed: false,
648
- expected,
649
- actual
650
- } };
651
- for (const [field, expValue] of Object.entries(expected)) {
652
- const fieldPath = path$1 ? `${path$1}.${field}` : field;
653
- Object.assign(results, compareFields({
654
- expected: expValue,
655
- actual: actual[field],
656
- comparators,
657
- path: fieldPath,
658
- expectedParent: expected,
659
- actualParent: actual,
660
- unorderedList
661
- }));
662
- }
663
- return results;
664
- }
665
- const fieldName = getFieldName(path$1);
666
- const comparator = comparators[fieldName] ?? (fieldName === "" ? exact : void 0);
667
- if (!comparator) return {};
668
- const result = comparator(expected, actual, {
669
- expectedParent,
670
- actualParent
671
- });
672
- return { [path$1]: {
673
- ...result,
674
- expected,
675
- actual
676
- } };
713
+ function formatCost(cost) {
714
+ return theme.dim(`$${cost.toFixed(4)}`);
677
715
  }
678
- function isObject(value) {
679
- return value !== null && typeof value === "object" && !Array.isArray(value);
716
+ function formatCostShort(cost) {
717
+ return theme.dim(`$${cost.toFixed(2)}`);
680
718
  }
681
- function getFieldName(path$1) {
682
- return (path$1.split(".").pop() || "").replace(/\[\d+\]$/, "");
719
+ function formatDuration(ms) {
720
+ const totalSeconds = Math.round(ms / 1e3);
721
+ if (totalSeconds < 60) return `${totalSeconds}s`;
722
+ const minutes = Math.floor(totalSeconds / 60);
723
+ const seconds = totalSeconds % 60;
724
+ return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
725
+ }
726
+ function formatPercentage(rate) {
727
+ return `${(rate * 100).toFixed(1)}%`;
683
728
  }
684
729
 
685
730
  //#endregion
686
- //#region src/optimizer-logging.ts
731
+ //#region src/optimizer/optimizer-logging.ts
687
732
  function formatMsCompact(ms) {
688
733
  const totalSeconds = Math.round(ms / 1e3);
689
734
  if (totalSeconds < 60) return `${totalSeconds}s`;
@@ -701,12 +746,75 @@ function formatTokensCompact(tokens) {
701
746
  if (tokens >= 1e3) return `${Math.round(tokens / 1e3)}K`;
702
747
  return String(tokens);
703
748
  }
749
+ /**
750
+ * Clear any active progress line before logging
751
+ * Call this before all console.log statements
752
+ */
753
+ function clearProgressLine() {
754
+ const width = process.stdout.columns || 80;
755
+ process.stdout.write("\r" + " ".repeat(width) + "\r");
756
+ }
757
+ /**
758
+ * Create a progress updater using cli-progress for beautiful output
759
+ */
760
+ function createProgressUpdater(label) {
761
+ let tracker = null;
762
+ let total = 0;
763
+ return {
764
+ update(completed, newTotal) {
765
+ if (!tracker) {
766
+ total = newTotal;
767
+ tracker = createProgressTracker(label);
768
+ tracker.start(total);
769
+ }
770
+ tracker.update(completed);
771
+ },
772
+ finish() {
773
+ if (tracker) {
774
+ tracker.stop();
775
+ tracker = null;
776
+ }
777
+ },
778
+ clear() {
779
+ clearProgressLine();
780
+ }
781
+ };
782
+ }
783
+ /**
784
+ * Track progress of Promise.allSettled with real-time updates
785
+ *
786
+ * @param promises Array of promises to track
787
+ * @param onProgress Callback called when each promise settles
788
+ * @returns Promise.allSettled result
789
+ */
790
+ async function trackPromiseProgress(promises, onProgress) {
791
+ if (promises.length === 0) return [];
792
+ let completed = 0;
793
+ const total = promises.length;
794
+ onProgress(0, total);
795
+ const wrappedPromises = promises.map((promise) => promise.then((value) => {
796
+ completed++;
797
+ onProgress(completed, total);
798
+ return {
799
+ status: "fulfilled",
800
+ value
801
+ };
802
+ }).catch((reason) => {
803
+ completed++;
804
+ onProgress(completed, total);
805
+ return {
806
+ status: "rejected",
807
+ reason
808
+ };
809
+ }));
810
+ return Promise.all(wrappedPromises);
811
+ }
704
812
  function formatFailure(testCase) {
705
813
  const lines = [];
706
814
  lines.push(`Input: ${JSON.stringify(testCase.input, null, 2)}`);
707
815
  lines.push(`Expected: ${JSON.stringify(testCase.expected, null, 2)}`);
708
816
  lines.push(`Actual: ${JSON.stringify(testCase.actual, null, 2)}`);
709
- if (testCase.additionalContext) lines.push(`Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
817
+ if (testCase.additionalContext) lines.push(`Additional Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
710
818
  lines.push("");
711
819
  lines.push("Field-level failures:");
712
820
  for (const [fieldPath, result] of Object.entries(testCase.fields)) if (!result.passed) lines.push(` ${fieldPath || "(root)"}: expected ${JSON.stringify(result.expected)}, got ${JSON.stringify(result.actual)}`);
@@ -730,56 +838,98 @@ function computeTotals(iterations) {
730
838
  totalDuration
731
839
  };
732
840
  }
733
- function formatDurationForLog(ms) {
734
- const seconds = Math.round(ms / 1e3);
735
- if (seconds < 60) return `(${seconds}s)`;
736
- return `(${Math.floor(seconds / 60)}m ${seconds % 60}s)`;
841
+ function logOptimizerHeader(model, targetRate, testCount) {
842
+ spinner.stop();
843
+ console.log("");
844
+ console.log(theme.bold("Didactic Optimizer"));
845
+ console.log(` ${theme.dim("Model:")} ${model}${theme.separator}${theme.dim("Target:")} ${formatPercentage(targetRate)}${theme.separator}${theme.dim("Tests:")} ${testCount}`);
737
846
  }
738
847
  function logIterationStart(iterationLabel) {
739
- console.log(`\n=== Optimization Iteration ${iterationLabel} ===`);
848
+ spinner.stop();
849
+ clearProgressLine();
850
+ console.log("");
851
+ console.log(theme.divider(`Iteration ${iterationLabel}`));
852
+ console.log("");
740
853
  }
741
854
  function logEvaluationStart() {
742
- console.log(` Evaluating prompt...`);
855
+ spinner.stop();
856
+ clearProgressLine();
857
+ console.log(` ${theme.bold("Evaluating prompt")}`);
858
+ spinner.start("Running evals...");
743
859
  }
744
860
  function logEvaluationResult(result, cumulativeCost, durationMs) {
745
- console.log(` Result: ${result.passed}/${result.total} passed (${(result.successRate * 100).toFixed(1)}%) | Cost: $${result.cost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
861
+ spinner.stop();
862
+ clearProgressLine();
863
+ const successIcon = result.successRate >= .9 ? theme.check : result.successRate >= .5 ? theme.warn : theme.cross;
864
+ console.log(` ${successIcon} ${theme.bold(formatPercentage(result.successRate))} success rate ${theme.dim(`(${result.passed}/${result.total} passed)`)}`);
865
+ console.log(` ${theme.dim("Cost:")} ${formatCost(result.cost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
746
866
  }
747
867
  function logRegressionDetected(bestSuccessRate) {
748
- console.log(` → Regression detected (was ${(bestSuccessRate * 100).toFixed(1)}%)`);
868
+ spinner.stop();
869
+ clearProgressLine();
870
+ console.log(` ${theme.pointer} ${theme.warning("Regression")} ${theme.dim(`(was ${formatPercentage(bestSuccessRate)})`)}`);
749
871
  }
750
872
  function logTargetReached(targetSuccessRate) {
751
- console.log(` Target: ${(targetSuccessRate * 100).toFixed(0)}% | ✓ Target reached!`);
873
+ spinner.stop();
874
+ clearProgressLine();
875
+ console.log(` ${theme.check} ${theme.success("Target reached!")} ${theme.dim(`(${formatPercentage(targetSuccessRate)})`)}`);
752
876
  }
753
877
  function logTargetFailures(targetSuccessRate, failureCount) {
754
- console.log(` Target: ${(targetSuccessRate * 100).toFixed(0)}% | ${failureCount} failures to address`);
878
+ spinner.stop();
879
+ clearProgressLine();
880
+ console.log(` ${theme.cross} ${theme.error(`${failureCount} failures`)} to address ${theme.dim(`(target: ${formatPercentage(targetSuccessRate)})`)}`);
755
881
  }
756
882
  function logCostLimitReached(cumulativeCost) {
757
- console.log(` Cost limit reached ($${cumulativeCost.toFixed(2)})`);
883
+ spinner.stop();
884
+ clearProgressLine();
885
+ console.log(` ${theme.warn} ${theme.warning("Cost limit reached")} ${theme.dim(`($${cumulativeCost.toFixed(2)})`)}`);
758
886
  }
759
887
  function logPatchGenerationStart(failureCount) {
760
- console.log(``);
761
- console.log(` Generating ${failureCount} patches in parallel...`);
888
+ spinner.stop();
889
+ clearProgressLine();
890
+ console.log("");
891
+ console.log(` ${theme.bold("Generating patches")}`);
892
+ spinner.start(`Generating ${failureCount} patches in parallel...`);
762
893
  }
763
894
  function logPatchGenerationResult(patchCost, cumulativeCost, durationMs) {
764
- console.log(` Patches generated | Cost: $${patchCost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
895
+ spinner.stop();
896
+ clearProgressLine();
897
+ console.log(` ${theme.check} Patches generated${theme.separator}${theme.dim("Cost:")} ${formatCost(patchCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
765
898
  }
766
899
  function logMergeStart() {
767
- console.log(``);
768
- console.log(` Merging patches...`);
900
+ spinner.stop();
901
+ clearProgressLine();
902
+ console.log("");
903
+ console.log(` ${theme.bold("Merging patches")}`);
904
+ spinner.start("Merging patches...");
769
905
  }
770
906
  function logMergeResult(mergeCost, cumulativeCost, durationMs) {
771
- console.log(` Patches merged | Cost: $${mergeCost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
907
+ spinner.stop();
908
+ clearProgressLine();
909
+ console.log(` ${theme.check} Merged${theme.separator}${theme.dim("Cost:")} ${formatCost(mergeCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
772
910
  }
773
911
  function logPatchGenerationFailures(failedCount, totalCount) {
774
- console.log(` ⚠ ${failedCount}/${totalCount} patch generations failed`);
912
+ spinner.stop();
913
+ clearProgressLine();
914
+ console.log(` ${theme.warn} ${theme.warning(`${failedCount}/${totalCount} patch generations failed`)}`);
775
915
  }
776
916
  function logOptimizationComplete(bestSuccessRate, targetSuccessRate, cumulativeCost) {
777
- console.log(`\n=== Optimization Complete ===`);
778
- console.log(`Best result: ${(bestSuccessRate * 100).toFixed(1)}% (target was ${(targetSuccessRate * 100).toFixed(0)}%)`);
779
- console.log(`Total cost: $${cumulativeCost.toFixed(4)}`);
917
+ spinner.stop();
918
+ clearProgressLine();
919
+ console.log("");
920
+ console.log(theme.divider("Complete"));
921
+ console.log("");
922
+ const targetMet = bestSuccessRate >= targetSuccessRate;
923
+ const icon = targetMet ? theme.check : theme.cross;
924
+ const rateColor = targetMet ? theme.success : theme.error;
925
+ console.log(` ${icon} ${theme.bold("Best:")} ${rateColor(formatPercentage(bestSuccessRate))}`);
926
+ console.log(` ${theme.dim("Target:")} ${formatPercentage(targetSuccessRate)}${theme.separator}${theme.dim("Total Cost:")} ${formatCostShort(cumulativeCost)}`);
780
927
  }
781
928
  function logLogsWritten(logPath) {
782
- console.log(`Logs written to: ${logPath}`);
929
+ spinner.stop();
930
+ clearProgressLine();
931
+ console.log(` ${theme.dim("Logs written to:")} ${logPath}`);
932
+ console.log("");
783
933
  }
784
934
  function generateConfigSection(ctx, testCaseCount) {
785
935
  const lines = [];
@@ -946,6 +1096,7 @@ function writeRawDataJson(folderPath, iterations, ctx, success) {
946
1096
  input: tc.input,
947
1097
  expected: tc.expected,
948
1098
  actual: tc.actual,
1099
+ additionalContext: tc.additionalContext,
949
1100
  fields: tc.fields
950
1101
  });
951
1102
  });
@@ -1019,6 +1170,7 @@ function writeBestRunJson(folderPath, iterations, ctx) {
1019
1170
  input: tc.input,
1020
1171
  expected: tc.expected,
1021
1172
  actual: tc.actual,
1173
+ additionalContext: tc.additionalContext,
1022
1174
  failedFields: extractFailedFields(tc.fields)
1023
1175
  });
1024
1176
  else if (tc.passRate < 1) partialFailures.push({
@@ -1027,13 +1179,15 @@ function writeBestRunJson(folderPath, iterations, ctx) {
1027
1179
  input: tc.input,
1028
1180
  expected: tc.expected,
1029
1181
  actual: tc.actual,
1182
+ additionalContext: tc.additionalContext,
1030
1183
  failedFields: extractFailedFields(tc.fields)
1031
1184
  });
1032
1185
  else successes.push({
1033
1186
  testIndex: testIdx,
1034
1187
  input: tc.input,
1035
1188
  expected: tc.expected,
1036
- actual: tc.actual
1189
+ actual: tc.actual,
1190
+ additionalContext: tc.additionalContext
1037
1191
  });
1038
1192
  });
1039
1193
  const report = {
@@ -1070,29 +1224,402 @@ function writeBestRunJson(folderPath, iterations, ctx) {
1070
1224
  };
1071
1225
  fs.writeFileSync(bestRunPath, JSON.stringify(report, null, 2), "utf-8");
1072
1226
  }
1073
- function writeFinalLogs(logPath, iterationLogs, logContext, success) {
1074
- const folderPath = path.dirname(logPath);
1075
- if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
1076
- const content = generateLogContent(iterationLogs, logContext, success);
1077
- fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
1078
- writePromptsFile(folderPath, iterationLogs, logContext);
1079
- writeRawDataJson(folderPath, iterationLogs, logContext, success);
1080
- writeBestRunJson(folderPath, iterationLogs, logContext);
1227
+ function writeFinalLogs(logPath, iterationLogs, logContext, success) {
1228
+ const folderPath = path.dirname(logPath);
1229
+ if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
1230
+ const content = generateLogContent(iterationLogs, logContext, success);
1231
+ fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
1232
+ writePromptsFile(folderPath, iterationLogs, logContext);
1233
+ writeRawDataJson(folderPath, iterationLogs, logContext, success);
1234
+ writeBestRunJson(folderPath, iterationLogs, logContext);
1235
+ }
1236
+
1237
+ //#endregion
1238
+ //#region src/eval/eval-logging.ts
1239
+ /**
1240
+ * Write evaluation results to rawData.json
1241
+ *
1242
+ * Synchronous writes are intentional - logging runs after evaluation completes
1243
+ * and errors are caught. This avoids async complexity in the calling code.
1244
+ */
1245
+ function writeEvalLogs(logPath, result, durationMs, perTestThreshold) {
1246
+ try {
1247
+ const dir = path.dirname(logPath);
1248
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
1249
+ const report = {
1250
+ metadata: {
1251
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1252
+ systemPrompt: result.systemPrompt,
1253
+ testCaseCount: result.total,
1254
+ perTestThreshold: perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD
1255
+ },
1256
+ summary: {
1257
+ passed: result.passed,
1258
+ total: result.total,
1259
+ successRate: result.successRate,
1260
+ correctFields: result.correctFields,
1261
+ totalFields: result.totalFields,
1262
+ accuracy: result.accuracy,
1263
+ executorCost: result.cost,
1264
+ comparatorCost: result.comparatorCost,
1265
+ totalCost: result.cost + result.comparatorCost,
1266
+ durationMs
1267
+ },
1268
+ testCases: result.testCases.map((tc, index) => ({
1269
+ index,
1270
+ passed: tc.passed,
1271
+ passRate: tc.passRate,
1272
+ input: tc.input,
1273
+ expected: tc.expected,
1274
+ actual: tc.actual,
1275
+ additionalContext: tc.additionalContext,
1276
+ executorCost: tc.cost ?? 0,
1277
+ comparatorCost: tc.comparatorCost ?? 0,
1278
+ error: tc.error,
1279
+ fields: tc.fields
1280
+ }))
1281
+ };
1282
+ fs.writeFileSync(logPath, JSON.stringify(report, null, 2), "utf-8");
1283
+ } catch (error) {
1284
+ console.error(`Failed to write eval logs to ${logPath}:`, error instanceof Error ? error.message : String(error));
1285
+ }
1286
+ }
1287
+
1288
+ //#endregion
1289
+ //#region src/eval/eval.ts
1290
+ /**
1291
+ * Run all test cases and return results.
1292
+ */
1293
+ async function evaluate(config) {
1294
+ const { testCases, systemPrompt, executor, comparators, comparatorOverride } = config;
1295
+ if (testCases.length === 0) throw new Error("testCases array cannot be empty");
1296
+ if (!executor) throw new Error("executor is required");
1297
+ const startTime = Date.now();
1298
+ const logPath = config.storeLogs ? typeof config.storeLogs === "string" ? config.storeLogs : `./didactic-logs/eval_${Date.now()}_${crypto.randomUUID().slice(0, 8)}/rawData.json` : void 0;
1299
+ const executeTestCase = async ({ input, expected }) => {
1300
+ try {
1301
+ const result = await executor(input, systemPrompt);
1302
+ let fields;
1303
+ if (comparatorOverride) {
1304
+ const compResult = await comparatorOverride(expected, result.output);
1305
+ fields = { "": {
1306
+ passed: compResult.passed,
1307
+ expected,
1308
+ actual: result.output
1309
+ } };
1310
+ } else {
1311
+ let comparatorConfig;
1312
+ if (!comparators) comparatorConfig = { "": exact };
1313
+ else if (typeof comparators === "function") comparatorConfig = { "": comparators };
1314
+ else comparatorConfig = comparators;
1315
+ fields = await compareFields({
1316
+ expected,
1317
+ actual: result.output,
1318
+ comparators: comparatorConfig,
1319
+ llmConfig: config.llmConfig
1320
+ });
1321
+ }
1322
+ const passedFields = Object.values(fields).filter((f) => f.passed).length;
1323
+ const totalFields$1 = Object.values(fields).length;
1324
+ const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
1325
+ const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
1326
+ const comparatorCost$1 = Object.values(fields).reduce((sum, field) => sum + (field.cost ?? 0), 0);
1327
+ return {
1328
+ input,
1329
+ expected,
1330
+ actual: result.output,
1331
+ additionalContext: result.additionalContext,
1332
+ cost: result.cost ?? 0,
1333
+ comparatorCost: comparatorCost$1,
1334
+ passed: passed$1,
1335
+ fields,
1336
+ passedFields,
1337
+ totalFields: totalFields$1,
1338
+ passRate
1339
+ };
1340
+ } catch (error) {
1341
+ return {
1342
+ input,
1343
+ expected,
1344
+ actual: void 0,
1345
+ cost: 0,
1346
+ comparatorCost: 0,
1347
+ passed: false,
1348
+ fields: {},
1349
+ passedFields: 0,
1350
+ totalFields: 0,
1351
+ passRate: 0,
1352
+ error: error instanceof Error ? error.message : String(error)
1353
+ };
1354
+ }
1355
+ };
1356
+ const rateLimitBatch = config.rateLimitBatch;
1357
+ let results;
1358
+ if (rateLimitBatch && rateLimitBatch > 0) {
1359
+ results = [];
1360
+ const progress = createProgressUpdater("evals");
1361
+ for (let i = 0; i < testCases.length; i += rateLimitBatch) {
1362
+ const batch = testCases.slice(i, i + rateLimitBatch);
1363
+ const batchResults = await Promise.all(batch.map(executeTestCase));
1364
+ results.push(...batchResults);
1365
+ progress.update(results.length, testCases.length);
1366
+ const rateLimitPause = config.rateLimitPause;
1367
+ if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
1368
+ }
1369
+ progress.finish();
1370
+ } else {
1371
+ const progress = createProgressUpdater("evals");
1372
+ results = (await trackPromiseProgress(testCases.map((tc) => executeTestCase(tc)), (completed, total$1) => progress.update(completed, total$1))).map((r) => r.value);
1373
+ progress.finish();
1374
+ }
1375
+ results.sort((a, b) => {
1376
+ if (a.passed !== b.passed) return a.passed ? 1 : -1;
1377
+ return a.passRate - b.passRate;
1378
+ });
1379
+ const passed = results.filter((r) => r.passed).length;
1380
+ const total = results.length;
1381
+ const successRate = total > 0 ? passed / total : 0;
1382
+ let correctFields = 0;
1383
+ let totalFields = 0;
1384
+ for (const r of results) {
1385
+ const fieldResults = Object.values(r.fields);
1386
+ totalFields += fieldResults.length;
1387
+ correctFields += fieldResults.filter((f) => f.passed).length;
1388
+ }
1389
+ const accuracy = totalFields > 0 ? correctFields / totalFields : 0;
1390
+ const cost = results.reduce((sum, r) => sum + (r.cost ?? 0), 0);
1391
+ const comparatorCost = results.reduce((sum, r) => sum + (r.comparatorCost ?? 0), 0);
1392
+ const durationMs = Date.now() - startTime;
1393
+ const logFolder = logPath ? path.dirname(logPath) : void 0;
1394
+ const evalResult = {
1395
+ systemPrompt,
1396
+ testCases: results,
1397
+ passed,
1398
+ total,
1399
+ successRate,
1400
+ correctFields,
1401
+ totalFields,
1402
+ accuracy,
1403
+ cost,
1404
+ comparatorCost,
1405
+ ...logFolder && { logFolder }
1406
+ };
1407
+ if (logPath) writeEvalLogs(logPath, evalResult, durationMs, config.perTestThreshold);
1408
+ return evalResult;
1409
+ }
1410
+ /**
1411
+ * Recursively compare expected vs actual, returning field-level results.
1412
+ * Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
1413
+ */
1414
+ async function compareFields(opts) {
1415
+ const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, llmConfig } = opts;
1416
+ const results = {};
1417
+ const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
1418
+ if (Array.isArray(expected)) {
1419
+ if (!Array.isArray(actual)) return { [path$1]: {
1420
+ passed: false,
1421
+ expected,
1422
+ actual
1423
+ } };
1424
+ if (expected.length === 0) return {};
1425
+ const fieldComparator = comparators[getFieldName(path$1)];
1426
+ const isUnordered = fieldComparator && typeof fieldComparator === "function" && "_unordered" in fieldComparator && fieldComparator._unordered === true;
1427
+ let itemComparators;
1428
+ if (isUnordered) itemComparators = fieldComparator._nestedComparators || comparators;
1429
+ else if (fieldComparator && typeof fieldComparator === "object" && !("_unordered" in fieldComparator)) itemComparators = fieldComparator;
1430
+ else itemComparators = comparators;
1431
+ let matchedPairs;
1432
+ if (isUnordered) matchedPairs = (await matchArrays(expected, actual, itemComparators)).assignments;
1433
+ else {
1434
+ matchedPairs = [];
1435
+ for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
1436
+ }
1437
+ const matchedIndices = new Set(matchedPairs.map(([i]) => i));
1438
+ for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, await compareFields({
1439
+ expected: expected[expIdx],
1440
+ actual: actual[actIdx],
1441
+ comparators: itemComparators,
1442
+ path: indexPath(expIdx),
1443
+ expectedParent,
1444
+ actualParent,
1445
+ llmConfig
1446
+ }));
1447
+ const hasArrayComparator = fieldComparator !== void 0;
1448
+ for (let i = 0; i < expected.length; i++) {
1449
+ if (matchedIndices.has(i)) continue;
1450
+ const item = expected[i];
1451
+ if (isObject(item)) {
1452
+ for (const [field, value] of Object.entries(item)) if (field in itemComparators) results[`${indexPath(i)}.${field}`] = {
1453
+ passed: false,
1454
+ expected: value,
1455
+ actual: void 0
1456
+ };
1457
+ } else if (hasArrayComparator) results[indexPath(i)] = {
1458
+ passed: false,
1459
+ expected: item,
1460
+ actual: void 0
1461
+ };
1462
+ }
1463
+ return results;
1464
+ }
1465
+ if (isObject(expected)) {
1466
+ if (!isObject(actual)) return { [path$1]: {
1467
+ passed: false,
1468
+ expected,
1469
+ actual
1470
+ } };
1471
+ for (const [field, expValue] of Object.entries(expected)) {
1472
+ const fieldPath = path$1 ? `${path$1}.${field}` : field;
1473
+ const fieldConfig = comparators[field];
1474
+ if (fieldConfig === void 0) continue;
1475
+ let fieldComparators;
1476
+ if (fieldConfig && typeof fieldConfig === "object" && !("_unordered" in fieldConfig)) fieldComparators = fieldConfig;
1477
+ else fieldComparators = comparators;
1478
+ Object.assign(results, await compareFields({
1479
+ expected: expValue,
1480
+ actual: actual[field],
1481
+ comparators: fieldComparators,
1482
+ path: fieldPath,
1483
+ expectedParent: expected,
1484
+ actualParent: actual,
1485
+ llmConfig
1486
+ }));
1487
+ }
1488
+ return results;
1489
+ }
1490
+ const fieldName = getFieldName(path$1);
1491
+ let comparatorConfig = comparators[fieldName];
1492
+ if (!comparatorConfig && fieldName === "") comparatorConfig = exact;
1493
+ if (!comparatorConfig) return {};
1494
+ const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected, actual, {
1495
+ expectedParent,
1496
+ actualParent,
1497
+ llmConfig
1498
+ });
1499
+ return { [path$1]: {
1500
+ ...result,
1501
+ expected,
1502
+ actual
1503
+ } };
1504
+ }
1505
+ function isObject(value) {
1506
+ return value !== null && typeof value === "object" && !Array.isArray(value);
1507
+ }
1508
+ function getFieldName(path$1) {
1509
+ return (path$1.split(".").pop() || "").replace(/\[\d+\]$/, "");
1510
+ }
1511
+
1512
+ //#endregion
1513
+ //#region src/optimizer/prompts.ts
1514
+ /**
1515
+ * Default system prompt for patch generation.
1516
+ * Analyzes failures and suggests specific, focused changes to improve the prompt.
1517
+ */
1518
+ const DEFAULT_PATCH_SYSTEM_PROMPT = `
1519
+ 'You are optimizing a system prompt for an LLM workflow.
1520
+ Analyze the failure and suggest a specific, focused change to improve the prompt.
1521
+ Do NOT overfit. Be generalizable.
1522
+
1523
+ <examples>
1524
+ VERY IMPORTANT, CRITICAL!!!
1525
+ Examples MUST be anonymized.
1526
+ NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
1527
+ - example: (for an invoice processor)
1528
+ - task: extract data from parsed invoices
1529
+ - failure context: (returned expected: true, actual: false)
1530
+ - prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
1531
+
1532
+ - example: (for a calendar app)
1533
+ - task: extract cost from calendar event
1534
+ - failure context: (cost expected: 123.45, actual: 167.89)
1535
+ - prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
1536
+ </examples>
1537
+ `;
1538
+ /**
1539
+ * Default system prompt for merging patches.
1540
+ * Combines multiple patches into a coherent system prompt.
1541
+ */
1542
+ const DEFAULT_MERGE_SYSTEM_PROMPT = `
1543
+ You are an expert LLM prompt editor.
1544
+ You are merging improvements into a system prompt.
1545
+ Incorporate the suggestions while keeping the prompt clear and coherent.
1546
+ `;
1547
+ /**
1548
+ * Builds the user prompt for patch generation.
1549
+ * Formats the failure context and current prompt for the LLM.
1550
+ */
1551
+ function buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures) {
1552
+ let userContent = `
1553
+ Current system prompt:
1554
+ ---
1555
+ ${currentPrompt}
1556
+ ---
1557
+
1558
+ A test case failed:
1559
+ ${formatFailure(failure)}
1560
+ `;
1561
+ if (previousBetterPrompt) {
1562
+ const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
1563
+ userContent += `
1564
+ Note: The current prompt is a REGRESSION from a better-performing version.
1565
+ Previous (better) prompt for reference:
1566
+ ---
1567
+ ${previousBetterPrompt}
1568
+ ---
1569
+
1570
+ The failures the better prompt had:
1571
+ ${failuresContext}
1572
+
1573
+ Your changes introduced new failures instead of fixing the above.
1574
+ Analyze what changed between the two prompts that might have caused this regression.
1575
+ Are there any new failures that were not present in the previous better prompt?
1576
+ Are there any failures that were present in the previous better prompt but not in the current prompt?
1577
+ Did any of our patches contradict any of the new failures?
1578
+ `;
1579
+ }
1580
+ userContent += `
1581
+ Suggest a specific change to the system prompt that would fix this failure.
1582
+ Be concise. Output ONLY the suggested patch/change, not the full prompt.
1583
+ DO NOT overfit the prompt to the test case.
1584
+ Generalize examples if you choose to use them.
1585
+ `;
1586
+ return userContent;
1587
+ }
1588
+ /**
1589
+ * Builds the user prompt for merging patches.
1590
+ * Formats the current prompt and suggested patches for the LLM.
1591
+ */
1592
+ function buildMergeUserPrompt(patches, currentPrompt) {
1593
+ return `
1594
+ Current prompt:
1595
+ ---
1596
+ ${currentPrompt}
1597
+ ---
1598
+
1599
+ Suggested improvements:
1600
+ ${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
1601
+
1602
+ Create a single improved system prompt that incorporates these suggestions.
1603
+ Be mindful of the size of the new prompt.
1604
+ Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
1605
+ Output ONLY the new system prompt, nothing else.
1606
+ Respect enums.
1607
+ `;
1081
1608
  }
1082
1609
 
1083
1610
  //#endregion
1084
- //#region src/optimizer.ts
1611
+ //#region src/optimizer/optimizer.ts
1085
1612
  async function optimize(evalConfig, config) {
1086
1613
  if (!config.apiKey) throw new Error("apiKey is required");
1087
- if (!config.systemPrompt) throw new Error("systemPrompt is required");
1088
1614
  if (config.targetSuccessRate < 0 || config.targetSuccessRate > 1) throw new Error("targetSuccessRate must be between 0 and 1");
1089
1615
  const iterationLogs = [];
1090
1616
  const maxIterations = config.maxIterations ?? (config.maxCost !== void 0 ? Infinity : 5);
1091
1617
  const startTime = /* @__PURE__ */ new Date();
1618
+ const model = PROVIDER_SPECS[config.provider].model;
1092
1619
  const logContext = {
1093
1620
  config,
1094
1621
  startTime,
1095
- model: PROVIDER_SPECS[config.provider].model,
1622
+ model,
1096
1623
  perTestThreshold: evalConfig.perTestThreshold,
1097
1624
  rateLimitBatch: evalConfig.rateLimitBatch,
1098
1625
  rateLimitPause: evalConfig.rateLimitPause
@@ -1149,6 +1676,8 @@ async function optimize(evalConfig, config) {
1149
1676
  totalCost: cumulativeCost
1150
1677
  };
1151
1678
  };
1679
+ const testCount = evalConfig.testCases?.length ?? 0;
1680
+ logOptimizerHeader(model, config.targetSuccessRate, testCount);
1152
1681
  for (let i = 1; i <= maxIterations; i++) {
1153
1682
  const iterationStart = Date.now();
1154
1683
  let iterInputTokens = 0;
@@ -1162,7 +1691,7 @@ async function optimize(evalConfig, config) {
1162
1691
  });
1163
1692
  cumulativeCost += result.cost;
1164
1693
  logEvaluationResult(result, cumulativeCost, Date.now() - evalStart);
1165
- const regressed = i > 1 && result.successRate < bestSuccessRate;
1694
+ const regressed = i > 1 && result.successRate <= bestSuccessRate;
1166
1695
  if (regressed) logRegressionDetected(bestSuccessRate);
1167
1696
  if (result.successRate > bestSuccessRate) {
1168
1697
  bestSuccessRate = result.successRate;
@@ -1175,10 +1704,6 @@ async function optimize(evalConfig, config) {
1175
1704
  return finalizeOptimization(true, currentPrompt);
1176
1705
  }
1177
1706
  const failures = result.testCases.filter((tc) => !tc.passed);
1178
- if (failures.length === 0) {
1179
- recordIteration(i, currentPrompt, result, result.cost, Date.now() - iterationStart, iterInputTokens, iterOutputTokens);
1180
- return finalizeOptimization(true, currentPrompt);
1181
- }
1182
1707
  logTargetFailures(config.targetSuccessRate, failures.length);
1183
1708
  if (config.maxCost !== void 0 && cumulativeCost >= config.maxCost) {
1184
1709
  logCostLimitReached(cumulativeCost);
@@ -1187,7 +1712,9 @@ async function optimize(evalConfig, config) {
1187
1712
  }
1188
1713
  logPatchGenerationStart(failures.length);
1189
1714
  const patchStart = Date.now();
1190
- const patchSettled = await Promise.allSettled(failures.map((failure) => generatePatch(failure, currentPrompt, config, regressed ? bestPrompt : void 0, regressed ? bestPromptFailures : void 0)));
1715
+ const patchProgress = createProgressUpdater("patches");
1716
+ const patchSettled = await trackPromiseProgress(failures.map((failure) => generatePatch(failure, currentPrompt, config, regressed ? bestPrompt : void 0, regressed ? bestPromptFailures : void 0)), (completed, total) => patchProgress.update(completed, total));
1717
+ patchProgress.finish();
1191
1718
  const patchResults = patchSettled.filter((r) => r.status === "fulfilled").map((r) => r.value);
1192
1719
  const failedPatchCount = patchSettled.filter((r) => r.status === "rejected").length;
1193
1720
  if (failedPatchCount > 0) logPatchGenerationFailures(failedPatchCount, failures.length);
@@ -1227,154 +1754,165 @@ async function optimize(evalConfig, config) {
1227
1754
  }
1228
1755
  return finalizeOptimization(false, bestPrompt);
1229
1756
  }
1230
- async function callLLM(messages, config, useThinking = false) {
1231
- const spec = PROVIDER_SPECS[config.provider];
1232
- try {
1233
- if (config.provider.startsWith("anthropic")) {
1234
- const client = new _anthropic_ai_sdk.default({ apiKey: config.apiKey });
1235
- const streamOptions = {
1236
- model: spec.model,
1237
- max_tokens: spec.maxTokens,
1238
- system: messages.find((m) => m.role === "system")?.content,
1239
- messages: messages.filter((m) => m.role !== "system").map((m) => ({
1240
- role: m.role,
1241
- content: m.content
1242
- }))
1243
- };
1244
- if (useThinking) streamOptions.thinking = {
1245
- type: "enabled",
1246
- budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
1247
- };
1248
- const finalMessage = await client.messages.stream(streamOptions).finalMessage();
1249
- const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
1250
- const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
1251
- const inputTokens = finalMessage.usage.input_tokens;
1252
- const outputTokens = finalMessage.usage.output_tokens;
1253
- return {
1254
- text,
1255
- cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
1256
- inputTokens,
1257
- outputTokens
1258
- };
1259
- }
1260
- if (config.provider.startsWith("openai")) {
1261
- const client = new openai.default({ apiKey: config.apiKey });
1262
- const completionOptions = {
1263
- model: spec.model,
1264
- messages: messages.map((m) => ({
1265
- role: m.role,
1266
- content: m.content
1267
- })),
1268
- max_completion_tokens: spec.maxTokens
1269
- };
1270
- if (useThinking) completionOptions.reasoning_effort = "xhigh";
1271
- const response = await client.chat.completions.create(completionOptions);
1272
- const text = response.choices[0].message.content ?? "";
1273
- const inputTokens = response.usage?.prompt_tokens ?? 0;
1274
- const outputTokens = response.usage?.completion_tokens ?? 0;
1275
- return {
1276
- text,
1277
- cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
1278
- inputTokens,
1279
- outputTokens
1280
- };
1281
- }
1282
- throw new Error(`Unsupported provider: ${config.provider}`);
1283
- } catch (error) {
1284
- const message = error instanceof Error ? error.message : String(error);
1285
- throw new Error(`LLM call failed (${spec.model}): ${message}`);
1286
- }
1287
- }
1288
1757
  async function generatePatch(failure, currentPrompt, config, previousBetterPrompt, previousBetterPromptFailures) {
1289
- let userContent = `
1290
- Current system prompt:
1291
- ---
1292
- ${currentPrompt}
1293
- ---
1294
-
1295
- A test case failed:
1296
- ${formatFailure(failure)}
1297
- `;
1298
- if (previousBetterPrompt) {
1299
- const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
1300
- userContent += `
1301
- Note: The current prompt is a REGRESSION from a better-performing version.
1302
- Previous (better) prompt for reference:
1303
- ---
1304
- ${previousBetterPrompt}
1305
- ---
1306
-
1307
- The failures the better prompt had:
1308
- ${failuresContext}
1309
-
1310
- Your changes introduced new failures instead of fixing the above.
1311
- Analyze what changed between the two prompts that might have caused this regression.
1312
- Are there any new failures that were not present in the previous better prompt?
1313
- Are there any failures that were present in the previous better prompt but not in the current prompt?
1314
- Did any of our patches contradict any of the new failures?
1315
- `;
1316
- }
1317
- userContent += `
1318
- Suggest a specific change to the system prompt that would fix this failure.
1319
- Be concise. Output ONLY the suggested patch/change, not the full prompt.
1320
- DO NOT overfit the prompt to the test case.
1321
- Generalize examples if you choose to use them.
1322
- `;
1323
- return callLLM([{
1758
+ const userContent = buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures);
1759
+ const messages = [{
1324
1760
  role: "system",
1325
- content: `
1326
- 'You are optimizing a system prompt for an LLM workflow.
1327
- Analyze the failure and suggest a specific, focused change to improve the prompt.
1328
- Do NOT overfit. Be generalizable.
1329
-
1330
- <examples>
1331
- VERY IMPORTANT, CRITICAL!!!
1332
- Examples MUST be anonymized.
1333
- NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
1334
- - example: (for an invoice processor)
1335
- - task: extract data from parsed invoices
1336
- - failure context: (returned expected: true, actual: false)
1337
- - prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
1338
-
1339
- - example: (for a calendar app)
1340
- - task: extract cost from calendar event
1341
- - failure context: (cost expected: 123.45, actual: 167.89)
1342
- - prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
1343
- </examples>
1344
- `
1761
+ content: config.patchSystemPrompt ?? DEFAULT_PATCH_SYSTEM_PROMPT
1345
1762
  }, {
1346
1763
  role: "user",
1347
1764
  content: userContent
1348
- }], config, config.thinking ?? false);
1765
+ }];
1766
+ return callLLM({
1767
+ provider: config.provider,
1768
+ apiKey: config.apiKey,
1769
+ messages,
1770
+ useThinking: config.thinking ?? false
1771
+ });
1349
1772
  }
1350
1773
  async function mergePatches(patches, currentPrompt, config) {
1351
- const systemContent = `
1352
- You are an expert LLM prompt editor.
1353
- You are merging improvements into a system prompt.
1354
- Incorporate the suggestions while keeping the prompt clear and coherent.
1355
- `;
1356
- const userContent = `
1357
- Current prompt:
1358
- ---
1359
- ${currentPrompt}
1360
- ---
1361
-
1362
- Suggested improvements:
1363
- ${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
1364
-
1365
- Create a single improved system prompt that incorporates these suggestions.
1366
- Be mindful of the size of the new prompt.
1367
- Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
1368
- Output ONLY the new system prompt, nothing else.
1369
- Respect enums.
1370
- `;
1371
- return callLLM([{
1774
+ const systemContent = config.mergeSystemPrompt ?? DEFAULT_MERGE_SYSTEM_PROMPT;
1775
+ const userContent = buildMergeUserPrompt(patches, currentPrompt);
1776
+ const messages = [{
1372
1777
  role: "system",
1373
1778
  content: systemContent
1374
1779
  }, {
1375
1780
  role: "user",
1376
1781
  content: userContent
1377
- }], config, config.thinking ?? false);
1782
+ }];
1783
+ return callLLM({
1784
+ provider: config.provider,
1785
+ apiKey: config.apiKey,
1786
+ messages,
1787
+ useThinking: config.thinking ?? false
1788
+ });
1789
+ }
1790
+
1791
+ //#endregion
1792
+ //#region src/eval/executors.ts
1793
+ /**
1794
+ * Creates an executor that calls an HTTP endpoint.
1795
+ *
1796
+ * @example
1797
+ * ```ts
1798
+ * const executor = endpoint('https://api.example.com/workflow', {
1799
+ * headers: { Authorization: 'Bearer token' },
1800
+ * });
1801
+ * ```
1802
+ */
1803
+ function endpoint(url, config = {}) {
1804
+ const { method = "POST", headers = {}, mapResponse, mapAdditionalContext, mapCost, timeout = DEFAULT_ENDPOINT_TIMEOUT_MS } = config;
1805
+ return async (input, systemPrompt) => {
1806
+ const body = typeof input === "object" && input !== null ? {
1807
+ ...input,
1808
+ systemPrompt
1809
+ } : {
1810
+ input,
1811
+ systemPrompt
1812
+ };
1813
+ const controller = new AbortController();
1814
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
1815
+ try {
1816
+ const response = await fetch(url, {
1817
+ method,
1818
+ headers: {
1819
+ "Content-Type": "application/json",
1820
+ ...headers
1821
+ },
1822
+ body: JSON.stringify(body),
1823
+ signal: controller.signal
1824
+ });
1825
+ clearTimeout(timeoutId);
1826
+ if (!response.ok) {
1827
+ const text = await response.text();
1828
+ throw new Error(`HTTP ${response.status}: ${text}`);
1829
+ }
1830
+ const data = await response.json();
1831
+ const additionalContext = mapAdditionalContext?.(data);
1832
+ const cost = mapCost?.(data) ?? 0;
1833
+ if (mapResponse) return {
1834
+ output: mapResponse(data),
1835
+ additionalContext,
1836
+ cost
1837
+ };
1838
+ return {
1839
+ output: data,
1840
+ additionalContext,
1841
+ cost
1842
+ };
1843
+ } catch (error) {
1844
+ clearTimeout(timeoutId);
1845
+ throw error;
1846
+ }
1847
+ };
1848
+ }
1849
+ /**
1850
+ * Creates an executor from a local function.
1851
+ *
1852
+ * @example
1853
+ * ```ts
1854
+ * const executor = fn({
1855
+ * fn: async (input, systemPrompt) => {
1856
+ * const result = await myLLMCall(input, systemPrompt);
1857
+ * return result;
1858
+ * },
1859
+ * });
1860
+ * ```
1861
+ *
1862
+ * @example With mapResponse to extract output from a richer response:
1863
+ * ```ts
1864
+ * const executor = fn({
1865
+ * fn: async (input, systemPrompt) => await startWorkflow({ ... }),
1866
+ * mapResponse: (result) => ({ documentType: result.documentType }),
1867
+ * mapCost: (result) => result.cost,
1868
+ * mapAdditionalContext: (result) => result.metadata,
1869
+ * });
1870
+ * ```
1871
+ */
1872
+ function fn(config) {
1873
+ return async (input, systemPrompt) => {
1874
+ const raw = await config.fn(input, systemPrompt);
1875
+ return {
1876
+ output: config.mapResponse ? config.mapResponse(raw) : raw,
1877
+ additionalContext: config.mapAdditionalContext?.(raw),
1878
+ cost: config.mapCost?.(raw) ?? 0
1879
+ };
1880
+ };
1881
+ }
1882
+ /**
1883
+ * Creates a mock executor for testing.
1884
+ * Can accept either:
1885
+ * - An array of outputs (returned in sequence, cycling if more calls than outputs)
1886
+ * - A function that maps input to output
1887
+ *
1888
+ * @example Array-based:
1889
+ * ```ts
1890
+ * const executor = mock([
1891
+ * { premium: 12500, policyType: 'claims-made' },
1892
+ * { premium: 8200, policyType: 'entity' },
1893
+ * ]);
1894
+ * ```
1895
+ *
1896
+ * @example Function-based:
1897
+ * ```ts
1898
+ * const executor = mock((input) => ({
1899
+ * id: input.id,
1900
+ * processed: true,
1901
+ * }));
1902
+ * ```
1903
+ */
1904
+ function mock(outputsOrFn) {
1905
+ if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
1906
+ return { output: outputsOrFn(input, systemPrompt) };
1907
+ };
1908
+ const outputs = outputsOrFn;
1909
+ if (outputs.length === 0) throw new Error("mock() requires at least one output");
1910
+ let callIndex = 0;
1911
+ return async () => {
1912
+ const output = outputs[callIndex % outputs.length];
1913
+ callIndex++;
1914
+ return { output };
1915
+ };
1378
1916
  }
1379
1917
 
1380
1918
  //#endregion
@@ -1438,11 +1976,13 @@ exports.endpoint = endpoint;
1438
1976
  exports.evaluate = evaluate;
1439
1977
  exports.exact = exact;
1440
1978
  exports.fn = fn;
1979
+ exports.llmCompare = llmCompare;
1441
1980
  exports.mock = mock;
1442
1981
  exports.name = name;
1443
1982
  exports.numeric = numeric;
1444
1983
  exports.oneOf = oneOf;
1445
1984
  exports.optimize = optimize;
1446
1985
  exports.presence = presence;
1986
+ exports.unordered = unordered;
1447
1987
  exports.within = within;
1448
1988
  //# sourceMappingURL=index.cjs.map