@docshield/didactic 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,15 +1,21 @@
1
- import * as chrono from "chrono-node";
2
- import { differenceInDays } from "date-fns";
3
- import Levenshtein from "levenshtein";
4
1
  import munkres from "munkres-js";
5
2
  import Anthropic from "@anthropic-ai/sdk";
6
3
  import OpenAI from "openai";
7
- import * as path from "path";
4
+ import * as chrono from "chrono-node";
5
+ import { differenceInDays } from "date-fns";
6
+ import Levenshtein from "levenshtein";
8
7
  import * as fs from "fs";
8
+ import * as path from "path";
9
+ import chalk from "chalk";
10
+ import ora from "ora";
11
+ import cliProgress from "cli-progress";
12
+ import figures from "figures";
13
+ import * as crypto from "crypto";
9
14
 
10
15
  //#region src/types.ts
11
16
  /**
12
17
  * Supported LLM providers.
18
+ * Used by both optimizer and LLM-based comparators.
13
19
  */
14
20
  let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
15
21
  LLMProviders$1["anthropic_claude_opus"] = "anthropic_claude_opus";
@@ -21,7 +27,7 @@ let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
21
27
  }({});
22
28
 
23
29
  //#endregion
24
- //#region src/constants.ts
30
+ //#region src/library/constants.ts
25
31
  const PROVIDER_SPECS = {
26
32
  [LLMProviders.anthropic_claude_opus]: {
27
33
  model: "claude-opus-4-5-20251101",
@@ -36,7 +42,7 @@ const PROVIDER_SPECS = {
36
42
  costPerMillionOutput: 15
37
43
  },
38
44
  [LLMProviders.anthropic_claude_haiku]: {
39
- model: "claude-haiku-4-5-20251101",
45
+ model: "claude-haiku-4-5-20251001",
40
46
  maxTokens: 64e3,
41
47
  costPerMillionInput: 1,
42
48
  costPerMillionOutput: 5
@@ -61,7 +67,154 @@ const DEFAULT_PER_TEST_THRESHOLD = 1;
61
67
  const NAME_SUFFIXES = /(?<=\S)\s*,?\s*(inc\.?|llc\.?|ltd\.?|l\.l\.c\.?|corp\.?|corporation|company|co\.?)$/i;
62
68
 
63
69
  //#endregion
64
- //#region src/comparators.ts
70
+ //#region src/library/llm/llm-client.ts
71
+ /**
72
+ * Call an LLM provider with the given messages.
73
+ * Returns raw text output - caller is responsible for parsing if structured output is needed.
74
+ */
75
+ async function callLLM(config) {
76
+ const { provider, apiKey, messages, useThinking = false } = config;
77
+ const spec = PROVIDER_SPECS[provider];
78
+ try {
79
+ if (provider.startsWith("anthropic")) {
80
+ const client = new Anthropic({ apiKey });
81
+ const streamOptions = {
82
+ model: spec.model,
83
+ max_tokens: spec.maxTokens,
84
+ system: messages.find((m) => m.role === "system")?.content,
85
+ messages: messages.filter((m) => m.role !== "system").map((m) => ({
86
+ role: m.role,
87
+ content: m.content
88
+ }))
89
+ };
90
+ if (useThinking) streamOptions.thinking = {
91
+ type: "enabled",
92
+ budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
93
+ };
94
+ const finalMessage = await client.messages.stream(streamOptions).finalMessage();
95
+ const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
96
+ const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
97
+ const inputTokens = finalMessage.usage.input_tokens;
98
+ const outputTokens = finalMessage.usage.output_tokens;
99
+ return {
100
+ text,
101
+ cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
102
+ inputTokens,
103
+ outputTokens
104
+ };
105
+ }
106
+ if (provider.startsWith("openai")) {
107
+ const client = new OpenAI({ apiKey });
108
+ const completionOptions = {
109
+ model: spec.model,
110
+ messages: messages.map((m) => ({
111
+ role: m.role,
112
+ content: m.content
113
+ })),
114
+ max_completion_tokens: spec.maxTokens
115
+ };
116
+ if (useThinking) completionOptions.reasoning_effort = "xhigh";
117
+ const response = await client.chat.completions.create(completionOptions);
118
+ const text = response.choices[0].message.content ?? "";
119
+ const inputTokens = response.usage?.prompt_tokens ?? 0;
120
+ const outputTokens = response.usage?.completion_tokens ?? 0;
121
+ return {
122
+ text,
123
+ cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
124
+ inputTokens,
125
+ outputTokens
126
+ };
127
+ }
128
+ throw new Error(`Unsupported provider: ${provider}`);
129
+ } catch (error) {
130
+ const message = error instanceof Error ? error.message : String(error);
131
+ throw new Error(`LLM call failed (${spec.model}): ${message}`);
132
+ }
133
+ }
134
+ /**
135
+ * Call an LLM provider with structured output.
136
+ * Returns parsed JSON data conforming to the provided schema.
137
+ */
138
+ async function callStructuredLLM(config) {
139
+ const { provider, apiKey, messages, schema, useThinking = false } = config;
140
+ const spec = PROVIDER_SPECS[provider];
141
+ try {
142
+ if (provider.startsWith("anthropic")) {
143
+ const client = new Anthropic({ apiKey });
144
+ const baseOptions = {
145
+ model: spec.model,
146
+ max_tokens: spec.maxTokens,
147
+ betas: ["structured-outputs-2025-11-13"],
148
+ system: messages.find((m) => m.role === "system")?.content,
149
+ messages: messages.filter((m) => m.role !== "system").map((m) => ({
150
+ role: m.role,
151
+ content: m.content
152
+ })),
153
+ output_format: {
154
+ type: "json_schema",
155
+ schema
156
+ }
157
+ };
158
+ const streamOptions = useThinking ? {
159
+ ...baseOptions,
160
+ thinking: {
161
+ type: "enabled",
162
+ budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
163
+ }
164
+ } : baseOptions;
165
+ const finalMessage = await client.beta.messages.stream(streamOptions).finalMessage();
166
+ const content = finalMessage.content[0];
167
+ if (content.type !== "text") throw new Error("Unexpected response type from LLM");
168
+ const data = JSON.parse(content.text);
169
+ const inputTokens = finalMessage.usage.input_tokens;
170
+ const outputTokens = finalMessage.usage.output_tokens;
171
+ return {
172
+ data,
173
+ cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
174
+ inputTokens,
175
+ outputTokens
176
+ };
177
+ }
178
+ if (provider.startsWith("openai")) {
179
+ const client = new OpenAI({ apiKey });
180
+ const completionOptions = {
181
+ model: spec.model,
182
+ messages: messages.map((m) => ({
183
+ role: m.role,
184
+ content: m.content
185
+ })),
186
+ max_completion_tokens: spec.maxTokens,
187
+ response_format: {
188
+ type: "json_schema",
189
+ json_schema: {
190
+ name: "response",
191
+ strict: true,
192
+ schema
193
+ }
194
+ }
195
+ };
196
+ if (useThinking) completionOptions.reasoning_effort = "xhigh";
197
+ const response = await client.chat.completions.create(completionOptions);
198
+ const text = response.choices[0].message.content ?? "";
199
+ const data = JSON.parse(text);
200
+ const inputTokens = response.usage?.prompt_tokens ?? 0;
201
+ const outputTokens = response.usage?.completion_tokens ?? 0;
202
+ return {
203
+ data,
204
+ cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
205
+ inputTokens,
206
+ outputTokens
207
+ };
208
+ }
209
+ throw new Error(`Unsupported provider: ${provider}`);
210
+ } catch (error) {
211
+ const message = error instanceof Error ? error.message : String(error);
212
+ throw new Error(`Structured LLM call failed (${spec.model}): ${message}`);
213
+ }
214
+ }
215
+
216
+ //#endregion
217
+ //#region src/eval/comparators/comparators.ts
65
218
  /** Checks if actual string contains a substring. */
66
219
  function contains(substring) {
67
220
  return (_expected, actual) => {
@@ -198,6 +351,103 @@ function within(config) {
198
351
  };
199
352
  };
200
353
  }
354
+ /** Schema for LLM comparison response. */
355
+ const LLM_COMPARE_SCHEMA = {
356
+ type: "object",
357
+ properties: {
358
+ passed: {
359
+ type: "boolean",
360
+ description: "Whether the actual value matches the expected value"
361
+ },
362
+ rationale: {
363
+ type: "string",
364
+ description: "Brief explanation of the comparison decision"
365
+ }
366
+ },
367
+ required: ["passed", "rationale"],
368
+ additionalProperties: false
369
+ };
370
+ const DEFAULT_LLM_COMPARE_SYSTEM_PROMPT = `Compare the following two values and determine if they are semantically equivalent.
371
+
372
+ Focus on whether they convey the same core meaning or information, even if expressed differently. Consider synonyms, paraphrasing, and stylistic variations as acceptable. Only mark as failed if there are substantial differences in the actual facts or meaning being conveyed.`;
373
+ const buildLLMCompareUserPrompt = (expected, actual) => `Expected value:
374
+ ${JSON.stringify(expected, null, 2)}
375
+
376
+ Actual value:
377
+ ${JSON.stringify(actual, null, 2)}`;
378
+ /**
379
+ * Uses an LLM to compare expected vs actual values.
380
+ * Returns a comparison result with rationale and cost tracking.
381
+ * Default provider: anthropic_claude_haiku (fastest, cheapest).
382
+ */
383
+ function llmCompare(config) {
384
+ const systemPrompt = config.systemPrompt ?? DEFAULT_LLM_COMPARE_SYSTEM_PROMPT;
385
+ return async (expected, actual, context) => {
386
+ try {
387
+ const apiKey = config.apiKey ?? context?.llmConfig?.apiKey;
388
+ if (!apiKey) throw new Error("llmCompare requires an apiKey. Either pass it directly to llmCompare() or set llmConfig.apiKey in eval config.");
389
+ const provider = config.provider ?? context?.llmConfig?.provider ?? LLMProviders.anthropic_claude_haiku;
390
+ const userPrompt = buildLLMCompareUserPrompt(expected, actual);
391
+ const result = await callStructuredLLM({
392
+ provider,
393
+ apiKey,
394
+ messages: [{
395
+ role: "system",
396
+ content: systemPrompt
397
+ }, {
398
+ role: "user",
399
+ content: userPrompt
400
+ }],
401
+ schema: LLM_COMPARE_SCHEMA
402
+ });
403
+ return {
404
+ passed: result.data.passed,
405
+ rationale: result.data.rationale,
406
+ cost: result.cost,
407
+ similarity: result.data.passed ? 1 : 0
408
+ };
409
+ } catch (error) {
410
+ return {
411
+ passed: false,
412
+ rationale: `LLM comparison failed: ${error instanceof Error ? error.message : String(error)}`,
413
+ cost: 0,
414
+ similarity: 0
415
+ };
416
+ }
417
+ };
418
+ }
419
+ /**
420
+ * Marks a comparator or comparator config as unordered.
421
+ * When applied to an array field, items will be matched by similarity
422
+ * rather than index position (using Hungarian algorithm).
423
+ *
424
+ * @example
425
+ * // Unordered array of objects
426
+ * lineItems: unordered({
427
+ * description: name,
428
+ * price: within({ tolerance: 5 })
429
+ * })
430
+ *
431
+ * @example
432
+ * // Unordered array of primitives
433
+ * tags: unordered(exact)
434
+ *
435
+ * @example
436
+ * // When entire output is an array
437
+ * comparators: unordered({
438
+ * carrier: exact,
439
+ * premium: within({ tolerance: 0.05 })
440
+ * })
441
+ */
442
+ function unordered(comparator) {
443
+ const baseFunction = typeof comparator === "function" ? comparator : () => {
444
+ throw new Error("unordered() base function should not be called when nested comparators exist. This is likely a bug in the evaluation logic.");
445
+ };
446
+ return Object.assign(baseFunction, {
447
+ _unordered: true,
448
+ _nestedComparators: typeof comparator === "object" ? comparator : void 0
449
+ });
450
+ }
201
451
  /**
202
452
  * Deep equality comparison with cycle detection.
203
453
  * Uses WeakSet to track visited object pairs to prevent stack overflow on circular references.
@@ -235,198 +485,74 @@ function normalizeNumeric(value) {
235
485
  if (value == null || value === "") return null;
236
486
  const str = String(value);
237
487
  const isNegativeParens = /^\(.*\)$/.test(str.trim());
238
- let cleaned = str.replace(/[^0-9.\-]/g, "");
488
+ let cleaned = str.replace(/[^0-9.-]/g, "");
239
489
  if (isNegativeParens && !cleaned.startsWith("-")) cleaned = "-" + cleaned;
240
490
  const num = parseFloat(cleaned);
241
491
  return isNaN(num) ? null : num;
242
492
  }
243
493
 
244
494
  //#endregion
245
- //#region src/executors.ts
495
+ //#region src/eval/comparators/matching.ts
496
+ function isObject$1(value) {
497
+ return value !== null && typeof value === "object" && !Array.isArray(value);
498
+ }
246
499
  /**
247
- * Creates an executor that calls an HTTP endpoint.
248
- *
249
- * @example
250
- * ```ts
251
- * const executor = endpoint('https://api.example.com/workflow', {
252
- * headers: { Authorization: 'Bearer token' },
253
- * });
254
- * ```
500
+ * Calculate similarity score between two values (0.0 to 1.0).
501
+ * For arrays: recursively match and average similarity of paired elements.
502
+ * For objects: average similarity across all fields using comparator results.
503
+ * For primitives: uses exact comparison's similarity score.
255
504
  */
256
- function endpoint(url, config = {}) {
257
- const { method = "POST", headers = {}, mapResponse, mapAdditionalContext, mapCost, timeout = DEFAULT_ENDPOINT_TIMEOUT_MS } = config;
258
- return async (input, systemPrompt) => {
259
- const body = typeof input === "object" && input !== null ? {
260
- ...input,
261
- systemPrompt
262
- } : {
263
- input,
264
- systemPrompt
265
- };
266
- const controller = new AbortController();
267
- const timeoutId = setTimeout(() => controller.abort(), timeout);
268
- try {
269
- const response = await fetch(url, {
270
- method,
271
- headers: {
272
- "Content-Type": "application/json",
273
- ...headers
274
- },
275
- body: JSON.stringify(body),
276
- signal: controller.signal
277
- });
278
- clearTimeout(timeoutId);
279
- if (!response.ok) {
280
- const text = await response.text();
281
- throw new Error(`HTTP ${response.status}: ${text}`);
282
- }
283
- const data = await response.json();
284
- const additionalContext = mapAdditionalContext?.(data);
285
- const cost = mapCost?.(data) ?? 0;
286
- if (mapResponse) return {
287
- output: mapResponse(data),
288
- additionalContext,
289
- cost
290
- };
291
- return {
292
- output: data,
293
- additionalContext,
294
- cost
295
- };
296
- } catch (error) {
297
- clearTimeout(timeoutId);
298
- throw error;
299
- }
300
- };
505
+ async function getSimilarity(expected, actual, comparators) {
506
+ if (Array.isArray(expected) && Array.isArray(actual)) {
507
+ if (expected.length === 0 && actual.length === 0) return 1;
508
+ if (expected.length === 0 || actual.length === 0) return 0;
509
+ const result = await matchArrays(expected, actual, comparators);
510
+ let total$1 = 0;
511
+ for (const [expIdx, actIdx] of result.assignments) total$1 += await getSimilarity(expected[expIdx], actual[actIdx], comparators);
512
+ const maxLen = Math.max(expected.length, actual.length);
513
+ return total$1 / maxLen;
514
+ }
515
+ if (!isObject$1(expected) || !isObject$1(actual)) {
516
+ const result = exact(expected, actual);
517
+ return result.similarity ?? (result.passed ? 1 : 0);
518
+ }
519
+ const fields = Object.keys(expected).filter((key) => {
520
+ const comp = comparators[key];
521
+ return comp !== void 0 && typeof comp === "function";
522
+ });
523
+ if (fields.length === 0) return 1;
524
+ let total = 0;
525
+ for (const key of fields) {
526
+ const comparatorConfig = comparators[key];
527
+ const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected[key], actual[key], {
528
+ expectedParent: expected,
529
+ actualParent: actual
530
+ });
531
+ total += result.similarity ?? (result.passed ? 1 : 0);
532
+ }
533
+ return total / fields.length;
301
534
  }
302
535
  /**
303
- * Creates an executor from a local function.
304
- *
305
- * @example
306
- * ```ts
307
- * const executor = fn({
308
- * fn: async (input, systemPrompt) => {
309
- * const result = await myLLMCall(input, systemPrompt);
310
- * return result;
311
- * },
312
- * });
313
- * ```
536
+ * Find optimal pairing between expected and actual arrays using Hungarian algorithm.
537
+ * Pure matching - no pass/fail determination.
314
538
  *
315
- * @example With mapResponse to extract output from a richer response:
316
- * ```ts
317
- * const executor = fn({
318
- * fn: async (input, systemPrompt) => await startWorkflow({ ... }),
319
- * mapResponse: (result) => ({ documentType: result.documentType }),
320
- * mapCost: (result) => result.cost,
321
- * mapAdditionalContext: (result) => result.metadata,
322
- * });
323
- * ```
539
+ * @param expected - Array of expected items
540
+ * @param actual - Array of actual items
541
+ * @param comparators - Nested comparator configuration for array items
542
+ * @returns Matching result with assignments and unmatched indices
324
543
  */
325
- function fn(config) {
326
- return async (input, systemPrompt) => {
327
- const raw = await config.fn(input, systemPrompt);
328
- return {
329
- output: config.mapResponse ? config.mapResponse(raw) : raw,
330
- additionalContext: config.mapAdditionalContext?.(raw),
331
- cost: config.mapCost?.(raw) ?? 0
332
- };
333
- };
334
- }
335
- /**
336
- * Creates a mock executor for testing.
337
- * Can accept either:
338
- * - An array of outputs (returned in sequence, cycling if more calls than outputs)
339
- * - A function that maps input to output
340
- *
341
- * @example Array-based:
342
- * ```ts
343
- * const executor = mock([
344
- * { premium: 12500, policyType: 'claims-made' },
345
- * { premium: 8200, policyType: 'entity' },
346
- * ]);
347
- * ```
348
- *
349
- * @example Function-based:
350
- * ```ts
351
- * const executor = mock((input) => ({
352
- * id: input.id,
353
- * processed: true,
354
- * }));
355
- * ```
356
- */
357
- function mock(outputsOrFn) {
358
- if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
359
- return { output: outputsOrFn(input, systemPrompt) };
360
- };
361
- const outputs = outputsOrFn;
362
- if (outputs.length === 0) throw new Error("mock() requires at least one output");
363
- let callIndex = 0;
364
- return async () => {
365
- const output = outputs[callIndex % outputs.length];
366
- callIndex++;
367
- return { output };
368
- };
369
- }
370
-
371
- //#endregion
372
- //#region src/matching.ts
373
- function isObject$1(value) {
374
- return value !== null && typeof value === "object" && !Array.isArray(value);
375
- }
376
- /**
377
- * Calculate similarity score between two values (0.0 to 1.0).
378
- * For arrays: recursively match and average similarity of paired elements.
379
- * For objects: average similarity across all fields using comparator results.
380
- * For primitives: uses exact comparison's similarity score.
381
- */
382
- function getSimilarity(expected, actual, comparators) {
383
- if (Array.isArray(expected) && Array.isArray(actual)) {
384
- if (expected.length === 0 && actual.length === 0) return 1;
385
- if (expected.length === 0 || actual.length === 0) return 0;
386
- const result = matchArrays(expected, actual, comparators);
387
- let total$1 = 0;
388
- for (const [expIdx, actIdx] of result.assignments) total$1 += getSimilarity(expected[expIdx], actual[actIdx], comparators);
389
- const maxLen = Math.max(expected.length, actual.length);
390
- return total$1 / maxLen;
391
- }
392
- if (!isObject$1(expected) || !isObject$1(actual)) {
393
- const result = exact(expected, actual);
394
- return result.similarity ?? (result.passed ? 1 : 0);
395
- }
396
- const fields = Object.keys(expected).filter((key) => comparators[key]);
397
- if (fields.length === 0) return 1;
398
- let total = 0;
399
- for (const key of fields) {
400
- const comparator = comparators[key];
401
- const result = comparator(expected[key], actual[key], {
402
- expectedParent: expected,
403
- actualParent: actual
404
- });
405
- total += result.similarity ?? (result.passed ? 1 : 0);
406
- }
407
- return total / fields.length;
408
- }
409
- /**
410
- * Find optimal pairing between expected and actual arrays using Hungarian algorithm.
411
- * Pure matching - no pass/fail determination.
412
- *
413
- * @param expected - Array of expected items
414
- * @param actual - Array of actual items
415
- * @param comparators - Map of field names to comparator functions
416
- * @returns Matching result with assignments and unmatched indices
417
- */
418
- function matchArrays(expected, actual, comparators = {}) {
419
- if (expected.length === 0) return {
420
- assignments: [],
421
- unmatchedExpected: [],
422
- unmatchedActual: [...Array(actual.length).keys()]
544
+ async function matchArrays(expected, actual, comparators = {}) {
545
+ if (expected.length === 0) return {
546
+ assignments: [],
547
+ unmatchedExpected: [],
548
+ unmatchedActual: [...Array(actual.length).keys()]
423
549
  };
424
550
  if (actual.length === 0) return {
425
551
  assignments: [],
426
552
  unmatchedExpected: [...Array(expected.length).keys()],
427
553
  unmatchedActual: []
428
554
  };
429
- const rawAssignments = munkres(expected.map((exp) => actual.map((act) => 1 - getSimilarity(exp, act, comparators))));
555
+ const rawAssignments = munkres(await Promise.all(expected.map(async (exp) => Promise.all(actual.map(async (act) => 1 - await getSimilarity(exp, act, comparators))))));
430
556
  const assignments = [];
431
557
  const matchedExp = /* @__PURE__ */ new Set();
432
558
  const matchedAct = /* @__PURE__ */ new Set();
@@ -443,212 +569,126 @@ function matchArrays(expected, actual, comparators = {}) {
443
569
  }
444
570
 
445
571
  //#endregion
446
- //#region src/eval.ts
572
+ //#region src/optimizer/ui.ts
447
573
  /**
448
- * Run all test cases and return results.
574
+ * UI utilities for beautiful console output
449
575
  */
450
- async function evaluate(config) {
451
- const { testCases, systemPrompt, executor, comparators, comparatorOverride } = config;
452
- if (testCases.length === 0) throw new Error("testCases array cannot be empty");
453
- if (!executor) throw new Error("executor is required");
454
- if (!comparators && !comparatorOverride) throw new Error("either \"comparators\" (field mapping or single function) or \"comparatorOverride\" (whole-object) is required");
455
- const executeTestCase = async ({ input, expected }) => {
456
- try {
457
- const result = await executor(input, systemPrompt);
458
- let fields;
459
- if (comparatorOverride) {
460
- const compResult = comparatorOverride(expected, result.output);
461
- fields = { "": {
462
- passed: compResult.passed,
463
- expected,
464
- actual: result.output
465
- } };
466
- } else if (typeof comparators === "function") if (Array.isArray(expected)) fields = compareFields({
467
- expected,
468
- actual: result.output,
469
- comparators: { "": comparators },
470
- unorderedList: config.unorderedList
471
- });
472
- else {
473
- const compResult = comparators(expected, result.output, {
474
- expectedParent: void 0,
475
- actualParent: void 0
476
- });
477
- fields = { "": {
478
- ...compResult,
479
- expected,
480
- actual: result.output
481
- } };
482
- }
483
- else fields = compareFields({
484
- expected,
485
- actual: result.output,
486
- comparators,
487
- unorderedList: config.unorderedList
488
- });
489
- const passedFields = Object.values(fields).filter((f) => f.passed).length;
490
- const totalFields$1 = Object.values(fields).length;
491
- const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
492
- const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
493
- return {
494
- input,
495
- expected,
496
- actual: result.output,
497
- additionalContext: result.additionalContext,
498
- cost: result.cost ?? 0,
499
- passed: passed$1,
500
- fields,
501
- passedFields,
502
- totalFields: totalFields$1,
503
- passRate
504
- };
505
- } catch (error) {
506
- return {
507
- input,
508
- expected,
509
- actual: void 0,
510
- cost: 0,
511
- passed: false,
512
- fields: {},
513
- passedFields: 0,
514
- totalFields: 0,
515
- passRate: 0,
516
- error: error instanceof Error ? error.message : String(error)
517
- };
576
+ const theme = {
577
+ success: chalk.green,
578
+ error: chalk.red,
579
+ warning: chalk.yellow,
580
+ bold: chalk.bold,
581
+ dim: chalk.dim,
582
+ check: chalk.green(figures.tick),
583
+ cross: chalk.red(figures.cross),
584
+ warn: chalk.yellow(figures.warning),
585
+ bullet: chalk.dim(figures.bullet),
586
+ pointer: chalk.yellow(figures.pointer),
587
+ separator: chalk.dim(" · "),
588
+ divider: (label, width = 60) => {
589
+ const prefix = `━━━ ${label} `;
590
+ const remaining = Math.max(0, width - prefix.length);
591
+ return chalk.cyan.dim(prefix + "━".repeat(remaining));
592
+ }
593
+ };
594
+ let activeSpinner = null;
595
+ const spinner = {
596
+ start(text) {
597
+ if (activeSpinner) activeSpinner.stop();
598
+ activeSpinner = ora({
599
+ text,
600
+ spinner: "dots",
601
+ indent: 4
602
+ }).start();
603
+ return activeSpinner;
604
+ },
605
+ succeed(text) {
606
+ if (activeSpinner) {
607
+ activeSpinner.succeed(text);
608
+ activeSpinner = null;
518
609
  }
519
- };
520
- const rateLimitBatch = config.rateLimitBatch;
521
- let results;
522
- if (rateLimitBatch && rateLimitBatch > 0) {
523
- results = [];
524
- for (let i = 0; i < testCases.length; i += rateLimitBatch) {
525
- const batch = testCases.slice(i, i + rateLimitBatch);
526
- const batchResults = await Promise.all(batch.map(executeTestCase));
527
- results.push(...batchResults);
528
- const rateLimitPause = config.rateLimitPause;
529
- if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
610
+ },
611
+ fail(text) {
612
+ if (activeSpinner) {
613
+ activeSpinner.fail(text);
614
+ activeSpinner = null;
530
615
  }
531
- } else results = await Promise.all(testCases.map(executeTestCase));
532
- results.sort((a, b) => {
533
- if (a.passed !== b.passed) return a.passed ? 1 : -1;
534
- return a.passRate - b.passRate;
535
- });
536
- const passed = results.filter((r) => r.passed).length;
537
- const total = results.length;
538
- const successRate = total > 0 ? passed / total : 0;
539
- let correctFields = 0;
540
- let totalFields = 0;
541
- for (const r of results) {
542
- const fieldResults = Object.values(r.fields);
543
- totalFields += fieldResults.length;
544
- correctFields += fieldResults.filter((f) => f.passed).length;
616
+ },
617
+ stop() {
618
+ if (activeSpinner) {
619
+ activeSpinner.stop();
620
+ activeSpinner = null;
621
+ }
622
+ },
623
+ clear() {
624
+ if (activeSpinner) activeSpinner.clear();
625
+ },
626
+ isActive() {
627
+ return activeSpinner !== null;
545
628
  }
546
- const accuracy = totalFields > 0 ? correctFields / totalFields : 0;
547
- const cost = results.reduce((sum, r) => sum + (r.cost ?? 0), 0);
629
+ };
630
+ function createProgressTracker(label) {
631
+ let bar = null;
632
+ let startTime = 0;
633
+ let lastUpdate = 0;
634
+ const MIN_UPDATE_INTERVAL = 100;
548
635
  return {
549
- systemPrompt,
550
- testCases: results,
551
- passed,
552
- total,
553
- successRate,
554
- correctFields,
555
- totalFields,
556
- accuracy,
557
- cost
636
+ start(total) {
637
+ spinner.stop();
638
+ startTime = Date.now();
639
+ bar = new cliProgress.SingleBar({
640
+ format: ` {bar} {percentage}% {value}/{total} ${label} {duration_formatted}`,
641
+ barCompleteChar: "█",
642
+ barIncompleteChar: "░",
643
+ barsize: 20,
644
+ hideCursor: true,
645
+ clearOnComplete: false,
646
+ stopOnComplete: false,
647
+ forceRedraw: true,
648
+ fps: 10
649
+ });
650
+ bar.start(total, 0, { duration_formatted: "0s" });
651
+ },
652
+ update(current) {
653
+ const now = Date.now();
654
+ if (now - lastUpdate < MIN_UPDATE_INTERVAL && bar) {
655
+ if (current < bar.getTotal()) return;
656
+ }
657
+ lastUpdate = now;
658
+ if (bar) {
659
+ const elapsed = Math.round((now - startTime) / 1e3);
660
+ bar.update(current, { duration_formatted: `${elapsed}s` });
661
+ }
662
+ },
663
+ stop() {
664
+ if (bar) {
665
+ const elapsed = Math.round((Date.now() - startTime) / 1e3);
666
+ bar.update(bar.getTotal(), { duration_formatted: `${elapsed}s` });
667
+ bar.stop();
668
+ bar = null;
669
+ }
670
+ }
558
671
  };
559
672
  }
560
- /**
561
- * Recursively compare expected vs actual, returning field-level results.
562
- * Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
563
- */
564
- function compareFields(opts) {
565
- const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, unorderedList = false } = opts;
566
- const results = {};
567
- const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
568
- if (Array.isArray(expected)) {
569
- if (!Array.isArray(actual)) return { [path$1]: {
570
- passed: false,
571
- expected,
572
- actual
573
- } };
574
- if (expected.length === 0) return {};
575
- let matchedPairs;
576
- if (unorderedList) matchedPairs = matchArrays(expected, actual, comparators).assignments;
577
- else {
578
- matchedPairs = [];
579
- for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
580
- }
581
- const matchedIndices = new Set(matchedPairs.map(([i]) => i));
582
- for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, compareFields({
583
- expected: expected[expIdx],
584
- actual: actual[actIdx],
585
- comparators,
586
- path: indexPath(expIdx),
587
- expectedParent,
588
- actualParent,
589
- unorderedList
590
- }));
591
- const arrayFieldName = getFieldName(path$1);
592
- const hasArrayComparator = arrayFieldName in comparators || arrayFieldName === "";
593
- for (let i = 0; i < expected.length; i++) {
594
- if (matchedIndices.has(i)) continue;
595
- const item = expected[i];
596
- if (isObject(item)) {
597
- for (const [field, value] of Object.entries(item)) if (field in comparators) results[`${indexPath(i)}.${field}`] = {
598
- passed: false,
599
- expected: value,
600
- actual: void 0
601
- };
602
- } else if (hasArrayComparator) results[indexPath(i)] = {
603
- passed: false,
604
- expected: item,
605
- actual: void 0
606
- };
607
- }
608
- return results;
609
- }
610
- if (isObject(expected)) {
611
- if (!isObject(actual)) return { [path$1]: {
612
- passed: false,
613
- expected,
614
- actual
615
- } };
616
- for (const [field, expValue] of Object.entries(expected)) {
617
- const fieldPath = path$1 ? `${path$1}.${field}` : field;
618
- Object.assign(results, compareFields({
619
- expected: expValue,
620
- actual: actual[field],
621
- comparators,
622
- path: fieldPath,
623
- expectedParent: expected,
624
- actualParent: actual,
625
- unorderedList
626
- }));
627
- }
628
- return results;
629
- }
630
- const fieldName = getFieldName(path$1);
631
- const comparator = comparators[fieldName] ?? (fieldName === "" ? exact : void 0);
632
- if (!comparator) return {};
633
- const result = comparator(expected, actual, {
634
- expectedParent,
635
- actualParent
636
- });
637
- return { [path$1]: {
638
- ...result,
639
- expected,
640
- actual
641
- } };
673
+ function formatCost(cost) {
674
+ return theme.dim(`$${cost.toFixed(4)}`);
642
675
  }
643
- function isObject(value) {
644
- return value !== null && typeof value === "object" && !Array.isArray(value);
676
+ function formatCostShort(cost) {
677
+ return theme.dim(`$${cost.toFixed(2)}`);
645
678
  }
646
- function getFieldName(path$1) {
647
- return (path$1.split(".").pop() || "").replace(/\[\d+\]$/, "");
679
+ function formatDuration(ms) {
680
+ const totalSeconds = Math.round(ms / 1e3);
681
+ if (totalSeconds < 60) return `${totalSeconds}s`;
682
+ const minutes = Math.floor(totalSeconds / 60);
683
+ const seconds = totalSeconds % 60;
684
+ return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
685
+ }
686
+ function formatPercentage(rate) {
687
+ return `${(rate * 100).toFixed(1)}%`;
648
688
  }
649
689
 
650
690
  //#endregion
651
- //#region src/optimizer-logging.ts
691
+ //#region src/optimizer/optimizer-logging.ts
652
692
  function formatMsCompact(ms) {
653
693
  const totalSeconds = Math.round(ms / 1e3);
654
694
  if (totalSeconds < 60) return `${totalSeconds}s`;
@@ -666,12 +706,75 @@ function formatTokensCompact(tokens) {
666
706
  if (tokens >= 1e3) return `${Math.round(tokens / 1e3)}K`;
667
707
  return String(tokens);
668
708
  }
709
+ /**
710
+ * Clear any active progress line before logging
711
+ * Call this before all console.log statements
712
+ */
713
+ function clearProgressLine() {
714
+ const width = process.stdout.columns || 80;
715
+ process.stdout.write("\r" + " ".repeat(width) + "\r");
716
+ }
717
+ /**
718
+ * Create a progress updater using cli-progress for beautiful output
719
+ */
720
+ function createProgressUpdater(label) {
721
+ let tracker = null;
722
+ let total = 0;
723
+ return {
724
+ update(completed, newTotal) {
725
+ if (!tracker) {
726
+ total = newTotal;
727
+ tracker = createProgressTracker(label);
728
+ tracker.start(total);
729
+ }
730
+ tracker.update(completed);
731
+ },
732
+ finish() {
733
+ if (tracker) {
734
+ tracker.stop();
735
+ tracker = null;
736
+ }
737
+ },
738
+ clear() {
739
+ clearProgressLine();
740
+ }
741
+ };
742
+ }
743
+ /**
744
+ * Track progress of Promise.allSettled with real-time updates
745
+ *
746
+ * @param promises Array of promises to track
747
+ * @param onProgress Callback called when each promise settles
748
+ * @returns Promise.allSettled result
749
+ */
750
+ async function trackPromiseProgress(promises, onProgress) {
751
+ if (promises.length === 0) return [];
752
+ let completed = 0;
753
+ const total = promises.length;
754
+ onProgress(0, total);
755
+ const wrappedPromises = promises.map((promise) => promise.then((value) => {
756
+ completed++;
757
+ onProgress(completed, total);
758
+ return {
759
+ status: "fulfilled",
760
+ value
761
+ };
762
+ }).catch((reason) => {
763
+ completed++;
764
+ onProgress(completed, total);
765
+ return {
766
+ status: "rejected",
767
+ reason
768
+ };
769
+ }));
770
+ return Promise.all(wrappedPromises);
771
+ }
669
772
  function formatFailure(testCase) {
670
773
  const lines = [];
671
774
  lines.push(`Input: ${JSON.stringify(testCase.input, null, 2)}`);
672
775
  lines.push(`Expected: ${JSON.stringify(testCase.expected, null, 2)}`);
673
776
  lines.push(`Actual: ${JSON.stringify(testCase.actual, null, 2)}`);
674
- if (testCase.additionalContext) lines.push(`Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
777
+ if (testCase.additionalContext) lines.push(`Additional Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
675
778
  lines.push("");
676
779
  lines.push("Field-level failures:");
677
780
  for (const [fieldPath, result] of Object.entries(testCase.fields)) if (!result.passed) lines.push(` ${fieldPath || "(root)"}: expected ${JSON.stringify(result.expected)}, got ${JSON.stringify(result.actual)}`);
@@ -695,56 +798,98 @@ function computeTotals(iterations) {
695
798
  totalDuration
696
799
  };
697
800
  }
698
- function formatDurationForLog(ms) {
699
- const seconds = Math.round(ms / 1e3);
700
- if (seconds < 60) return `(${seconds}s)`;
701
- return `(${Math.floor(seconds / 60)}m ${seconds % 60}s)`;
801
+ function logOptimizerHeader(model, targetRate, testCount) {
802
+ spinner.stop();
803
+ console.log("");
804
+ console.log(theme.bold("Didactic Optimizer"));
805
+ console.log(` ${theme.dim("Model:")} ${model}${theme.separator}${theme.dim("Target:")} ${formatPercentage(targetRate)}${theme.separator}${theme.dim("Tests:")} ${testCount}`);
702
806
  }
703
807
  function logIterationStart(iterationLabel) {
704
- console.log(`\n=== Optimization Iteration ${iterationLabel} ===`);
808
+ spinner.stop();
809
+ clearProgressLine();
810
+ console.log("");
811
+ console.log(theme.divider(`Iteration ${iterationLabel}`));
812
+ console.log("");
705
813
  }
706
814
  function logEvaluationStart() {
707
- console.log(` Evaluating prompt...`);
815
+ spinner.stop();
816
+ clearProgressLine();
817
+ console.log(` ${theme.bold("Evaluating prompt")}`);
818
+ spinner.start("Running evals...");
708
819
  }
709
820
  function logEvaluationResult(result, cumulativeCost, durationMs) {
710
- console.log(` Result: ${result.passed}/${result.total} passed (${(result.successRate * 100).toFixed(1)}%) | Cost: $${result.cost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
821
+ spinner.stop();
822
+ clearProgressLine();
823
+ const successIcon = result.successRate >= .9 ? theme.check : result.successRate >= .5 ? theme.warn : theme.cross;
824
+ console.log(` ${successIcon} ${theme.bold(formatPercentage(result.successRate))} success rate ${theme.dim(`(${result.passed}/${result.total} passed)`)}`);
825
+ console.log(` ${theme.dim("Cost:")} ${formatCost(result.cost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
711
826
  }
712
827
  function logRegressionDetected(bestSuccessRate) {
713
- console.log(` → Regression detected (was ${(bestSuccessRate * 100).toFixed(1)}%)`);
828
+ spinner.stop();
829
+ clearProgressLine();
830
+ console.log(` ${theme.pointer} ${theme.warning("Regression")} ${theme.dim(`(was ${formatPercentage(bestSuccessRate)})`)}`);
714
831
  }
715
832
  function logTargetReached(targetSuccessRate) {
716
- console.log(` Target: ${(targetSuccessRate * 100).toFixed(0)}% | ✓ Target reached!`);
833
+ spinner.stop();
834
+ clearProgressLine();
835
+ console.log(` ${theme.check} ${theme.success("Target reached!")} ${theme.dim(`(${formatPercentage(targetSuccessRate)})`)}`);
717
836
  }
718
837
  function logTargetFailures(targetSuccessRate, failureCount) {
719
- console.log(` Target: ${(targetSuccessRate * 100).toFixed(0)}% | ${failureCount} failures to address`);
838
+ spinner.stop();
839
+ clearProgressLine();
840
+ console.log(` ${theme.cross} ${theme.error(`${failureCount} failures`)} to address ${theme.dim(`(target: ${formatPercentage(targetSuccessRate)})`)}`);
720
841
  }
721
842
  function logCostLimitReached(cumulativeCost) {
722
- console.log(` Cost limit reached ($${cumulativeCost.toFixed(2)})`);
843
+ spinner.stop();
844
+ clearProgressLine();
845
+ console.log(` ${theme.warn} ${theme.warning("Cost limit reached")} ${theme.dim(`($${cumulativeCost.toFixed(2)})`)}`);
723
846
  }
724
847
  function logPatchGenerationStart(failureCount) {
725
- console.log(``);
726
- console.log(` Generating ${failureCount} patches in parallel...`);
848
+ spinner.stop();
849
+ clearProgressLine();
850
+ console.log("");
851
+ console.log(` ${theme.bold("Generating patches")}`);
852
+ spinner.start(`Generating ${failureCount} patches in parallel...`);
727
853
  }
728
854
  function logPatchGenerationResult(patchCost, cumulativeCost, durationMs) {
729
- console.log(` Patches generated | Cost: $${patchCost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
855
+ spinner.stop();
856
+ clearProgressLine();
857
+ console.log(` ${theme.check} Patches generated${theme.separator}${theme.dim("Cost:")} ${formatCost(patchCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
730
858
  }
731
859
  function logMergeStart() {
732
- console.log(``);
733
- console.log(` Merging patches...`);
860
+ spinner.stop();
861
+ clearProgressLine();
862
+ console.log("");
863
+ console.log(` ${theme.bold("Merging patches")}`);
864
+ spinner.start("Merging patches...");
734
865
  }
735
866
  function logMergeResult(mergeCost, cumulativeCost, durationMs) {
736
- console.log(` Patches merged | Cost: $${mergeCost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
867
+ spinner.stop();
868
+ clearProgressLine();
869
+ console.log(` ${theme.check} Merged${theme.separator}${theme.dim("Cost:")} ${formatCost(mergeCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
737
870
  }
738
871
  function logPatchGenerationFailures(failedCount, totalCount) {
739
- console.log(` ⚠ ${failedCount}/${totalCount} patch generations failed`);
872
+ spinner.stop();
873
+ clearProgressLine();
874
+ console.log(` ${theme.warn} ${theme.warning(`${failedCount}/${totalCount} patch generations failed`)}`);
740
875
  }
741
876
  function logOptimizationComplete(bestSuccessRate, targetSuccessRate, cumulativeCost) {
742
- console.log(`\n=== Optimization Complete ===`);
743
- console.log(`Best result: ${(bestSuccessRate * 100).toFixed(1)}% (target was ${(targetSuccessRate * 100).toFixed(0)}%)`);
744
- console.log(`Total cost: $${cumulativeCost.toFixed(4)}`);
877
+ spinner.stop();
878
+ clearProgressLine();
879
+ console.log("");
880
+ console.log(theme.divider("Complete"));
881
+ console.log("");
882
+ const targetMet = bestSuccessRate >= targetSuccessRate;
883
+ const icon = targetMet ? theme.check : theme.cross;
884
+ const rateColor = targetMet ? theme.success : theme.error;
885
+ console.log(` ${icon} ${theme.bold("Best:")} ${rateColor(formatPercentage(bestSuccessRate))}`);
886
+ console.log(` ${theme.dim("Target:")} ${formatPercentage(targetSuccessRate)}${theme.separator}${theme.dim("Total Cost:")} ${formatCostShort(cumulativeCost)}`);
745
887
  }
746
888
  function logLogsWritten(logPath) {
747
- console.log(`Logs written to: ${logPath}`);
889
+ spinner.stop();
890
+ clearProgressLine();
891
+ console.log(` ${theme.dim("Logs written to:")} ${logPath}`);
892
+ console.log("");
748
893
  }
749
894
  function generateConfigSection(ctx, testCaseCount) {
750
895
  const lines = [];
@@ -911,6 +1056,7 @@ function writeRawDataJson(folderPath, iterations, ctx, success) {
911
1056
  input: tc.input,
912
1057
  expected: tc.expected,
913
1058
  actual: tc.actual,
1059
+ additionalContext: tc.additionalContext,
914
1060
  fields: tc.fields
915
1061
  });
916
1062
  });
@@ -984,6 +1130,7 @@ function writeBestRunJson(folderPath, iterations, ctx) {
984
1130
  input: tc.input,
985
1131
  expected: tc.expected,
986
1132
  actual: tc.actual,
1133
+ additionalContext: tc.additionalContext,
987
1134
  failedFields: extractFailedFields(tc.fields)
988
1135
  });
989
1136
  else if (tc.passRate < 1) partialFailures.push({
@@ -992,13 +1139,15 @@ function writeBestRunJson(folderPath, iterations, ctx) {
992
1139
  input: tc.input,
993
1140
  expected: tc.expected,
994
1141
  actual: tc.actual,
1142
+ additionalContext: tc.additionalContext,
995
1143
  failedFields: extractFailedFields(tc.fields)
996
1144
  });
997
1145
  else successes.push({
998
1146
  testIndex: testIdx,
999
1147
  input: tc.input,
1000
1148
  expected: tc.expected,
1001
- actual: tc.actual
1149
+ actual: tc.actual,
1150
+ additionalContext: tc.additionalContext
1002
1151
  });
1003
1152
  });
1004
1153
  const report = {
@@ -1035,29 +1184,402 @@ function writeBestRunJson(folderPath, iterations, ctx) {
1035
1184
  };
1036
1185
  fs.writeFileSync(bestRunPath, JSON.stringify(report, null, 2), "utf-8");
1037
1186
  }
1038
- function writeFinalLogs(logPath, iterationLogs, logContext, success) {
1039
- const folderPath = path.dirname(logPath);
1040
- if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
1041
- const content = generateLogContent(iterationLogs, logContext, success);
1042
- fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
1043
- writePromptsFile(folderPath, iterationLogs, logContext);
1044
- writeRawDataJson(folderPath, iterationLogs, logContext, success);
1045
- writeBestRunJson(folderPath, iterationLogs, logContext);
1187
+ function writeFinalLogs(logPath, iterationLogs, logContext, success) {
1188
+ const folderPath = path.dirname(logPath);
1189
+ if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
1190
+ const content = generateLogContent(iterationLogs, logContext, success);
1191
+ fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
1192
+ writePromptsFile(folderPath, iterationLogs, logContext);
1193
+ writeRawDataJson(folderPath, iterationLogs, logContext, success);
1194
+ writeBestRunJson(folderPath, iterationLogs, logContext);
1195
+ }
1196
+
1197
+ //#endregion
1198
+ //#region src/eval/eval-logging.ts
1199
+ /**
1200
+ * Write evaluation results to rawData.json
1201
+ *
1202
+ * Synchronous writes are intentional - logging runs after evaluation completes
1203
+ * and errors are caught. This avoids async complexity in the calling code.
1204
+ */
1205
+ function writeEvalLogs(logPath, result, durationMs, perTestThreshold) {
1206
+ try {
1207
+ const dir = path.dirname(logPath);
1208
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
1209
+ const report = {
1210
+ metadata: {
1211
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1212
+ systemPrompt: result.systemPrompt,
1213
+ testCaseCount: result.total,
1214
+ perTestThreshold: perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD
1215
+ },
1216
+ summary: {
1217
+ passed: result.passed,
1218
+ total: result.total,
1219
+ successRate: result.successRate,
1220
+ correctFields: result.correctFields,
1221
+ totalFields: result.totalFields,
1222
+ accuracy: result.accuracy,
1223
+ executorCost: result.cost,
1224
+ comparatorCost: result.comparatorCost,
1225
+ totalCost: result.cost + result.comparatorCost,
1226
+ durationMs
1227
+ },
1228
+ testCases: result.testCases.map((tc, index) => ({
1229
+ index,
1230
+ passed: tc.passed,
1231
+ passRate: tc.passRate,
1232
+ input: tc.input,
1233
+ expected: tc.expected,
1234
+ actual: tc.actual,
1235
+ additionalContext: tc.additionalContext,
1236
+ executorCost: tc.cost ?? 0,
1237
+ comparatorCost: tc.comparatorCost ?? 0,
1238
+ error: tc.error,
1239
+ fields: tc.fields
1240
+ }))
1241
+ };
1242
+ fs.writeFileSync(logPath, JSON.stringify(report, null, 2), "utf-8");
1243
+ } catch (error) {
1244
+ console.error(`Failed to write eval logs to ${logPath}:`, error instanceof Error ? error.message : String(error));
1245
+ }
1246
+ }
1247
+
1248
+ //#endregion
1249
+ //#region src/eval/eval.ts
1250
+ /**
1251
+ * Run all test cases and return results.
1252
+ */
1253
+ async function evaluate(config) {
1254
+ const { testCases, systemPrompt, executor, comparators, comparatorOverride } = config;
1255
+ if (testCases.length === 0) throw new Error("testCases array cannot be empty");
1256
+ if (!executor) throw new Error("executor is required");
1257
+ const startTime = Date.now();
1258
+ const logPath = config.storeLogs ? typeof config.storeLogs === "string" ? config.storeLogs : `./didactic-logs/eval_${Date.now()}_${crypto.randomUUID().slice(0, 8)}/rawData.json` : void 0;
1259
+ const executeTestCase = async ({ input, expected }) => {
1260
+ try {
1261
+ const result = await executor(input, systemPrompt);
1262
+ let fields;
1263
+ if (comparatorOverride) {
1264
+ const compResult = await comparatorOverride(expected, result.output);
1265
+ fields = { "": {
1266
+ passed: compResult.passed,
1267
+ expected,
1268
+ actual: result.output
1269
+ } };
1270
+ } else {
1271
+ let comparatorConfig;
1272
+ if (!comparators) comparatorConfig = { "": exact };
1273
+ else if (typeof comparators === "function") comparatorConfig = { "": comparators };
1274
+ else comparatorConfig = comparators;
1275
+ fields = await compareFields({
1276
+ expected,
1277
+ actual: result.output,
1278
+ comparators: comparatorConfig,
1279
+ llmConfig: config.llmConfig
1280
+ });
1281
+ }
1282
+ const passedFields = Object.values(fields).filter((f) => f.passed).length;
1283
+ const totalFields$1 = Object.values(fields).length;
1284
+ const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
1285
+ const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
1286
+ const comparatorCost$1 = Object.values(fields).reduce((sum, field) => sum + (field.cost ?? 0), 0);
1287
+ return {
1288
+ input,
1289
+ expected,
1290
+ actual: result.output,
1291
+ additionalContext: result.additionalContext,
1292
+ cost: result.cost ?? 0,
1293
+ comparatorCost: comparatorCost$1,
1294
+ passed: passed$1,
1295
+ fields,
1296
+ passedFields,
1297
+ totalFields: totalFields$1,
1298
+ passRate
1299
+ };
1300
+ } catch (error) {
1301
+ return {
1302
+ input,
1303
+ expected,
1304
+ actual: void 0,
1305
+ cost: 0,
1306
+ comparatorCost: 0,
1307
+ passed: false,
1308
+ fields: {},
1309
+ passedFields: 0,
1310
+ totalFields: 0,
1311
+ passRate: 0,
1312
+ error: error instanceof Error ? error.message : String(error)
1313
+ };
1314
+ }
1315
+ };
1316
+ const rateLimitBatch = config.rateLimitBatch;
1317
+ let results;
1318
+ if (rateLimitBatch && rateLimitBatch > 0) {
1319
+ results = [];
1320
+ const progress = createProgressUpdater("evals");
1321
+ for (let i = 0; i < testCases.length; i += rateLimitBatch) {
1322
+ const batch = testCases.slice(i, i + rateLimitBatch);
1323
+ const batchResults = await Promise.all(batch.map(executeTestCase));
1324
+ results.push(...batchResults);
1325
+ progress.update(results.length, testCases.length);
1326
+ const rateLimitPause = config.rateLimitPause;
1327
+ if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
1328
+ }
1329
+ progress.finish();
1330
+ } else {
1331
+ const progress = createProgressUpdater("evals");
1332
+ results = (await trackPromiseProgress(testCases.map((tc) => executeTestCase(tc)), (completed, total$1) => progress.update(completed, total$1))).map((r) => r.value);
1333
+ progress.finish();
1334
+ }
1335
+ results.sort((a, b) => {
1336
+ if (a.passed !== b.passed) return a.passed ? 1 : -1;
1337
+ return a.passRate - b.passRate;
1338
+ });
1339
+ const passed = results.filter((r) => r.passed).length;
1340
+ const total = results.length;
1341
+ const successRate = total > 0 ? passed / total : 0;
1342
+ let correctFields = 0;
1343
+ let totalFields = 0;
1344
+ for (const r of results) {
1345
+ const fieldResults = Object.values(r.fields);
1346
+ totalFields += fieldResults.length;
1347
+ correctFields += fieldResults.filter((f) => f.passed).length;
1348
+ }
1349
+ const accuracy = totalFields > 0 ? correctFields / totalFields : 0;
1350
+ const cost = results.reduce((sum, r) => sum + (r.cost ?? 0), 0);
1351
+ const comparatorCost = results.reduce((sum, r) => sum + (r.comparatorCost ?? 0), 0);
1352
+ const durationMs = Date.now() - startTime;
1353
+ const logFolder = logPath ? path.dirname(logPath) : void 0;
1354
+ const evalResult = {
1355
+ systemPrompt,
1356
+ testCases: results,
1357
+ passed,
1358
+ total,
1359
+ successRate,
1360
+ correctFields,
1361
+ totalFields,
1362
+ accuracy,
1363
+ cost,
1364
+ comparatorCost,
1365
+ ...logFolder && { logFolder }
1366
+ };
1367
+ if (logPath) writeEvalLogs(logPath, evalResult, durationMs, config.perTestThreshold);
1368
+ return evalResult;
1369
+ }
1370
+ /**
1371
+ * Recursively compare expected vs actual, returning field-level results.
1372
+ * Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
1373
+ */
1374
+ async function compareFields(opts) {
1375
+ const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, llmConfig } = opts;
1376
+ const results = {};
1377
+ const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
1378
+ if (Array.isArray(expected)) {
1379
+ if (!Array.isArray(actual)) return { [path$1]: {
1380
+ passed: false,
1381
+ expected,
1382
+ actual
1383
+ } };
1384
+ if (expected.length === 0) return {};
1385
+ const fieldComparator = comparators[getFieldName(path$1)];
1386
+ const isUnordered = fieldComparator && typeof fieldComparator === "function" && "_unordered" in fieldComparator && fieldComparator._unordered === true;
1387
+ let itemComparators;
1388
+ if (isUnordered) itemComparators = fieldComparator._nestedComparators || comparators;
1389
+ else if (fieldComparator && typeof fieldComparator === "object" && !("_unordered" in fieldComparator)) itemComparators = fieldComparator;
1390
+ else itemComparators = comparators;
1391
+ let matchedPairs;
1392
+ if (isUnordered) matchedPairs = (await matchArrays(expected, actual, itemComparators)).assignments;
1393
+ else {
1394
+ matchedPairs = [];
1395
+ for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
1396
+ }
1397
+ const matchedIndices = new Set(matchedPairs.map(([i]) => i));
1398
+ for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, await compareFields({
1399
+ expected: expected[expIdx],
1400
+ actual: actual[actIdx],
1401
+ comparators: itemComparators,
1402
+ path: indexPath(expIdx),
1403
+ expectedParent,
1404
+ actualParent,
1405
+ llmConfig
1406
+ }));
1407
+ const hasArrayComparator = fieldComparator !== void 0;
1408
+ for (let i = 0; i < expected.length; i++) {
1409
+ if (matchedIndices.has(i)) continue;
1410
+ const item = expected[i];
1411
+ if (isObject(item)) {
1412
+ for (const [field, value] of Object.entries(item)) if (field in itemComparators) results[`${indexPath(i)}.${field}`] = {
1413
+ passed: false,
1414
+ expected: value,
1415
+ actual: void 0
1416
+ };
1417
+ } else if (hasArrayComparator) results[indexPath(i)] = {
1418
+ passed: false,
1419
+ expected: item,
1420
+ actual: void 0
1421
+ };
1422
+ }
1423
+ return results;
1424
+ }
1425
+ if (isObject(expected)) {
1426
+ if (!isObject(actual)) return { [path$1]: {
1427
+ passed: false,
1428
+ expected,
1429
+ actual
1430
+ } };
1431
+ for (const [field, expValue] of Object.entries(expected)) {
1432
+ const fieldPath = path$1 ? `${path$1}.${field}` : field;
1433
+ const fieldConfig = comparators[field];
1434
+ if (fieldConfig === void 0) continue;
1435
+ let fieldComparators;
1436
+ if (fieldConfig && typeof fieldConfig === "object" && !("_unordered" in fieldConfig)) fieldComparators = fieldConfig;
1437
+ else fieldComparators = comparators;
1438
+ Object.assign(results, await compareFields({
1439
+ expected: expValue,
1440
+ actual: actual[field],
1441
+ comparators: fieldComparators,
1442
+ path: fieldPath,
1443
+ expectedParent: expected,
1444
+ actualParent: actual,
1445
+ llmConfig
1446
+ }));
1447
+ }
1448
+ return results;
1449
+ }
1450
+ const fieldName = getFieldName(path$1);
1451
+ let comparatorConfig = comparators[fieldName];
1452
+ if (!comparatorConfig && fieldName === "") comparatorConfig = exact;
1453
+ if (!comparatorConfig) return {};
1454
+ const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected, actual, {
1455
+ expectedParent,
1456
+ actualParent,
1457
+ llmConfig
1458
+ });
1459
+ return { [path$1]: {
1460
+ ...result,
1461
+ expected,
1462
+ actual
1463
+ } };
1464
+ }
1465
+ function isObject(value) {
1466
+ return value !== null && typeof value === "object" && !Array.isArray(value);
1467
+ }
1468
+ function getFieldName(path$1) {
1469
+ return (path$1.split(".").pop() || "").replace(/\[\d+\]$/, "");
1470
+ }
1471
+
1472
+ //#endregion
1473
+ //#region src/optimizer/prompts.ts
1474
+ /**
1475
+ * Default system prompt for patch generation.
1476
+ * Analyzes failures and suggests specific, focused changes to improve the prompt.
1477
+ */
1478
+ const DEFAULT_PATCH_SYSTEM_PROMPT = `
1479
+ 'You are optimizing a system prompt for an LLM workflow.
1480
+ Analyze the failure and suggest a specific, focused change to improve the prompt.
1481
+ Do NOT overfit. Be generalizable.
1482
+
1483
+ <examples>
1484
+ VERY IMPORTANT, CRITICAL!!!
1485
+ Examples MUST be anonymized.
1486
+ NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
1487
+ - example: (for an invoice processor)
1488
+ - task: extract data from parsed invoices
1489
+ - failure context: (returned expected: true, actual: false)
1490
+ - prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
1491
+
1492
+ - example: (for a calendar app)
1493
+ - task: extract cost from calendar event
1494
+ - failure context: (cost expected: 123.45, actual: 167.89)
1495
+ - prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
1496
+ </examples>
1497
+ `;
1498
+ /**
1499
+ * Default system prompt for merging patches.
1500
+ * Combines multiple patches into a coherent system prompt.
1501
+ */
1502
+ const DEFAULT_MERGE_SYSTEM_PROMPT = `
1503
+ You are an expert LLM prompt editor.
1504
+ You are merging improvements into a system prompt.
1505
+ Incorporate the suggestions while keeping the prompt clear and coherent.
1506
+ `;
1507
+ /**
1508
+ * Builds the user prompt for patch generation.
1509
+ * Formats the failure context and current prompt for the LLM.
1510
+ */
1511
+ function buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures) {
1512
+ let userContent = `
1513
+ Current system prompt:
1514
+ ---
1515
+ ${currentPrompt}
1516
+ ---
1517
+
1518
+ A test case failed:
1519
+ ${formatFailure(failure)}
1520
+ `;
1521
+ if (previousBetterPrompt) {
1522
+ const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
1523
+ userContent += `
1524
+ Note: The current prompt is a REGRESSION from a better-performing version.
1525
+ Previous (better) prompt for reference:
1526
+ ---
1527
+ ${previousBetterPrompt}
1528
+ ---
1529
+
1530
+ The failures the better prompt had:
1531
+ ${failuresContext}
1532
+
1533
+ Your changes introduced new failures instead of fixing the above.
1534
+ Analyze what changed between the two prompts that might have caused this regression.
1535
+ Are there any new failures that were not present in the previous better prompt?
1536
+ Are there any failures that were present in the previous better prompt but not in the current prompt?
1537
+ Did any of our patches contradict any of the new failures?
1538
+ `;
1539
+ }
1540
+ userContent += `
1541
+ Suggest a specific change to the system prompt that would fix this failure.
1542
+ Be concise. Output ONLY the suggested patch/change, not the full prompt.
1543
+ DO NOT overfit the prompt to the test case.
1544
+ Generalize examples if you choose to use them.
1545
+ `;
1546
+ return userContent;
1547
+ }
1548
+ /**
1549
+ * Builds the user prompt for merging patches.
1550
+ * Formats the current prompt and suggested patches for the LLM.
1551
+ */
1552
+ function buildMergeUserPrompt(patches, currentPrompt) {
1553
+ return `
1554
+ Current prompt:
1555
+ ---
1556
+ ${currentPrompt}
1557
+ ---
1558
+
1559
+ Suggested improvements:
1560
+ ${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
1561
+
1562
+ Create a single improved system prompt that incorporates these suggestions.
1563
+ Be mindful of the size of the new prompt.
1564
+ Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
1565
+ Output ONLY the new system prompt, nothing else.
1566
+ Respect enums.
1567
+ `;
1046
1568
  }
1047
1569
 
1048
1570
  //#endregion
1049
- //#region src/optimizer.ts
1571
+ //#region src/optimizer/optimizer.ts
1050
1572
  async function optimize(evalConfig, config) {
1051
1573
  if (!config.apiKey) throw new Error("apiKey is required");
1052
- if (!config.systemPrompt) throw new Error("systemPrompt is required");
1053
1574
  if (config.targetSuccessRate < 0 || config.targetSuccessRate > 1) throw new Error("targetSuccessRate must be between 0 and 1");
1054
1575
  const iterationLogs = [];
1055
1576
  const maxIterations = config.maxIterations ?? (config.maxCost !== void 0 ? Infinity : 5);
1056
1577
  const startTime = /* @__PURE__ */ new Date();
1578
+ const model = PROVIDER_SPECS[config.provider].model;
1057
1579
  const logContext = {
1058
1580
  config,
1059
1581
  startTime,
1060
- model: PROVIDER_SPECS[config.provider].model,
1582
+ model,
1061
1583
  perTestThreshold: evalConfig.perTestThreshold,
1062
1584
  rateLimitBatch: evalConfig.rateLimitBatch,
1063
1585
  rateLimitPause: evalConfig.rateLimitPause
@@ -1114,6 +1636,8 @@ async function optimize(evalConfig, config) {
1114
1636
  totalCost: cumulativeCost
1115
1637
  };
1116
1638
  };
1639
+ const testCount = evalConfig.testCases?.length ?? 0;
1640
+ logOptimizerHeader(model, config.targetSuccessRate, testCount);
1117
1641
  for (let i = 1; i <= maxIterations; i++) {
1118
1642
  const iterationStart = Date.now();
1119
1643
  let iterInputTokens = 0;
@@ -1127,7 +1651,7 @@ async function optimize(evalConfig, config) {
1127
1651
  });
1128
1652
  cumulativeCost += result.cost;
1129
1653
  logEvaluationResult(result, cumulativeCost, Date.now() - evalStart);
1130
- const regressed = i > 1 && result.successRate < bestSuccessRate;
1654
+ const regressed = i > 1 && result.successRate <= bestSuccessRate;
1131
1655
  if (regressed) logRegressionDetected(bestSuccessRate);
1132
1656
  if (result.successRate > bestSuccessRate) {
1133
1657
  bestSuccessRate = result.successRate;
@@ -1140,10 +1664,6 @@ async function optimize(evalConfig, config) {
1140
1664
  return finalizeOptimization(true, currentPrompt);
1141
1665
  }
1142
1666
  const failures = result.testCases.filter((tc) => !tc.passed);
1143
- if (failures.length === 0) {
1144
- recordIteration(i, currentPrompt, result, result.cost, Date.now() - iterationStart, iterInputTokens, iterOutputTokens);
1145
- return finalizeOptimization(true, currentPrompt);
1146
- }
1147
1667
  logTargetFailures(config.targetSuccessRate, failures.length);
1148
1668
  if (config.maxCost !== void 0 && cumulativeCost >= config.maxCost) {
1149
1669
  logCostLimitReached(cumulativeCost);
@@ -1152,7 +1672,9 @@ async function optimize(evalConfig, config) {
1152
1672
  }
1153
1673
  logPatchGenerationStart(failures.length);
1154
1674
  const patchStart = Date.now();
1155
- const patchSettled = await Promise.allSettled(failures.map((failure) => generatePatch(failure, currentPrompt, config, regressed ? bestPrompt : void 0, regressed ? bestPromptFailures : void 0)));
1675
+ const patchProgress = createProgressUpdater("patches");
1676
+ const patchSettled = await trackPromiseProgress(failures.map((failure) => generatePatch(failure, currentPrompt, config, regressed ? bestPrompt : void 0, regressed ? bestPromptFailures : void 0)), (completed, total) => patchProgress.update(completed, total));
1677
+ patchProgress.finish();
1156
1678
  const patchResults = patchSettled.filter((r) => r.status === "fulfilled").map((r) => r.value);
1157
1679
  const failedPatchCount = patchSettled.filter((r) => r.status === "rejected").length;
1158
1680
  if (failedPatchCount > 0) logPatchGenerationFailures(failedPatchCount, failures.length);
@@ -1192,154 +1714,165 @@ async function optimize(evalConfig, config) {
1192
1714
  }
1193
1715
  return finalizeOptimization(false, bestPrompt);
1194
1716
  }
1195
- async function callLLM(messages, config, useThinking = false) {
1196
- const spec = PROVIDER_SPECS[config.provider];
1197
- try {
1198
- if (config.provider.startsWith("anthropic")) {
1199
- const client = new Anthropic({ apiKey: config.apiKey });
1200
- const streamOptions = {
1201
- model: spec.model,
1202
- max_tokens: spec.maxTokens,
1203
- system: messages.find((m) => m.role === "system")?.content,
1204
- messages: messages.filter((m) => m.role !== "system").map((m) => ({
1205
- role: m.role,
1206
- content: m.content
1207
- }))
1208
- };
1209
- if (useThinking) streamOptions.thinking = {
1210
- type: "enabled",
1211
- budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
1212
- };
1213
- const finalMessage = await client.messages.stream(streamOptions).finalMessage();
1214
- const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
1215
- const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
1216
- const inputTokens = finalMessage.usage.input_tokens;
1217
- const outputTokens = finalMessage.usage.output_tokens;
1218
- return {
1219
- text,
1220
- cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
1221
- inputTokens,
1222
- outputTokens
1223
- };
1224
- }
1225
- if (config.provider.startsWith("openai")) {
1226
- const client = new OpenAI({ apiKey: config.apiKey });
1227
- const completionOptions = {
1228
- model: spec.model,
1229
- messages: messages.map((m) => ({
1230
- role: m.role,
1231
- content: m.content
1232
- })),
1233
- max_completion_tokens: spec.maxTokens
1234
- };
1235
- if (useThinking) completionOptions.reasoning_effort = "xhigh";
1236
- const response = await client.chat.completions.create(completionOptions);
1237
- const text = response.choices[0].message.content ?? "";
1238
- const inputTokens = response.usage?.prompt_tokens ?? 0;
1239
- const outputTokens = response.usage?.completion_tokens ?? 0;
1240
- return {
1241
- text,
1242
- cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
1243
- inputTokens,
1244
- outputTokens
1245
- };
1246
- }
1247
- throw new Error(`Unsupported provider: ${config.provider}`);
1248
- } catch (error) {
1249
- const message = error instanceof Error ? error.message : String(error);
1250
- throw new Error(`LLM call failed (${spec.model}): ${message}`);
1251
- }
1252
- }
1253
1717
  async function generatePatch(failure, currentPrompt, config, previousBetterPrompt, previousBetterPromptFailures) {
1254
- let userContent = `
1255
- Current system prompt:
1256
- ---
1257
- ${currentPrompt}
1258
- ---
1259
-
1260
- A test case failed:
1261
- ${formatFailure(failure)}
1262
- `;
1263
- if (previousBetterPrompt) {
1264
- const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
1265
- userContent += `
1266
- Note: The current prompt is a REGRESSION from a better-performing version.
1267
- Previous (better) prompt for reference:
1268
- ---
1269
- ${previousBetterPrompt}
1270
- ---
1271
-
1272
- The failures the better prompt had:
1273
- ${failuresContext}
1274
-
1275
- Your changes introduced new failures instead of fixing the above.
1276
- Analyze what changed between the two prompts that might have caused this regression.
1277
- Are there any new failures that were not present in the previous better prompt?
1278
- Are there any failures that were present in the previous better prompt but not in the current prompt?
1279
- Did any of our patches contradict any of the new failures?
1280
- `;
1281
- }
1282
- userContent += `
1283
- Suggest a specific change to the system prompt that would fix this failure.
1284
- Be concise. Output ONLY the suggested patch/change, not the full prompt.
1285
- DO NOT overfit the prompt to the test case.
1286
- Generalize examples if you choose to use them.
1287
- `;
1288
- return callLLM([{
1718
+ const userContent = buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures);
1719
+ const messages = [{
1289
1720
  role: "system",
1290
- content: `
1291
- 'You are optimizing a system prompt for an LLM workflow.
1292
- Analyze the failure and suggest a specific, focused change to improve the prompt.
1293
- Do NOT overfit. Be generalizable.
1294
-
1295
- <examples>
1296
- VERY IMPORTANT, CRITICAL!!!
1297
- Examples MUST be anonymized.
1298
- NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
1299
- - example: (for an invoice processor)
1300
- - task: extract data from parsed invoices
1301
- - failure context: (returned expected: true, actual: false)
1302
- - prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
1303
-
1304
- - example: (for a calendar app)
1305
- - task: extract cost from calendar event
1306
- - failure context: (cost expected: 123.45, actual: 167.89)
1307
- - prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
1308
- </examples>
1309
- `
1721
+ content: config.patchSystemPrompt ?? DEFAULT_PATCH_SYSTEM_PROMPT
1310
1722
  }, {
1311
1723
  role: "user",
1312
1724
  content: userContent
1313
- }], config, config.thinking ?? false);
1725
+ }];
1726
+ return callLLM({
1727
+ provider: config.provider,
1728
+ apiKey: config.apiKey,
1729
+ messages,
1730
+ useThinking: config.thinking ?? false
1731
+ });
1314
1732
  }
1315
1733
  async function mergePatches(patches, currentPrompt, config) {
1316
- const systemContent = `
1317
- You are an expert LLM prompt editor.
1318
- You are merging improvements into a system prompt.
1319
- Incorporate the suggestions while keeping the prompt clear and coherent.
1320
- `;
1321
- const userContent = `
1322
- Current prompt:
1323
- ---
1324
- ${currentPrompt}
1325
- ---
1326
-
1327
- Suggested improvements:
1328
- ${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
1329
-
1330
- Create a single improved system prompt that incorporates these suggestions.
1331
- Be mindful of the size of the new prompt.
1332
- Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
1333
- Output ONLY the new system prompt, nothing else.
1334
- Respect enums.
1335
- `;
1336
- return callLLM([{
1734
+ const systemContent = config.mergeSystemPrompt ?? DEFAULT_MERGE_SYSTEM_PROMPT;
1735
+ const userContent = buildMergeUserPrompt(patches, currentPrompt);
1736
+ const messages = [{
1337
1737
  role: "system",
1338
1738
  content: systemContent
1339
1739
  }, {
1340
1740
  role: "user",
1341
1741
  content: userContent
1342
- }], config, config.thinking ?? false);
1742
+ }];
1743
+ return callLLM({
1744
+ provider: config.provider,
1745
+ apiKey: config.apiKey,
1746
+ messages,
1747
+ useThinking: config.thinking ?? false
1748
+ });
1749
+ }
1750
+
1751
+ //#endregion
1752
+ //#region src/eval/executors.ts
1753
+ /**
1754
+ * Creates an executor that calls an HTTP endpoint.
1755
+ *
1756
+ * @example
1757
+ * ```ts
1758
+ * const executor = endpoint('https://api.example.com/workflow', {
1759
+ * headers: { Authorization: 'Bearer token' },
1760
+ * });
1761
+ * ```
1762
+ */
1763
+ function endpoint(url, config = {}) {
1764
+ const { method = "POST", headers = {}, mapResponse, mapAdditionalContext, mapCost, timeout = DEFAULT_ENDPOINT_TIMEOUT_MS } = config;
1765
+ return async (input, systemPrompt) => {
1766
+ const body = typeof input === "object" && input !== null ? {
1767
+ ...input,
1768
+ systemPrompt
1769
+ } : {
1770
+ input,
1771
+ systemPrompt
1772
+ };
1773
+ const controller = new AbortController();
1774
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
1775
+ try {
1776
+ const response = await fetch(url, {
1777
+ method,
1778
+ headers: {
1779
+ "Content-Type": "application/json",
1780
+ ...headers
1781
+ },
1782
+ body: JSON.stringify(body),
1783
+ signal: controller.signal
1784
+ });
1785
+ clearTimeout(timeoutId);
1786
+ if (!response.ok) {
1787
+ const text = await response.text();
1788
+ throw new Error(`HTTP ${response.status}: ${text}`);
1789
+ }
1790
+ const data = await response.json();
1791
+ const additionalContext = mapAdditionalContext?.(data);
1792
+ const cost = mapCost?.(data) ?? 0;
1793
+ if (mapResponse) return {
1794
+ output: mapResponse(data),
1795
+ additionalContext,
1796
+ cost
1797
+ };
1798
+ return {
1799
+ output: data,
1800
+ additionalContext,
1801
+ cost
1802
+ };
1803
+ } catch (error) {
1804
+ clearTimeout(timeoutId);
1805
+ throw error;
1806
+ }
1807
+ };
1808
+ }
1809
+ /**
1810
+ * Creates an executor from a local function.
1811
+ *
1812
+ * @example
1813
+ * ```ts
1814
+ * const executor = fn({
1815
+ * fn: async (input, systemPrompt) => {
1816
+ * const result = await myLLMCall(input, systemPrompt);
1817
+ * return result;
1818
+ * },
1819
+ * });
1820
+ * ```
1821
+ *
1822
+ * @example With mapResponse to extract output from a richer response:
1823
+ * ```ts
1824
+ * const executor = fn({
1825
+ * fn: async (input, systemPrompt) => await startWorkflow({ ... }),
1826
+ * mapResponse: (result) => ({ documentType: result.documentType }),
1827
+ * mapCost: (result) => result.cost,
1828
+ * mapAdditionalContext: (result) => result.metadata,
1829
+ * });
1830
+ * ```
1831
+ */
1832
+ function fn(config) {
1833
+ return async (input, systemPrompt) => {
1834
+ const raw = await config.fn(input, systemPrompt);
1835
+ return {
1836
+ output: config.mapResponse ? config.mapResponse(raw) : raw,
1837
+ additionalContext: config.mapAdditionalContext?.(raw),
1838
+ cost: config.mapCost?.(raw) ?? 0
1839
+ };
1840
+ };
1841
+ }
1842
+ /**
1843
+ * Creates a mock executor for testing.
1844
+ * Can accept either:
1845
+ * - An array of outputs (returned in sequence, cycling if more calls than outputs)
1846
+ * - A function that maps input to output
1847
+ *
1848
+ * @example Array-based:
1849
+ * ```ts
1850
+ * const executor = mock([
1851
+ * { premium: 12500, policyType: 'claims-made' },
1852
+ * { premium: 8200, policyType: 'entity' },
1853
+ * ]);
1854
+ * ```
1855
+ *
1856
+ * @example Function-based:
1857
+ * ```ts
1858
+ * const executor = mock((input) => ({
1859
+ * id: input.id,
1860
+ * processed: true,
1861
+ * }));
1862
+ * ```
1863
+ */
1864
+ function mock(outputsOrFn) {
1865
+ if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
1866
+ return { output: outputsOrFn(input, systemPrompt) };
1867
+ };
1868
+ const outputs = outputsOrFn;
1869
+ if (outputs.length === 0) throw new Error("mock() requires at least one output");
1870
+ let callIndex = 0;
1871
+ return async () => {
1872
+ const output = outputs[callIndex % outputs.length];
1873
+ callIndex++;
1874
+ return { output };
1875
+ };
1343
1876
  }
1344
1877
 
1345
1878
  //#endregion
@@ -1392,5 +1925,5 @@ const didactic = {
1392
1925
  var src_default = didactic;
1393
1926
 
1394
1927
  //#endregion
1395
- export { LLMProviders, contains, custom, date, src_default as default, didactic as didact, didactic, endpoint, evaluate, exact, fn, mock, name, numeric, oneOf, optimize, presence, within };
1928
+ export { LLMProviders, contains, custom, date, src_default as default, didactic as didact, didactic, endpoint, evaluate, exact, fn, llmCompare, mock, name, numeric, oneOf, optimize, presence, unordered, within };
1396
1929
  //# sourceMappingURL=index.mjs.map