npm - @docshield/didactic - Versions diffs - 0.1.1 → 0.1.4 - Mend

@docshield/didactic 0.1.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/index.mjs CHANGED Viewed

@@ -1,15 +1,21 @@
-import * as chrono from "chrono-node";
-import { differenceInDays } from "date-fns";
-import Levenshtein from "levenshtein";
 import munkres from "munkres-js";
 import Anthropic from "@anthropic-ai/sdk";
 import OpenAI from "openai";
-import * as path from "path";
+import * as chrono from "chrono-node";
+import { differenceInDays } from "date-fns";
+import Levenshtein from "levenshtein";
 import * as fs from "fs";
+import * as path from "path";
+import chalk from "chalk";
+import ora from "ora";
+import cliProgress from "cli-progress";
+import figures from "figures";
+import * as crypto from "crypto";
 //#region src/types.ts
 /**
 * Supported LLM providers.
+* Used by both optimizer and LLM-based comparators.
 */
 let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
 	LLMProviders$1["anthropic_claude_opus"] = "anthropic_claude_opus";
@@ -21,7 +27,7 @@ let LLMProviders = /* @__PURE__ */ function(LLMProviders$1) {
 }({});
 //#endregion
-//#region src/constants.ts
+//#region src/library/constants.ts
 const PROVIDER_SPECS = {
 	[LLMProviders.anthropic_claude_opus]: {
 		model: "claude-opus-4-5-20251101",
@@ -36,7 +42,7 @@ const PROVIDER_SPECS = {
 		costPerMillionOutput: 15
 	},
 	[LLMProviders.anthropic_claude_haiku]: {
-		model: "claude-haiku-4-5-20251101",
+		model: "claude-haiku-4-5-20251001",
 		maxTokens: 64e3,
 		costPerMillionInput: 1,
 		costPerMillionOutput: 5
@@ -61,7 +67,154 @@ const DEFAULT_PER_TEST_THRESHOLD = 1;
 const NAME_SUFFIXES = /(?<=\S)\s*,?\s*(inc\.?|llc\.?|ltd\.?|l\.l\.c\.?|corp\.?|corporation|company|co\.?)$/i;
 //#endregion
-//#region src/comparators.ts
+//#region src/library/llm/llm-client.ts
+/**
+* Call an LLM provider with the given messages.
+* Returns raw text output - caller is responsible for parsing if structured output is needed.
+*/
+async function callLLM(config) {
+	const { provider, apiKey, messages, useThinking = false } = config;
+	const spec = PROVIDER_SPECS[provider];
+	try {
+		if (provider.startsWith("anthropic")) {
+			const client = new Anthropic({ apiKey });
+			const streamOptions = {
+				model: spec.model,
+				max_tokens: spec.maxTokens,
+				system: messages.find((m) => m.role === "system")?.content,
+				messages: messages.filter((m) => m.role !== "system").map((m) => ({
+					role: m.role,
+					content: m.content
+				}))
+			};
+			if (useThinking) streamOptions.thinking = {
+				type: "enabled",
+				budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
+			};
+			const finalMessage = await client.messages.stream(streamOptions).finalMessage();
+			const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
+			const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
+			const inputTokens = finalMessage.usage.input_tokens;
+			const outputTokens = finalMessage.usage.output_tokens;
+			return {
+				text,
+				cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
+				inputTokens,
+				outputTokens
+			};
+		}
+		if (provider.startsWith("openai")) {
+			const client = new OpenAI({ apiKey });
+			const completionOptions = {
+				model: spec.model,
+				messages: messages.map((m) => ({
+					role: m.role,
+					content: m.content
+				})),
+				max_completion_tokens: spec.maxTokens
+			};
+			if (useThinking) completionOptions.reasoning_effort = "xhigh";
+			const response = await client.chat.completions.create(completionOptions);
+			const text = response.choices[0].message.content ?? "";
+			const inputTokens = response.usage?.prompt_tokens ?? 0;
+			const outputTokens = response.usage?.completion_tokens ?? 0;
+			return {
+				text,
+				cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
+				inputTokens,
+				outputTokens
+			};
+		}
+		throw new Error(`Unsupported provider: ${provider}`);
+	} catch (error) {
+		const message = error instanceof Error ? error.message : String(error);
+		throw new Error(`LLM call failed (${spec.model}): ${message}`);
+	}
+}
+/**
+* Call an LLM provider with structured output.
+* Returns parsed JSON data conforming to the provided schema.
+*/
+async function callStructuredLLM(config) {
+	const { provider, apiKey, messages, schema, useThinking = false } = config;
+	const spec = PROVIDER_SPECS[provider];
+	try {
+		if (provider.startsWith("anthropic")) {
+			const client = new Anthropic({ apiKey });
+			const baseOptions = {
+				model: spec.model,
+				max_tokens: spec.maxTokens,
+				betas: ["structured-outputs-2025-11-13"],
+				system: messages.find((m) => m.role === "system")?.content,
+				messages: messages.filter((m) => m.role !== "system").map((m) => ({
+					role: m.role,
+					content: m.content
+				})),
+				output_format: {
+					type: "json_schema",
+					schema
+				}
+			};
+			const streamOptions = useThinking ? {
+				...baseOptions,
+				thinking: {
+					type: "enabled",
+					budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
+				}
+			} : baseOptions;
+			const finalMessage = await client.beta.messages.stream(streamOptions).finalMessage();
+			const content = finalMessage.content[0];
+			if (content.type !== "text") throw new Error("Unexpected response type from LLM");
+			const data = JSON.parse(content.text);
+			const inputTokens = finalMessage.usage.input_tokens;
+			const outputTokens = finalMessage.usage.output_tokens;
+			return {
+				data,
+				cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
+				inputTokens,
+				outputTokens
+			};
+		}
+		if (provider.startsWith("openai")) {
+			const client = new OpenAI({ apiKey });
+			const completionOptions = {
+				model: spec.model,
+				messages: messages.map((m) => ({
+					role: m.role,
+					content: m.content
+				})),
+				max_completion_tokens: spec.maxTokens,
+				response_format: {
+					type: "json_schema",
+					json_schema: {
+						name: "response",
+						strict: true,
+						schema
+					}
+				}
+			};
+			if (useThinking) completionOptions.reasoning_effort = "xhigh";
+			const response = await client.chat.completions.create(completionOptions);
+			const text = response.choices[0].message.content ?? "";
+			const data = JSON.parse(text);
+			const inputTokens = response.usage?.prompt_tokens ?? 0;
+			const outputTokens = response.usage?.completion_tokens ?? 0;
+			return {
+				data,
+				cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
+				inputTokens,
+				outputTokens
+			};
+		}
+		throw new Error(`Unsupported provider: ${provider}`);
+	} catch (error) {
+		const message = error instanceof Error ? error.message : String(error);
+		throw new Error(`Structured LLM call failed (${spec.model}): ${message}`);
+	}
+}
+//#endregion
+//#region src/eval/comparators/comparators.ts
 /** Checks if actual string contains a substring. */
 function contains(substring) {
 	return (_expected, actual) => {
@@ -198,6 +351,103 @@ function within(config) {
 		};
 	};
 }
+/** Schema for LLM comparison response. */
+const LLM_COMPARE_SCHEMA = {
+	type: "object",
+	properties: {
+		passed: {
+			type: "boolean",
+			description: "Whether the actual value matches the expected value"
+		},
+		rationale: {
+			type: "string",
+			description: "Brief explanation of the comparison decision"
+		}
+	},
+	required: ["passed", "rationale"],
+	additionalProperties: false
+};
+const DEFAULT_LLM_COMPARE_SYSTEM_PROMPT = `Compare the following two values and determine if they are semantically equivalent.
+Focus on whether they convey the same core meaning or information, even if expressed differently. Consider synonyms, paraphrasing, and stylistic variations as acceptable. Only mark as failed if there are substantial differences in the actual facts or meaning being conveyed.`;
+const buildLLMCompareUserPrompt = (expected, actual) => `Expected value:
+${JSON.stringify(expected, null, 2)}
+Actual value:
+${JSON.stringify(actual, null, 2)}`;
+/**
+* Uses an LLM to compare expected vs actual values.
+* Returns a comparison result with rationale and cost tracking.
+* Default provider: anthropic_claude_haiku (fastest, cheapest).
+*/
+function llmCompare(config) {
+	const systemPrompt = config.systemPrompt ?? DEFAULT_LLM_COMPARE_SYSTEM_PROMPT;
+	return async (expected, actual, context) => {
+		try {
+			const apiKey = config.apiKey ?? context?.llmConfig?.apiKey;
+			if (!apiKey) throw new Error("llmCompare requires an apiKey. Either pass it directly to llmCompare() or set llmConfig.apiKey in eval config.");
+			const provider = config.provider ?? context?.llmConfig?.provider ?? LLMProviders.anthropic_claude_haiku;
+			const userPrompt = buildLLMCompareUserPrompt(expected, actual);
+			const result = await callStructuredLLM({
+				provider,
+				apiKey,
+				messages: [{
+					role: "system",
+					content: systemPrompt
+				}, {
+					role: "user",
+					content: userPrompt
+				}],
+				schema: LLM_COMPARE_SCHEMA
+			});
+			return {
+				passed: result.data.passed,
+				rationale: result.data.rationale,
+				cost: result.cost,
+				similarity: result.data.passed ? 1 : 0
+			};
+		} catch (error) {
+			return {
+				passed: false,
+				rationale: `LLM comparison failed: ${error instanceof Error ? error.message : String(error)}`,
+				cost: 0,
+				similarity: 0
+			};
+		}
+	};
+}
+/**
+* Marks a comparator or comparator config as unordered.
+* When applied to an array field, items will be matched by similarity
+* rather than index position (using Hungarian algorithm).
+*
+* @example
+* // Unordered array of objects
+* lineItems: unordered({
+*   description: name,
+*   price: within({ tolerance: 5 })
+* })
+*
+* @example
+* // Unordered array of primitives
+* tags: unordered(exact)
+*
+* @example
+* // When entire output is an array
+* comparators: unordered({
+*   carrier: exact,
+*   premium: within({ tolerance: 0.05 })
+* })
+*/
+function unordered(comparator) {
+	const baseFunction = typeof comparator === "function" ? comparator : () => {
+		throw new Error("unordered() base function should not be called when nested comparators exist. This is likely a bug in the evaluation logic.");
+	};
+	return Object.assign(baseFunction, {
+		_unordered: true,
+		_nestedComparators: typeof comparator === "object" ? comparator : void 0
+	});
+}
 /**
 * Deep equality comparison with cycle detection.
 * Uses WeakSet to track visited object pairs to prevent stack overflow on circular references.
@@ -235,198 +485,74 @@ function normalizeNumeric(value) {
 	if (value == null || value === "") return null;
 	const str = String(value);
 	const isNegativeParens = /^\(.*\)$/.test(str.trim());
-	let cleaned = str.replace(/[^0-9.\-]/g, "");
+	let cleaned = str.replace(/[^0-9.-]/g, "");
 	if (isNegativeParens && !cleaned.startsWith("-")) cleaned = "-" + cleaned;
 	const num = parseFloat(cleaned);
 	return isNaN(num) ? null : num;
 }
 //#endregion
-//#region src/executors.ts
+//#region src/eval/comparators/matching.ts
+function isObject$1(value) {
+	return value !== null && typeof value === "object" && !Array.isArray(value);
+}
 /**
-* Creates an executor that calls an HTTP endpoint.
-*
-* @example
-* ```ts
-* const executor = endpoint('https://api.example.com/workflow', {
-*   headers: { Authorization: 'Bearer token' },
-* });
-* ```
+* Calculate similarity score between two values (0.0 to 1.0).
+* For arrays: recursively match and average similarity of paired elements.
+* For objects: average similarity across all fields using comparator results.
+* For primitives: uses exact comparison's similarity score.
 */
-function endpoint(url, config = {}) {
-	const { method = "POST", headers = {}, mapResponse, mapAdditionalContext, mapCost, timeout = DEFAULT_ENDPOINT_TIMEOUT_MS } = config;
-	return async (input, systemPrompt) => {
-		const body = typeof input === "object" && input !== null ? {
-			...input,
-			systemPrompt
-		} : {
-			input,
-			systemPrompt
-		};
-		const controller = new AbortController();
-		const timeoutId = setTimeout(() => controller.abort(), timeout);
-		try {
-			const response = await fetch(url, {
-				method,
-				headers: {
-					"Content-Type": "application/json",
-					...headers
-				},
-				body: JSON.stringify(body),
-				signal: controller.signal
-			});
-			clearTimeout(timeoutId);
-			if (!response.ok) {
-				const text = await response.text();
-				throw new Error(`HTTP ${response.status}: ${text}`);
-			}
-			const data = await response.json();
-			const additionalContext = mapAdditionalContext?.(data);
-			const cost = mapCost?.(data) ?? 0;
-			if (mapResponse) return {
-				output: mapResponse(data),
-				additionalContext,
-				cost
-			};
-			return {
-				output: data,
-				additionalContext,
-				cost
-			};
-		} catch (error) {
-			clearTimeout(timeoutId);
-			throw error;
-		}
-	};
+async function getSimilarity(expected, actual, comparators) {
+	if (Array.isArray(expected) && Array.isArray(actual)) {
+		if (expected.length === 0 && actual.length === 0) return 1;
+		if (expected.length === 0 || actual.length === 0) return 0;
+		const result = await matchArrays(expected, actual, comparators);
+		let total$1 = 0;
+		for (const [expIdx, actIdx] of result.assignments) total$1 += await getSimilarity(expected[expIdx], actual[actIdx], comparators);
+		const maxLen = Math.max(expected.length, actual.length);
+		return total$1 / maxLen;
+	}
+	if (!isObject$1(expected) || !isObject$1(actual)) {
+		const result = exact(expected, actual);
+		return result.similarity ?? (result.passed ? 1 : 0);
+	}
+	const fields = Object.keys(expected).filter((key) => {
+		const comp = comparators[key];
+		return comp !== void 0 && typeof comp === "function";
+	});
+	if (fields.length === 0) return 1;
+	let total = 0;
+	for (const key of fields) {
+		const comparatorConfig = comparators[key];
+		const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected[key], actual[key], {
+			expectedParent: expected,
+			actualParent: actual
+		});
+		total += result.similarity ?? (result.passed ? 1 : 0);
+	}
+	return total / fields.length;
 }
 /**
-* Creates an executor from a local function.
-*
-* @example
-* ```ts
-* const executor = fn({
-*   fn: async (input, systemPrompt) => {
-*     const result = await myLLMCall(input, systemPrompt);
-*     return result;
-*   },
-* });
-* ```
+* Find optimal pairing between expected and actual arrays using Hungarian algorithm.
+* Pure matching - no pass/fail determination.
 *
-* @example With mapResponse to extract output from a richer response:
-* ```ts
-* const executor = fn({
-*   fn: async (input, systemPrompt) => await startWorkflow({ ... }),
-*   mapResponse: (result) => ({ documentType: result.documentType }),
-*   mapCost: (result) => result.cost,
-*   mapAdditionalContext: (result) => result.metadata,
-* });
-* ```
+* @param expected - Array of expected items
+* @param actual - Array of actual items
+* @param comparators - Nested comparator configuration for array items
+* @returns Matching result with assignments and unmatched indices
 */
-function fn(config) {
-	return async (input, systemPrompt) => {
-		const raw = await config.fn(input, systemPrompt);
-		return {
-			output: config.mapResponse ? config.mapResponse(raw) : raw,
-			additionalContext: config.mapAdditionalContext?.(raw),
-			cost: config.mapCost?.(raw) ?? 0
-		};
-	};
-}
-/**
-* Creates a mock executor for testing.
-* Can accept either:
-* - An array of outputs (returned in sequence, cycling if more calls than outputs)
-* - A function that maps input to output
-*
-* @example Array-based:
-* ```ts
-* const executor = mock([
-*   { premium: 12500, policyType: 'claims-made' },
-*   { premium: 8200, policyType: 'entity' },
-* ]);
-* ```
-*
-* @example Function-based:
-* ```ts
-* const executor = mock((input) => ({
-*   id: input.id,
-*   processed: true,
-* }));
-* ```
-*/
-function mock(outputsOrFn) {
-	if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
-		return { output: outputsOrFn(input, systemPrompt) };
-	};
-	const outputs = outputsOrFn;
-	if (outputs.length === 0) throw new Error("mock() requires at least one output");
-	let callIndex = 0;
-	return async () => {
-		const output = outputs[callIndex % outputs.length];
-		callIndex++;
-		return { output };
-	};
-}
-//#endregion
-//#region src/matching.ts
-function isObject$1(value) {
-	return value !== null && typeof value === "object" && !Array.isArray(value);
-}
-/**
-* Calculate similarity score between two values (0.0 to 1.0).
-* For arrays: recursively match and average similarity of paired elements.
-* For objects: average similarity across all fields using comparator results.
-* For primitives: uses exact comparison's similarity score.
-*/
-function getSimilarity(expected, actual, comparators) {
-	if (Array.isArray(expected) && Array.isArray(actual)) {
-		if (expected.length === 0 && actual.length === 0) return 1;
-		if (expected.length === 0 || actual.length === 0) return 0;
-		const result = matchArrays(expected, actual, comparators);
-		let total$1 = 0;
-		for (const [expIdx, actIdx] of result.assignments) total$1 += getSimilarity(expected[expIdx], actual[actIdx], comparators);
-		const maxLen = Math.max(expected.length, actual.length);
-		return total$1 / maxLen;
-	}
-	if (!isObject$1(expected) || !isObject$1(actual)) {
-		const result = exact(expected, actual);
-		return result.similarity ?? (result.passed ? 1 : 0);
-	}
-	const fields = Object.keys(expected).filter((key) => comparators[key]);
-	if (fields.length === 0) return 1;
-	let total = 0;
-	for (const key of fields) {
-		const comparator = comparators[key];
-		const result = comparator(expected[key], actual[key], {
-			expectedParent: expected,
-			actualParent: actual
-		});
-		total += result.similarity ?? (result.passed ? 1 : 0);
-	}
-	return total / fields.length;
-}
-/**
-* Find optimal pairing between expected and actual arrays using Hungarian algorithm.
-* Pure matching - no pass/fail determination.
-*
-* @param expected - Array of expected items
-* @param actual - Array of actual items
-* @param comparators - Map of field names to comparator functions
-* @returns Matching result with assignments and unmatched indices
-*/
-function matchArrays(expected, actual, comparators = {}) {
-	if (expected.length === 0) return {
-		assignments: [],
-		unmatchedExpected: [],
-		unmatchedActual: [...Array(actual.length).keys()]
+async function matchArrays(expected, actual, comparators = {}) {
+	if (expected.length === 0) return {
+		assignments: [],
+		unmatchedExpected: [],
+		unmatchedActual: [...Array(actual.length).keys()]
 	};
 	if (actual.length === 0) return {
 		assignments: [],
 		unmatchedExpected: [...Array(expected.length).keys()],
 		unmatchedActual: []
 	};
-	const rawAssignments = munkres(expected.map((exp) => actual.map((act) => 1 - getSimilarity(exp, act, comparators))));
+	const rawAssignments = munkres(await Promise.all(expected.map(async (exp) => Promise.all(actual.map(async (act) => 1 - await getSimilarity(exp, act, comparators))))));
 	const assignments = [];
 	const matchedExp = /* @__PURE__ */ new Set();
 	const matchedAct = /* @__PURE__ */ new Set();
@@ -443,212 +569,126 @@ function matchArrays(expected, actual, comparators = {}) {
 }
 //#endregion
-//#region src/eval.ts
+//#region src/optimizer/ui.ts
 /**
-* Run all test cases and return results.
+* UI utilities for beautiful console output
 */
-async function evaluate(config) {
-	const { testCases, systemPrompt, executor, comparators, comparatorOverride } = config;
-	if (testCases.length === 0) throw new Error("testCases array cannot be empty");
-	if (!executor) throw new Error("executor is required");
-	if (!comparators && !comparatorOverride) throw new Error("either \"comparators\" (field mapping or single function) or \"comparatorOverride\" (whole-object) is required");
-	const executeTestCase = async ({ input, expected }) => {
-		try {
-			const result = await executor(input, systemPrompt);
-			let fields;
-			if (comparatorOverride) {
-				const compResult = comparatorOverride(expected, result.output);
-				fields = { "": {
-					passed: compResult.passed,
-					expected,
-					actual: result.output
-				} };
-			} else if (typeof comparators === "function") if (Array.isArray(expected)) fields = compareFields({
-				expected,
-				actual: result.output,
-				comparators: { "": comparators },
-				unorderedList: config.unorderedList
-			});
-			else {
-				const compResult = comparators(expected, result.output, {
-					expectedParent: void 0,
-					actualParent: void 0
-				});
-				fields = { "": {
-					...compResult,
-					expected,
-					actual: result.output
-				} };
-			}
-			else fields = compareFields({
-				expected,
-				actual: result.output,
-				comparators,
-				unorderedList: config.unorderedList
-			});
-			const passedFields = Object.values(fields).filter((f) => f.passed).length;
-			const totalFields$1 = Object.values(fields).length;
-			const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
-			const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
-			return {
-				input,
-				expected,
-				actual: result.output,
-				additionalContext: result.additionalContext,
-				cost: result.cost ?? 0,
-				passed: passed$1,
-				fields,
-				passedFields,
-				totalFields: totalFields$1,
-				passRate
-			};
-		} catch (error) {
-			return {
-				input,
-				expected,
-				actual: void 0,
-				cost: 0,
-				passed: false,
-				fields: {},
-				passedFields: 0,
-				totalFields: 0,
-				passRate: 0,
-				error: error instanceof Error ? error.message : String(error)
-			};
+const theme = {
+	success: chalk.green,
+	error: chalk.red,
+	warning: chalk.yellow,
+	bold: chalk.bold,
+	dim: chalk.dim,
+	check: chalk.green(figures.tick),
+	cross: chalk.red(figures.cross),
+	warn: chalk.yellow(figures.warning),
+	bullet: chalk.dim(figures.bullet),
+	pointer: chalk.yellow(figures.pointer),
+	separator: chalk.dim(" · "),
+	divider: (label, width = 60) => {
+		const prefix = `━━━ ${label} `;
+		const remaining = Math.max(0, width - prefix.length);
+		return chalk.cyan.dim(prefix + "━".repeat(remaining));
+	}
+};
+let activeSpinner = null;
+const spinner = {
+	start(text) {
+		if (activeSpinner) activeSpinner.stop();
+		activeSpinner = ora({
+			text,
+			spinner: "dots",
+			indent: 4
+		}).start();
+		return activeSpinner;
+	},
+	succeed(text) {
+		if (activeSpinner) {
+			activeSpinner.succeed(text);
+			activeSpinner = null;
 		}
-	};
-	const rateLimitBatch = config.rateLimitBatch;
-	let results;
-	if (rateLimitBatch && rateLimitBatch > 0) {
-		results = [];
-		for (let i = 0; i < testCases.length; i += rateLimitBatch) {
-			const batch = testCases.slice(i, i + rateLimitBatch);
-			const batchResults = await Promise.all(batch.map(executeTestCase));
-			results.push(...batchResults);
-			const rateLimitPause = config.rateLimitPause;
-			if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
+	},
+	fail(text) {
+		if (activeSpinner) {
+			activeSpinner.fail(text);
+			activeSpinner = null;
 		}
-	} else results = await Promise.all(testCases.map(executeTestCase));
-	results.sort((a, b) => {
-		if (a.passed !== b.passed) return a.passed ? 1 : -1;
-		return a.passRate - b.passRate;
-	});
-	const passed = results.filter((r) => r.passed).length;
-	const total = results.length;
-	const successRate = total > 0 ? passed / total : 0;
-	let correctFields = 0;
-	let totalFields = 0;
-	for (const r of results) {
-		const fieldResults = Object.values(r.fields);
-		totalFields += fieldResults.length;
-		correctFields += fieldResults.filter((f) => f.passed).length;
+	},
+	stop() {
+		if (activeSpinner) {
+			activeSpinner.stop();
+			activeSpinner = null;
+		}
+	},
+	clear() {
+		if (activeSpinner) activeSpinner.clear();
+	},
+	isActive() {
+		return activeSpinner !== null;
 	}
-	const accuracy = totalFields > 0 ? correctFields / totalFields : 0;
-	const cost = results.reduce((sum, r) => sum + (r.cost ?? 0), 0);
+};
+function createProgressTracker(label) {
+	let bar = null;
+	let startTime = 0;
+	let lastUpdate = 0;
+	const MIN_UPDATE_INTERVAL = 100;
 	return {
-		systemPrompt,
-		testCases: results,
-		passed,
-		total,
-		successRate,
-		correctFields,
-		totalFields,
-		accuracy,
-		cost
+		start(total) {
+			spinner.stop();
+			startTime = Date.now();
+			bar = new cliProgress.SingleBar({
+				format: `    {bar} {percentage}%  {value}/{total} ${label}  {duration_formatted}`,
+				barCompleteChar: "█",
+				barIncompleteChar: "░",
+				barsize: 20,
+				hideCursor: true,
+				clearOnComplete: false,
+				stopOnComplete: false,
+				forceRedraw: true,
+				fps: 10
+			});
+			bar.start(total, 0, { duration_formatted: "0s" });
+		},
+		update(current) {
+			const now = Date.now();
+			if (now - lastUpdate < MIN_UPDATE_INTERVAL && bar) {
+				if (current < bar.getTotal()) return;
+			}
+			lastUpdate = now;
+			if (bar) {
+				const elapsed = Math.round((now - startTime) / 1e3);
+				bar.update(current, { duration_formatted: `${elapsed}s` });
+			}
+		},
+		stop() {
+			if (bar) {
+				const elapsed = Math.round((Date.now() - startTime) / 1e3);
+				bar.update(bar.getTotal(), { duration_formatted: `${elapsed}s` });
+				bar.stop();
+				bar = null;
+			}
+		}
 	};
 }
-/**
-* Recursively compare expected vs actual, returning field-level results.
-* Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
-*/
-function compareFields(opts) {
-	const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, unorderedList = false } = opts;
-	const results = {};
-	const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
-	if (Array.isArray(expected)) {
-		if (!Array.isArray(actual)) return { [path$1]: {
-			passed: false,
-			expected,
-			actual
-		} };
-		if (expected.length === 0) return {};
-		let matchedPairs;
-		if (unorderedList) matchedPairs = matchArrays(expected, actual, comparators).assignments;
-		else {
-			matchedPairs = [];
-			for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
-		}
-		const matchedIndices = new Set(matchedPairs.map(([i]) => i));
-		for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, compareFields({
-			expected: expected[expIdx],
-			actual: actual[actIdx],
-			comparators,
-			path: indexPath(expIdx),
-			expectedParent,
-			actualParent,
-			unorderedList
-		}));
-		const arrayFieldName = getFieldName(path$1);
-		const hasArrayComparator = arrayFieldName in comparators || arrayFieldName === "";
-		for (let i = 0; i < expected.length; i++) {
-			if (matchedIndices.has(i)) continue;
-			const item = expected[i];
-			if (isObject(item)) {
-				for (const [field, value] of Object.entries(item)) if (field in comparators) results[`${indexPath(i)}.${field}`] = {
-					passed: false,
-					expected: value,
-					actual: void 0
-				};
-			} else if (hasArrayComparator) results[indexPath(i)] = {
-				passed: false,
-				expected: item,
-				actual: void 0
-			};
-		}
-		return results;
-	}
-	if (isObject(expected)) {
-		if (!isObject(actual)) return { [path$1]: {
-			passed: false,
-			expected,
-			actual
-		} };
-		for (const [field, expValue] of Object.entries(expected)) {
-			const fieldPath = path$1 ? `${path$1}.${field}` : field;
-			Object.assign(results, compareFields({
-				expected: expValue,
-				actual: actual[field],
-				comparators,
-				path: fieldPath,
-				expectedParent: expected,
-				actualParent: actual,
-				unorderedList
-			}));
-		}
-		return results;
-	}
-	const fieldName = getFieldName(path$1);
-	const comparator = comparators[fieldName] ?? (fieldName === "" ? exact : void 0);
-	if (!comparator) return {};
-	const result = comparator(expected, actual, {
-		expectedParent,
-		actualParent
-	});
-	return { [path$1]: {
-		...result,
-		expected,
-		actual
-	} };
+function formatCost(cost) {
+	return theme.dim(`$${cost.toFixed(4)}`);
 }
-function isObject(value) {
-	return value !== null && typeof value === "object" && !Array.isArray(value);
+function formatCostShort(cost) {
+	return theme.dim(`$${cost.toFixed(2)}`);
 }
-function getFieldName(path$1) {
-	return (path$1.split(".").pop() || "").replace(/\[\d+\]$/, "");
+function formatDuration(ms) {
+	const totalSeconds = Math.round(ms / 1e3);
+	if (totalSeconds < 60) return `${totalSeconds}s`;
+	const minutes = Math.floor(totalSeconds / 60);
+	const seconds = totalSeconds % 60;
+	return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
+}
+function formatPercentage(rate) {
+	return `${(rate * 100).toFixed(1)}%`;
 }
 //#endregion
-//#region src/optimizer-logging.ts
+//#region src/optimizer/optimizer-logging.ts
 function formatMsCompact(ms) {
 	const totalSeconds = Math.round(ms / 1e3);
 	if (totalSeconds < 60) return `${totalSeconds}s`;
@@ -666,12 +706,75 @@ function formatTokensCompact(tokens) {
 	if (tokens >= 1e3) return `${Math.round(tokens / 1e3)}K`;
 	return String(tokens);
 }
+/**
+* Clear any active progress line before logging
+* Call this before all console.log statements
+*/
+function clearProgressLine() {
+	const width = process.stdout.columns || 80;
+	process.stdout.write("\r" + " ".repeat(width) + "\r");
+}
+/**
+* Create a progress updater using cli-progress for beautiful output
+*/
+function createProgressUpdater(label) {
+	let tracker = null;
+	let total = 0;
+	return {
+		update(completed, newTotal) {
+			if (!tracker) {
+				total = newTotal;
+				tracker = createProgressTracker(label);
+				tracker.start(total);
+			}
+			tracker.update(completed);
+		},
+		finish() {
+			if (tracker) {
+				tracker.stop();
+				tracker = null;
+			}
+		},
+		clear() {
+			clearProgressLine();
+		}
+	};
+}
+/**
+* Track progress of Promise.allSettled with real-time updates
+*
+* @param promises Array of promises to track
+* @param onProgress Callback called when each promise settles
+* @returns Promise.allSettled result
+*/
+async function trackPromiseProgress(promises, onProgress) {
+	if (promises.length === 0) return [];
+	let completed = 0;
+	const total = promises.length;
+	onProgress(0, total);
+	const wrappedPromises = promises.map((promise) => promise.then((value) => {
+		completed++;
+		onProgress(completed, total);
+		return {
+			status: "fulfilled",
+			value
+		};
+	}).catch((reason) => {
+		completed++;
+		onProgress(completed, total);
+		return {
+			status: "rejected",
+			reason
+		};
+	}));
+	return Promise.all(wrappedPromises);
+}
 function formatFailure(testCase) {
 	const lines = [];
 	lines.push(`Input: ${JSON.stringify(testCase.input, null, 2)}`);
 	lines.push(`Expected: ${JSON.stringify(testCase.expected, null, 2)}`);
 	lines.push(`Actual: ${JSON.stringify(testCase.actual, null, 2)}`);
-	if (testCase.additionalContext) lines.push(`Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
+	if (testCase.additionalContext) lines.push(`Additional Context: ${JSON.stringify(testCase.additionalContext, null, 2)}`);
 	lines.push("");
 	lines.push("Field-level failures:");
 	for (const [fieldPath, result] of Object.entries(testCase.fields)) if (!result.passed) lines.push(`  ${fieldPath || "(root)"}: expected ${JSON.stringify(result.expected)}, got ${JSON.stringify(result.actual)}`);
@@ -695,56 +798,98 @@ function computeTotals(iterations) {
 		totalDuration
 	};
 }
-function formatDurationForLog(ms) {
-	const seconds = Math.round(ms / 1e3);
-	if (seconds < 60) return `(${seconds}s)`;
-	return `(${Math.floor(seconds / 60)}m ${seconds % 60}s)`;
+function logOptimizerHeader(model, targetRate, testCount) {
+	spinner.stop();
+	console.log("");
+	console.log(theme.bold("Didactic Optimizer"));
+	console.log(`  ${theme.dim("Model:")} ${model}${theme.separator}${theme.dim("Target:")} ${formatPercentage(targetRate)}${theme.separator}${theme.dim("Tests:")} ${testCount}`);
 }
 function logIterationStart(iterationLabel) {
-	console.log(`\n=== Optimization Iteration ${iterationLabel} ===`);
+	spinner.stop();
+	clearProgressLine();
+	console.log("");
+	console.log(theme.divider(`Iteration ${iterationLabel}`));
+	console.log("");
 }
 function logEvaluationStart() {
-	console.log(`  Evaluating prompt...`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`  ${theme.bold("Evaluating prompt")}`);
+	spinner.start("Running evals...");
 }
 function logEvaluationResult(result, cumulativeCost, durationMs) {
-	console.log(`  Result: ${result.passed}/${result.total} passed (${(result.successRate * 100).toFixed(1)}%) | Cost: $${result.cost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
+	spinner.stop();
+	clearProgressLine();
+	const successIcon = result.successRate >= .9 ? theme.check : result.successRate >= .5 ? theme.warn : theme.cross;
+	console.log(`    ${successIcon} ${theme.bold(formatPercentage(result.successRate))} success rate  ${theme.dim(`(${result.passed}/${result.total} passed)`)}`);
+	console.log(`    ${theme.dim("Cost:")} ${formatCost(result.cost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
 }
 function logRegressionDetected(bestSuccessRate) {
-	console.log(`  → Regression detected (was ${(bestSuccessRate * 100).toFixed(1)}%)`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`    ${theme.pointer} ${theme.warning("Regression")} ${theme.dim(`(was ${formatPercentage(bestSuccessRate)})`)}`);
 }
 function logTargetReached(targetSuccessRate) {
-	console.log(`  Target: ${(targetSuccessRate * 100).toFixed(0)}% | ✓ Target reached!`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`    ${theme.check} ${theme.success("Target reached!")} ${theme.dim(`(${formatPercentage(targetSuccessRate)})`)}`);
 }
 function logTargetFailures(targetSuccessRate, failureCount) {
-	console.log(`  Target: ${(targetSuccessRate * 100).toFixed(0)}% | ${failureCount} failures to address`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`    ${theme.cross} ${theme.error(`${failureCount} failures`)} to address ${theme.dim(`(target: ${formatPercentage(targetSuccessRate)})`)}`);
 }
 function logCostLimitReached(cumulativeCost) {
-	console.log(`  Cost limit reached ($${cumulativeCost.toFixed(2)})`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`    ${theme.warn} ${theme.warning("Cost limit reached")} ${theme.dim(`($${cumulativeCost.toFixed(2)})`)}`);
 }
 function logPatchGenerationStart(failureCount) {
-	console.log(``);
-	console.log(`  Generating ${failureCount} patches in parallel...`);
+	spinner.stop();
+	clearProgressLine();
+	console.log("");
+	console.log(`  ${theme.bold("Generating patches")}`);
+	spinner.start(`Generating ${failureCount} patches in parallel...`);
 }
 function logPatchGenerationResult(patchCost, cumulativeCost, durationMs) {
-	console.log(`  Patches generated | Cost: $${patchCost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`    ${theme.check} Patches generated${theme.separator}${theme.dim("Cost:")} ${formatCost(patchCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
 }
 function logMergeStart() {
-	console.log(``);
-	console.log(`  Merging patches...`);
+	spinner.stop();
+	clearProgressLine();
+	console.log("");
+	console.log(`  ${theme.bold("Merging patches")}`);
+	spinner.start("Merging patches...");
 }
 function logMergeResult(mergeCost, cumulativeCost, durationMs) {
-	console.log(`  Patches merged | Cost: $${mergeCost.toFixed(4)} | Total: $${cumulativeCost.toFixed(4)} ${formatDurationForLog(durationMs)}`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`    ${theme.check} Merged${theme.separator}${theme.dim("Cost:")} ${formatCost(mergeCost)}${theme.separator}${theme.dim("Total:")} ${formatCostShort(cumulativeCost)}${theme.separator}${theme.dim(formatDuration(durationMs))}`);
 }
 function logPatchGenerationFailures(failedCount, totalCount) {
-	console.log(`  ⚠ ${failedCount}/${totalCount} patch generations failed`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`    ${theme.warn} ${theme.warning(`${failedCount}/${totalCount} patch generations failed`)}`);
 }
 function logOptimizationComplete(bestSuccessRate, targetSuccessRate, cumulativeCost) {
-	console.log(`\n=== Optimization Complete ===`);
-	console.log(`Best result: ${(bestSuccessRate * 100).toFixed(1)}% (target was ${(targetSuccessRate * 100).toFixed(0)}%)`);
-	console.log(`Total cost: $${cumulativeCost.toFixed(4)}`);
+	spinner.stop();
+	clearProgressLine();
+	console.log("");
+	console.log(theme.divider("Complete"));
+	console.log("");
+	const targetMet = bestSuccessRate >= targetSuccessRate;
+	const icon = targetMet ? theme.check : theme.cross;
+	const rateColor = targetMet ? theme.success : theme.error;
+	console.log(`  ${icon} ${theme.bold("Best:")} ${rateColor(formatPercentage(bestSuccessRate))}`);
+	console.log(`  ${theme.dim("Target:")} ${formatPercentage(targetSuccessRate)}${theme.separator}${theme.dim("Total Cost:")} ${formatCostShort(cumulativeCost)}`);
 }
 function logLogsWritten(logPath) {
-	console.log(`Logs written to: ${logPath}`);
+	spinner.stop();
+	clearProgressLine();
+	console.log(`  ${theme.dim("Logs written to:")} ${logPath}`);
+	console.log("");
 }
 function generateConfigSection(ctx, testCaseCount) {
 	const lines = [];
@@ -911,6 +1056,7 @@ function writeRawDataJson(folderPath, iterations, ctx, success) {
 					input: tc.input,
 					expected: tc.expected,
 					actual: tc.actual,
+					additionalContext: tc.additionalContext,
 					fields: tc.fields
 				});
 			});
@@ -984,6 +1130,7 @@ function writeBestRunJson(folderPath, iterations, ctx) {
 			input: tc.input,
 			expected: tc.expected,
 			actual: tc.actual,
+			additionalContext: tc.additionalContext,
 			failedFields: extractFailedFields(tc.fields)
 		});
 		else if (tc.passRate < 1) partialFailures.push({
@@ -992,13 +1139,15 @@ function writeBestRunJson(folderPath, iterations, ctx) {
 			input: tc.input,
 			expected: tc.expected,
 			actual: tc.actual,
+			additionalContext: tc.additionalContext,
 			failedFields: extractFailedFields(tc.fields)
 		});
 		else successes.push({
 			testIndex: testIdx,
 			input: tc.input,
 			expected: tc.expected,
-			actual: tc.actual
+			actual: tc.actual,
+			additionalContext: tc.additionalContext
 		});
 	});
 	const report = {
@@ -1035,29 +1184,402 @@ function writeBestRunJson(folderPath, iterations, ctx) {
 	};
 	fs.writeFileSync(bestRunPath, JSON.stringify(report, null, 2), "utf-8");
 }
-function writeFinalLogs(logPath, iterationLogs, logContext, success) {
-	const folderPath = path.dirname(logPath);
-	if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
-	const content = generateLogContent(iterationLogs, logContext, success);
-	fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
-	writePromptsFile(folderPath, iterationLogs, logContext);
-	writeRawDataJson(folderPath, iterationLogs, logContext, success);
-	writeBestRunJson(folderPath, iterationLogs, logContext);
+function writeFinalLogs(logPath, iterationLogs, logContext, success) {
+	const folderPath = path.dirname(logPath);
+	if (!fs.existsSync(folderPath)) fs.mkdirSync(folderPath, { recursive: true });
+	const content = generateLogContent(iterationLogs, logContext, success);
+	fs.writeFileSync(path.join(folderPath, "summary.md"), content, "utf-8");
+	writePromptsFile(folderPath, iterationLogs, logContext);
+	writeRawDataJson(folderPath, iterationLogs, logContext, success);
+	writeBestRunJson(folderPath, iterationLogs, logContext);
+}
+//#endregion
+//#region src/eval/eval-logging.ts
+/**
+* Write evaluation results to rawData.json
+*
+* Synchronous writes are intentional - logging runs after evaluation completes
+* and errors are caught. This avoids async complexity in the calling code.
+*/
+function writeEvalLogs(logPath, result, durationMs, perTestThreshold) {
+	try {
+		const dir = path.dirname(logPath);
+		if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
+		const report = {
+			metadata: {
+				timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+				systemPrompt: result.systemPrompt,
+				testCaseCount: result.total,
+				perTestThreshold: perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD
+			},
+			summary: {
+				passed: result.passed,
+				total: result.total,
+				successRate: result.successRate,
+				correctFields: result.correctFields,
+				totalFields: result.totalFields,
+				accuracy: result.accuracy,
+				executorCost: result.cost,
+				comparatorCost: result.comparatorCost,
+				totalCost: result.cost + result.comparatorCost,
+				durationMs
+			},
+			testCases: result.testCases.map((tc, index) => ({
+				index,
+				passed: tc.passed,
+				passRate: tc.passRate,
+				input: tc.input,
+				expected: tc.expected,
+				actual: tc.actual,
+				additionalContext: tc.additionalContext,
+				executorCost: tc.cost ?? 0,
+				comparatorCost: tc.comparatorCost ?? 0,
+				error: tc.error,
+				fields: tc.fields
+			}))
+		};
+		fs.writeFileSync(logPath, JSON.stringify(report, null, 2), "utf-8");
+	} catch (error) {
+		console.error(`Failed to write eval logs to ${logPath}:`, error instanceof Error ? error.message : String(error));
+	}
+}
+//#endregion
+//#region src/eval/eval.ts
+/**
+* Run all test cases and return results.
+*/
+async function evaluate(config) {
+	const { testCases, systemPrompt, executor, comparators, comparatorOverride } = config;
+	if (testCases.length === 0) throw new Error("testCases array cannot be empty");
+	if (!executor) throw new Error("executor is required");
+	const startTime = Date.now();
+	const logPath = config.storeLogs ? typeof config.storeLogs === "string" ? config.storeLogs : `./didactic-logs/eval_${Date.now()}_${crypto.randomUUID().slice(0, 8)}/rawData.json` : void 0;
+	const executeTestCase = async ({ input, expected }) => {
+		try {
+			const result = await executor(input, systemPrompt);
+			let fields;
+			if (comparatorOverride) {
+				const compResult = await comparatorOverride(expected, result.output);
+				fields = { "": {
+					passed: compResult.passed,
+					expected,
+					actual: result.output
+				} };
+			} else {
+				let comparatorConfig;
+				if (!comparators) comparatorConfig = { "": exact };
+				else if (typeof comparators === "function") comparatorConfig = { "": comparators };
+				else comparatorConfig = comparators;
+				fields = await compareFields({
+					expected,
+					actual: result.output,
+					comparators: comparatorConfig,
+					llmConfig: config.llmConfig
+				});
+			}
+			const passedFields = Object.values(fields).filter((f) => f.passed).length;
+			const totalFields$1 = Object.values(fields).length;
+			const passRate = totalFields$1 === 0 ? 1 : passedFields / totalFields$1;
+			const passed$1 = passRate >= (config.perTestThreshold ?? DEFAULT_PER_TEST_THRESHOLD);
+			const comparatorCost$1 = Object.values(fields).reduce((sum, field) => sum + (field.cost ?? 0), 0);
+			return {
+				input,
+				expected,
+				actual: result.output,
+				additionalContext: result.additionalContext,
+				cost: result.cost ?? 0,
+				comparatorCost: comparatorCost$1,
+				passed: passed$1,
+				fields,
+				passedFields,
+				totalFields: totalFields$1,
+				passRate
+			};
+		} catch (error) {
+			return {
+				input,
+				expected,
+				actual: void 0,
+				cost: 0,
+				comparatorCost: 0,
+				passed: false,
+				fields: {},
+				passedFields: 0,
+				totalFields: 0,
+				passRate: 0,
+				error: error instanceof Error ? error.message : String(error)
+			};
+		}
+	};
+	const rateLimitBatch = config.rateLimitBatch;
+	let results;
+	if (rateLimitBatch && rateLimitBatch > 0) {
+		results = [];
+		const progress = createProgressUpdater("evals");
+		for (let i = 0; i < testCases.length; i += rateLimitBatch) {
+			const batch = testCases.slice(i, i + rateLimitBatch);
+			const batchResults = await Promise.all(batch.map(executeTestCase));
+			results.push(...batchResults);
+			progress.update(results.length, testCases.length);
+			const rateLimitPause = config.rateLimitPause;
+			if (rateLimitPause && rateLimitPause > 0 && i + rateLimitBatch < testCases.length) await new Promise((r) => setTimeout(r, rateLimitPause * 1e3));
+		}
+		progress.finish();
+	} else {
+		const progress = createProgressUpdater("evals");
+		results = (await trackPromiseProgress(testCases.map((tc) => executeTestCase(tc)), (completed, total$1) => progress.update(completed, total$1))).map((r) => r.value);
+		progress.finish();
+	}
+	results.sort((a, b) => {
+		if (a.passed !== b.passed) return a.passed ? 1 : -1;
+		return a.passRate - b.passRate;
+	});
+	const passed = results.filter((r) => r.passed).length;
+	const total = results.length;
+	const successRate = total > 0 ? passed / total : 0;
+	let correctFields = 0;
+	let totalFields = 0;
+	for (const r of results) {
+		const fieldResults = Object.values(r.fields);
+		totalFields += fieldResults.length;
+		correctFields += fieldResults.filter((f) => f.passed).length;
+	}
+	const accuracy = totalFields > 0 ? correctFields / totalFields : 0;
+	const cost = results.reduce((sum, r) => sum + (r.cost ?? 0), 0);
+	const comparatorCost = results.reduce((sum, r) => sum + (r.comparatorCost ?? 0), 0);
+	const durationMs = Date.now() - startTime;
+	const logFolder = logPath ? path.dirname(logPath) : void 0;
+	const evalResult = {
+		systemPrompt,
+		testCases: results,
+		passed,
+		total,
+		successRate,
+		correctFields,
+		totalFields,
+		accuracy,
+		cost,
+		comparatorCost,
+		...logFolder && { logFolder }
+	};
+	if (logPath) writeEvalLogs(logPath, evalResult, durationMs, config.perTestThreshold);
+	return evalResult;
+}
+/**
+* Recursively compare expected vs actual, returning field-level results.
+* Path patterns: 'carrier', 'quote.premium', '[0]', 'quotes[0].carrier'
+*/
+async function compareFields(opts) {
+	const { expected, actual, comparators, path: path$1 = "", expectedParent, actualParent, llmConfig } = opts;
+	const results = {};
+	const indexPath = (i) => path$1 ? `${path$1}[${i}]` : `[${i}]`;
+	if (Array.isArray(expected)) {
+		if (!Array.isArray(actual)) return { [path$1]: {
+			passed: false,
+			expected,
+			actual
+		} };
+		if (expected.length === 0) return {};
+		const fieldComparator = comparators[getFieldName(path$1)];
+		const isUnordered = fieldComparator && typeof fieldComparator === "function" && "_unordered" in fieldComparator && fieldComparator._unordered === true;
+		let itemComparators;
+		if (isUnordered) itemComparators = fieldComparator._nestedComparators || comparators;
+		else if (fieldComparator && typeof fieldComparator === "object" && !("_unordered" in fieldComparator)) itemComparators = fieldComparator;
+		else itemComparators = comparators;
+		let matchedPairs;
+		if (isUnordered) matchedPairs = (await matchArrays(expected, actual, itemComparators)).assignments;
+		else {
+			matchedPairs = [];
+			for (let i = 0; i < expected.length && i < actual.length; i++) matchedPairs.push([i, i]);
+		}
+		const matchedIndices = new Set(matchedPairs.map(([i]) => i));
+		for (const [expIdx, actIdx] of matchedPairs) Object.assign(results, await compareFields({
+			expected: expected[expIdx],
+			actual: actual[actIdx],
+			comparators: itemComparators,
+			path: indexPath(expIdx),
+			expectedParent,
+			actualParent,
+			llmConfig
+		}));
+		const hasArrayComparator = fieldComparator !== void 0;
+		for (let i = 0; i < expected.length; i++) {
+			if (matchedIndices.has(i)) continue;
+			const item = expected[i];
+			if (isObject(item)) {
+				for (const [field, value] of Object.entries(item)) if (field in itemComparators) results[`${indexPath(i)}.${field}`] = {
+					passed: false,
+					expected: value,
+					actual: void 0
+				};
+			} else if (hasArrayComparator) results[indexPath(i)] = {
+				passed: false,
+				expected: item,
+				actual: void 0
+			};
+		}
+		return results;
+	}
+	if (isObject(expected)) {
+		if (!isObject(actual)) return { [path$1]: {
+			passed: false,
+			expected,
+			actual
+		} };
+		for (const [field, expValue] of Object.entries(expected)) {
+			const fieldPath = path$1 ? `${path$1}.${field}` : field;
+			const fieldConfig = comparators[field];
+			if (fieldConfig === void 0) continue;
+			let fieldComparators;
+			if (fieldConfig && typeof fieldConfig === "object" && !("_unordered" in fieldConfig)) fieldComparators = fieldConfig;
+			else fieldComparators = comparators;
+			Object.assign(results, await compareFields({
+				expected: expValue,
+				actual: actual[field],
+				comparators: fieldComparators,
+				path: fieldPath,
+				expectedParent: expected,
+				actualParent: actual,
+				llmConfig
+			}));
+		}
+		return results;
+	}
+	const fieldName = getFieldName(path$1);
+	let comparatorConfig = comparators[fieldName];
+	if (!comparatorConfig && fieldName === "") comparatorConfig = exact;
+	if (!comparatorConfig) return {};
+	const result = await (typeof comparatorConfig === "function" ? comparatorConfig : exact)(expected, actual, {
+		expectedParent,
+		actualParent,
+		llmConfig
+	});
+	return { [path$1]: {
+		...result,
+		expected,
+		actual
+	} };
+}
+function isObject(value) {
+	return value !== null && typeof value === "object" && !Array.isArray(value);
+}
+function getFieldName(path$1) {
+	return (path$1.split(".").pop() || "").replace(/\[\d+\]$/, "");
+}
+//#endregion
+//#region src/optimizer/prompts.ts
+/**
+* Default system prompt for patch generation.
+* Analyzes failures and suggests specific, focused changes to improve the prompt.
+*/
+const DEFAULT_PATCH_SYSTEM_PROMPT = `
+  'You are optimizing a system prompt for an LLM workflow.
+  Analyze the failure and suggest a specific, focused change to improve the prompt.
+  Do NOT overfit. Be generalizable.
+  <examples>
+    VERY IMPORTANT, CRITICAL!!!
+    Examples MUST be anonymized.
+    NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
+    - example: (for an invoice processor)
+      - task: extract data from parsed invoices
+      - failure context: (returned expected: true, actual: false)
+      - prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
+    - example: (for a calendar app)
+      - task: extract cost from calendar event
+      - failure context: (cost expected: 123.45, actual: 167.89)
+      - prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
+  </examples>
+`;
+/**
+* Default system prompt for merging patches.
+* Combines multiple patches into a coherent system prompt.
+*/
+const DEFAULT_MERGE_SYSTEM_PROMPT = `
+  You are an expert LLM prompt editor.
+  You are merging improvements into a system prompt.
+  Incorporate the suggestions while keeping the prompt clear and coherent.
+`;
+/**
+* Builds the user prompt for patch generation.
+* Formats the failure context and current prompt for the LLM.
+*/
+function buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures) {
+	let userContent = `
+    Current system prompt:
+    ---
+    ${currentPrompt}
+    ---
+    A test case failed:
+    ${formatFailure(failure)}
+  `;
+	if (previousBetterPrompt) {
+		const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
+		userContent += `
+      Note: The current prompt is a REGRESSION from a better-performing version.
+      Previous (better) prompt for reference:
+      ---
+      ${previousBetterPrompt}
+      ---
+      The failures the better prompt had:
+      ${failuresContext}
+      Your changes introduced new failures instead of fixing the above.
+      Analyze what changed between the two prompts that might have caused this regression.
+      Are there any new failures that were not present in the previous better prompt?
+      Are there any failures that were present in the previous better prompt but not in the current prompt?
+      Did any of our patches contradict any of the new failures?
+    `;
+	}
+	userContent += `
+    Suggest a specific change to the system prompt that would fix this failure.
+    Be concise. Output ONLY the suggested patch/change, not the full prompt.
+    DO NOT overfit the prompt to the test case.
+    Generalize examples if you choose to use them.
+  `;
+	return userContent;
+}
+/**
+* Builds the user prompt for merging patches.
+* Formats the current prompt and suggested patches for the LLM.
+*/
+function buildMergeUserPrompt(patches, currentPrompt) {
+	return `
+      Current prompt:
+      ---
+      ${currentPrompt}
+      ---
+      Suggested improvements:
+      ${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
+      Create a single improved system prompt that incorporates these suggestions.
+      Be mindful of the size of the new prompt.
+      Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
+      Output ONLY the new system prompt, nothing else.
+      Respect enums.
+  `;
 }
 //#endregion
-//#region src/optimizer.ts
+//#region src/optimizer/optimizer.ts
 async function optimize(evalConfig, config) {
 	if (!config.apiKey) throw new Error("apiKey is required");
-	if (!config.systemPrompt) throw new Error("systemPrompt is required");
 	if (config.targetSuccessRate < 0 || config.targetSuccessRate > 1) throw new Error("targetSuccessRate must be between 0 and 1");
 	const iterationLogs = [];
 	const maxIterations = config.maxIterations ?? (config.maxCost !== void 0 ? Infinity : 5);
 	const startTime = /* @__PURE__ */ new Date();
+	const model = PROVIDER_SPECS[config.provider].model;
 	const logContext = {
 		config,
 		startTime,
-		model: PROVIDER_SPECS[config.provider].model,
+		model,
 		perTestThreshold: evalConfig.perTestThreshold,
 		rateLimitBatch: evalConfig.rateLimitBatch,
 		rateLimitPause: evalConfig.rateLimitPause
@@ -1114,6 +1636,8 @@ async function optimize(evalConfig, config) {
 			totalCost: cumulativeCost
 		};
 	};
+	const testCount = evalConfig.testCases?.length ?? 0;
+	logOptimizerHeader(model, config.targetSuccessRate, testCount);
 	for (let i = 1; i <= maxIterations; i++) {
 		const iterationStart = Date.now();
 		let iterInputTokens = 0;
@@ -1127,7 +1651,7 @@ async function optimize(evalConfig, config) {
 		});
 		cumulativeCost += result.cost;
 		logEvaluationResult(result, cumulativeCost, Date.now() - evalStart);
-		const regressed = i > 1 && result.successRate < bestSuccessRate;
+		const regressed = i > 1 && result.successRate <= bestSuccessRate;
 		if (regressed) logRegressionDetected(bestSuccessRate);
 		if (result.successRate > bestSuccessRate) {
 			bestSuccessRate = result.successRate;
@@ -1140,10 +1664,6 @@ async function optimize(evalConfig, config) {
 			return finalizeOptimization(true, currentPrompt);
 		}
 		const failures = result.testCases.filter((tc) => !tc.passed);
-		if (failures.length === 0) {
-			recordIteration(i, currentPrompt, result, result.cost, Date.now() - iterationStart, iterInputTokens, iterOutputTokens);
-			return finalizeOptimization(true, currentPrompt);
-		}
 		logTargetFailures(config.targetSuccessRate, failures.length);
 		if (config.maxCost !== void 0 && cumulativeCost >= config.maxCost) {
 			logCostLimitReached(cumulativeCost);
@@ -1152,7 +1672,9 @@ async function optimize(evalConfig, config) {
 		}
 		logPatchGenerationStart(failures.length);
 		const patchStart = Date.now();
-		const patchSettled = await Promise.allSettled(failures.map((failure) => generatePatch(failure, currentPrompt, config, regressed ? bestPrompt : void 0, regressed ? bestPromptFailures : void 0)));
+		const patchProgress = createProgressUpdater("patches");
+		const patchSettled = await trackPromiseProgress(failures.map((failure) => generatePatch(failure, currentPrompt, config, regressed ? bestPrompt : void 0, regressed ? bestPromptFailures : void 0)), (completed, total) => patchProgress.update(completed, total));
+		patchProgress.finish();
 		const patchResults = patchSettled.filter((r) => r.status === "fulfilled").map((r) => r.value);
 		const failedPatchCount = patchSettled.filter((r) => r.status === "rejected").length;
 		if (failedPatchCount > 0) logPatchGenerationFailures(failedPatchCount, failures.length);
@@ -1192,154 +1714,165 @@ async function optimize(evalConfig, config) {
 	}
 	return finalizeOptimization(false, bestPrompt);
 }
-async function callLLM(messages, config, useThinking = false) {
-	const spec = PROVIDER_SPECS[config.provider];
-	try {
-		if (config.provider.startsWith("anthropic")) {
-			const client = new Anthropic({ apiKey: config.apiKey });
-			const streamOptions = {
-				model: spec.model,
-				max_tokens: spec.maxTokens,
-				system: messages.find((m) => m.role === "system")?.content,
-				messages: messages.filter((m) => m.role !== "system").map((m) => ({
-					role: m.role,
-					content: m.content
-				}))
-			};
-			if (useThinking) streamOptions.thinking = {
-				type: "enabled",
-				budget_tokens: ANTHROPIC_THINKING_BUDGET_TOKENS
-			};
-			const finalMessage = await client.messages.stream(streamOptions).finalMessage();
-			const textBlocks = finalMessage.content.filter((block) => block.type === "text").map((block) => block.text);
-			const text = textBlocks.length > 0 ? textBlocks.join(" ") : "";
-			const inputTokens = finalMessage.usage.input_tokens;
-			const outputTokens = finalMessage.usage.output_tokens;
-			return {
-				text,
-				cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
-				inputTokens,
-				outputTokens
-			};
-		}
-		if (config.provider.startsWith("openai")) {
-			const client = new OpenAI({ apiKey: config.apiKey });
-			const completionOptions = {
-				model: spec.model,
-				messages: messages.map((m) => ({
-					role: m.role,
-					content: m.content
-				})),
-				max_completion_tokens: spec.maxTokens
-			};
-			if (useThinking) completionOptions.reasoning_effort = "xhigh";
-			const response = await client.chat.completions.create(completionOptions);
-			const text = response.choices[0].message.content ?? "";
-			const inputTokens = response.usage?.prompt_tokens ?? 0;
-			const outputTokens = response.usage?.completion_tokens ?? 0;
-			return {
-				text,
-				cost: (inputTokens * spec.costPerMillionInput + outputTokens * spec.costPerMillionOutput) / TOKENS_PER_MILLION,
-				inputTokens,
-				outputTokens
-			};
-		}
-		throw new Error(`Unsupported provider: ${config.provider}`);
-	} catch (error) {
-		const message = error instanceof Error ? error.message : String(error);
-		throw new Error(`LLM call failed (${spec.model}): ${message}`);
-	}
-}
 async function generatePatch(failure, currentPrompt, config, previousBetterPrompt, previousBetterPromptFailures) {
-	let userContent = `
-    Current system prompt:
-    ---
-    ${currentPrompt}
-    ---
-    A test case failed:
-    ${formatFailure(failure)}
-  `;
-	if (previousBetterPrompt) {
-		const failuresContext = previousBetterPromptFailures && previousBetterPromptFailures.length > 0 ? previousBetterPromptFailures.map((f, i) => `${i + 1}. ${formatFailure(f)}`).join("\n\n") : "None recorded";
-		userContent += `
-      Note: The current prompt is a REGRESSION from a better-performing version.
-      Previous (better) prompt for reference:
-      ---
-      ${previousBetterPrompt}
-      ---
-      The failures the better prompt had:
-      ${failuresContext}
-      Your changes introduced new failures instead of fixing the above.
-      Analyze what changed between the two prompts that might have caused this regression.
-      Are there any new failures that were not present in the previous better prompt?
-      Are there any failures that were present in the previous better prompt but not in the current prompt?
-      Did any of our patches contradict any of the new failures?
-    `;
-	}
-	userContent += `
-    Suggest a specific change to the system prompt that would fix this failure.
-    Be concise. Output ONLY the suggested patch/change, not the full prompt.
-    DO NOT overfit the prompt to the test case.
-    Generalize examples if you choose to use them.
-  `;
-	return callLLM([{
+	const userContent = buildPatchUserPrompt(failure, currentPrompt, previousBetterPrompt, previousBetterPromptFailures);
+	const messages = [{
 		role: "system",
-		content: `
-    'You are optimizing a system prompt for an LLM workflow.
-    Analyze the failure and suggest a specific, focused change to improve the prompt.
-    Do NOT overfit. Be generalizable.
-    <examples>
-      VERY IMPORTANT, CRITICAL!!!
-      Examples MUST be anonymized.
-      NEVER use specific names, dates, or other identifying information UNLESS it's a universal fact:
-      - example: (for an invoice processor)
-        - task: extract data from parsed invoices
-        - failure context: (returned expected: true, actual: false)
-        - prompt patch: "if you see "Restocked" on a Schedule B report of a Shopify invoice, mark returned as true." <- this is kind of specific, but it's a universal fact for the problem and could satisfy other inputs.)
-      - example: (for a calendar app)
-        - task: extract cost from calendar event
-        - failure context: (cost expected: 123.45, actual: 167.89)
-        - prompt patch: "if you see "Daisy" in the name field, return 123.45 for cost" <- this is too specific, it's overfit to a specific failure. The spirit of the failure is an incorrect extraction, you should look for the expected in the context and determine how the prompt could be modified to acheive the expected output.)
-    </examples>
-  `
+		content: config.patchSystemPrompt ?? DEFAULT_PATCH_SYSTEM_PROMPT
 	}, {
 		role: "user",
 		content: userContent
-	}], config, config.thinking ?? false);
+	}];
+	return callLLM({
+		provider: config.provider,
+		apiKey: config.apiKey,
+		messages,
+		useThinking: config.thinking ?? false
+	});
 }
 async function mergePatches(patches, currentPrompt, config) {
-	const systemContent = `
-    You are an expert LLM prompt editor.
-    You are merging improvements into a system prompt.
-    Incorporate the suggestions while keeping the prompt clear and coherent.
-  `;
-	const userContent = `
-      Current prompt:
-      ---
-      ${currentPrompt}
-      ---
-      Suggested improvements:
-      ${patches.map((p, i) => `${i + 1}. ${p}`).join("\n\n")}
-      Create a single improved system prompt that incorporates these suggestions.
-      Be mindful of the size of the new prompt.
-      Use discretion when merging the patches, if you see duplicate information, emphasize it but don't repeat it.
-      Output ONLY the new system prompt, nothing else.
-      Respect enums.
-  `;
-	return callLLM([{
+	const systemContent = config.mergeSystemPrompt ?? DEFAULT_MERGE_SYSTEM_PROMPT;
+	const userContent = buildMergeUserPrompt(patches, currentPrompt);
+	const messages = [{
 		role: "system",
 		content: systemContent
 	}, {
 		role: "user",
 		content: userContent
-	}], config, config.thinking ?? false);
+	}];
+	return callLLM({
+		provider: config.provider,
+		apiKey: config.apiKey,
+		messages,
+		useThinking: config.thinking ?? false
+	});
+}
+//#endregion
+//#region src/eval/executors.ts
+/**
+* Creates an executor that calls an HTTP endpoint.
+*
+* @example
+* ```ts
+* const executor = endpoint('https://api.example.com/workflow', {
+*   headers: { Authorization: 'Bearer token' },
+* });
+* ```
+*/
+function endpoint(url, config = {}) {
+	const { method = "POST", headers = {}, mapResponse, mapAdditionalContext, mapCost, timeout = DEFAULT_ENDPOINT_TIMEOUT_MS } = config;
+	return async (input, systemPrompt) => {
+		const body = typeof input === "object" && input !== null ? {
+			...input,
+			systemPrompt
+		} : {
+			input,
+			systemPrompt
+		};
+		const controller = new AbortController();
+		const timeoutId = setTimeout(() => controller.abort(), timeout);
+		try {
+			const response = await fetch(url, {
+				method,
+				headers: {
+					"Content-Type": "application/json",
+					...headers
+				},
+				body: JSON.stringify(body),
+				signal: controller.signal
+			});
+			clearTimeout(timeoutId);
+			if (!response.ok) {
+				const text = await response.text();
+				throw new Error(`HTTP ${response.status}: ${text}`);
+			}
+			const data = await response.json();
+			const additionalContext = mapAdditionalContext?.(data);
+			const cost = mapCost?.(data) ?? 0;
+			if (mapResponse) return {
+				output: mapResponse(data),
+				additionalContext,
+				cost
+			};
+			return {
+				output: data,
+				additionalContext,
+				cost
+			};
+		} catch (error) {
+			clearTimeout(timeoutId);
+			throw error;
+		}
+	};
+}
+/**
+* Creates an executor from a local function.
+*
+* @example
+* ```ts
+* const executor = fn({
+*   fn: async (input, systemPrompt) => {
+*     const result = await myLLMCall(input, systemPrompt);
+*     return result;
+*   },
+* });
+* ```
+*
+* @example With mapResponse to extract output from a richer response:
+* ```ts
+* const executor = fn({
+*   fn: async (input, systemPrompt) => await startWorkflow({ ... }),
+*   mapResponse: (result) => ({ documentType: result.documentType }),
+*   mapCost: (result) => result.cost,
+*   mapAdditionalContext: (result) => result.metadata,
+* });
+* ```
+*/
+function fn(config) {
+	return async (input, systemPrompt) => {
+		const raw = await config.fn(input, systemPrompt);
+		return {
+			output: config.mapResponse ? config.mapResponse(raw) : raw,
+			additionalContext: config.mapAdditionalContext?.(raw),
+			cost: config.mapCost?.(raw) ?? 0
+		};
+	};
+}
+/**
+* Creates a mock executor for testing.
+* Can accept either:
+* - An array of outputs (returned in sequence, cycling if more calls than outputs)
+* - A function that maps input to output
+*
+* @example Array-based:
+* ```ts
+* const executor = mock([
+*   { premium: 12500, policyType: 'claims-made' },
+*   { premium: 8200, policyType: 'entity' },
+* ]);
+* ```
+*
+* @example Function-based:
+* ```ts
+* const executor = mock((input) => ({
+*   id: input.id,
+*   processed: true,
+* }));
+* ```
+*/
+function mock(outputsOrFn) {
+	if (typeof outputsOrFn === "function") return async (input, systemPrompt) => {
+		return { output: outputsOrFn(input, systemPrompt) };
+	};
+	const outputs = outputsOrFn;
+	if (outputs.length === 0) throw new Error("mock() requires at least one output");
+	let callIndex = 0;
+	return async () => {
+		const output = outputs[callIndex % outputs.length];
+		callIndex++;
+		return { output };
+	};
 }
 //#endregion
@@ -1392,5 +1925,5 @@ const didactic = {
 var src_default = didactic;
 //#endregion
-export { LLMProviders, contains, custom, date, src_default as default, didactic as didact, didactic, endpoint, evaluate, exact, fn, mock, name, numeric, oneOf, optimize, presence, within };
+export { LLMProviders, contains, custom, date, src_default as default, didactic as didact, didactic, endpoint, evaluate, exact, fn, llmCompare, mock, name, numeric, oneOf, optimize, presence, unordered, within };
 //# sourceMappingURL=index.mjs.map