npm - @llmops/sdk - Versions diffs - 1.0.0-beta.22 → 1.0.0-beta.23 - Mend

@llmops/sdk 1.0.0-beta.22 → 1.0.0-beta.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/eval.cjs CHANGED Viewed

@@ -23,6 +23,13 @@ var InlineDataset = class {
 //#endregion
 //#region src/eval/evaluate.ts
+const RESET = "\x1B[0m";
+const DIM = "\x1B[2m";
+const BOLD = "\x1B[1m";
+const CYAN = "\x1B[36m";
+const GREEN = "\x1B[32m";
+const RED = "\x1B[31m";
+const YELLOW = "\x1B[33m";
 async function pool(items, concurrency, fn) {
 	const executing = [];
 	for (const item of items) {
@@ -55,11 +62,70 @@ function computeStats(values) {
 		count: sorted.length
 	};
 }
-async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
+const isSilent = process.env.LLMOPS_EVAL_OUTPUT === "json";
+const w = process.stderr;
+function printHeader(name, total) {
+	if (isSilent) return;
+	w.write("\n");
+	w.write(`  ${BOLD}${name}${RESET}  ${DIM}(${total} datapoints)${RESET}\n`);
+	w.write(`  ${DIM}${"─".repeat(50)}${RESET}\n`);
+}
+function printDatapointResult(idx, total, dp) {
+	if (isSilent) return;
+	const label = typeof dp.data === "object" && dp.data !== null ? JSON.stringify(dp.data).slice(0, 50) : String(dp.data).slice(0, 50);
+	if (dp.error) {
+		w.write(`  ${RED}✗${RESET} ${DIM}[${idx + 1}/${total}]${RESET} ${label}  ${RED}ERROR${RESET} ${DIM}${dp.error.slice(0, 60)}${RESET}\n`);
+		return;
+	}
+	const scoreStr = Object.entries(dp.scores).map(([name, val]) => {
+		if (Number.isNaN(val)) return `${DIM}${name}=NaN${RESET}`;
+		return `${val >= .8 ? GREEN : val >= .5 ? YELLOW : RED}${name}=${val.toFixed(2)}${RESET}`;
+	}).join("  ");
+	w.write(`  ${GREEN}✓${RESET} ${DIM}[${idx + 1}/${total}]${RESET} ${label}  ${scoreStr}  ${DIM}${dp.durationMs}ms${RESET}\n`);
+}
+function scoreBar(score, width = 20) {
+	const filled = Math.round(score * width);
+	const empty = width - filled;
+	return "█".repeat(filled) + "░".repeat(empty);
+}
+function scoreColor(score) {
+	if (score >= .8) return GREEN;
+	if (score >= .5) return YELLOW;
+	return RED;
+}
+function printSummary(result) {
+	if (isSilent) return;
+	w.write("\n");
+	const entries = Object.entries(result.scores);
+	if (entries.length > 0) {
+		const maxNameLen = Math.max(...entries.map(([n]) => n.length), 10);
+		w.write(`  ${DIM}${"Evaluator".padEnd(maxNameLen)}  ${"Mean".padStart(6)}  ${"Bar".padEnd(20)}  ${"Min".padStart(5)}  ${"Max".padStart(5)}  ${"Med".padStart(5)}${RESET}\n`);
+		w.write(`  ${DIM}${"─".repeat(maxNameLen + 50)}${RESET}\n`);
+		for (const [name, stats] of entries) {
+			const color = scoreColor(stats.mean);
+			const bar = scoreBar(stats.mean);
+			w.write(`  ${name.padEnd(maxNameLen)}  ${color}${stats.mean.toFixed(2).padStart(6)}${RESET}  ${DIM}${bar}${RESET}  ${stats.min.toFixed(2).padStart(5)}  ${stats.max.toFixed(2).padStart(5)}  ${stats.median.toFixed(2).padStart(5)}\n`);
+		}
+	}
+	const completed = result.count - result.errors;
+	w.write("\n");
+	w.write(`  ${DIM}Duration${RESET} ${(result.durationMs / 1e3).toFixed(1)}s`);
+	w.write(`    ${DIM}Passed${RESET} ${completed}/${result.count}`);
+	if (result.errors > 0) w.write(`    ${RED}Failed ${result.errors}${RESET}`);
+	w.write(`    ${DIM}Run${RESET} ${CYAN}${result.runId.slice(0, 8)}${RESET}`);
+	w.write("\n\n");
+}
+function saveResult(result, outputDir) {
+	const dir = (0, node_path.join)(outputDir, result.name);
+	(0, node_fs.mkdirSync)(dir, { recursive: true });
+	(0, node_fs.writeFileSync)((0, node_path.join)(dir, `${Date.now()}.json`), JSON.stringify(result, null, 2));
+}
+async function runSingleExecutor(name, dataset, executor, evaluators, concurrency) {
 	const size = await dataset.size();
 	const datapoints = await dataset.slice(0, size);
 	const results = new Array(datapoints.length);
 	const startTime = Date.now();
+	printHeader(name, datapoints.length);
 	await pool(datapoints, concurrency, async (dp) => {
 		const idx = datapoints.indexOf(dp);
 		const dpStart = Date.now();
@@ -71,14 +137,16 @@ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
 		} catch (err) {
 			error = err instanceof Error ? err.message : String(err);
 		}
-		if (!error && output !== null) for (const [name, evaluator] of Object.entries(evaluators)) try {
+		if (!error && output !== null) for (const [evalName, evaluator] of Object.entries(evaluators)) try {
 			const result = await evaluator(output, dp.target, dp.data);
-			if (typeof result === "number") scores[name] = result;
-			else for (const [subKey, subScore] of Object.entries(result)) scores[`${name}.${subKey}`] = subScore;
-		} catch {
-			scores[name] = NaN;
+			if (typeof result === "number") scores[evalName] = result;
+			else for (const [subKey, subScore] of Object.entries(result)) scores[`${evalName}.${subKey}`] = subScore;
+		} catch (evalErr) {
+			scores[evalName] = NaN;
+			const msg = evalErr instanceof Error ? evalErr.message : String(evalErr);
+			if (!isSilent) w.write(`  ${YELLOW}⚠${RESET} ${DIM}evaluator "${evalName}":${RESET} ${msg.slice(0, 80)}\n`);
 		}
-		results[idx] = {
+		const dpResult = {
 			data: dp.data,
 			target: dp.target,
 			metadata: dp.metadata,
@@ -87,33 +155,14 @@ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
 			durationMs: Date.now() - dpStart,
 			error
 		};
+		results[idx] = dpResult;
+		printDatapointResult(idx, datapoints.length, dpResult);
 	});
 	return {
 		results,
 		durationMs: Date.now() - startTime
 	};
 }
-function printSummary(result) {
-	const lines = [];
-	lines.push("");
-	lines.push(` ${result.name}`);
-	lines.push("");
-	const completed = result.count - result.errors;
-	lines.push(` ✓ ${completed}/${result.count} completed${result.errors > 0 ? `  ✗ ${result.errors} errors` : ""}`);
-	lines.push("");
-	lines.push(" Scores:");
-	for (const [name, stats] of Object.entries(result.scores)) lines.push(`   ${name.padEnd(16)} mean=${stats.mean.toFixed(2)}  min=${stats.min.toFixed(2)}  max=${stats.max.toFixed(2)}  median=${stats.median.toFixed(2)}`);
-	lines.push("");
-	lines.push(` Duration: ${(result.durationMs / 1e3).toFixed(1)}s`);
-	lines.push(` Run ID:   ${result.runId}`);
-	lines.push("");
-	process.stderr.write(lines.join("\n"));
-}
-function saveResult(result, outputDir) {
-	const dir = (0, node_path.join)(outputDir, result.name);
-	(0, node_fs.mkdirSync)(dir, { recursive: true });
-	(0, node_fs.writeFileSync)((0, node_path.join)(dir, `${result.runId}.json`), JSON.stringify(result, null, 2));
-}
 async function evaluate(options) {
 	const { name, data, executor, variants, evaluators, concurrency = 5, group, metadata, outputDir = process.env.LLMOPS_EVAL_OUTPUT_DIR || "./llmops-evals" } = options;
 	const runId = (0, node_crypto.randomUUID)();
@@ -121,7 +170,7 @@ async function evaluate(options) {
 	if (!executor && !variants) throw new Error("evaluate(): provide either executor or variants");
 	const dataset = Array.isArray(data) ? new InlineDataset(data) : data;
 	if (executor) {
-		const { results, durationMs } = await runSingleExecutor(dataset, executor, evaluators, concurrency);
+		const { results, durationMs } = await runSingleExecutor(name, dataset, executor, evaluators, concurrency);
 		const scoreNames = /* @__PURE__ */ new Set();
 		for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
 		const scores = {};
@@ -137,7 +186,7 @@ async function evaluate(options) {
 			metadata,
 			results
 		};
-		if (process.env.LLMOPS_EVAL_OUTPUT === "json") process.stdout.write(JSON.stringify(result, null, 2));
+		if (isSilent) process.stdout.write(JSON.stringify(result, null, 2));
 		else printSummary(result);
 		saveResult(result, outputDir);
 		return result;
@@ -145,7 +194,7 @@ async function evaluate(options) {
 	const variantResults = {};
 	const totalStart = Date.now();
 	for (const [variantName, variantExecutor] of Object.entries(variants)) {
-		const { results, durationMs } = await runSingleExecutor(dataset, variantExecutor, evaluators, concurrency);
+		const { results, durationMs } = await runSingleExecutor(`${name}/${variantName}`, dataset, variantExecutor, evaluators, concurrency);
 		const scoreNames = /* @__PURE__ */ new Set();
 		for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
 		const scores = {};
@@ -162,7 +211,7 @@ async function evaluate(options) {
 			results
 		};
 		variantResults[variantName] = variantResult;
-		if (process.env.LLMOPS_EVAL_OUTPUT !== "json") printSummary(variantResult);
+		if (!isSilent) printSummary(variantResult);
 		saveResult(variantResult, outputDir);
 	}
 	const variantEvalResult = {
@@ -173,48 +222,43 @@ async function evaluate(options) {
 		metadata,
 		variants: variantResults
 	};
-	if (process.env.LLMOPS_EVAL_OUTPUT === "json") process.stdout.write(JSON.stringify(variantEvalResult, null, 2));
+	if (isSilent) process.stdout.write(JSON.stringify(variantEvalResult, null, 2));
 	return variantEvalResult;
 }
 //#endregion
 //#region src/eval/compare.ts
 /**
-* Load an eval run from the filesystem.
+* Load an eval result from a JSON file.
 */
-function loadRun(outputDir, name, runId) {
-	const dir = (0, node_path.join)(outputDir, name);
-	const filePath = (0, node_path.join)(dir, `${runId}.json`);
+function loadResult(filePath) {
 	try {
 		const content = (0, node_fs.readFileSync)(filePath, "utf-8");
 		return JSON.parse(content);
 	} catch {
-		try {
-			const match = (0, node_fs.readdirSync)(dir).find((f) => f.startsWith(runId) && f.endsWith(".json"));
-			if (match) {
-				const content = (0, node_fs.readFileSync)((0, node_path.join)(dir, match), "utf-8");
-				return JSON.parse(content);
-			}
-		} catch {}
-		throw new Error(`Eval run "${runId}" not found for "${name}" in ${outputDir}. Expected file: ${filePath}`);
+		throw new Error(`Could not read eval result: ${filePath}`);
 	}
 }
 /**
-* Compare two eval runs. First run ID is the baseline.
+* Compare two eval result files. First file is the baseline.
 *
-* Usage:
+* Usage with version control:
+* 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
+* 2. Commit the file
+* 3. Make changes, re-run eval
+* 4. Compare: git stash the new result, compare old vs new
+*
+* Or compare two named eval files:
 * ```ts
 * const diff = await compare({
-*   name: 'support-bot',
-*   runs: [run1.runId, run2.runId],
+*   files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
 * })
 * ```
 */
 async function compare(options) {
-	const { runs, name, outputDir = "./llmops-evals" } = options;
-	if (runs.length < 2) throw new Error("compare() requires at least 2 run IDs");
-	const baselineRun = loadRun(outputDir, name, runs[0]);
-	const candidateRun = loadRun(outputDir, name, runs[1]);
+	const { files } = options;
+	const baselineRun = loadResult(files[0]);
+	const candidateRun = loadResult(files[1]);
 	const allScoreNames = new Set([...Object.keys(baselineRun.scores), ...Object.keys(candidateRun.scores)]);
 	const scores = {};
 	for (const scoreName of allScoreNames) {
@@ -251,112 +295,165 @@ async function compare(options) {
 		}
 	}
 	const result = {
-		baseline: runs[0],
-		candidate: runs[1],
+		baseline: baselineRun.runId,
+		candidate: candidateRun.runId,
 		scores,
 		regressions,
 		improvements
 	};
-	const lines = [];
-	lines.push("");
-	lines.push(` compare: ${runs[0].slice(0, 8)} → ${runs[1].slice(0, 8)}`);
-	lines.push("");
-	lines.push(" Scores:");
-	for (const [scoreName, delta] of Object.entries(scores)) {
-		const sign = delta.delta >= 0 ? "+" : "";
-		const marker = delta.delta >= 0 ? "✓" : "✗";
-		lines.push(`   ${scoreName.padEnd(16)} ${delta.baseline.toFixed(2)} → ${delta.candidate.toFixed(2)}  (${sign}${delta.delta.toFixed(2)}) ${marker}`);
+	const w$1 = process.stderr;
+	const RESET$1 = "\x1B[0m";
+	const DIM$1 = "\x1B[2m";
+	const BOLD$1 = "\x1B[1m";
+	const GREEN$1 = "\x1B[32m";
+	const RED$1 = "\x1B[31m";
+	const CYAN$1 = "\x1B[36m";
+	w$1.write("\n");
+	w$1.write(`  ${BOLD$1}Compare${RESET$1}  ${DIM$1}${baselineRun.name} → ${candidateRun.name}${RESET$1}\n`);
+	w$1.write(`  ${DIM$1}${"─".repeat(50)}${RESET$1}\n\n`);
+	const scoreEntries = Object.entries(scores);
+	if (scoreEntries.length > 0) {
+		const maxNameLen = Math.max(...scoreEntries.map(([n]) => n.length), 10);
+		w$1.write(`  ${DIM$1}${"Evaluator".padEnd(maxNameLen)}  ${"Base".padStart(6)}    ${"New".padStart(6)}  ${"Delta".padStart(7)}${RESET$1}\n`);
+		w$1.write(`  ${DIM$1}${"─".repeat(maxNameLen + 30)}${RESET$1}\n`);
+		for (const [scoreName, delta] of scoreEntries) {
+			const sign = delta.delta >= 0 ? "+" : "";
+			const color = delta.delta >= 0 ? GREEN$1 : RED$1;
+			const icon = delta.delta > 0 ? "▲" : delta.delta < 0 ? "▼" : "=";
+			w$1.write(`  ${scoreName.padEnd(maxNameLen)}  ${delta.baseline.toFixed(2).padStart(6)}  ${DIM$1}→${RESET$1}  ${delta.candidate.toFixed(2).padStart(6)}  ${color}${sign}${delta.delta.toFixed(2).padStart(5)} ${icon}${RESET$1}\n`);
+		}
+		w$1.write("\n");
 	}
 	if (regressions.length > 0) {
-		lines.push("");
-		lines.push(` Regressions (${regressions.length}):`);
+		w$1.write(`  ${RED$1}▼ ${regressions.length} regression${regressions.length > 1 ? "s" : ""}${RESET$1}\n`);
 		for (const r of regressions.slice(0, 5)) {
-			const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0, 60);
-			lines.push(`   "${dataStr}"  ${r.evaluator}: ${r.baselineScore.toFixed(2)} → ${r.candidateScore.toFixed(2)}`);
+			const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0, 50);
+			w$1.write(`    ${DIM$1}${dataStr}${RESET$1}  ${r.evaluator}: ${r.baselineScore.toFixed(2)} → ${RED$1}${r.candidateScore.toFixed(2)}${RESET$1}\n`);
 		}
-		if (regressions.length > 5) lines.push(`   ... and ${regressions.length - 5} more`);
+		if (regressions.length > 5) w$1.write(`    ${DIM$1}... and ${regressions.length - 5} more${RESET$1}\n`);
+		w$1.write("\n");
 	}
 	if (improvements.length > 0) {
-		lines.push("");
-		lines.push(` Improvements (${improvements.length}):`);
+		w$1.write(`  ${GREEN$1}▲ ${improvements.length} improvement${improvements.length > 1 ? "s" : ""}${RESET$1}\n`);
 		for (const imp of improvements.slice(0, 5)) {
-			const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0, 60);
-			lines.push(`   "${dataStr}"  ${imp.evaluator}: ${imp.baselineScore.toFixed(2)} → ${imp.candidateScore.toFixed(2)}`);
+			const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0, 50);
+			w$1.write(`    ${DIM$1}${dataStr}${RESET$1}  ${imp.evaluator}: ${imp.baselineScore.toFixed(2)} → ${GREEN$1}${imp.candidateScore.toFixed(2)}${RESET$1}\n`);
 		}
-		if (improvements.length > 5) lines.push(`   ... and ${improvements.length - 5} more`);
+		if (improvements.length > 5) w$1.write(`    ${DIM$1}... and ${improvements.length - 5} more${RESET$1}\n`);
+		w$1.write("\n");
 	}
-	lines.push("");
-	process.stderr.write(lines.join("\n"));
+	if (regressions.length === 0 && improvements.length === 0) w$1.write(`  ${CYAN$1}No changes between runs${RESET$1}\n\n`);
 	return result;
 }
 //#endregion
 //#region src/eval/judge.ts
-/**
-* Simple mustache-style template interpolation.
-*/
 function interpolate(template, vars) {
 	return template.replace(/\{\{(\w+(?:\.\w+)*)\}\}/g, (_, path) => {
 		const value = path.split(".").reduce((obj, key) => obj?.[key], vars);
-		return typeof value === "string" ? value : JSON.stringify(value);
+		if (value === void 0 || value === null) return "";
+		return typeof value === "string" ? value : JSON.stringify(value, null, 2);
 	});
 }
-/**
-* Default parser: expects JSON with a `score` field, a bare number,
-* or an object of number values (multi-score).
-*/
+function buildVars(output, target, data) {
+	const vars = {
+		output: typeof output === "string" ? output : JSON.stringify(output, null, 2),
+		target: typeof target === "string" ? target : JSON.stringify(target, null, 2),
+		data: typeof data === "string" ? data : JSON.stringify(data, null, 2)
+	};
+	if (target && typeof target === "object") for (const [k, v] of Object.entries(target)) vars[`target.${k}`] = v;
+	if (data && typeof data === "object") for (const [k, v] of Object.entries(data)) vars[`data.${k}`] = v;
+	return vars;
+}
+const DEFAULT_SYSTEM = `You are an expert evaluator. Your job is to grade an AI system's output.
+Instructions:
+- Read the grading criteria in the user message carefully.
+- Evaluate the output objectively.
+- Return ONLY valid JSON. No markdown, no explanation outside the JSON.
+- The JSON must contain a "score" field with a number between 0.0 and 1.0.
+- You may optionally include a "reasoning" field with a brief explanation.
+Example response:
+{"score": 0.85, "reasoning": "The response is mostly accurate but misses one detail."}`;
 function defaultParse(response) {
-	const cleaned = response.replace(/```json\n?|```/g, "").trim();
+	let cleaned = response.replace(/```(?:json)?\n?/g, "").replace(/```$/g, "").trim();
+	const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
+	if (jsonMatch) cleaned = jsonMatch[0];
 	const parsed = JSON.parse(cleaned);
-	if (typeof parsed === "number") return parsed;
-	if (typeof parsed?.score === "number") return parsed.score;
+	if (typeof parsed?.score === "number") return Math.max(0, Math.min(1, parsed.score));
+	if (typeof parsed === "number") return Math.max(0, Math.min(1, parsed));
 	if (typeof parsed === "object" && parsed !== null) {
-		const entries = Object.entries(parsed).filter(([, v]) => typeof v === "number");
-		if (entries.length > 0) return Object.fromEntries(entries);
+		const entries = Object.entries(parsed).filter(([k, v]) => typeof v === "number" && k !== "reasoning");
+		if (entries.length > 0) return Object.fromEntries(entries.map(([k, v]) => [k, Math.max(0, Math.min(1, v))]));
 	}
-	throw new Error(`Could not extract score from judge response: ${response.slice(0, 200)}`);
+	throw new Error(`Could not extract score from judge response: ${response.slice(0, 300)}`);
+}
+async function callLLM(client, model, system, userMessage, temperature) {
+	const providerConfig = client.provider();
+	const response = await providerConfig.fetch(`${providerConfig.baseURL}/chat/completions`, {
+		method: "POST",
+		headers: {
+			"Content-Type": "application/json",
+			Authorization: `Bearer ${providerConfig.apiKey}`
+		},
+		body: JSON.stringify({
+			model,
+			temperature,
+			messages: [{
+				role: "system",
+				content: system
+			}, {
+				role: "user",
+				content: userMessage
+			}]
+		})
+	});
+	if (!response.ok) {
+		const errorBody = await response.text().catch(() => "unknown error");
+		throw new Error(`Judge LLM call failed (${response.status}): ${errorBody.slice(0, 300)}`);
+	}
+	const content = (await response.json()).choices?.[0]?.message?.content;
+	if (!content) throw new Error("Judge LLM returned empty response");
+	return content;
 }
 /**
 * Factory that returns an Evaluator which uses an LLM to score output.
 *
+* The judge:
+* - Uses a system message that instructs the LLM to return JSON scores
+* - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
+* - Uses temperature 0 by default for deterministic scoring
+* - Retries on parse failure (configurable)
+* - Clamps scores to [0, 1]
+*
 * Usage:
 * ```ts
+* import { llmops } from '@llmops/sdk'
+*
+* const client = llmops()
 * const accuracy = judgeScorer({
 *   model: '@openai/gpt-4o',
-*   prompt: 'Rate accuracy 0-1. Expected: {{target.answer}} Actual: {{output}}',
-*   ops,
+*   prompt: `Rate the accuracy of this response.
+* Expected: {{target.answer}}
+* Actual: {{output}}`,
+*   client,
 * })
 * ```
 */
 function judgeScorer(options) {
-	const { model, prompt, ops, parse = defaultParse } = options;
-	return async (output, target) => {
-		const vars = {
-			output: typeof output === "string" ? output : JSON.stringify(output),
-			target
-		};
-		if (target && typeof target === "object") for (const [k, v] of Object.entries(target)) vars[`target.${k}`] = v;
-		const renderedPrompt = interpolate(prompt, vars);
-		const providerConfig = ops.provider();
-		const response = await providerConfig.fetch(`${providerConfig.baseURL}/chat/completions`, {
-			method: "POST",
-			headers: {
-				"Content-Type": "application/json",
-				Authorization: `Bearer ${providerConfig.apiKey}`
-			},
-			body: JSON.stringify({
-				model,
-				messages: [{
-					role: "user",
-					content: renderedPrompt
-				}],
-				response_format: { type: "json_object" }
-			})
-		});
-		if (!response.ok) throw new Error(`Judge LLM call failed: ${response.status} ${await response.text()}`);
-		const content = (await response.json()).choices?.[0]?.message?.content;
-		if (!content) throw new Error("Judge LLM returned empty response");
-		return parse(content);
+	const { model, prompt, client, system = DEFAULT_SYSTEM, temperature = 0, maxRetries = 1, parse = defaultParse } = options;
+	return async (output, target, data) => {
+		const userMessage = interpolate(prompt, buildVars(output, target, data));
+		let lastError = null;
+		const attempts = 1 + maxRetries;
+		for (let attempt = 0; attempt < attempts; attempt++) try {
+			return parse(await callLLM(client, model, system, userMessage, temperature));
+		} catch (err) {
+			lastError = err instanceof Error ? err : new Error(String(err));
+			if (lastError.message.includes("Judge LLM call failed")) throw lastError;
+		}
+		throw lastError ?? /* @__PURE__ */ new Error("Judge scoring failed");
 	};
 }

package/dist/eval.d.cts CHANGED Viewed

@@ -116,12 +116,8 @@ interface VariantEvaluateResult<D = unknown, O = unknown> {
  * Options for compare().
  */
 interface CompareOptions {
-  /** Run IDs to compare. First is baseline. */
-  runs: string[];
-  /** Directory where eval results are stored. Default: './llmops-evals' */
-  outputDir?: string;
-  /** Eval name to search within. Required. */
-  name: string;
+  /** Paths to eval result JSON files. First is baseline, second is candidate. */
+  files: [string, string];
 }
 /**
  * Per-evaluator delta between two runs.
@@ -157,11 +153,38 @@ interface CompareResult {
 interface JudgeScorerOptions {
   /** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
   model: string;
-  /** Prompt template. Supports {{output}}, {{target}}, {{target.*}} placeholders. */
+  /**
+   * Grading prompt. Supports {{output}}, {{target}}, {{target.*}},
+   * {{data}}, {{data.*}} placeholders.
+   *
+   * This becomes the user message. A system message is added automatically
+   * that instructs the LLM to return a JSON score.
+   */
   prompt: string;
-  /** The llmops client instance. Judge call routed through gateway. */
-  ops: LLMOpsClient;
-  /** Custom parser for extracting score from LLM response. */
+  /**
+   * The llmops client instance. The judge call is routed through the
+   * gateway and traced like any other LLM call.
+   *
+   * ```ts
+   * const client = llmops({ telemetry: pgStore(url) })
+   * judgeScorer({ model: '@openai/gpt-4o', prompt: '...', client })
+   * ```
+   */
+  client: LLMOpsClient;
+  /**
+   * Custom system message. Overrides the default grading instructions.
+   * If omitted, a default system message is used that instructs
+   * the LLM to return JSON with a "score" field (0-1).
+   */
+  system?: string;
+  /** Temperature for the judge LLM. Default: 0 (deterministic). */
+  temperature?: number;
+  /** Max retries on parse failure. Default: 1. */
+  maxRetries?: number;
+  /**
+   * Custom parser for extracting score from LLM response.
+   * Default: expects JSON with a `score` field.
+   */
   parse?: (response: string) => number | Record<string, number>;
 }
 //#endregion
@@ -170,13 +193,18 @@ declare function evaluate<D = Record<string, unknown>, T = Record<string, unknow
 //#endregion
 //#region src/eval/compare.d.ts
 /**
- * Compare two eval runs. First run ID is the baseline.
+ * Compare two eval result files. First file is the baseline.
  *
- * Usage:
+ * Usage with version control:
+ * 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
+ * 2. Commit the file
+ * 3. Make changes, re-run eval
+ * 4. Compare: git stash the new result, compare old vs new
+ *
+ * Or compare two named eval files:
  * ```ts
  * const diff = await compare({
- *   name: 'support-bot',
- *   runs: [run1.runId, run2.runId],
+ *   files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
  * })
  * ```
  */
@@ -186,12 +214,24 @@ declare function compare(options: CompareOptions): Promise<CompareResult>;
 /**
  * Factory that returns an Evaluator which uses an LLM to score output.
  *
+ * The judge:
+ * - Uses a system message that instructs the LLM to return JSON scores
+ * - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
+ * - Uses temperature 0 by default for deterministic scoring
+ * - Retries on parse failure (configurable)
+ * - Clamps scores to [0, 1]
+ *
  * Usage:
  * ```ts
+ * import { llmops } from '@llmops/sdk'
+ *
+ * const client = llmops()
  * const accuracy = judgeScorer({
  *   model: '@openai/gpt-4o',
- *   prompt: 'Rate accuracy 0-1. Expected: {{target.answer}} Actual: {{output}}',
- *   ops,
+ *   prompt: `Rate the accuracy of this response.
+ * Expected: {{target.answer}}
+ * Actual: {{output}}`,
+ *   client,
  * })
  * ```
  */

package/dist/eval.d.mts CHANGED Viewed

@@ -116,12 +116,8 @@ interface VariantEvaluateResult<D = unknown, O = unknown> {
  * Options for compare().
  */
 interface CompareOptions {
-  /** Run IDs to compare. First is baseline. */
-  runs: string[];
-  /** Directory where eval results are stored. Default: './llmops-evals' */
-  outputDir?: string;
-  /** Eval name to search within. Required. */
-  name: string;
+  /** Paths to eval result JSON files. First is baseline, second is candidate. */
+  files: [string, string];
 }
 /**
  * Per-evaluator delta between two runs.
@@ -157,11 +153,38 @@ interface CompareResult {
 interface JudgeScorerOptions {
   /** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
   model: string;
-  /** Prompt template. Supports {{output}}, {{target}}, {{target.*}} placeholders. */
+  /**
+   * Grading prompt. Supports {{output}}, {{target}}, {{target.*}},
+   * {{data}}, {{data.*}} placeholders.
+   *
+   * This becomes the user message. A system message is added automatically
+   * that instructs the LLM to return a JSON score.
+   */
   prompt: string;
-  /** The llmops client instance. Judge call routed through gateway. */
-  ops: LLMOpsClient;
-  /** Custom parser for extracting score from LLM response. */
+  /**
+   * The llmops client instance. The judge call is routed through the
+   * gateway and traced like any other LLM call.
+   *
+   * ```ts
+   * const client = llmops({ telemetry: pgStore(url) })
+   * judgeScorer({ model: '@openai/gpt-4o', prompt: '...', client })
+   * ```
+   */
+  client: LLMOpsClient;
+  /**
+   * Custom system message. Overrides the default grading instructions.
+   * If omitted, a default system message is used that instructs
+   * the LLM to return JSON with a "score" field (0-1).
+   */
+  system?: string;
+  /** Temperature for the judge LLM. Default: 0 (deterministic). */
+  temperature?: number;
+  /** Max retries on parse failure. Default: 1. */
+  maxRetries?: number;
+  /**
+   * Custom parser for extracting score from LLM response.
+   * Default: expects JSON with a `score` field.
+   */
   parse?: (response: string) => number | Record<string, number>;
 }
 //#endregion
@@ -170,13 +193,18 @@ declare function evaluate<D = Record<string, unknown>, T = Record<string, unknow
 //#endregion
 //#region src/eval/compare.d.ts
 /**
- * Compare two eval runs. First run ID is the baseline.
+ * Compare two eval result files. First file is the baseline.
  *
- * Usage:
+ * Usage with version control:
+ * 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
+ * 2. Commit the file
+ * 3. Make changes, re-run eval
+ * 4. Compare: git stash the new result, compare old vs new
+ *
+ * Or compare two named eval files:
  * ```ts
  * const diff = await compare({
- *   name: 'support-bot',
- *   runs: [run1.runId, run2.runId],
+ *   files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
  * })
  * ```
  */
@@ -186,12 +214,24 @@ declare function compare(options: CompareOptions): Promise<CompareResult>;
 /**
  * Factory that returns an Evaluator which uses an LLM to score output.
  *
+ * The judge:
+ * - Uses a system message that instructs the LLM to return JSON scores
+ * - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
+ * - Uses temperature 0 by default for deterministic scoring
+ * - Retries on parse failure (configurable)
+ * - Clamps scores to [0, 1]
+ *
  * Usage:
  * ```ts
+ * import { llmops } from '@llmops/sdk'
+ *
+ * const client = llmops()
  * const accuracy = judgeScorer({
  *   model: '@openai/gpt-4o',
- *   prompt: 'Rate accuracy 0-1. Expected: {{target.answer}} Actual: {{output}}',
- *   ops,
+ *   prompt: `Rate the accuracy of this response.
+ * Expected: {{target.answer}}
+ * Actual: {{output}}`,
+ *   client,
  * })
  * ```
  */

package/dist/eval.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
 import { randomUUID } from "node:crypto";
-import { mkdirSync, readFileSync, readdirSync, writeFileSync } from "node:fs";
+import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
 //#region src/eval/dataset.ts
@@ -23,6 +23,13 @@ var InlineDataset = class {
 //#endregion
 //#region src/eval/evaluate.ts
+const RESET = "\x1B[0m";
+const DIM = "\x1B[2m";
+const BOLD = "\x1B[1m";
+const CYAN = "\x1B[36m";
+const GREEN = "\x1B[32m";
+const RED = "\x1B[31m";
+const YELLOW = "\x1B[33m";
 async function pool(items, concurrency, fn) {
 	const executing = [];
 	for (const item of items) {
@@ -55,11 +62,70 @@ function computeStats(values) {
 		count: sorted.length
 	};
 }
-async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
+const isSilent = process.env.LLMOPS_EVAL_OUTPUT === "json";
+const w = process.stderr;
+function printHeader(name, total) {
+	if (isSilent) return;
+	w.write("\n");
+	w.write(`  ${BOLD}${name}${RESET}  ${DIM}(${total} datapoints)${RESET}\n`);
+	w.write(`  ${DIM}${"─".repeat(50)}${RESET}\n`);
+}
+function printDatapointResult(idx, total, dp) {
+	if (isSilent) return;
+	const label = typeof dp.data === "object" && dp.data !== null ? JSON.stringify(dp.data).slice(0, 50) : String(dp.data).slice(0, 50);
+	if (dp.error) {
+		w.write(`  ${RED}✗${RESET} ${DIM}[${idx + 1}/${total}]${RESET} ${label}  ${RED}ERROR${RESET} ${DIM}${dp.error.slice(0, 60)}${RESET}\n`);
+		return;
+	}
+	const scoreStr = Object.entries(dp.scores).map(([name, val]) => {
+		if (Number.isNaN(val)) return `${DIM}${name}=NaN${RESET}`;
+		return `${val >= .8 ? GREEN : val >= .5 ? YELLOW : RED}${name}=${val.toFixed(2)}${RESET}`;
+	}).join("  ");
+	w.write(`  ${GREEN}✓${RESET} ${DIM}[${idx + 1}/${total}]${RESET} ${label}  ${scoreStr}  ${DIM}${dp.durationMs}ms${RESET}\n`);
+}
+function scoreBar(score, width = 20) {
+	const filled = Math.round(score * width);
+	const empty = width - filled;
+	return "█".repeat(filled) + "░".repeat(empty);
+}
+function scoreColor(score) {
+	if (score >= .8) return GREEN;
+	if (score >= .5) return YELLOW;
+	return RED;
+}
+function printSummary(result) {
+	if (isSilent) return;
+	w.write("\n");
+	const entries = Object.entries(result.scores);
+	if (entries.length > 0) {
+		const maxNameLen = Math.max(...entries.map(([n]) => n.length), 10);
+		w.write(`  ${DIM}${"Evaluator".padEnd(maxNameLen)}  ${"Mean".padStart(6)}  ${"Bar".padEnd(20)}  ${"Min".padStart(5)}  ${"Max".padStart(5)}  ${"Med".padStart(5)}${RESET}\n`);
+		w.write(`  ${DIM}${"─".repeat(maxNameLen + 50)}${RESET}\n`);
+		for (const [name, stats] of entries) {
+			const color = scoreColor(stats.mean);
+			const bar = scoreBar(stats.mean);
+			w.write(`  ${name.padEnd(maxNameLen)}  ${color}${stats.mean.toFixed(2).padStart(6)}${RESET}  ${DIM}${bar}${RESET}  ${stats.min.toFixed(2).padStart(5)}  ${stats.max.toFixed(2).padStart(5)}  ${stats.median.toFixed(2).padStart(5)}\n`);
+		}
+	}
+	const completed = result.count - result.errors;
+	w.write("\n");
+	w.write(`  ${DIM}Duration${RESET} ${(result.durationMs / 1e3).toFixed(1)}s`);
+	w.write(`    ${DIM}Passed${RESET} ${completed}/${result.count}`);
+	if (result.errors > 0) w.write(`    ${RED}Failed ${result.errors}${RESET}`);
+	w.write(`    ${DIM}Run${RESET} ${CYAN}${result.runId.slice(0, 8)}${RESET}`);
+	w.write("\n\n");
+}
+function saveResult(result, outputDir) {
+	const dir = join(outputDir, result.name);
+	mkdirSync(dir, { recursive: true });
+	writeFileSync(join(dir, `${Date.now()}.json`), JSON.stringify(result, null, 2));
+}
+async function runSingleExecutor(name, dataset, executor, evaluators, concurrency) {
 	const size = await dataset.size();
 	const datapoints = await dataset.slice(0, size);
 	const results = new Array(datapoints.length);
 	const startTime = Date.now();
+	printHeader(name, datapoints.length);
 	await pool(datapoints, concurrency, async (dp) => {
 		const idx = datapoints.indexOf(dp);
 		const dpStart = Date.now();
@@ -71,14 +137,16 @@ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
 		} catch (err) {
 			error = err instanceof Error ? err.message : String(err);
 		}
-		if (!error && output !== null) for (const [name, evaluator] of Object.entries(evaluators)) try {
+		if (!error && output !== null) for (const [evalName, evaluator] of Object.entries(evaluators)) try {
 			const result = await evaluator(output, dp.target, dp.data);
-			if (typeof result === "number") scores[name] = result;
-			else for (const [subKey, subScore] of Object.entries(result)) scores[`${name}.${subKey}`] = subScore;
-		} catch {
-			scores[name] = NaN;
+			if (typeof result === "number") scores[evalName] = result;
+			else for (const [subKey, subScore] of Object.entries(result)) scores[`${evalName}.${subKey}`] = subScore;
+		} catch (evalErr) {
+			scores[evalName] = NaN;
+			const msg = evalErr instanceof Error ? evalErr.message : String(evalErr);
+			if (!isSilent) w.write(`  ${YELLOW}⚠${RESET} ${DIM}evaluator "${evalName}":${RESET} ${msg.slice(0, 80)}\n`);
 		}
-		results[idx] = {
+		const dpResult = {
 			data: dp.data,
 			target: dp.target,
 			metadata: dp.metadata,
@@ -87,33 +155,14 @@ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
 			durationMs: Date.now() - dpStart,
 			error
 		};
+		results[idx] = dpResult;
+		printDatapointResult(idx, datapoints.length, dpResult);
 	});
 	return {
 		results,
 		durationMs: Date.now() - startTime
 	};
 }
-function printSummary(result) {
-	const lines = [];
-	lines.push("");
-	lines.push(` ${result.name}`);
-	lines.push("");
-	const completed = result.count - result.errors;
-	lines.push(` ✓ ${completed}/${result.count} completed${result.errors > 0 ? `  ✗ ${result.errors} errors` : ""}`);
-	lines.push("");
-	lines.push(" Scores:");
-	for (const [name, stats] of Object.entries(result.scores)) lines.push(`   ${name.padEnd(16)} mean=${stats.mean.toFixed(2)}  min=${stats.min.toFixed(2)}  max=${stats.max.toFixed(2)}  median=${stats.median.toFixed(2)}`);
-	lines.push("");
-	lines.push(` Duration: ${(result.durationMs / 1e3).toFixed(1)}s`);
-	lines.push(` Run ID:   ${result.runId}`);
-	lines.push("");
-	process.stderr.write(lines.join("\n"));
-}
-function saveResult(result, outputDir) {
-	const dir = join(outputDir, result.name);
-	mkdirSync(dir, { recursive: true });
-	writeFileSync(join(dir, `${result.runId}.json`), JSON.stringify(result, null, 2));
-}
 async function evaluate(options) {
 	const { name, data, executor, variants, evaluators, concurrency = 5, group, metadata, outputDir = process.env.LLMOPS_EVAL_OUTPUT_DIR || "./llmops-evals" } = options;
 	const runId = randomUUID();
@@ -121,7 +170,7 @@ async function evaluate(options) {
 	if (!executor && !variants) throw new Error("evaluate(): provide either executor or variants");
 	const dataset = Array.isArray(data) ? new InlineDataset(data) : data;
 	if (executor) {
-		const { results, durationMs } = await runSingleExecutor(dataset, executor, evaluators, concurrency);
+		const { results, durationMs } = await runSingleExecutor(name, dataset, executor, evaluators, concurrency);
 		const scoreNames = /* @__PURE__ */ new Set();
 		for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
 		const scores = {};
@@ -137,7 +186,7 @@ async function evaluate(options) {
 			metadata,
 			results
 		};
-		if (process.env.LLMOPS_EVAL_OUTPUT === "json") process.stdout.write(JSON.stringify(result, null, 2));
+		if (isSilent) process.stdout.write(JSON.stringify(result, null, 2));
 		else printSummary(result);
 		saveResult(result, outputDir);
 		return result;
@@ -145,7 +194,7 @@ async function evaluate(options) {
 	const variantResults = {};
 	const totalStart = Date.now();
 	for (const [variantName, variantExecutor] of Object.entries(variants)) {
-		const { results, durationMs } = await runSingleExecutor(dataset, variantExecutor, evaluators, concurrency);
+		const { results, durationMs } = await runSingleExecutor(`${name}/${variantName}`, dataset, variantExecutor, evaluators, concurrency);
 		const scoreNames = /* @__PURE__ */ new Set();
 		for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
 		const scores = {};
@@ -162,7 +211,7 @@ async function evaluate(options) {
 			results
 		};
 		variantResults[variantName] = variantResult;
-		if (process.env.LLMOPS_EVAL_OUTPUT !== "json") printSummary(variantResult);
+		if (!isSilent) printSummary(variantResult);
 		saveResult(variantResult, outputDir);
 	}
 	const variantEvalResult = {
@@ -173,48 +222,43 @@ async function evaluate(options) {
 		metadata,
 		variants: variantResults
 	};
-	if (process.env.LLMOPS_EVAL_OUTPUT === "json") process.stdout.write(JSON.stringify(variantEvalResult, null, 2));
+	if (isSilent) process.stdout.write(JSON.stringify(variantEvalResult, null, 2));
 	return variantEvalResult;
 }
 //#endregion
 //#region src/eval/compare.ts
 /**
-* Load an eval run from the filesystem.
+* Load an eval result from a JSON file.
 */
-function loadRun(outputDir, name, runId) {
-	const dir = join(outputDir, name);
-	const filePath = join(dir, `${runId}.json`);
+function loadResult(filePath) {
 	try {
 		const content = readFileSync(filePath, "utf-8");
 		return JSON.parse(content);
 	} catch {
-		try {
-			const match = readdirSync(dir).find((f) => f.startsWith(runId) && f.endsWith(".json"));
-			if (match) {
-				const content = readFileSync(join(dir, match), "utf-8");
-				return JSON.parse(content);
-			}
-		} catch {}
-		throw new Error(`Eval run "${runId}" not found for "${name}" in ${outputDir}. Expected file: ${filePath}`);
+		throw new Error(`Could not read eval result: ${filePath}`);
 	}
 }
 /**
-* Compare two eval runs. First run ID is the baseline.
+* Compare two eval result files. First file is the baseline.
 *
-* Usage:
+* Usage with version control:
+* 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
+* 2. Commit the file
+* 3. Make changes, re-run eval
+* 4. Compare: git stash the new result, compare old vs new
+*
+* Or compare two named eval files:
 * ```ts
 * const diff = await compare({
-*   name: 'support-bot',
-*   runs: [run1.runId, run2.runId],
+*   files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
 * })
 * ```
 */
 async function compare(options) {
-	const { runs, name, outputDir = "./llmops-evals" } = options;
-	if (runs.length < 2) throw new Error("compare() requires at least 2 run IDs");
-	const baselineRun = loadRun(outputDir, name, runs[0]);
-	const candidateRun = loadRun(outputDir, name, runs[1]);
+	const { files } = options;
+	const baselineRun = loadResult(files[0]);
+	const candidateRun = loadResult(files[1]);
 	const allScoreNames = new Set([...Object.keys(baselineRun.scores), ...Object.keys(candidateRun.scores)]);
 	const scores = {};
 	for (const scoreName of allScoreNames) {
@@ -251,112 +295,165 @@ async function compare(options) {
 		}
 	}
 	const result = {
-		baseline: runs[0],
-		candidate: runs[1],
+		baseline: baselineRun.runId,
+		candidate: candidateRun.runId,
 		scores,
 		regressions,
 		improvements
 	};
-	const lines = [];
-	lines.push("");
-	lines.push(` compare: ${runs[0].slice(0, 8)} → ${runs[1].slice(0, 8)}`);
-	lines.push("");
-	lines.push(" Scores:");
-	for (const [scoreName, delta] of Object.entries(scores)) {
-		const sign = delta.delta >= 0 ? "+" : "";
-		const marker = delta.delta >= 0 ? "✓" : "✗";
-		lines.push(`   ${scoreName.padEnd(16)} ${delta.baseline.toFixed(2)} → ${delta.candidate.toFixed(2)}  (${sign}${delta.delta.toFixed(2)}) ${marker}`);
+	const w$1 = process.stderr;
+	const RESET$1 = "\x1B[0m";
+	const DIM$1 = "\x1B[2m";
+	const BOLD$1 = "\x1B[1m";
+	const GREEN$1 = "\x1B[32m";
+	const RED$1 = "\x1B[31m";
+	const CYAN$1 = "\x1B[36m";
+	w$1.write("\n");
+	w$1.write(`  ${BOLD$1}Compare${RESET$1}  ${DIM$1}${baselineRun.name} → ${candidateRun.name}${RESET$1}\n`);
+	w$1.write(`  ${DIM$1}${"─".repeat(50)}${RESET$1}\n\n`);
+	const scoreEntries = Object.entries(scores);
+	if (scoreEntries.length > 0) {
+		const maxNameLen = Math.max(...scoreEntries.map(([n]) => n.length), 10);
+		w$1.write(`  ${DIM$1}${"Evaluator".padEnd(maxNameLen)}  ${"Base".padStart(6)}    ${"New".padStart(6)}  ${"Delta".padStart(7)}${RESET$1}\n`);
+		w$1.write(`  ${DIM$1}${"─".repeat(maxNameLen + 30)}${RESET$1}\n`);
+		for (const [scoreName, delta] of scoreEntries) {
+			const sign = delta.delta >= 0 ? "+" : "";
+			const color = delta.delta >= 0 ? GREEN$1 : RED$1;
+			const icon = delta.delta > 0 ? "▲" : delta.delta < 0 ? "▼" : "=";
+			w$1.write(`  ${scoreName.padEnd(maxNameLen)}  ${delta.baseline.toFixed(2).padStart(6)}  ${DIM$1}→${RESET$1}  ${delta.candidate.toFixed(2).padStart(6)}  ${color}${sign}${delta.delta.toFixed(2).padStart(5)} ${icon}${RESET$1}\n`);
+		}
+		w$1.write("\n");
 	}
 	if (regressions.length > 0) {
-		lines.push("");
-		lines.push(` Regressions (${regressions.length}):`);
+		w$1.write(`  ${RED$1}▼ ${regressions.length} regression${regressions.length > 1 ? "s" : ""}${RESET$1}\n`);
 		for (const r of regressions.slice(0, 5)) {
-			const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0, 60);
-			lines.push(`   "${dataStr}"  ${r.evaluator}: ${r.baselineScore.toFixed(2)} → ${r.candidateScore.toFixed(2)}`);
+			const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0, 50);
+			w$1.write(`    ${DIM$1}${dataStr}${RESET$1}  ${r.evaluator}: ${r.baselineScore.toFixed(2)} → ${RED$1}${r.candidateScore.toFixed(2)}${RESET$1}\n`);
 		}
-		if (regressions.length > 5) lines.push(`   ... and ${regressions.length - 5} more`);
+		if (regressions.length > 5) w$1.write(`    ${DIM$1}... and ${regressions.length - 5} more${RESET$1}\n`);
+		w$1.write("\n");
 	}
 	if (improvements.length > 0) {
-		lines.push("");
-		lines.push(` Improvements (${improvements.length}):`);
+		w$1.write(`  ${GREEN$1}▲ ${improvements.length} improvement${improvements.length > 1 ? "s" : ""}${RESET$1}\n`);
 		for (const imp of improvements.slice(0, 5)) {
-			const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0, 60);
-			lines.push(`   "${dataStr}"  ${imp.evaluator}: ${imp.baselineScore.toFixed(2)} → ${imp.candidateScore.toFixed(2)}`);
+			const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0, 50);
+			w$1.write(`    ${DIM$1}${dataStr}${RESET$1}  ${imp.evaluator}: ${imp.baselineScore.toFixed(2)} → ${GREEN$1}${imp.candidateScore.toFixed(2)}${RESET$1}\n`);
 		}
-		if (improvements.length > 5) lines.push(`   ... and ${improvements.length - 5} more`);
+		if (improvements.length > 5) w$1.write(`    ${DIM$1}... and ${improvements.length - 5} more${RESET$1}\n`);
+		w$1.write("\n");
 	}
-	lines.push("");
-	process.stderr.write(lines.join("\n"));
+	if (regressions.length === 0 && improvements.length === 0) w$1.write(`  ${CYAN$1}No changes between runs${RESET$1}\n\n`);
 	return result;
 }
 //#endregion
 //#region src/eval/judge.ts
-/**
-* Simple mustache-style template interpolation.
-*/
 function interpolate(template, vars) {
 	return template.replace(/\{\{(\w+(?:\.\w+)*)\}\}/g, (_, path) => {
 		const value = path.split(".").reduce((obj, key) => obj?.[key], vars);
-		return typeof value === "string" ? value : JSON.stringify(value);
+		if (value === void 0 || value === null) return "";
+		return typeof value === "string" ? value : JSON.stringify(value, null, 2);
 	});
 }
-/**
-* Default parser: expects JSON with a `score` field, a bare number,
-* or an object of number values (multi-score).
-*/
+function buildVars(output, target, data) {
+	const vars = {
+		output: typeof output === "string" ? output : JSON.stringify(output, null, 2),
+		target: typeof target === "string" ? target : JSON.stringify(target, null, 2),
+		data: typeof data === "string" ? data : JSON.stringify(data, null, 2)
+	};
+	if (target && typeof target === "object") for (const [k, v] of Object.entries(target)) vars[`target.${k}`] = v;
+	if (data && typeof data === "object") for (const [k, v] of Object.entries(data)) vars[`data.${k}`] = v;
+	return vars;
+}
+const DEFAULT_SYSTEM = `You are an expert evaluator. Your job is to grade an AI system's output.
+Instructions:
+- Read the grading criteria in the user message carefully.
+- Evaluate the output objectively.
+- Return ONLY valid JSON. No markdown, no explanation outside the JSON.
+- The JSON must contain a "score" field with a number between 0.0 and 1.0.
+- You may optionally include a "reasoning" field with a brief explanation.
+Example response:
+{"score": 0.85, "reasoning": "The response is mostly accurate but misses one detail."}`;
 function defaultParse(response) {
-	const cleaned = response.replace(/```json\n?|```/g, "").trim();
+	let cleaned = response.replace(/```(?:json)?\n?/g, "").replace(/```$/g, "").trim();
+	const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
+	if (jsonMatch) cleaned = jsonMatch[0];
 	const parsed = JSON.parse(cleaned);
-	if (typeof parsed === "number") return parsed;
-	if (typeof parsed?.score === "number") return parsed.score;
+	if (typeof parsed?.score === "number") return Math.max(0, Math.min(1, parsed.score));
+	if (typeof parsed === "number") return Math.max(0, Math.min(1, parsed));
 	if (typeof parsed === "object" && parsed !== null) {
-		const entries = Object.entries(parsed).filter(([, v]) => typeof v === "number");
-		if (entries.length > 0) return Object.fromEntries(entries);
+		const entries = Object.entries(parsed).filter(([k, v]) => typeof v === "number" && k !== "reasoning");
+		if (entries.length > 0) return Object.fromEntries(entries.map(([k, v]) => [k, Math.max(0, Math.min(1, v))]));
 	}
-	throw new Error(`Could not extract score from judge response: ${response.slice(0, 200)}`);
+	throw new Error(`Could not extract score from judge response: ${response.slice(0, 300)}`);
+}
+async function callLLM(client, model, system, userMessage, temperature) {
+	const providerConfig = client.provider();
+	const response = await providerConfig.fetch(`${providerConfig.baseURL}/chat/completions`, {
+		method: "POST",
+		headers: {
+			"Content-Type": "application/json",
+			Authorization: `Bearer ${providerConfig.apiKey}`
+		},
+		body: JSON.stringify({
+			model,
+			temperature,
+			messages: [{
+				role: "system",
+				content: system
+			}, {
+				role: "user",
+				content: userMessage
+			}]
+		})
+	});
+	if (!response.ok) {
+		const errorBody = await response.text().catch(() => "unknown error");
+		throw new Error(`Judge LLM call failed (${response.status}): ${errorBody.slice(0, 300)}`);
+	}
+	const content = (await response.json()).choices?.[0]?.message?.content;
+	if (!content) throw new Error("Judge LLM returned empty response");
+	return content;
 }
 /**
 * Factory that returns an Evaluator which uses an LLM to score output.
 *
+* The judge:
+* - Uses a system message that instructs the LLM to return JSON scores
+* - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
+* - Uses temperature 0 by default for deterministic scoring
+* - Retries on parse failure (configurable)
+* - Clamps scores to [0, 1]
+*
 * Usage:
 * ```ts
+* import { llmops } from '@llmops/sdk'
+*
+* const client = llmops()
 * const accuracy = judgeScorer({
 *   model: '@openai/gpt-4o',
-*   prompt: 'Rate accuracy 0-1. Expected: {{target.answer}} Actual: {{output}}',
-*   ops,
+*   prompt: `Rate the accuracy of this response.
+* Expected: {{target.answer}}
+* Actual: {{output}}`,
+*   client,
 * })
 * ```
 */
 function judgeScorer(options) {
-	const { model, prompt, ops, parse = defaultParse } = options;
-	return async (output, target) => {
-		const vars = {
-			output: typeof output === "string" ? output : JSON.stringify(output),
-			target
-		};
-		if (target && typeof target === "object") for (const [k, v] of Object.entries(target)) vars[`target.${k}`] = v;
-		const renderedPrompt = interpolate(prompt, vars);
-		const providerConfig = ops.provider();
-		const response = await providerConfig.fetch(`${providerConfig.baseURL}/chat/completions`, {
-			method: "POST",
-			headers: {
-				"Content-Type": "application/json",
-				Authorization: `Bearer ${providerConfig.apiKey}`
-			},
-			body: JSON.stringify({
-				model,
-				messages: [{
-					role: "user",
-					content: renderedPrompt
-				}],
-				response_format: { type: "json_object" }
-			})
-		});
-		if (!response.ok) throw new Error(`Judge LLM call failed: ${response.status} ${await response.text()}`);
-		const content = (await response.json()).choices?.[0]?.message?.content;
-		if (!content) throw new Error("Judge LLM returned empty response");
-		return parse(content);
+	const { model, prompt, client, system = DEFAULT_SYSTEM, temperature = 0, maxRetries = 1, parse = defaultParse } = options;
+	return async (output, target, data) => {
+		const userMessage = interpolate(prompt, buildVars(output, target, data));
+		let lastError = null;
+		const attempts = 1 + maxRetries;
+		for (let attempt = 0; attempt < attempts; attempt++) try {
+			return parse(await callLLM(client, model, system, userMessage, temperature));
+		} catch (err) {
+			lastError = err instanceof Error ? err : new Error(String(err));
+			if (lastError.message.includes("Judge LLM call failed")) throw lastError;
+		}
+		throw lastError ?? /* @__PURE__ */ new Error("Judge scoring failed");
 	};
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@llmops/sdk",
-  "version": "1.0.0-beta.22",
+  "version": "1.0.0-beta.23",
   "description": "An LLMOps toolkit for TypeScript applications",
   "type": "module",
   "license": "Apache-2.0",
@@ -134,8 +134,8 @@
     "access": "public"
   },
   "dependencies": {
-    "@llmops/app": "^1.0.0-beta.22",
-    "@llmops/core": "^1.0.0-beta.22"
+    "@llmops/app": "^1.0.0-beta.23",
+    "@llmops/core": "^1.0.0-beta.23"
   },
   "peerDependencies": {
     "pg": "*",