@llmops/sdk 1.0.0-beta.22 → 1.0.0-beta.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/eval.cjs +217 -120
- package/dist/eval.d.cts +56 -16
- package/dist/eval.d.mts +56 -16
- package/dist/eval.mjs +218 -121
- package/package.json +3 -3
package/dist/eval.cjs
CHANGED
|
@@ -23,6 +23,13 @@ var InlineDataset = class {
|
|
|
23
23
|
|
|
24
24
|
//#endregion
|
|
25
25
|
//#region src/eval/evaluate.ts
|
|
26
|
+
const RESET = "\x1B[0m";
|
|
27
|
+
const DIM = "\x1B[2m";
|
|
28
|
+
const BOLD = "\x1B[1m";
|
|
29
|
+
const CYAN = "\x1B[36m";
|
|
30
|
+
const GREEN = "\x1B[32m";
|
|
31
|
+
const RED = "\x1B[31m";
|
|
32
|
+
const YELLOW = "\x1B[33m";
|
|
26
33
|
async function pool(items, concurrency, fn) {
|
|
27
34
|
const executing = [];
|
|
28
35
|
for (const item of items) {
|
|
@@ -55,11 +62,70 @@ function computeStats(values) {
|
|
|
55
62
|
count: sorted.length
|
|
56
63
|
};
|
|
57
64
|
}
|
|
58
|
-
|
|
65
|
+
const isSilent = process.env.LLMOPS_EVAL_OUTPUT === "json";
|
|
66
|
+
const w = process.stderr;
|
|
67
|
+
function printHeader(name, total) {
|
|
68
|
+
if (isSilent) return;
|
|
69
|
+
w.write("\n");
|
|
70
|
+
w.write(` ${BOLD}${name}${RESET} ${DIM}(${total} datapoints)${RESET}\n`);
|
|
71
|
+
w.write(` ${DIM}${"─".repeat(50)}${RESET}\n`);
|
|
72
|
+
}
|
|
73
|
+
function printDatapointResult(idx, total, dp) {
|
|
74
|
+
if (isSilent) return;
|
|
75
|
+
const label = typeof dp.data === "object" && dp.data !== null ? JSON.stringify(dp.data).slice(0, 50) : String(dp.data).slice(0, 50);
|
|
76
|
+
if (dp.error) {
|
|
77
|
+
w.write(` ${RED}✗${RESET} ${DIM}[${idx + 1}/${total}]${RESET} ${label} ${RED}ERROR${RESET} ${DIM}${dp.error.slice(0, 60)}${RESET}\n`);
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
const scoreStr = Object.entries(dp.scores).map(([name, val]) => {
|
|
81
|
+
if (Number.isNaN(val)) return `${DIM}${name}=NaN${RESET}`;
|
|
82
|
+
return `${val >= .8 ? GREEN : val >= .5 ? YELLOW : RED}${name}=${val.toFixed(2)}${RESET}`;
|
|
83
|
+
}).join(" ");
|
|
84
|
+
w.write(` ${GREEN}✓${RESET} ${DIM}[${idx + 1}/${total}]${RESET} ${label} ${scoreStr} ${DIM}${dp.durationMs}ms${RESET}\n`);
|
|
85
|
+
}
|
|
86
|
+
function scoreBar(score, width = 20) {
|
|
87
|
+
const filled = Math.round(score * width);
|
|
88
|
+
const empty = width - filled;
|
|
89
|
+
return "█".repeat(filled) + "░".repeat(empty);
|
|
90
|
+
}
|
|
91
|
+
function scoreColor(score) {
|
|
92
|
+
if (score >= .8) return GREEN;
|
|
93
|
+
if (score >= .5) return YELLOW;
|
|
94
|
+
return RED;
|
|
95
|
+
}
|
|
96
|
+
function printSummary(result) {
|
|
97
|
+
if (isSilent) return;
|
|
98
|
+
w.write("\n");
|
|
99
|
+
const entries = Object.entries(result.scores);
|
|
100
|
+
if (entries.length > 0) {
|
|
101
|
+
const maxNameLen = Math.max(...entries.map(([n]) => n.length), 10);
|
|
102
|
+
w.write(` ${DIM}${"Evaluator".padEnd(maxNameLen)} ${"Mean".padStart(6)} ${"Bar".padEnd(20)} ${"Min".padStart(5)} ${"Max".padStart(5)} ${"Med".padStart(5)}${RESET}\n`);
|
|
103
|
+
w.write(` ${DIM}${"─".repeat(maxNameLen + 50)}${RESET}\n`);
|
|
104
|
+
for (const [name, stats] of entries) {
|
|
105
|
+
const color = scoreColor(stats.mean);
|
|
106
|
+
const bar = scoreBar(stats.mean);
|
|
107
|
+
w.write(` ${name.padEnd(maxNameLen)} ${color}${stats.mean.toFixed(2).padStart(6)}${RESET} ${DIM}${bar}${RESET} ${stats.min.toFixed(2).padStart(5)} ${stats.max.toFixed(2).padStart(5)} ${stats.median.toFixed(2).padStart(5)}\n`);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
const completed = result.count - result.errors;
|
|
111
|
+
w.write("\n");
|
|
112
|
+
w.write(` ${DIM}Duration${RESET} ${(result.durationMs / 1e3).toFixed(1)}s`);
|
|
113
|
+
w.write(` ${DIM}Passed${RESET} ${completed}/${result.count}`);
|
|
114
|
+
if (result.errors > 0) w.write(` ${RED}Failed ${result.errors}${RESET}`);
|
|
115
|
+
w.write(` ${DIM}Run${RESET} ${CYAN}${result.runId.slice(0, 8)}${RESET}`);
|
|
116
|
+
w.write("\n\n");
|
|
117
|
+
}
|
|
118
|
+
function saveResult(result, outputDir) {
|
|
119
|
+
const dir = (0, node_path.join)(outputDir, result.name);
|
|
120
|
+
(0, node_fs.mkdirSync)(dir, { recursive: true });
|
|
121
|
+
(0, node_fs.writeFileSync)((0, node_path.join)(dir, `${Date.now()}.json`), JSON.stringify(result, null, 2));
|
|
122
|
+
}
|
|
123
|
+
async function runSingleExecutor(name, dataset, executor, evaluators, concurrency) {
|
|
59
124
|
const size = await dataset.size();
|
|
60
125
|
const datapoints = await dataset.slice(0, size);
|
|
61
126
|
const results = new Array(datapoints.length);
|
|
62
127
|
const startTime = Date.now();
|
|
128
|
+
printHeader(name, datapoints.length);
|
|
63
129
|
await pool(datapoints, concurrency, async (dp) => {
|
|
64
130
|
const idx = datapoints.indexOf(dp);
|
|
65
131
|
const dpStart = Date.now();
|
|
@@ -71,14 +137,16 @@ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
|
|
|
71
137
|
} catch (err) {
|
|
72
138
|
error = err instanceof Error ? err.message : String(err);
|
|
73
139
|
}
|
|
74
|
-
if (!error && output !== null) for (const [
|
|
140
|
+
if (!error && output !== null) for (const [evalName, evaluator] of Object.entries(evaluators)) try {
|
|
75
141
|
const result = await evaluator(output, dp.target, dp.data);
|
|
76
|
-
if (typeof result === "number") scores[
|
|
77
|
-
else for (const [subKey, subScore] of Object.entries(result)) scores[`${
|
|
78
|
-
} catch {
|
|
79
|
-
scores[
|
|
142
|
+
if (typeof result === "number") scores[evalName] = result;
|
|
143
|
+
else for (const [subKey, subScore] of Object.entries(result)) scores[`${evalName}.${subKey}`] = subScore;
|
|
144
|
+
} catch (evalErr) {
|
|
145
|
+
scores[evalName] = NaN;
|
|
146
|
+
const msg = evalErr instanceof Error ? evalErr.message : String(evalErr);
|
|
147
|
+
if (!isSilent) w.write(` ${YELLOW}⚠${RESET} ${DIM}evaluator "${evalName}":${RESET} ${msg.slice(0, 80)}\n`);
|
|
80
148
|
}
|
|
81
|
-
|
|
149
|
+
const dpResult = {
|
|
82
150
|
data: dp.data,
|
|
83
151
|
target: dp.target,
|
|
84
152
|
metadata: dp.metadata,
|
|
@@ -87,33 +155,14 @@ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
|
|
|
87
155
|
durationMs: Date.now() - dpStart,
|
|
88
156
|
error
|
|
89
157
|
};
|
|
158
|
+
results[idx] = dpResult;
|
|
159
|
+
printDatapointResult(idx, datapoints.length, dpResult);
|
|
90
160
|
});
|
|
91
161
|
return {
|
|
92
162
|
results,
|
|
93
163
|
durationMs: Date.now() - startTime
|
|
94
164
|
};
|
|
95
165
|
}
|
|
96
|
-
function printSummary(result) {
|
|
97
|
-
const lines = [];
|
|
98
|
-
lines.push("");
|
|
99
|
-
lines.push(` ${result.name}`);
|
|
100
|
-
lines.push("");
|
|
101
|
-
const completed = result.count - result.errors;
|
|
102
|
-
lines.push(` ✓ ${completed}/${result.count} completed${result.errors > 0 ? ` ✗ ${result.errors} errors` : ""}`);
|
|
103
|
-
lines.push("");
|
|
104
|
-
lines.push(" Scores:");
|
|
105
|
-
for (const [name, stats] of Object.entries(result.scores)) lines.push(` ${name.padEnd(16)} mean=${stats.mean.toFixed(2)} min=${stats.min.toFixed(2)} max=${stats.max.toFixed(2)} median=${stats.median.toFixed(2)}`);
|
|
106
|
-
lines.push("");
|
|
107
|
-
lines.push(` Duration: ${(result.durationMs / 1e3).toFixed(1)}s`);
|
|
108
|
-
lines.push(` Run ID: ${result.runId}`);
|
|
109
|
-
lines.push("");
|
|
110
|
-
process.stderr.write(lines.join("\n"));
|
|
111
|
-
}
|
|
112
|
-
function saveResult(result, outputDir) {
|
|
113
|
-
const dir = (0, node_path.join)(outputDir, result.name);
|
|
114
|
-
(0, node_fs.mkdirSync)(dir, { recursive: true });
|
|
115
|
-
(0, node_fs.writeFileSync)((0, node_path.join)(dir, `${result.runId}.json`), JSON.stringify(result, null, 2));
|
|
116
|
-
}
|
|
117
166
|
async function evaluate(options) {
|
|
118
167
|
const { name, data, executor, variants, evaluators, concurrency = 5, group, metadata, outputDir = process.env.LLMOPS_EVAL_OUTPUT_DIR || "./llmops-evals" } = options;
|
|
119
168
|
const runId = (0, node_crypto.randomUUID)();
|
|
@@ -121,7 +170,7 @@ async function evaluate(options) {
|
|
|
121
170
|
if (!executor && !variants) throw new Error("evaluate(): provide either executor or variants");
|
|
122
171
|
const dataset = Array.isArray(data) ? new InlineDataset(data) : data;
|
|
123
172
|
if (executor) {
|
|
124
|
-
const { results, durationMs } = await runSingleExecutor(dataset, executor, evaluators, concurrency);
|
|
173
|
+
const { results, durationMs } = await runSingleExecutor(name, dataset, executor, evaluators, concurrency);
|
|
125
174
|
const scoreNames = /* @__PURE__ */ new Set();
|
|
126
175
|
for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
|
|
127
176
|
const scores = {};
|
|
@@ -137,7 +186,7 @@ async function evaluate(options) {
|
|
|
137
186
|
metadata,
|
|
138
187
|
results
|
|
139
188
|
};
|
|
140
|
-
if (
|
|
189
|
+
if (isSilent) process.stdout.write(JSON.stringify(result, null, 2));
|
|
141
190
|
else printSummary(result);
|
|
142
191
|
saveResult(result, outputDir);
|
|
143
192
|
return result;
|
|
@@ -145,7 +194,7 @@ async function evaluate(options) {
|
|
|
145
194
|
const variantResults = {};
|
|
146
195
|
const totalStart = Date.now();
|
|
147
196
|
for (const [variantName, variantExecutor] of Object.entries(variants)) {
|
|
148
|
-
const { results, durationMs } = await runSingleExecutor(dataset, variantExecutor, evaluators, concurrency);
|
|
197
|
+
const { results, durationMs } = await runSingleExecutor(`${name}/${variantName}`, dataset, variantExecutor, evaluators, concurrency);
|
|
149
198
|
const scoreNames = /* @__PURE__ */ new Set();
|
|
150
199
|
for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
|
|
151
200
|
const scores = {};
|
|
@@ -162,7 +211,7 @@ async function evaluate(options) {
|
|
|
162
211
|
results
|
|
163
212
|
};
|
|
164
213
|
variantResults[variantName] = variantResult;
|
|
165
|
-
if (
|
|
214
|
+
if (!isSilent) printSummary(variantResult);
|
|
166
215
|
saveResult(variantResult, outputDir);
|
|
167
216
|
}
|
|
168
217
|
const variantEvalResult = {
|
|
@@ -173,48 +222,43 @@ async function evaluate(options) {
|
|
|
173
222
|
metadata,
|
|
174
223
|
variants: variantResults
|
|
175
224
|
};
|
|
176
|
-
if (
|
|
225
|
+
if (isSilent) process.stdout.write(JSON.stringify(variantEvalResult, null, 2));
|
|
177
226
|
return variantEvalResult;
|
|
178
227
|
}
|
|
179
228
|
|
|
180
229
|
//#endregion
|
|
181
230
|
//#region src/eval/compare.ts
|
|
182
231
|
/**
|
|
183
|
-
* Load an eval
|
|
232
|
+
* Load an eval result from a JSON file.
|
|
184
233
|
*/
|
|
185
|
-
function
|
|
186
|
-
const dir = (0, node_path.join)(outputDir, name);
|
|
187
|
-
const filePath = (0, node_path.join)(dir, `${runId}.json`);
|
|
234
|
+
function loadResult(filePath) {
|
|
188
235
|
try {
|
|
189
236
|
const content = (0, node_fs.readFileSync)(filePath, "utf-8");
|
|
190
237
|
return JSON.parse(content);
|
|
191
238
|
} catch {
|
|
192
|
-
|
|
193
|
-
const match = (0, node_fs.readdirSync)(dir).find((f) => f.startsWith(runId) && f.endsWith(".json"));
|
|
194
|
-
if (match) {
|
|
195
|
-
const content = (0, node_fs.readFileSync)((0, node_path.join)(dir, match), "utf-8");
|
|
196
|
-
return JSON.parse(content);
|
|
197
|
-
}
|
|
198
|
-
} catch {}
|
|
199
|
-
throw new Error(`Eval run "${runId}" not found for "${name}" in ${outputDir}. Expected file: ${filePath}`);
|
|
239
|
+
throw new Error(`Could not read eval result: ${filePath}`);
|
|
200
240
|
}
|
|
201
241
|
}
|
|
202
242
|
/**
|
|
203
|
-
* Compare two eval
|
|
243
|
+
* Compare two eval result files. First file is the baseline.
|
|
204
244
|
*
|
|
205
|
-
* Usage:
|
|
245
|
+
* Usage with version control:
|
|
246
|
+
* 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
|
|
247
|
+
* 2. Commit the file
|
|
248
|
+
* 3. Make changes, re-run eval
|
|
249
|
+
* 4. Compare: git stash the new result, compare old vs new
|
|
250
|
+
*
|
|
251
|
+
* Or compare two named eval files:
|
|
206
252
|
* ```ts
|
|
207
253
|
* const diff = await compare({
|
|
208
|
-
*
|
|
209
|
-
* runs: [run1.runId, run2.runId],
|
|
254
|
+
* files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
|
|
210
255
|
* })
|
|
211
256
|
* ```
|
|
212
257
|
*/
|
|
213
258
|
async function compare(options) {
|
|
214
|
-
const {
|
|
215
|
-
|
|
216
|
-
const
|
|
217
|
-
const candidateRun = loadRun(outputDir, name, runs[1]);
|
|
259
|
+
const { files } = options;
|
|
260
|
+
const baselineRun = loadResult(files[0]);
|
|
261
|
+
const candidateRun = loadResult(files[1]);
|
|
218
262
|
const allScoreNames = new Set([...Object.keys(baselineRun.scores), ...Object.keys(candidateRun.scores)]);
|
|
219
263
|
const scores = {};
|
|
220
264
|
for (const scoreName of allScoreNames) {
|
|
@@ -251,112 +295,165 @@ async function compare(options) {
|
|
|
251
295
|
}
|
|
252
296
|
}
|
|
253
297
|
const result = {
|
|
254
|
-
baseline:
|
|
255
|
-
candidate:
|
|
298
|
+
baseline: baselineRun.runId,
|
|
299
|
+
candidate: candidateRun.runId,
|
|
256
300
|
scores,
|
|
257
301
|
regressions,
|
|
258
302
|
improvements
|
|
259
303
|
};
|
|
260
|
-
const
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
304
|
+
const w$1 = process.stderr;
|
|
305
|
+
const RESET$1 = "\x1B[0m";
|
|
306
|
+
const DIM$1 = "\x1B[2m";
|
|
307
|
+
const BOLD$1 = "\x1B[1m";
|
|
308
|
+
const GREEN$1 = "\x1B[32m";
|
|
309
|
+
const RED$1 = "\x1B[31m";
|
|
310
|
+
const CYAN$1 = "\x1B[36m";
|
|
311
|
+
w$1.write("\n");
|
|
312
|
+
w$1.write(` ${BOLD$1}Compare${RESET$1} ${DIM$1}${baselineRun.name} → ${candidateRun.name}${RESET$1}\n`);
|
|
313
|
+
w$1.write(` ${DIM$1}${"─".repeat(50)}${RESET$1}\n\n`);
|
|
314
|
+
const scoreEntries = Object.entries(scores);
|
|
315
|
+
if (scoreEntries.length > 0) {
|
|
316
|
+
const maxNameLen = Math.max(...scoreEntries.map(([n]) => n.length), 10);
|
|
317
|
+
w$1.write(` ${DIM$1}${"Evaluator".padEnd(maxNameLen)} ${"Base".padStart(6)} ${"New".padStart(6)} ${"Delta".padStart(7)}${RESET$1}\n`);
|
|
318
|
+
w$1.write(` ${DIM$1}${"─".repeat(maxNameLen + 30)}${RESET$1}\n`);
|
|
319
|
+
for (const [scoreName, delta] of scoreEntries) {
|
|
320
|
+
const sign = delta.delta >= 0 ? "+" : "";
|
|
321
|
+
const color = delta.delta >= 0 ? GREEN$1 : RED$1;
|
|
322
|
+
const icon = delta.delta > 0 ? "▲" : delta.delta < 0 ? "▼" : "=";
|
|
323
|
+
w$1.write(` ${scoreName.padEnd(maxNameLen)} ${delta.baseline.toFixed(2).padStart(6)} ${DIM$1}→${RESET$1} ${delta.candidate.toFixed(2).padStart(6)} ${color}${sign}${delta.delta.toFixed(2).padStart(5)} ${icon}${RESET$1}\n`);
|
|
324
|
+
}
|
|
325
|
+
w$1.write("\n");
|
|
269
326
|
}
|
|
270
327
|
if (regressions.length > 0) {
|
|
271
|
-
|
|
272
|
-
lines.push(` Regressions (${regressions.length}):`);
|
|
328
|
+
w$1.write(` ${RED$1}▼ ${regressions.length} regression${regressions.length > 1 ? "s" : ""}${RESET$1}\n`);
|
|
273
329
|
for (const r of regressions.slice(0, 5)) {
|
|
274
|
-
const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0,
|
|
275
|
-
|
|
330
|
+
const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0, 50);
|
|
331
|
+
w$1.write(` ${DIM$1}${dataStr}${RESET$1} ${r.evaluator}: ${r.baselineScore.toFixed(2)} → ${RED$1}${r.candidateScore.toFixed(2)}${RESET$1}\n`);
|
|
276
332
|
}
|
|
277
|
-
if (regressions.length > 5)
|
|
333
|
+
if (regressions.length > 5) w$1.write(` ${DIM$1}... and ${regressions.length - 5} more${RESET$1}\n`);
|
|
334
|
+
w$1.write("\n");
|
|
278
335
|
}
|
|
279
336
|
if (improvements.length > 0) {
|
|
280
|
-
|
|
281
|
-
lines.push(` Improvements (${improvements.length}):`);
|
|
337
|
+
w$1.write(` ${GREEN$1}▲ ${improvements.length} improvement${improvements.length > 1 ? "s" : ""}${RESET$1}\n`);
|
|
282
338
|
for (const imp of improvements.slice(0, 5)) {
|
|
283
|
-
const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0,
|
|
284
|
-
|
|
339
|
+
const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0, 50);
|
|
340
|
+
w$1.write(` ${DIM$1}${dataStr}${RESET$1} ${imp.evaluator}: ${imp.baselineScore.toFixed(2)} → ${GREEN$1}${imp.candidateScore.toFixed(2)}${RESET$1}\n`);
|
|
285
341
|
}
|
|
286
|
-
if (improvements.length > 5)
|
|
342
|
+
if (improvements.length > 5) w$1.write(` ${DIM$1}... and ${improvements.length - 5} more${RESET$1}\n`);
|
|
343
|
+
w$1.write("\n");
|
|
287
344
|
}
|
|
288
|
-
|
|
289
|
-
process.stderr.write(lines.join("\n"));
|
|
345
|
+
if (regressions.length === 0 && improvements.length === 0) w$1.write(` ${CYAN$1}No changes between runs${RESET$1}\n\n`);
|
|
290
346
|
return result;
|
|
291
347
|
}
|
|
292
348
|
|
|
293
349
|
//#endregion
|
|
294
350
|
//#region src/eval/judge.ts
|
|
295
|
-
/**
|
|
296
|
-
* Simple mustache-style template interpolation.
|
|
297
|
-
*/
|
|
298
351
|
function interpolate(template, vars) {
|
|
299
352
|
return template.replace(/\{\{(\w+(?:\.\w+)*)\}\}/g, (_, path) => {
|
|
300
353
|
const value = path.split(".").reduce((obj, key) => obj?.[key], vars);
|
|
301
|
-
|
|
354
|
+
if (value === void 0 || value === null) return "";
|
|
355
|
+
return typeof value === "string" ? value : JSON.stringify(value, null, 2);
|
|
302
356
|
});
|
|
303
357
|
}
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
358
|
+
function buildVars(output, target, data) {
|
|
359
|
+
const vars = {
|
|
360
|
+
output: typeof output === "string" ? output : JSON.stringify(output, null, 2),
|
|
361
|
+
target: typeof target === "string" ? target : JSON.stringify(target, null, 2),
|
|
362
|
+
data: typeof data === "string" ? data : JSON.stringify(data, null, 2)
|
|
363
|
+
};
|
|
364
|
+
if (target && typeof target === "object") for (const [k, v] of Object.entries(target)) vars[`target.${k}`] = v;
|
|
365
|
+
if (data && typeof data === "object") for (const [k, v] of Object.entries(data)) vars[`data.${k}`] = v;
|
|
366
|
+
return vars;
|
|
367
|
+
}
|
|
368
|
+
const DEFAULT_SYSTEM = `You are an expert evaluator. Your job is to grade an AI system's output.
|
|
369
|
+
|
|
370
|
+
Instructions:
|
|
371
|
+
- Read the grading criteria in the user message carefully.
|
|
372
|
+
- Evaluate the output objectively.
|
|
373
|
+
- Return ONLY valid JSON. No markdown, no explanation outside the JSON.
|
|
374
|
+
- The JSON must contain a "score" field with a number between 0.0 and 1.0.
|
|
375
|
+
- You may optionally include a "reasoning" field with a brief explanation.
|
|
376
|
+
|
|
377
|
+
Example response:
|
|
378
|
+
{"score": 0.85, "reasoning": "The response is mostly accurate but misses one detail."}`;
|
|
308
379
|
function defaultParse(response) {
|
|
309
|
-
|
|
380
|
+
let cleaned = response.replace(/```(?:json)?\n?/g, "").replace(/```$/g, "").trim();
|
|
381
|
+
const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
|
|
382
|
+
if (jsonMatch) cleaned = jsonMatch[0];
|
|
310
383
|
const parsed = JSON.parse(cleaned);
|
|
311
|
-
if (typeof parsed === "number") return parsed;
|
|
312
|
-
if (typeof parsed
|
|
384
|
+
if (typeof parsed?.score === "number") return Math.max(0, Math.min(1, parsed.score));
|
|
385
|
+
if (typeof parsed === "number") return Math.max(0, Math.min(1, parsed));
|
|
313
386
|
if (typeof parsed === "object" && parsed !== null) {
|
|
314
|
-
const entries = Object.entries(parsed).filter(([, v]) => typeof v === "number");
|
|
315
|
-
if (entries.length > 0) return Object.fromEntries(entries);
|
|
387
|
+
const entries = Object.entries(parsed).filter(([k, v]) => typeof v === "number" && k !== "reasoning");
|
|
388
|
+
if (entries.length > 0) return Object.fromEntries(entries.map(([k, v]) => [k, Math.max(0, Math.min(1, v))]));
|
|
316
389
|
}
|
|
317
|
-
throw new Error(`Could not extract score from judge response: ${response.slice(0,
|
|
390
|
+
throw new Error(`Could not extract score from judge response: ${response.slice(0, 300)}`);
|
|
391
|
+
}
|
|
392
|
+
async function callLLM(client, model, system, userMessage, temperature) {
|
|
393
|
+
const providerConfig = client.provider();
|
|
394
|
+
const response = await providerConfig.fetch(`${providerConfig.baseURL}/chat/completions`, {
|
|
395
|
+
method: "POST",
|
|
396
|
+
headers: {
|
|
397
|
+
"Content-Type": "application/json",
|
|
398
|
+
Authorization: `Bearer ${providerConfig.apiKey}`
|
|
399
|
+
},
|
|
400
|
+
body: JSON.stringify({
|
|
401
|
+
model,
|
|
402
|
+
temperature,
|
|
403
|
+
messages: [{
|
|
404
|
+
role: "system",
|
|
405
|
+
content: system
|
|
406
|
+
}, {
|
|
407
|
+
role: "user",
|
|
408
|
+
content: userMessage
|
|
409
|
+
}]
|
|
410
|
+
})
|
|
411
|
+
});
|
|
412
|
+
if (!response.ok) {
|
|
413
|
+
const errorBody = await response.text().catch(() => "unknown error");
|
|
414
|
+
throw new Error(`Judge LLM call failed (${response.status}): ${errorBody.slice(0, 300)}`);
|
|
415
|
+
}
|
|
416
|
+
const content = (await response.json()).choices?.[0]?.message?.content;
|
|
417
|
+
if (!content) throw new Error("Judge LLM returned empty response");
|
|
418
|
+
return content;
|
|
318
419
|
}
|
|
319
420
|
/**
|
|
320
421
|
* Factory that returns an Evaluator which uses an LLM to score output.
|
|
321
422
|
*
|
|
423
|
+
* The judge:
|
|
424
|
+
* - Uses a system message that instructs the LLM to return JSON scores
|
|
425
|
+
* - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
|
|
426
|
+
* - Uses temperature 0 by default for deterministic scoring
|
|
427
|
+
* - Retries on parse failure (configurable)
|
|
428
|
+
* - Clamps scores to [0, 1]
|
|
429
|
+
*
|
|
322
430
|
* Usage:
|
|
323
431
|
* ```ts
|
|
432
|
+
* import { llmops } from '@llmops/sdk'
|
|
433
|
+
*
|
|
434
|
+
* const client = llmops()
|
|
324
435
|
* const accuracy = judgeScorer({
|
|
325
436
|
* model: '@openai/gpt-4o',
|
|
326
|
-
* prompt:
|
|
327
|
-
*
|
|
437
|
+
* prompt: `Rate the accuracy of this response.
|
|
438
|
+
* Expected: {{target.answer}}
|
|
439
|
+
* Actual: {{output}}`,
|
|
440
|
+
* client,
|
|
328
441
|
* })
|
|
329
442
|
* ```
|
|
330
443
|
*/
|
|
331
444
|
function judgeScorer(options) {
|
|
332
|
-
const { model, prompt,
|
|
333
|
-
return async (output, target) => {
|
|
334
|
-
const
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
"Content-Type": "application/json",
|
|
345
|
-
Authorization: `Bearer ${providerConfig.apiKey}`
|
|
346
|
-
},
|
|
347
|
-
body: JSON.stringify({
|
|
348
|
-
model,
|
|
349
|
-
messages: [{
|
|
350
|
-
role: "user",
|
|
351
|
-
content: renderedPrompt
|
|
352
|
-
}],
|
|
353
|
-
response_format: { type: "json_object" }
|
|
354
|
-
})
|
|
355
|
-
});
|
|
356
|
-
if (!response.ok) throw new Error(`Judge LLM call failed: ${response.status} ${await response.text()}`);
|
|
357
|
-
const content = (await response.json()).choices?.[0]?.message?.content;
|
|
358
|
-
if (!content) throw new Error("Judge LLM returned empty response");
|
|
359
|
-
return parse(content);
|
|
445
|
+
const { model, prompt, client, system = DEFAULT_SYSTEM, temperature = 0, maxRetries = 1, parse = defaultParse } = options;
|
|
446
|
+
return async (output, target, data) => {
|
|
447
|
+
const userMessage = interpolate(prompt, buildVars(output, target, data));
|
|
448
|
+
let lastError = null;
|
|
449
|
+
const attempts = 1 + maxRetries;
|
|
450
|
+
for (let attempt = 0; attempt < attempts; attempt++) try {
|
|
451
|
+
return parse(await callLLM(client, model, system, userMessage, temperature));
|
|
452
|
+
} catch (err) {
|
|
453
|
+
lastError = err instanceof Error ? err : new Error(String(err));
|
|
454
|
+
if (lastError.message.includes("Judge LLM call failed")) throw lastError;
|
|
455
|
+
}
|
|
456
|
+
throw lastError ?? /* @__PURE__ */ new Error("Judge scoring failed");
|
|
360
457
|
};
|
|
361
458
|
}
|
|
362
459
|
|
package/dist/eval.d.cts
CHANGED
|
@@ -116,12 +116,8 @@ interface VariantEvaluateResult<D = unknown, O = unknown> {
|
|
|
116
116
|
* Options for compare().
|
|
117
117
|
*/
|
|
118
118
|
interface CompareOptions {
|
|
119
|
-
/**
|
|
120
|
-
|
|
121
|
-
/** Directory where eval results are stored. Default: './llmops-evals' */
|
|
122
|
-
outputDir?: string;
|
|
123
|
-
/** Eval name to search within. Required. */
|
|
124
|
-
name: string;
|
|
119
|
+
/** Paths to eval result JSON files. First is baseline, second is candidate. */
|
|
120
|
+
files: [string, string];
|
|
125
121
|
}
|
|
126
122
|
/**
|
|
127
123
|
* Per-evaluator delta between two runs.
|
|
@@ -157,11 +153,38 @@ interface CompareResult {
|
|
|
157
153
|
interface JudgeScorerOptions {
|
|
158
154
|
/** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
|
|
159
155
|
model: string;
|
|
160
|
-
/**
|
|
156
|
+
/**
|
|
157
|
+
* Grading prompt. Supports {{output}}, {{target}}, {{target.*}},
|
|
158
|
+
* {{data}}, {{data.*}} placeholders.
|
|
159
|
+
*
|
|
160
|
+
* This becomes the user message. A system message is added automatically
|
|
161
|
+
* that instructs the LLM to return a JSON score.
|
|
162
|
+
*/
|
|
161
163
|
prompt: string;
|
|
162
|
-
/**
|
|
163
|
-
|
|
164
|
-
|
|
164
|
+
/**
|
|
165
|
+
* The llmops client instance. The judge call is routed through the
|
|
166
|
+
* gateway and traced like any other LLM call.
|
|
167
|
+
*
|
|
168
|
+
* ```ts
|
|
169
|
+
* const client = llmops({ telemetry: pgStore(url) })
|
|
170
|
+
* judgeScorer({ model: '@openai/gpt-4o', prompt: '...', client })
|
|
171
|
+
* ```
|
|
172
|
+
*/
|
|
173
|
+
client: LLMOpsClient;
|
|
174
|
+
/**
|
|
175
|
+
* Custom system message. Overrides the default grading instructions.
|
|
176
|
+
* If omitted, a default system message is used that instructs
|
|
177
|
+
* the LLM to return JSON with a "score" field (0-1).
|
|
178
|
+
*/
|
|
179
|
+
system?: string;
|
|
180
|
+
/** Temperature for the judge LLM. Default: 0 (deterministic). */
|
|
181
|
+
temperature?: number;
|
|
182
|
+
/** Max retries on parse failure. Default: 1. */
|
|
183
|
+
maxRetries?: number;
|
|
184
|
+
/**
|
|
185
|
+
* Custom parser for extracting score from LLM response.
|
|
186
|
+
* Default: expects JSON with a `score` field.
|
|
187
|
+
*/
|
|
165
188
|
parse?: (response: string) => number | Record<string, number>;
|
|
166
189
|
}
|
|
167
190
|
//#endregion
|
|
@@ -170,13 +193,18 @@ declare function evaluate<D = Record<string, unknown>, T = Record<string, unknow
|
|
|
170
193
|
//#endregion
|
|
171
194
|
//#region src/eval/compare.d.ts
|
|
172
195
|
/**
|
|
173
|
-
* Compare two eval
|
|
196
|
+
* Compare two eval result files. First file is the baseline.
|
|
174
197
|
*
|
|
175
|
-
* Usage:
|
|
198
|
+
* Usage with version control:
|
|
199
|
+
* 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
|
|
200
|
+
* 2. Commit the file
|
|
201
|
+
* 3. Make changes, re-run eval
|
|
202
|
+
* 4. Compare: git stash the new result, compare old vs new
|
|
203
|
+
*
|
|
204
|
+
* Or compare two named eval files:
|
|
176
205
|
* ```ts
|
|
177
206
|
* const diff = await compare({
|
|
178
|
-
*
|
|
179
|
-
* runs: [run1.runId, run2.runId],
|
|
207
|
+
* files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
|
|
180
208
|
* })
|
|
181
209
|
* ```
|
|
182
210
|
*/
|
|
@@ -186,12 +214,24 @@ declare function compare(options: CompareOptions): Promise<CompareResult>;
|
|
|
186
214
|
/**
|
|
187
215
|
* Factory that returns an Evaluator which uses an LLM to score output.
|
|
188
216
|
*
|
|
217
|
+
* The judge:
|
|
218
|
+
* - Uses a system message that instructs the LLM to return JSON scores
|
|
219
|
+
* - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
|
|
220
|
+
* - Uses temperature 0 by default for deterministic scoring
|
|
221
|
+
* - Retries on parse failure (configurable)
|
|
222
|
+
* - Clamps scores to [0, 1]
|
|
223
|
+
*
|
|
189
224
|
* Usage:
|
|
190
225
|
* ```ts
|
|
226
|
+
* import { llmops } from '@llmops/sdk'
|
|
227
|
+
*
|
|
228
|
+
* const client = llmops()
|
|
191
229
|
* const accuracy = judgeScorer({
|
|
192
230
|
* model: '@openai/gpt-4o',
|
|
193
|
-
* prompt:
|
|
194
|
-
*
|
|
231
|
+
* prompt: `Rate the accuracy of this response.
|
|
232
|
+
* Expected: {{target.answer}}
|
|
233
|
+
* Actual: {{output}}`,
|
|
234
|
+
* client,
|
|
195
235
|
* })
|
|
196
236
|
* ```
|
|
197
237
|
*/
|
package/dist/eval.d.mts
CHANGED
|
@@ -116,12 +116,8 @@ interface VariantEvaluateResult<D = unknown, O = unknown> {
|
|
|
116
116
|
* Options for compare().
|
|
117
117
|
*/
|
|
118
118
|
interface CompareOptions {
|
|
119
|
-
/**
|
|
120
|
-
|
|
121
|
-
/** Directory where eval results are stored. Default: './llmops-evals' */
|
|
122
|
-
outputDir?: string;
|
|
123
|
-
/** Eval name to search within. Required. */
|
|
124
|
-
name: string;
|
|
119
|
+
/** Paths to eval result JSON files. First is baseline, second is candidate. */
|
|
120
|
+
files: [string, string];
|
|
125
121
|
}
|
|
126
122
|
/**
|
|
127
123
|
* Per-evaluator delta between two runs.
|
|
@@ -157,11 +153,38 @@ interface CompareResult {
|
|
|
157
153
|
interface JudgeScorerOptions {
|
|
158
154
|
/** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
|
|
159
155
|
model: string;
|
|
160
|
-
/**
|
|
156
|
+
/**
|
|
157
|
+
* Grading prompt. Supports {{output}}, {{target}}, {{target.*}},
|
|
158
|
+
* {{data}}, {{data.*}} placeholders.
|
|
159
|
+
*
|
|
160
|
+
* This becomes the user message. A system message is added automatically
|
|
161
|
+
* that instructs the LLM to return a JSON score.
|
|
162
|
+
*/
|
|
161
163
|
prompt: string;
|
|
162
|
-
/**
|
|
163
|
-
|
|
164
|
-
|
|
164
|
+
/**
|
|
165
|
+
* The llmops client instance. The judge call is routed through the
|
|
166
|
+
* gateway and traced like any other LLM call.
|
|
167
|
+
*
|
|
168
|
+
* ```ts
|
|
169
|
+
* const client = llmops({ telemetry: pgStore(url) })
|
|
170
|
+
* judgeScorer({ model: '@openai/gpt-4o', prompt: '...', client })
|
|
171
|
+
* ```
|
|
172
|
+
*/
|
|
173
|
+
client: LLMOpsClient;
|
|
174
|
+
/**
|
|
175
|
+
* Custom system message. Overrides the default grading instructions.
|
|
176
|
+
* If omitted, a default system message is used that instructs
|
|
177
|
+
* the LLM to return JSON with a "score" field (0-1).
|
|
178
|
+
*/
|
|
179
|
+
system?: string;
|
|
180
|
+
/** Temperature for the judge LLM. Default: 0 (deterministic). */
|
|
181
|
+
temperature?: number;
|
|
182
|
+
/** Max retries on parse failure. Default: 1. */
|
|
183
|
+
maxRetries?: number;
|
|
184
|
+
/**
|
|
185
|
+
* Custom parser for extracting score from LLM response.
|
|
186
|
+
* Default: expects JSON with a `score` field.
|
|
187
|
+
*/
|
|
165
188
|
parse?: (response: string) => number | Record<string, number>;
|
|
166
189
|
}
|
|
167
190
|
//#endregion
|
|
@@ -170,13 +193,18 @@ declare function evaluate<D = Record<string, unknown>, T = Record<string, unknow
|
|
|
170
193
|
//#endregion
|
|
171
194
|
//#region src/eval/compare.d.ts
|
|
172
195
|
/**
|
|
173
|
-
* Compare two eval
|
|
196
|
+
* Compare two eval result files. First file is the baseline.
|
|
174
197
|
*
|
|
175
|
-
* Usage:
|
|
198
|
+
* Usage with version control:
|
|
199
|
+
* 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
|
|
200
|
+
* 2. Commit the file
|
|
201
|
+
* 3. Make changes, re-run eval
|
|
202
|
+
* 4. Compare: git stash the new result, compare old vs new
|
|
203
|
+
*
|
|
204
|
+
* Or compare two named eval files:
|
|
176
205
|
* ```ts
|
|
177
206
|
* const diff = await compare({
|
|
178
|
-
*
|
|
179
|
-
* runs: [run1.runId, run2.runId],
|
|
207
|
+
* files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
|
|
180
208
|
* })
|
|
181
209
|
* ```
|
|
182
210
|
*/
|
|
@@ -186,12 +214,24 @@ declare function compare(options: CompareOptions): Promise<CompareResult>;
|
|
|
186
214
|
/**
|
|
187
215
|
* Factory that returns an Evaluator which uses an LLM to score output.
|
|
188
216
|
*
|
|
217
|
+
* The judge:
|
|
218
|
+
* - Uses a system message that instructs the LLM to return JSON scores
|
|
219
|
+
* - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
|
|
220
|
+
* - Uses temperature 0 by default for deterministic scoring
|
|
221
|
+
* - Retries on parse failure (configurable)
|
|
222
|
+
* - Clamps scores to [0, 1]
|
|
223
|
+
*
|
|
189
224
|
* Usage:
|
|
190
225
|
* ```ts
|
|
226
|
+
* import { llmops } from '@llmops/sdk'
|
|
227
|
+
*
|
|
228
|
+
* const client = llmops()
|
|
191
229
|
* const accuracy = judgeScorer({
|
|
192
230
|
* model: '@openai/gpt-4o',
|
|
193
|
-
* prompt:
|
|
194
|
-
*
|
|
231
|
+
* prompt: `Rate the accuracy of this response.
|
|
232
|
+
* Expected: {{target.answer}}
|
|
233
|
+
* Actual: {{output}}`,
|
|
234
|
+
* client,
|
|
195
235
|
* })
|
|
196
236
|
* ```
|
|
197
237
|
*/
|
package/dist/eval.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { randomUUID } from "node:crypto";
|
|
2
|
-
import { mkdirSync, readFileSync,
|
|
2
|
+
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
3
|
import { join } from "node:path";
|
|
4
4
|
|
|
5
5
|
//#region src/eval/dataset.ts
|
|
@@ -23,6 +23,13 @@ var InlineDataset = class {
|
|
|
23
23
|
|
|
24
24
|
//#endregion
|
|
25
25
|
//#region src/eval/evaluate.ts
|
|
26
|
+
const RESET = "\x1B[0m";
|
|
27
|
+
const DIM = "\x1B[2m";
|
|
28
|
+
const BOLD = "\x1B[1m";
|
|
29
|
+
const CYAN = "\x1B[36m";
|
|
30
|
+
const GREEN = "\x1B[32m";
|
|
31
|
+
const RED = "\x1B[31m";
|
|
32
|
+
const YELLOW = "\x1B[33m";
|
|
26
33
|
async function pool(items, concurrency, fn) {
|
|
27
34
|
const executing = [];
|
|
28
35
|
for (const item of items) {
|
|
@@ -55,11 +62,70 @@ function computeStats(values) {
|
|
|
55
62
|
count: sorted.length
|
|
56
63
|
};
|
|
57
64
|
}
|
|
58
|
-
|
|
65
|
+
const isSilent = process.env.LLMOPS_EVAL_OUTPUT === "json";
|
|
66
|
+
const w = process.stderr;
|
|
67
|
+
function printHeader(name, total) {
|
|
68
|
+
if (isSilent) return;
|
|
69
|
+
w.write("\n");
|
|
70
|
+
w.write(` ${BOLD}${name}${RESET} ${DIM}(${total} datapoints)${RESET}\n`);
|
|
71
|
+
w.write(` ${DIM}${"─".repeat(50)}${RESET}\n`);
|
|
72
|
+
}
|
|
73
|
+
function printDatapointResult(idx, total, dp) {
|
|
74
|
+
if (isSilent) return;
|
|
75
|
+
const label = typeof dp.data === "object" && dp.data !== null ? JSON.stringify(dp.data).slice(0, 50) : String(dp.data).slice(0, 50);
|
|
76
|
+
if (dp.error) {
|
|
77
|
+
w.write(` ${RED}✗${RESET} ${DIM}[${idx + 1}/${total}]${RESET} ${label} ${RED}ERROR${RESET} ${DIM}${dp.error.slice(0, 60)}${RESET}\n`);
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
const scoreStr = Object.entries(dp.scores).map(([name, val]) => {
|
|
81
|
+
if (Number.isNaN(val)) return `${DIM}${name}=NaN${RESET}`;
|
|
82
|
+
return `${val >= .8 ? GREEN : val >= .5 ? YELLOW : RED}${name}=${val.toFixed(2)}${RESET}`;
|
|
83
|
+
}).join(" ");
|
|
84
|
+
w.write(` ${GREEN}✓${RESET} ${DIM}[${idx + 1}/${total}]${RESET} ${label} ${scoreStr} ${DIM}${dp.durationMs}ms${RESET}\n`);
|
|
85
|
+
}
|
|
86
|
+
function scoreBar(score, width = 20) {
|
|
87
|
+
const filled = Math.round(score * width);
|
|
88
|
+
const empty = width - filled;
|
|
89
|
+
return "█".repeat(filled) + "░".repeat(empty);
|
|
90
|
+
}
|
|
91
|
+
function scoreColor(score) {
|
|
92
|
+
if (score >= .8) return GREEN;
|
|
93
|
+
if (score >= .5) return YELLOW;
|
|
94
|
+
return RED;
|
|
95
|
+
}
|
|
96
|
+
function printSummary(result) {
|
|
97
|
+
if (isSilent) return;
|
|
98
|
+
w.write("\n");
|
|
99
|
+
const entries = Object.entries(result.scores);
|
|
100
|
+
if (entries.length > 0) {
|
|
101
|
+
const maxNameLen = Math.max(...entries.map(([n]) => n.length), 10);
|
|
102
|
+
w.write(` ${DIM}${"Evaluator".padEnd(maxNameLen)} ${"Mean".padStart(6)} ${"Bar".padEnd(20)} ${"Min".padStart(5)} ${"Max".padStart(5)} ${"Med".padStart(5)}${RESET}\n`);
|
|
103
|
+
w.write(` ${DIM}${"─".repeat(maxNameLen + 50)}${RESET}\n`);
|
|
104
|
+
for (const [name, stats] of entries) {
|
|
105
|
+
const color = scoreColor(stats.mean);
|
|
106
|
+
const bar = scoreBar(stats.mean);
|
|
107
|
+
w.write(` ${name.padEnd(maxNameLen)} ${color}${stats.mean.toFixed(2).padStart(6)}${RESET} ${DIM}${bar}${RESET} ${stats.min.toFixed(2).padStart(5)} ${stats.max.toFixed(2).padStart(5)} ${stats.median.toFixed(2).padStart(5)}\n`);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
const completed = result.count - result.errors;
|
|
111
|
+
w.write("\n");
|
|
112
|
+
w.write(` ${DIM}Duration${RESET} ${(result.durationMs / 1e3).toFixed(1)}s`);
|
|
113
|
+
w.write(` ${DIM}Passed${RESET} ${completed}/${result.count}`);
|
|
114
|
+
if (result.errors > 0) w.write(` ${RED}Failed ${result.errors}${RESET}`);
|
|
115
|
+
w.write(` ${DIM}Run${RESET} ${CYAN}${result.runId.slice(0, 8)}${RESET}`);
|
|
116
|
+
w.write("\n\n");
|
|
117
|
+
}
|
|
118
|
+
function saveResult(result, outputDir) {
|
|
119
|
+
const dir = join(outputDir, result.name);
|
|
120
|
+
mkdirSync(dir, { recursive: true });
|
|
121
|
+
writeFileSync(join(dir, `${Date.now()}.json`), JSON.stringify(result, null, 2));
|
|
122
|
+
}
|
|
123
|
+
async function runSingleExecutor(name, dataset, executor, evaluators, concurrency) {
|
|
59
124
|
const size = await dataset.size();
|
|
60
125
|
const datapoints = await dataset.slice(0, size);
|
|
61
126
|
const results = new Array(datapoints.length);
|
|
62
127
|
const startTime = Date.now();
|
|
128
|
+
printHeader(name, datapoints.length);
|
|
63
129
|
await pool(datapoints, concurrency, async (dp) => {
|
|
64
130
|
const idx = datapoints.indexOf(dp);
|
|
65
131
|
const dpStart = Date.now();
|
|
@@ -71,14 +137,16 @@ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
|
|
|
71
137
|
} catch (err) {
|
|
72
138
|
error = err instanceof Error ? err.message : String(err);
|
|
73
139
|
}
|
|
74
|
-
if (!error && output !== null) for (const [
|
|
140
|
+
if (!error && output !== null) for (const [evalName, evaluator] of Object.entries(evaluators)) try {
|
|
75
141
|
const result = await evaluator(output, dp.target, dp.data);
|
|
76
|
-
if (typeof result === "number") scores[
|
|
77
|
-
else for (const [subKey, subScore] of Object.entries(result)) scores[`${
|
|
78
|
-
} catch {
|
|
79
|
-
scores[
|
|
142
|
+
if (typeof result === "number") scores[evalName] = result;
|
|
143
|
+
else for (const [subKey, subScore] of Object.entries(result)) scores[`${evalName}.${subKey}`] = subScore;
|
|
144
|
+
} catch (evalErr) {
|
|
145
|
+
scores[evalName] = NaN;
|
|
146
|
+
const msg = evalErr instanceof Error ? evalErr.message : String(evalErr);
|
|
147
|
+
if (!isSilent) w.write(` ${YELLOW}⚠${RESET} ${DIM}evaluator "${evalName}":${RESET} ${msg.slice(0, 80)}\n`);
|
|
80
148
|
}
|
|
81
|
-
|
|
149
|
+
const dpResult = {
|
|
82
150
|
data: dp.data,
|
|
83
151
|
target: dp.target,
|
|
84
152
|
metadata: dp.metadata,
|
|
@@ -87,33 +155,14 @@ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
|
|
|
87
155
|
durationMs: Date.now() - dpStart,
|
|
88
156
|
error
|
|
89
157
|
};
|
|
158
|
+
results[idx] = dpResult;
|
|
159
|
+
printDatapointResult(idx, datapoints.length, dpResult);
|
|
90
160
|
});
|
|
91
161
|
return {
|
|
92
162
|
results,
|
|
93
163
|
durationMs: Date.now() - startTime
|
|
94
164
|
};
|
|
95
165
|
}
|
|
96
|
-
function printSummary(result) {
|
|
97
|
-
const lines = [];
|
|
98
|
-
lines.push("");
|
|
99
|
-
lines.push(` ${result.name}`);
|
|
100
|
-
lines.push("");
|
|
101
|
-
const completed = result.count - result.errors;
|
|
102
|
-
lines.push(` ✓ ${completed}/${result.count} completed${result.errors > 0 ? ` ✗ ${result.errors} errors` : ""}`);
|
|
103
|
-
lines.push("");
|
|
104
|
-
lines.push(" Scores:");
|
|
105
|
-
for (const [name, stats] of Object.entries(result.scores)) lines.push(` ${name.padEnd(16)} mean=${stats.mean.toFixed(2)} min=${stats.min.toFixed(2)} max=${stats.max.toFixed(2)} median=${stats.median.toFixed(2)}`);
|
|
106
|
-
lines.push("");
|
|
107
|
-
lines.push(` Duration: ${(result.durationMs / 1e3).toFixed(1)}s`);
|
|
108
|
-
lines.push(` Run ID: ${result.runId}`);
|
|
109
|
-
lines.push("");
|
|
110
|
-
process.stderr.write(lines.join("\n"));
|
|
111
|
-
}
|
|
112
|
-
function saveResult(result, outputDir) {
|
|
113
|
-
const dir = join(outputDir, result.name);
|
|
114
|
-
mkdirSync(dir, { recursive: true });
|
|
115
|
-
writeFileSync(join(dir, `${result.runId}.json`), JSON.stringify(result, null, 2));
|
|
116
|
-
}
|
|
117
166
|
async function evaluate(options) {
|
|
118
167
|
const { name, data, executor, variants, evaluators, concurrency = 5, group, metadata, outputDir = process.env.LLMOPS_EVAL_OUTPUT_DIR || "./llmops-evals" } = options;
|
|
119
168
|
const runId = randomUUID();
|
|
@@ -121,7 +170,7 @@ async function evaluate(options) {
|
|
|
121
170
|
if (!executor && !variants) throw new Error("evaluate(): provide either executor or variants");
|
|
122
171
|
const dataset = Array.isArray(data) ? new InlineDataset(data) : data;
|
|
123
172
|
if (executor) {
|
|
124
|
-
const { results, durationMs } = await runSingleExecutor(dataset, executor, evaluators, concurrency);
|
|
173
|
+
const { results, durationMs } = await runSingleExecutor(name, dataset, executor, evaluators, concurrency);
|
|
125
174
|
const scoreNames = /* @__PURE__ */ new Set();
|
|
126
175
|
for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
|
|
127
176
|
const scores = {};
|
|
@@ -137,7 +186,7 @@ async function evaluate(options) {
|
|
|
137
186
|
metadata,
|
|
138
187
|
results
|
|
139
188
|
};
|
|
140
|
-
if (
|
|
189
|
+
if (isSilent) process.stdout.write(JSON.stringify(result, null, 2));
|
|
141
190
|
else printSummary(result);
|
|
142
191
|
saveResult(result, outputDir);
|
|
143
192
|
return result;
|
|
@@ -145,7 +194,7 @@ async function evaluate(options) {
|
|
|
145
194
|
const variantResults = {};
|
|
146
195
|
const totalStart = Date.now();
|
|
147
196
|
for (const [variantName, variantExecutor] of Object.entries(variants)) {
|
|
148
|
-
const { results, durationMs } = await runSingleExecutor(dataset, variantExecutor, evaluators, concurrency);
|
|
197
|
+
const { results, durationMs } = await runSingleExecutor(`${name}/${variantName}`, dataset, variantExecutor, evaluators, concurrency);
|
|
149
198
|
const scoreNames = /* @__PURE__ */ new Set();
|
|
150
199
|
for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
|
|
151
200
|
const scores = {};
|
|
@@ -162,7 +211,7 @@ async function evaluate(options) {
|
|
|
162
211
|
results
|
|
163
212
|
};
|
|
164
213
|
variantResults[variantName] = variantResult;
|
|
165
|
-
if (
|
|
214
|
+
if (!isSilent) printSummary(variantResult);
|
|
166
215
|
saveResult(variantResult, outputDir);
|
|
167
216
|
}
|
|
168
217
|
const variantEvalResult = {
|
|
@@ -173,48 +222,43 @@ async function evaluate(options) {
|
|
|
173
222
|
metadata,
|
|
174
223
|
variants: variantResults
|
|
175
224
|
};
|
|
176
|
-
if (
|
|
225
|
+
if (isSilent) process.stdout.write(JSON.stringify(variantEvalResult, null, 2));
|
|
177
226
|
return variantEvalResult;
|
|
178
227
|
}
|
|
179
228
|
|
|
180
229
|
//#endregion
|
|
181
230
|
//#region src/eval/compare.ts
|
|
182
231
|
/**
|
|
183
|
-
* Load an eval
|
|
232
|
+
* Load an eval result from a JSON file.
|
|
184
233
|
*/
|
|
185
|
-
function
|
|
186
|
-
const dir = join(outputDir, name);
|
|
187
|
-
const filePath = join(dir, `${runId}.json`);
|
|
234
|
+
function loadResult(filePath) {
|
|
188
235
|
try {
|
|
189
236
|
const content = readFileSync(filePath, "utf-8");
|
|
190
237
|
return JSON.parse(content);
|
|
191
238
|
} catch {
|
|
192
|
-
|
|
193
|
-
const match = readdirSync(dir).find((f) => f.startsWith(runId) && f.endsWith(".json"));
|
|
194
|
-
if (match) {
|
|
195
|
-
const content = readFileSync(join(dir, match), "utf-8");
|
|
196
|
-
return JSON.parse(content);
|
|
197
|
-
}
|
|
198
|
-
} catch {}
|
|
199
|
-
throw new Error(`Eval run "${runId}" not found for "${name}" in ${outputDir}. Expected file: ${filePath}`);
|
|
239
|
+
throw new Error(`Could not read eval result: ${filePath}`);
|
|
200
240
|
}
|
|
201
241
|
}
|
|
202
242
|
/**
|
|
203
|
-
* Compare two eval
|
|
243
|
+
* Compare two eval result files. First file is the baseline.
|
|
204
244
|
*
|
|
205
|
-
* Usage:
|
|
245
|
+
* Usage with version control:
|
|
246
|
+
* 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
|
|
247
|
+
* 2. Commit the file
|
|
248
|
+
* 3. Make changes, re-run eval
|
|
249
|
+
* 4. Compare: git stash the new result, compare old vs new
|
|
250
|
+
*
|
|
251
|
+
* Or compare two named eval files:
|
|
206
252
|
* ```ts
|
|
207
253
|
* const diff = await compare({
|
|
208
|
-
*
|
|
209
|
-
* runs: [run1.runId, run2.runId],
|
|
254
|
+
* files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
|
|
210
255
|
* })
|
|
211
256
|
* ```
|
|
212
257
|
*/
|
|
213
258
|
async function compare(options) {
|
|
214
|
-
const {
|
|
215
|
-
|
|
216
|
-
const
|
|
217
|
-
const candidateRun = loadRun(outputDir, name, runs[1]);
|
|
259
|
+
const { files } = options;
|
|
260
|
+
const baselineRun = loadResult(files[0]);
|
|
261
|
+
const candidateRun = loadResult(files[1]);
|
|
218
262
|
const allScoreNames = new Set([...Object.keys(baselineRun.scores), ...Object.keys(candidateRun.scores)]);
|
|
219
263
|
const scores = {};
|
|
220
264
|
for (const scoreName of allScoreNames) {
|
|
@@ -251,112 +295,165 @@ async function compare(options) {
|
|
|
251
295
|
}
|
|
252
296
|
}
|
|
253
297
|
const result = {
|
|
254
|
-
baseline:
|
|
255
|
-
candidate:
|
|
298
|
+
baseline: baselineRun.runId,
|
|
299
|
+
candidate: candidateRun.runId,
|
|
256
300
|
scores,
|
|
257
301
|
regressions,
|
|
258
302
|
improvements
|
|
259
303
|
};
|
|
260
|
-
const
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
304
|
+
const w$1 = process.stderr;
|
|
305
|
+
const RESET$1 = "\x1B[0m";
|
|
306
|
+
const DIM$1 = "\x1B[2m";
|
|
307
|
+
const BOLD$1 = "\x1B[1m";
|
|
308
|
+
const GREEN$1 = "\x1B[32m";
|
|
309
|
+
const RED$1 = "\x1B[31m";
|
|
310
|
+
const CYAN$1 = "\x1B[36m";
|
|
311
|
+
w$1.write("\n");
|
|
312
|
+
w$1.write(` ${BOLD$1}Compare${RESET$1} ${DIM$1}${baselineRun.name} → ${candidateRun.name}${RESET$1}\n`);
|
|
313
|
+
w$1.write(` ${DIM$1}${"─".repeat(50)}${RESET$1}\n\n`);
|
|
314
|
+
const scoreEntries = Object.entries(scores);
|
|
315
|
+
if (scoreEntries.length > 0) {
|
|
316
|
+
const maxNameLen = Math.max(...scoreEntries.map(([n]) => n.length), 10);
|
|
317
|
+
w$1.write(` ${DIM$1}${"Evaluator".padEnd(maxNameLen)} ${"Base".padStart(6)} ${"New".padStart(6)} ${"Delta".padStart(7)}${RESET$1}\n`);
|
|
318
|
+
w$1.write(` ${DIM$1}${"─".repeat(maxNameLen + 30)}${RESET$1}\n`);
|
|
319
|
+
for (const [scoreName, delta] of scoreEntries) {
|
|
320
|
+
const sign = delta.delta >= 0 ? "+" : "";
|
|
321
|
+
const color = delta.delta >= 0 ? GREEN$1 : RED$1;
|
|
322
|
+
const icon = delta.delta > 0 ? "▲" : delta.delta < 0 ? "▼" : "=";
|
|
323
|
+
w$1.write(` ${scoreName.padEnd(maxNameLen)} ${delta.baseline.toFixed(2).padStart(6)} ${DIM$1}→${RESET$1} ${delta.candidate.toFixed(2).padStart(6)} ${color}${sign}${delta.delta.toFixed(2).padStart(5)} ${icon}${RESET$1}\n`);
|
|
324
|
+
}
|
|
325
|
+
w$1.write("\n");
|
|
269
326
|
}
|
|
270
327
|
if (regressions.length > 0) {
|
|
271
|
-
|
|
272
|
-
lines.push(` Regressions (${regressions.length}):`);
|
|
328
|
+
w$1.write(` ${RED$1}▼ ${regressions.length} regression${regressions.length > 1 ? "s" : ""}${RESET$1}\n`);
|
|
273
329
|
for (const r of regressions.slice(0, 5)) {
|
|
274
|
-
const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0,
|
|
275
|
-
|
|
330
|
+
const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0, 50);
|
|
331
|
+
w$1.write(` ${DIM$1}${dataStr}${RESET$1} ${r.evaluator}: ${r.baselineScore.toFixed(2)} → ${RED$1}${r.candidateScore.toFixed(2)}${RESET$1}\n`);
|
|
276
332
|
}
|
|
277
|
-
if (regressions.length > 5)
|
|
333
|
+
if (regressions.length > 5) w$1.write(` ${DIM$1}... and ${regressions.length - 5} more${RESET$1}\n`);
|
|
334
|
+
w$1.write("\n");
|
|
278
335
|
}
|
|
279
336
|
if (improvements.length > 0) {
|
|
280
|
-
|
|
281
|
-
lines.push(` Improvements (${improvements.length}):`);
|
|
337
|
+
w$1.write(` ${GREEN$1}▲ ${improvements.length} improvement${improvements.length > 1 ? "s" : ""}${RESET$1}\n`);
|
|
282
338
|
for (const imp of improvements.slice(0, 5)) {
|
|
283
|
-
const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0,
|
|
284
|
-
|
|
339
|
+
const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0, 50);
|
|
340
|
+
w$1.write(` ${DIM$1}${dataStr}${RESET$1} ${imp.evaluator}: ${imp.baselineScore.toFixed(2)} → ${GREEN$1}${imp.candidateScore.toFixed(2)}${RESET$1}\n`);
|
|
285
341
|
}
|
|
286
|
-
if (improvements.length > 5)
|
|
342
|
+
if (improvements.length > 5) w$1.write(` ${DIM$1}... and ${improvements.length - 5} more${RESET$1}\n`);
|
|
343
|
+
w$1.write("\n");
|
|
287
344
|
}
|
|
288
|
-
|
|
289
|
-
process.stderr.write(lines.join("\n"));
|
|
345
|
+
if (regressions.length === 0 && improvements.length === 0) w$1.write(` ${CYAN$1}No changes between runs${RESET$1}\n\n`);
|
|
290
346
|
return result;
|
|
291
347
|
}
|
|
292
348
|
|
|
293
349
|
//#endregion
|
|
294
350
|
//#region src/eval/judge.ts
|
|
295
|
-
/**
|
|
296
|
-
* Simple mustache-style template interpolation.
|
|
297
|
-
*/
|
|
298
351
|
function interpolate(template, vars) {
|
|
299
352
|
return template.replace(/\{\{(\w+(?:\.\w+)*)\}\}/g, (_, path) => {
|
|
300
353
|
const value = path.split(".").reduce((obj, key) => obj?.[key], vars);
|
|
301
|
-
|
|
354
|
+
if (value === void 0 || value === null) return "";
|
|
355
|
+
return typeof value === "string" ? value : JSON.stringify(value, null, 2);
|
|
302
356
|
});
|
|
303
357
|
}
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
358
|
+
function buildVars(output, target, data) {
|
|
359
|
+
const vars = {
|
|
360
|
+
output: typeof output === "string" ? output : JSON.stringify(output, null, 2),
|
|
361
|
+
target: typeof target === "string" ? target : JSON.stringify(target, null, 2),
|
|
362
|
+
data: typeof data === "string" ? data : JSON.stringify(data, null, 2)
|
|
363
|
+
};
|
|
364
|
+
if (target && typeof target === "object") for (const [k, v] of Object.entries(target)) vars[`target.${k}`] = v;
|
|
365
|
+
if (data && typeof data === "object") for (const [k, v] of Object.entries(data)) vars[`data.${k}`] = v;
|
|
366
|
+
return vars;
|
|
367
|
+
}
|
|
368
|
+
const DEFAULT_SYSTEM = `You are an expert evaluator. Your job is to grade an AI system's output.
|
|
369
|
+
|
|
370
|
+
Instructions:
|
|
371
|
+
- Read the grading criteria in the user message carefully.
|
|
372
|
+
- Evaluate the output objectively.
|
|
373
|
+
- Return ONLY valid JSON. No markdown, no explanation outside the JSON.
|
|
374
|
+
- The JSON must contain a "score" field with a number between 0.0 and 1.0.
|
|
375
|
+
- You may optionally include a "reasoning" field with a brief explanation.
|
|
376
|
+
|
|
377
|
+
Example response:
|
|
378
|
+
{"score": 0.85, "reasoning": "The response is mostly accurate but misses one detail."}`;
|
|
308
379
|
function defaultParse(response) {
|
|
309
|
-
|
|
380
|
+
let cleaned = response.replace(/```(?:json)?\n?/g, "").replace(/```$/g, "").trim();
|
|
381
|
+
const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
|
|
382
|
+
if (jsonMatch) cleaned = jsonMatch[0];
|
|
310
383
|
const parsed = JSON.parse(cleaned);
|
|
311
|
-
if (typeof parsed === "number") return parsed;
|
|
312
|
-
if (typeof parsed
|
|
384
|
+
if (typeof parsed?.score === "number") return Math.max(0, Math.min(1, parsed.score));
|
|
385
|
+
if (typeof parsed === "number") return Math.max(0, Math.min(1, parsed));
|
|
313
386
|
if (typeof parsed === "object" && parsed !== null) {
|
|
314
|
-
const entries = Object.entries(parsed).filter(([, v]) => typeof v === "number");
|
|
315
|
-
if (entries.length > 0) return Object.fromEntries(entries);
|
|
387
|
+
const entries = Object.entries(parsed).filter(([k, v]) => typeof v === "number" && k !== "reasoning");
|
|
388
|
+
if (entries.length > 0) return Object.fromEntries(entries.map(([k, v]) => [k, Math.max(0, Math.min(1, v))]));
|
|
316
389
|
}
|
|
317
|
-
throw new Error(`Could not extract score from judge response: ${response.slice(0,
|
|
390
|
+
throw new Error(`Could not extract score from judge response: ${response.slice(0, 300)}`);
|
|
391
|
+
}
|
|
392
|
+
async function callLLM(client, model, system, userMessage, temperature) {
|
|
393
|
+
const providerConfig = client.provider();
|
|
394
|
+
const response = await providerConfig.fetch(`${providerConfig.baseURL}/chat/completions`, {
|
|
395
|
+
method: "POST",
|
|
396
|
+
headers: {
|
|
397
|
+
"Content-Type": "application/json",
|
|
398
|
+
Authorization: `Bearer ${providerConfig.apiKey}`
|
|
399
|
+
},
|
|
400
|
+
body: JSON.stringify({
|
|
401
|
+
model,
|
|
402
|
+
temperature,
|
|
403
|
+
messages: [{
|
|
404
|
+
role: "system",
|
|
405
|
+
content: system
|
|
406
|
+
}, {
|
|
407
|
+
role: "user",
|
|
408
|
+
content: userMessage
|
|
409
|
+
}]
|
|
410
|
+
})
|
|
411
|
+
});
|
|
412
|
+
if (!response.ok) {
|
|
413
|
+
const errorBody = await response.text().catch(() => "unknown error");
|
|
414
|
+
throw new Error(`Judge LLM call failed (${response.status}): ${errorBody.slice(0, 300)}`);
|
|
415
|
+
}
|
|
416
|
+
const content = (await response.json()).choices?.[0]?.message?.content;
|
|
417
|
+
if (!content) throw new Error("Judge LLM returned empty response");
|
|
418
|
+
return content;
|
|
318
419
|
}
|
|
319
420
|
/**
|
|
320
421
|
* Factory that returns an Evaluator which uses an LLM to score output.
|
|
321
422
|
*
|
|
423
|
+
* The judge:
|
|
424
|
+
* - Uses a system message that instructs the LLM to return JSON scores
|
|
425
|
+
* - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
|
|
426
|
+
* - Uses temperature 0 by default for deterministic scoring
|
|
427
|
+
* - Retries on parse failure (configurable)
|
|
428
|
+
* - Clamps scores to [0, 1]
|
|
429
|
+
*
|
|
322
430
|
* Usage:
|
|
323
431
|
* ```ts
|
|
432
|
+
* import { llmops } from '@llmops/sdk'
|
|
433
|
+
*
|
|
434
|
+
* const client = llmops()
|
|
324
435
|
* const accuracy = judgeScorer({
|
|
325
436
|
* model: '@openai/gpt-4o',
|
|
326
|
-
* prompt:
|
|
327
|
-
*
|
|
437
|
+
* prompt: `Rate the accuracy of this response.
|
|
438
|
+
* Expected: {{target.answer}}
|
|
439
|
+
* Actual: {{output}}`,
|
|
440
|
+
* client,
|
|
328
441
|
* })
|
|
329
442
|
* ```
|
|
330
443
|
*/
|
|
331
444
|
function judgeScorer(options) {
|
|
332
|
-
const { model, prompt,
|
|
333
|
-
return async (output, target) => {
|
|
334
|
-
const
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
"Content-Type": "application/json",
|
|
345
|
-
Authorization: `Bearer ${providerConfig.apiKey}`
|
|
346
|
-
},
|
|
347
|
-
body: JSON.stringify({
|
|
348
|
-
model,
|
|
349
|
-
messages: [{
|
|
350
|
-
role: "user",
|
|
351
|
-
content: renderedPrompt
|
|
352
|
-
}],
|
|
353
|
-
response_format: { type: "json_object" }
|
|
354
|
-
})
|
|
355
|
-
});
|
|
356
|
-
if (!response.ok) throw new Error(`Judge LLM call failed: ${response.status} ${await response.text()}`);
|
|
357
|
-
const content = (await response.json()).choices?.[0]?.message?.content;
|
|
358
|
-
if (!content) throw new Error("Judge LLM returned empty response");
|
|
359
|
-
return parse(content);
|
|
445
|
+
const { model, prompt, client, system = DEFAULT_SYSTEM, temperature = 0, maxRetries = 1, parse = defaultParse } = options;
|
|
446
|
+
return async (output, target, data) => {
|
|
447
|
+
const userMessage = interpolate(prompt, buildVars(output, target, data));
|
|
448
|
+
let lastError = null;
|
|
449
|
+
const attempts = 1 + maxRetries;
|
|
450
|
+
for (let attempt = 0; attempt < attempts; attempt++) try {
|
|
451
|
+
return parse(await callLLM(client, model, system, userMessage, temperature));
|
|
452
|
+
} catch (err) {
|
|
453
|
+
lastError = err instanceof Error ? err : new Error(String(err));
|
|
454
|
+
if (lastError.message.includes("Judge LLM call failed")) throw lastError;
|
|
455
|
+
}
|
|
456
|
+
throw lastError ?? /* @__PURE__ */ new Error("Judge scoring failed");
|
|
360
457
|
};
|
|
361
458
|
}
|
|
362
459
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@llmops/sdk",
|
|
3
|
-
"version": "1.0.0-beta.
|
|
3
|
+
"version": "1.0.0-beta.23",
|
|
4
4
|
"description": "An LLMOps toolkit for TypeScript applications",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "Apache-2.0",
|
|
@@ -134,8 +134,8 @@
|
|
|
134
134
|
"access": "public"
|
|
135
135
|
},
|
|
136
136
|
"dependencies": {
|
|
137
|
-
"@llmops/app": "^1.0.0-beta.
|
|
138
|
-
"@llmops/core": "^1.0.0-beta.
|
|
137
|
+
"@llmops/app": "^1.0.0-beta.23",
|
|
138
|
+
"@llmops/core": "^1.0.0-beta.23"
|
|
139
139
|
},
|
|
140
140
|
"peerDependencies": {
|
|
141
141
|
"pg": "*",
|