@llmops/sdk 1.0.0-beta.19 → 1.0.0-beta.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents.d.cts +1 -1
- package/dist/agents.d.mts +1 -1
- package/dist/agents.mjs +1 -1
- package/dist/eval.cjs +367 -0
- package/dist/eval.d.cts +200 -0
- package/dist/eval.d.mts +200 -0
- package/dist/eval.mjs +364 -0
- package/dist/express.d.cts +2 -2
- package/dist/express.d.mts +2 -2
- package/dist/hono.d.cts +2 -2
- package/dist/hono.d.mts +2 -2
- package/dist/{index-BpwfOQEm.d.cts → index-BZLzywwb.d.mts} +1 -1
- package/dist/{index-CniO6im-.d.mts → index-lgspeSNr.d.cts} +1 -1
- package/dist/index.d.cts +3 -3
- package/dist/index.d.mts +3 -3
- package/dist/index.mjs +2 -2
- package/dist/nextjs.d.cts +2 -2
- package/dist/nextjs.d.mts +2 -2
- package/dist/store/d1.d.cts +1 -1
- package/dist/store/d1.d.mts +1 -1
- package/dist/store/pg.d.cts +1 -1
- package/dist/store/pg.d.mts +1 -1
- package/dist/store/pg.mjs +2 -2
- package/dist/store/sqlite.d.cts +1 -1
- package/dist/store/sqlite.d.mts +1 -1
- package/dist/store/sqlite.mjs +1 -1
- package/dist/types.d.cts +1 -1
- package/dist/types.d.mts +1 -1
- package/package.json +13 -3
- /package/dist/{agents-exporter-BY7BXquG.mjs → agents-exporter-CGxTzDeQ.mjs} +0 -0
- /package/dist/{agents-exporter-CI29gyrT.d.mts → agents-exporter-CehKIArI.d.mts} +0 -0
- /package/dist/{agents-exporter-YNq4HFTK.d.cts → agents-exporter-DkqkCcIx.d.cts} +0 -0
- /package/dist/{chunk-rZg9L66_.mjs → chunk-CxwUPGYo.mjs} +0 -0
- /package/dist/{constants-BVeqLv6F.mjs → constants-BvnYU_pl.mjs} +0 -0
- /package/dist/{interface-BGkC9Ml4.d.mts → interface-BbAwy96d.d.cts} +0 -0
- /package/dist/{interface-Dix77UtO.d.cts → interface-Dz7B6QN1.d.mts} +0 -0
package/dist/agents.d.cts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { a as AgentsTracingExporter, i as AgentsTrace, n as AgentsSpanData, o as LLMOpsAgentsExporterConfig, r as AgentsSpanError, s as createLLMOpsAgentsExporter, t as AgentsSpan } from "./agents-exporter-
|
|
1
|
+
import { a as AgentsTracingExporter, i as AgentsTrace, n as AgentsSpanData, o as LLMOpsAgentsExporterConfig, r as AgentsSpanError, s as createLLMOpsAgentsExporter, t as AgentsSpan } from "./agents-exporter-DkqkCcIx.cjs";
|
|
2
2
|
export { AgentsSpan, AgentsSpanData, AgentsSpanError, AgentsTrace, AgentsTracingExporter, LLMOpsAgentsExporterConfig, createLLMOpsAgentsExporter };
|
package/dist/agents.d.mts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { a as AgentsTracingExporter, i as AgentsTrace, n as AgentsSpanData, o as LLMOpsAgentsExporterConfig, r as AgentsSpanError, s as createLLMOpsAgentsExporter, t as AgentsSpan } from "./agents-exporter-
|
|
1
|
+
import { a as AgentsTracingExporter, i as AgentsTrace, n as AgentsSpanData, o as LLMOpsAgentsExporterConfig, r as AgentsSpanError, s as createLLMOpsAgentsExporter, t as AgentsSpan } from "./agents-exporter-CehKIArI.mjs";
|
|
2
2
|
export { AgentsSpan, AgentsSpanData, AgentsSpanError, AgentsTrace, AgentsTracingExporter, LLMOpsAgentsExporterConfig, createLLMOpsAgentsExporter };
|
package/dist/agents.mjs
CHANGED
package/dist/eval.cjs
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
let node_crypto = require("node:crypto");
|
|
2
|
+
let node_fs = require("node:fs");
|
|
3
|
+
let node_path = require("node:path");
|
|
4
|
+
|
|
5
|
+
//#region src/eval/dataset.ts
|
|
6
|
+
/**
|
|
7
|
+
* Wraps a plain array as an EvaluationDataset.
|
|
8
|
+
*/
|
|
9
|
+
var InlineDataset = class {
|
|
10
|
+
constructor(items) {
|
|
11
|
+
this.items = items;
|
|
12
|
+
}
|
|
13
|
+
size() {
|
|
14
|
+
return this.items.length;
|
|
15
|
+
}
|
|
16
|
+
get(index) {
|
|
17
|
+
return this.items[index];
|
|
18
|
+
}
|
|
19
|
+
slice(start, end) {
|
|
20
|
+
return this.items.slice(start, end);
|
|
21
|
+
}
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
//#endregion
|
|
25
|
+
//#region src/eval/evaluate.ts
|
|
26
|
+
async function pool(items, concurrency, fn) {
|
|
27
|
+
const executing = [];
|
|
28
|
+
for (const item of items) {
|
|
29
|
+
const p = fn(item).then(() => {
|
|
30
|
+
executing.splice(executing.indexOf(p), 1);
|
|
31
|
+
});
|
|
32
|
+
executing.push(p);
|
|
33
|
+
if (executing.length >= concurrency) await Promise.race(executing);
|
|
34
|
+
}
|
|
35
|
+
await Promise.all(executing);
|
|
36
|
+
}
|
|
37
|
+
function computeStats(values) {
|
|
38
|
+
const valid = values.filter((v) => !Number.isNaN(v));
|
|
39
|
+
if (valid.length === 0) return {
|
|
40
|
+
mean: 0,
|
|
41
|
+
min: 0,
|
|
42
|
+
max: 0,
|
|
43
|
+
median: 0,
|
|
44
|
+
count: 0
|
|
45
|
+
};
|
|
46
|
+
const sorted = [...valid].sort((a, b) => a - b);
|
|
47
|
+
const sum = sorted.reduce((a, b) => a + b, 0);
|
|
48
|
+
const mid = Math.floor(sorted.length / 2);
|
|
49
|
+
const median = sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
50
|
+
return {
|
|
51
|
+
mean: sum / sorted.length,
|
|
52
|
+
min: sorted[0],
|
|
53
|
+
max: sorted[sorted.length - 1],
|
|
54
|
+
median,
|
|
55
|
+
count: sorted.length
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
|
|
59
|
+
const size = await dataset.size();
|
|
60
|
+
const datapoints = await dataset.slice(0, size);
|
|
61
|
+
const results = new Array(datapoints.length);
|
|
62
|
+
const startTime = Date.now();
|
|
63
|
+
await pool(datapoints, concurrency, async (dp) => {
|
|
64
|
+
const idx = datapoints.indexOf(dp);
|
|
65
|
+
const dpStart = Date.now();
|
|
66
|
+
let output = null;
|
|
67
|
+
let error;
|
|
68
|
+
const scores = {};
|
|
69
|
+
try {
|
|
70
|
+
output = await executor(dp.data);
|
|
71
|
+
} catch (err) {
|
|
72
|
+
error = err instanceof Error ? err.message : String(err);
|
|
73
|
+
}
|
|
74
|
+
if (!error && output !== null) for (const [name, evaluator] of Object.entries(evaluators)) try {
|
|
75
|
+
const result = await evaluator(output, dp.target, dp.data);
|
|
76
|
+
if (typeof result === "number") scores[name] = result;
|
|
77
|
+
else for (const [subKey, subScore] of Object.entries(result)) scores[`${name}.${subKey}`] = subScore;
|
|
78
|
+
} catch {
|
|
79
|
+
scores[name] = NaN;
|
|
80
|
+
}
|
|
81
|
+
results[idx] = {
|
|
82
|
+
data: dp.data,
|
|
83
|
+
target: dp.target,
|
|
84
|
+
metadata: dp.metadata,
|
|
85
|
+
output,
|
|
86
|
+
scores,
|
|
87
|
+
durationMs: Date.now() - dpStart,
|
|
88
|
+
error
|
|
89
|
+
};
|
|
90
|
+
});
|
|
91
|
+
return {
|
|
92
|
+
results,
|
|
93
|
+
durationMs: Date.now() - startTime
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
function printSummary(result) {
|
|
97
|
+
const lines = [];
|
|
98
|
+
lines.push("");
|
|
99
|
+
lines.push(` ${result.name}`);
|
|
100
|
+
lines.push("");
|
|
101
|
+
const completed = result.count - result.errors;
|
|
102
|
+
lines.push(` ✓ ${completed}/${result.count} completed${result.errors > 0 ? ` ✗ ${result.errors} errors` : ""}`);
|
|
103
|
+
lines.push("");
|
|
104
|
+
lines.push(" Scores:");
|
|
105
|
+
for (const [name, stats] of Object.entries(result.scores)) lines.push(` ${name.padEnd(16)} mean=${stats.mean.toFixed(2)} min=${stats.min.toFixed(2)} max=${stats.max.toFixed(2)} median=${stats.median.toFixed(2)}`);
|
|
106
|
+
lines.push("");
|
|
107
|
+
lines.push(` Duration: ${(result.durationMs / 1e3).toFixed(1)}s`);
|
|
108
|
+
lines.push(` Run ID: ${result.runId}`);
|
|
109
|
+
lines.push("");
|
|
110
|
+
process.stderr.write(lines.join("\n"));
|
|
111
|
+
}
|
|
112
|
+
function saveResult(result, outputDir) {
|
|
113
|
+
const dir = (0, node_path.join)(outputDir, result.name);
|
|
114
|
+
(0, node_fs.mkdirSync)(dir, { recursive: true });
|
|
115
|
+
(0, node_fs.writeFileSync)((0, node_path.join)(dir, `${result.runId}.json`), JSON.stringify(result, null, 2));
|
|
116
|
+
}
|
|
117
|
+
async function evaluate(options) {
|
|
118
|
+
const { name, data, executor, variants, evaluators, concurrency = 5, group, metadata, outputDir = process.env.LLMOPS_EVAL_OUTPUT_DIR || "./llmops-evals" } = options;
|
|
119
|
+
const runId = (0, node_crypto.randomUUID)();
|
|
120
|
+
if (executor && variants) throw new Error("evaluate(): provide either executor or variants, not both");
|
|
121
|
+
if (!executor && !variants) throw new Error("evaluate(): provide either executor or variants");
|
|
122
|
+
const dataset = Array.isArray(data) ? new InlineDataset(data) : data;
|
|
123
|
+
if (executor) {
|
|
124
|
+
const { results, durationMs } = await runSingleExecutor(dataset, executor, evaluators, concurrency);
|
|
125
|
+
const scoreNames = /* @__PURE__ */ new Set();
|
|
126
|
+
for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
|
|
127
|
+
const scores = {};
|
|
128
|
+
for (const scoreName of scoreNames) scores[scoreName] = computeStats(results.map((r) => r.scores[scoreName] ?? NaN));
|
|
129
|
+
const result = {
|
|
130
|
+
name,
|
|
131
|
+
runId,
|
|
132
|
+
group,
|
|
133
|
+
scores,
|
|
134
|
+
durationMs,
|
|
135
|
+
count: results.length,
|
|
136
|
+
errors: results.filter((r) => r.error).length,
|
|
137
|
+
metadata,
|
|
138
|
+
results
|
|
139
|
+
};
|
|
140
|
+
if (process.env.LLMOPS_EVAL_OUTPUT === "json") process.stdout.write(JSON.stringify(result, null, 2));
|
|
141
|
+
else printSummary(result);
|
|
142
|
+
saveResult(result, outputDir);
|
|
143
|
+
return result;
|
|
144
|
+
}
|
|
145
|
+
const variantResults = {};
|
|
146
|
+
const totalStart = Date.now();
|
|
147
|
+
for (const [variantName, variantExecutor] of Object.entries(variants)) {
|
|
148
|
+
const { results, durationMs } = await runSingleExecutor(dataset, variantExecutor, evaluators, concurrency);
|
|
149
|
+
const scoreNames = /* @__PURE__ */ new Set();
|
|
150
|
+
for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
|
|
151
|
+
const scores = {};
|
|
152
|
+
for (const scoreName of scoreNames) scores[scoreName] = computeStats(results.map((r) => r.scores[scoreName] ?? NaN));
|
|
153
|
+
const variantResult = {
|
|
154
|
+
name: `${name}/${variantName}`,
|
|
155
|
+
runId,
|
|
156
|
+
group,
|
|
157
|
+
scores,
|
|
158
|
+
durationMs,
|
|
159
|
+
count: results.length,
|
|
160
|
+
errors: results.filter((r) => r.error).length,
|
|
161
|
+
metadata,
|
|
162
|
+
results
|
|
163
|
+
};
|
|
164
|
+
variantResults[variantName] = variantResult;
|
|
165
|
+
if (process.env.LLMOPS_EVAL_OUTPUT !== "json") printSummary(variantResult);
|
|
166
|
+
saveResult(variantResult, outputDir);
|
|
167
|
+
}
|
|
168
|
+
const variantEvalResult = {
|
|
169
|
+
name,
|
|
170
|
+
runId,
|
|
171
|
+
group,
|
|
172
|
+
durationMs: Date.now() - totalStart,
|
|
173
|
+
metadata,
|
|
174
|
+
variants: variantResults
|
|
175
|
+
};
|
|
176
|
+
if (process.env.LLMOPS_EVAL_OUTPUT === "json") process.stdout.write(JSON.stringify(variantEvalResult, null, 2));
|
|
177
|
+
return variantEvalResult;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
//#endregion
|
|
181
|
+
//#region src/eval/compare.ts
|
|
182
|
+
/**
|
|
183
|
+
* Load an eval run from the filesystem.
|
|
184
|
+
*/
|
|
185
|
+
function loadRun(outputDir, name, runId) {
|
|
186
|
+
const dir = (0, node_path.join)(outputDir, name);
|
|
187
|
+
const filePath = (0, node_path.join)(dir, `${runId}.json`);
|
|
188
|
+
try {
|
|
189
|
+
const content = (0, node_fs.readFileSync)(filePath, "utf-8");
|
|
190
|
+
return JSON.parse(content);
|
|
191
|
+
} catch {
|
|
192
|
+
try {
|
|
193
|
+
const match = (0, node_fs.readdirSync)(dir).find((f) => f.startsWith(runId) && f.endsWith(".json"));
|
|
194
|
+
if (match) {
|
|
195
|
+
const content = (0, node_fs.readFileSync)((0, node_path.join)(dir, match), "utf-8");
|
|
196
|
+
return JSON.parse(content);
|
|
197
|
+
}
|
|
198
|
+
} catch {}
|
|
199
|
+
throw new Error(`Eval run "${runId}" not found for "${name}" in ${outputDir}. Expected file: ${filePath}`);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Compare two eval runs. First run ID is the baseline.
|
|
204
|
+
*
|
|
205
|
+
* Usage:
|
|
206
|
+
* ```ts
|
|
207
|
+
* const diff = await compare({
|
|
208
|
+
* name: 'support-bot',
|
|
209
|
+
* runs: [run1.runId, run2.runId],
|
|
210
|
+
* })
|
|
211
|
+
* ```
|
|
212
|
+
*/
|
|
213
|
+
async function compare(options) {
|
|
214
|
+
const { runs, name, outputDir = "./llmops-evals" } = options;
|
|
215
|
+
if (runs.length < 2) throw new Error("compare() requires at least 2 run IDs");
|
|
216
|
+
const baselineRun = loadRun(outputDir, name, runs[0]);
|
|
217
|
+
const candidateRun = loadRun(outputDir, name, runs[1]);
|
|
218
|
+
const allScoreNames = new Set([...Object.keys(baselineRun.scores), ...Object.keys(candidateRun.scores)]);
|
|
219
|
+
const scores = {};
|
|
220
|
+
for (const scoreName of allScoreNames) {
|
|
221
|
+
const baselineMean = baselineRun.scores[scoreName]?.mean ?? 0;
|
|
222
|
+
const candidateMean = candidateRun.scores[scoreName]?.mean ?? 0;
|
|
223
|
+
scores[scoreName] = {
|
|
224
|
+
baseline: baselineMean,
|
|
225
|
+
candidate: candidateMean,
|
|
226
|
+
delta: candidateMean - baselineMean
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
const regressions = [];
|
|
230
|
+
const improvements = [];
|
|
231
|
+
const minLen = Math.min(baselineRun.results.length, candidateRun.results.length);
|
|
232
|
+
for (let i = 0; i < minLen; i++) {
|
|
233
|
+
const baselineResult = baselineRun.results[i];
|
|
234
|
+
const candidateResult = candidateRun.results[i];
|
|
235
|
+
for (const scoreName of allScoreNames) {
|
|
236
|
+
const baselineScore = baselineResult.scores[scoreName] ?? NaN;
|
|
237
|
+
const candidateScore = candidateResult.scores[scoreName] ?? NaN;
|
|
238
|
+
if (Number.isNaN(baselineScore) || Number.isNaN(candidateScore)) continue;
|
|
239
|
+
if (candidateScore < baselineScore) regressions.push({
|
|
240
|
+
data: baselineResult.data,
|
|
241
|
+
evaluator: scoreName,
|
|
242
|
+
baselineScore,
|
|
243
|
+
candidateScore
|
|
244
|
+
});
|
|
245
|
+
else if (candidateScore > baselineScore) improvements.push({
|
|
246
|
+
data: baselineResult.data,
|
|
247
|
+
evaluator: scoreName,
|
|
248
|
+
baselineScore,
|
|
249
|
+
candidateScore
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
const result = {
|
|
254
|
+
baseline: runs[0],
|
|
255
|
+
candidate: runs[1],
|
|
256
|
+
scores,
|
|
257
|
+
regressions,
|
|
258
|
+
improvements
|
|
259
|
+
};
|
|
260
|
+
const lines = [];
|
|
261
|
+
lines.push("");
|
|
262
|
+
lines.push(` compare: ${runs[0].slice(0, 8)} → ${runs[1].slice(0, 8)}`);
|
|
263
|
+
lines.push("");
|
|
264
|
+
lines.push(" Scores:");
|
|
265
|
+
for (const [scoreName, delta] of Object.entries(scores)) {
|
|
266
|
+
const sign = delta.delta >= 0 ? "+" : "";
|
|
267
|
+
const marker = delta.delta >= 0 ? "✓" : "✗";
|
|
268
|
+
lines.push(` ${scoreName.padEnd(16)} ${delta.baseline.toFixed(2)} → ${delta.candidate.toFixed(2)} (${sign}${delta.delta.toFixed(2)}) ${marker}`);
|
|
269
|
+
}
|
|
270
|
+
if (regressions.length > 0) {
|
|
271
|
+
lines.push("");
|
|
272
|
+
lines.push(` Regressions (${regressions.length}):`);
|
|
273
|
+
for (const r of regressions.slice(0, 5)) {
|
|
274
|
+
const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0, 60);
|
|
275
|
+
lines.push(` "${dataStr}" ${r.evaluator}: ${r.baselineScore.toFixed(2)} → ${r.candidateScore.toFixed(2)}`);
|
|
276
|
+
}
|
|
277
|
+
if (regressions.length > 5) lines.push(` ... and ${regressions.length - 5} more`);
|
|
278
|
+
}
|
|
279
|
+
if (improvements.length > 0) {
|
|
280
|
+
lines.push("");
|
|
281
|
+
lines.push(` Improvements (${improvements.length}):`);
|
|
282
|
+
for (const imp of improvements.slice(0, 5)) {
|
|
283
|
+
const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0, 60);
|
|
284
|
+
lines.push(` "${dataStr}" ${imp.evaluator}: ${imp.baselineScore.toFixed(2)} → ${imp.candidateScore.toFixed(2)}`);
|
|
285
|
+
}
|
|
286
|
+
if (improvements.length > 5) lines.push(` ... and ${improvements.length - 5} more`);
|
|
287
|
+
}
|
|
288
|
+
lines.push("");
|
|
289
|
+
process.stderr.write(lines.join("\n"));
|
|
290
|
+
return result;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
//#endregion
|
|
294
|
+
//#region src/eval/judge.ts
|
|
295
|
+
/**
|
|
296
|
+
* Simple mustache-style template interpolation.
|
|
297
|
+
*/
|
|
298
|
+
function interpolate(template, vars) {
|
|
299
|
+
return template.replace(/\{\{(\w+(?:\.\w+)*)\}\}/g, (_, path) => {
|
|
300
|
+
const value = path.split(".").reduce((obj, key) => obj?.[key], vars);
|
|
301
|
+
return typeof value === "string" ? value : JSON.stringify(value);
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
/**
|
|
305
|
+
* Default parser: expects JSON with a `score` field, a bare number,
|
|
306
|
+
* or an object of number values (multi-score).
|
|
307
|
+
*/
|
|
308
|
+
function defaultParse(response) {
|
|
309
|
+
const cleaned = response.replace(/```json\n?|```/g, "").trim();
|
|
310
|
+
const parsed = JSON.parse(cleaned);
|
|
311
|
+
if (typeof parsed === "number") return parsed;
|
|
312
|
+
if (typeof parsed?.score === "number") return parsed.score;
|
|
313
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
314
|
+
const entries = Object.entries(parsed).filter(([, v]) => typeof v === "number");
|
|
315
|
+
if (entries.length > 0) return Object.fromEntries(entries);
|
|
316
|
+
}
|
|
317
|
+
throw new Error(`Could not extract score from judge response: ${response.slice(0, 200)}`);
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Factory that returns an Evaluator which uses an LLM to score output.
|
|
321
|
+
*
|
|
322
|
+
* Usage:
|
|
323
|
+
* ```ts
|
|
324
|
+
* const accuracy = judgeScorer({
|
|
325
|
+
* model: '@openai/gpt-4o',
|
|
326
|
+
* prompt: 'Rate accuracy 0-1. Expected: {{target.answer}} Actual: {{output}}',
|
|
327
|
+
* ops,
|
|
328
|
+
* })
|
|
329
|
+
* ```
|
|
330
|
+
*/
|
|
331
|
+
function judgeScorer(options) {
|
|
332
|
+
const { model, prompt, ops, parse = defaultParse } = options;
|
|
333
|
+
return async (output, target) => {
|
|
334
|
+
const vars = {
|
|
335
|
+
output: typeof output === "string" ? output : JSON.stringify(output),
|
|
336
|
+
target
|
|
337
|
+
};
|
|
338
|
+
if (target && typeof target === "object") for (const [k, v] of Object.entries(target)) vars[`target.${k}`] = v;
|
|
339
|
+
const renderedPrompt = interpolate(prompt, vars);
|
|
340
|
+
const providerConfig = ops.provider();
|
|
341
|
+
const response = await providerConfig.fetch(`${providerConfig.baseURL}/chat/completions`, {
|
|
342
|
+
method: "POST",
|
|
343
|
+
headers: {
|
|
344
|
+
"Content-Type": "application/json",
|
|
345
|
+
Authorization: `Bearer ${providerConfig.apiKey}`
|
|
346
|
+
},
|
|
347
|
+
body: JSON.stringify({
|
|
348
|
+
model,
|
|
349
|
+
messages: [{
|
|
350
|
+
role: "user",
|
|
351
|
+
content: renderedPrompt
|
|
352
|
+
}],
|
|
353
|
+
response_format: { type: "json_object" }
|
|
354
|
+
})
|
|
355
|
+
});
|
|
356
|
+
if (!response.ok) throw new Error(`Judge LLM call failed: ${response.status} ${await response.text()}`);
|
|
357
|
+
const content = (await response.json()).choices?.[0]?.message?.content;
|
|
358
|
+
if (!content) throw new Error("Judge LLM returned empty response");
|
|
359
|
+
return parse(content);
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
//#endregion
|
|
364
|
+
exports.InlineDataset = InlineDataset;
|
|
365
|
+
exports.compare = compare;
|
|
366
|
+
exports.evaluate = evaluate;
|
|
367
|
+
exports.judgeScorer = judgeScorer;
|
package/dist/eval.d.cts
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import "./agents-exporter-DkqkCcIx.cjs";
|
|
2
|
+
import { t as LLMOpsClient } from "./index-lgspeSNr.cjs";
|
|
3
|
+
|
|
4
|
+
//#region src/eval/dataset.d.ts
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Interface for custom dataset sources.
|
|
8
|
+
* Built-in: inline arrays are wrapped in InlineDataset automatically.
|
|
9
|
+
* Future: CSVDataset, JSONLDataset, S3Dataset.
|
|
10
|
+
*/
|
|
11
|
+
interface EvaluationDataset<D = Record<string, unknown>, T = Record<string, unknown>> {
|
|
12
|
+
size(): number | Promise<number>;
|
|
13
|
+
get(index: number): Datapoint<D, T> | Promise<Datapoint<D, T>>;
|
|
14
|
+
slice(start: number, end: number): Datapoint<D, T>[] | Promise<Datapoint<D, T>[]>;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Wraps a plain array as an EvaluationDataset.
|
|
18
|
+
*/
|
|
19
|
+
declare class InlineDataset<D, T> implements EvaluationDataset<D, T> {
|
|
20
|
+
private items;
|
|
21
|
+
constructor(items: Datapoint<D, T>[]);
|
|
22
|
+
size(): number;
|
|
23
|
+
get(index: number): Datapoint<D, T>;
|
|
24
|
+
slice(start: number, end: number): Datapoint<D, T>[];
|
|
25
|
+
}
|
|
26
|
+
//#endregion
|
|
27
|
+
//#region src/eval/types.d.ts
|
|
28
|
+
/**
|
|
29
|
+
* A single datapoint in a dataset.
|
|
30
|
+
*/
|
|
31
|
+
interface Datapoint<D = Record<string, unknown>, T = Record<string, unknown>> {
|
|
32
|
+
data: D;
|
|
33
|
+
target?: T;
|
|
34
|
+
metadata?: Record<string, unknown>;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* An evaluator scores executor output.
|
|
38
|
+
* Returns a single number (0-1) or an object of named scores.
|
|
39
|
+
*/
|
|
40
|
+
type Evaluator<O = unknown, T = unknown, D = unknown> = (output: O, target?: T, data?: D) => number | Record<string, number> | Promise<number | Record<string, number>>;
|
|
41
|
+
/**
|
|
42
|
+
* An executor is the function under test.
|
|
43
|
+
*/
|
|
44
|
+
type Executor<D = Record<string, unknown>, O = unknown> = (data: D) => O | Promise<O>;
|
|
45
|
+
/**
|
|
46
|
+
* Configuration for evaluate().
|
|
47
|
+
*/
|
|
48
|
+
interface EvaluateOptions<D, T, O> {
|
|
49
|
+
/** Name of this evaluation run. Required. */
|
|
50
|
+
name: string;
|
|
51
|
+
/** Dataset — inline array of datapoints or an EvaluationDataset */
|
|
52
|
+
data: Datapoint<D, T>[] | EvaluationDataset<D, T>;
|
|
53
|
+
/** The function under test. Provide either executor or variants, not both. */
|
|
54
|
+
executor?: Executor<D, O>;
|
|
55
|
+
/** Named variants for side-by-side comparison. Keys become variant labels. */
|
|
56
|
+
variants?: Record<string, Executor<D, O>>;
|
|
57
|
+
/** Named evaluator functions. Keys become score names. */
|
|
58
|
+
evaluators: Record<string, Evaluator<O, T>>;
|
|
59
|
+
/** Maximum concurrent datapoints. Default: 5 */
|
|
60
|
+
concurrency?: number;
|
|
61
|
+
/** Group name for tracking score progression across runs. */
|
|
62
|
+
group?: string;
|
|
63
|
+
/** Metadata attached to the entire run. */
|
|
64
|
+
metadata?: Record<string, unknown>;
|
|
65
|
+
/** Output directory for JSON results. Default: './llmops-evals' */
|
|
66
|
+
outputDir?: string;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Result for a single datapoint.
|
|
70
|
+
*/
|
|
71
|
+
interface DatapointResult<D = unknown, O = unknown> {
|
|
72
|
+
data: D;
|
|
73
|
+
target?: unknown;
|
|
74
|
+
metadata?: Record<string, unknown>;
|
|
75
|
+
output: O;
|
|
76
|
+
scores: Record<string, number>;
|
|
77
|
+
durationMs: number;
|
|
78
|
+
error?: string;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Aggregated score statistics for one evaluator.
|
|
82
|
+
*/
|
|
83
|
+
interface ScoreStats {
|
|
84
|
+
mean: number;
|
|
85
|
+
min: number;
|
|
86
|
+
max: number;
|
|
87
|
+
median: number;
|
|
88
|
+
count: number;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Summary of an evaluation run.
|
|
92
|
+
*/
|
|
93
|
+
interface EvaluateResult<D = unknown, O = unknown> {
|
|
94
|
+
name: string;
|
|
95
|
+
runId: string;
|
|
96
|
+
group?: string;
|
|
97
|
+
scores: Record<string, ScoreStats>;
|
|
98
|
+
durationMs: number;
|
|
99
|
+
count: number;
|
|
100
|
+
errors: number;
|
|
101
|
+
metadata?: Record<string, unknown>;
|
|
102
|
+
results: DatapointResult<D, O>[];
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* When variants are used, wraps per-variant results.
|
|
106
|
+
*/
|
|
107
|
+
interface VariantEvaluateResult<D = unknown, O = unknown> {
|
|
108
|
+
name: string;
|
|
109
|
+
runId: string;
|
|
110
|
+
group?: string;
|
|
111
|
+
durationMs: number;
|
|
112
|
+
metadata?: Record<string, unknown>;
|
|
113
|
+
variants: Record<string, EvaluateResult<D, O>>;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Options for compare().
|
|
117
|
+
*/
|
|
118
|
+
interface CompareOptions {
|
|
119
|
+
/** Run IDs to compare. First is baseline. */
|
|
120
|
+
runs: string[];
|
|
121
|
+
/** Directory where eval results are stored. Default: './llmops-evals' */
|
|
122
|
+
outputDir?: string;
|
|
123
|
+
/** Eval name to search within. Required. */
|
|
124
|
+
name: string;
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Per-evaluator delta between two runs.
|
|
128
|
+
*/
|
|
129
|
+
interface ScoreDelta {
|
|
130
|
+
baseline: number;
|
|
131
|
+
candidate: number;
|
|
132
|
+
delta: number;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Result of comparing two runs.
|
|
136
|
+
*/
|
|
137
|
+
interface CompareResult {
|
|
138
|
+
baseline: string;
|
|
139
|
+
candidate: string;
|
|
140
|
+
scores: Record<string, ScoreDelta>;
|
|
141
|
+
regressions: Array<{
|
|
142
|
+
data: unknown;
|
|
143
|
+
evaluator: string;
|
|
144
|
+
baselineScore: number;
|
|
145
|
+
candidateScore: number;
|
|
146
|
+
}>;
|
|
147
|
+
improvements: Array<{
|
|
148
|
+
data: unknown;
|
|
149
|
+
evaluator: string;
|
|
150
|
+
baselineScore: number;
|
|
151
|
+
candidateScore: number;
|
|
152
|
+
}>;
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Options for judgeScorer().
|
|
156
|
+
*/
|
|
157
|
+
interface JudgeScorerOptions {
|
|
158
|
+
/** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
|
|
159
|
+
model: string;
|
|
160
|
+
/** Prompt template. Supports {{output}}, {{target}}, {{target.*}} placeholders. */
|
|
161
|
+
prompt: string;
|
|
162
|
+
/** The llmops client instance. Judge call routed through gateway. */
|
|
163
|
+
ops: LLMOpsClient;
|
|
164
|
+
/** Custom parser for extracting score from LLM response. */
|
|
165
|
+
parse?: (response: string) => number | Record<string, number>;
|
|
166
|
+
}
|
|
167
|
+
//#endregion
|
|
168
|
+
//#region src/eval/evaluate.d.ts
|
|
169
|
+
declare function evaluate<D = Record<string, unknown>, T = Record<string, unknown>, O = unknown>(options: EvaluateOptions<D, T, O>): Promise<EvaluateResult<D, O> | VariantEvaluateResult<D, O>>;
|
|
170
|
+
//#endregion
|
|
171
|
+
//#region src/eval/compare.d.ts
|
|
172
|
+
/**
|
|
173
|
+
* Compare two eval runs. First run ID is the baseline.
|
|
174
|
+
*
|
|
175
|
+
* Usage:
|
|
176
|
+
* ```ts
|
|
177
|
+
* const diff = await compare({
|
|
178
|
+
* name: 'support-bot',
|
|
179
|
+
* runs: [run1.runId, run2.runId],
|
|
180
|
+
* })
|
|
181
|
+
* ```
|
|
182
|
+
*/
|
|
183
|
+
declare function compare(options: CompareOptions): Promise<CompareResult>;
|
|
184
|
+
//#endregion
|
|
185
|
+
//#region src/eval/judge.d.ts
|
|
186
|
+
/**
|
|
187
|
+
* Factory that returns an Evaluator which uses an LLM to score output.
|
|
188
|
+
*
|
|
189
|
+
* Usage:
|
|
190
|
+
* ```ts
|
|
191
|
+
* const accuracy = judgeScorer({
|
|
192
|
+
* model: '@openai/gpt-4o',
|
|
193
|
+
* prompt: 'Rate accuracy 0-1. Expected: {{target.answer}} Actual: {{output}}',
|
|
194
|
+
* ops,
|
|
195
|
+
* })
|
|
196
|
+
* ```
|
|
197
|
+
*/
|
|
198
|
+
declare function judgeScorer(options: JudgeScorerOptions): Evaluator;
|
|
199
|
+
//#endregion
|
|
200
|
+
export { type CompareOptions, type CompareResult, type Datapoint, type DatapointResult, type EvaluateOptions, type EvaluateResult, type EvaluationDataset, type Evaluator, type Executor, InlineDataset, type JudgeScorerOptions, type ScoreDelta, type ScoreStats, type VariantEvaluateResult, compare, evaluate, judgeScorer };
|