@llmops/sdk 1.0.0-beta.19 → 1.0.0-beta.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/agents.d.cts +1 -1
  2. package/dist/agents.d.mts +1 -1
  3. package/dist/agents.mjs +1 -1
  4. package/dist/eval.cjs +367 -0
  5. package/dist/eval.d.cts +200 -0
  6. package/dist/eval.d.mts +200 -0
  7. package/dist/eval.mjs +364 -0
  8. package/dist/express.d.cts +2 -2
  9. package/dist/express.d.mts +2 -2
  10. package/dist/hono.d.cts +2 -2
  11. package/dist/hono.d.mts +2 -2
  12. package/dist/{index-BpwfOQEm.d.cts → index-BZLzywwb.d.mts} +1 -1
  13. package/dist/{index-CniO6im-.d.mts → index-lgspeSNr.d.cts} +1 -1
  14. package/dist/index.d.cts +3 -3
  15. package/dist/index.d.mts +3 -3
  16. package/dist/index.mjs +2 -2
  17. package/dist/nextjs.d.cts +2 -2
  18. package/dist/nextjs.d.mts +2 -2
  19. package/dist/store/d1.d.cts +1 -1
  20. package/dist/store/d1.d.mts +1 -1
  21. package/dist/store/pg.d.cts +1 -1
  22. package/dist/store/pg.d.mts +1 -1
  23. package/dist/store/pg.mjs +2 -2
  24. package/dist/store/sqlite.d.cts +1 -1
  25. package/dist/store/sqlite.d.mts +1 -1
  26. package/dist/store/sqlite.mjs +1 -1
  27. package/dist/types.d.cts +1 -1
  28. package/dist/types.d.mts +1 -1
  29. package/package.json +13 -3
  30. /package/dist/{agents-exporter-BY7BXquG.mjs → agents-exporter-CGxTzDeQ.mjs} +0 -0
  31. /package/dist/{agents-exporter-CI29gyrT.d.mts → agents-exporter-CehKIArI.d.mts} +0 -0
  32. /package/dist/{agents-exporter-YNq4HFTK.d.cts → agents-exporter-DkqkCcIx.d.cts} +0 -0
  33. /package/dist/{chunk-rZg9L66_.mjs → chunk-CxwUPGYo.mjs} +0 -0
  34. /package/dist/{constants-BVeqLv6F.mjs → constants-BvnYU_pl.mjs} +0 -0
  35. /package/dist/{interface-BGkC9Ml4.d.mts → interface-BbAwy96d.d.cts} +0 -0
  36. /package/dist/{interface-Dix77UtO.d.cts → interface-Dz7B6QN1.d.mts} +0 -0
package/dist/agents.d.cts CHANGED
@@ -1,2 +1,2 @@
1
- import { a as AgentsTracingExporter, i as AgentsTrace, n as AgentsSpanData, o as LLMOpsAgentsExporterConfig, r as AgentsSpanError, s as createLLMOpsAgentsExporter, t as AgentsSpan } from "./agents-exporter-YNq4HFTK.cjs";
1
+ import { a as AgentsTracingExporter, i as AgentsTrace, n as AgentsSpanData, o as LLMOpsAgentsExporterConfig, r as AgentsSpanError, s as createLLMOpsAgentsExporter, t as AgentsSpan } from "./agents-exporter-DkqkCcIx.cjs";
2
2
  export { AgentsSpan, AgentsSpanData, AgentsSpanError, AgentsTrace, AgentsTracingExporter, LLMOpsAgentsExporterConfig, createLLMOpsAgentsExporter };
package/dist/agents.d.mts CHANGED
@@ -1,2 +1,2 @@
1
- import { a as AgentsTracingExporter, i as AgentsTrace, n as AgentsSpanData, o as LLMOpsAgentsExporterConfig, r as AgentsSpanError, s as createLLMOpsAgentsExporter, t as AgentsSpan } from "./agents-exporter-CI29gyrT.mjs";
1
+ import { a as AgentsTracingExporter, i as AgentsTrace, n as AgentsSpanData, o as LLMOpsAgentsExporterConfig, r as AgentsSpanError, s as createLLMOpsAgentsExporter, t as AgentsSpan } from "./agents-exporter-CehKIArI.mjs";
2
2
  export { AgentsSpan, AgentsSpanData, AgentsSpanError, AgentsTrace, AgentsTracingExporter, LLMOpsAgentsExporterConfig, createLLMOpsAgentsExporter };
package/dist/agents.mjs CHANGED
@@ -1,3 +1,3 @@
1
- import { t as createLLMOpsAgentsExporter } from "./agents-exporter-BY7BXquG.mjs";
1
+ import { t as createLLMOpsAgentsExporter } from "./agents-exporter-CGxTzDeQ.mjs";
2
2
 
3
3
  export { createLLMOpsAgentsExporter };
package/dist/eval.cjs ADDED
@@ -0,0 +1,367 @@
1
+ let node_crypto = require("node:crypto");
2
+ let node_fs = require("node:fs");
3
+ let node_path = require("node:path");
4
+
5
+ //#region src/eval/dataset.ts
6
+ /**
7
+ * Wraps a plain array as an EvaluationDataset.
8
+ */
9
+ var InlineDataset = class {
10
+ constructor(items) {
11
+ this.items = items;
12
+ }
13
+ size() {
14
+ return this.items.length;
15
+ }
16
+ get(index) {
17
+ return this.items[index];
18
+ }
19
+ slice(start, end) {
20
+ return this.items.slice(start, end);
21
+ }
22
+ };
23
+
24
+ //#endregion
25
+ //#region src/eval/evaluate.ts
26
+ async function pool(items, concurrency, fn) {
27
+ const executing = [];
28
+ for (const item of items) {
29
+ const p = fn(item).then(() => {
30
+ executing.splice(executing.indexOf(p), 1);
31
+ });
32
+ executing.push(p);
33
+ if (executing.length >= concurrency) await Promise.race(executing);
34
+ }
35
+ await Promise.all(executing);
36
+ }
37
+ function computeStats(values) {
38
+ const valid = values.filter((v) => !Number.isNaN(v));
39
+ if (valid.length === 0) return {
40
+ mean: 0,
41
+ min: 0,
42
+ max: 0,
43
+ median: 0,
44
+ count: 0
45
+ };
46
+ const sorted = [...valid].sort((a, b) => a - b);
47
+ const sum = sorted.reduce((a, b) => a + b, 0);
48
+ const mid = Math.floor(sorted.length / 2);
49
+ const median = sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
50
+ return {
51
+ mean: sum / sorted.length,
52
+ min: sorted[0],
53
+ max: sorted[sorted.length - 1],
54
+ median,
55
+ count: sorted.length
56
+ };
57
+ }
58
+ async function runSingleExecutor(dataset, executor, evaluators, concurrency) {
59
+ const size = await dataset.size();
60
+ const datapoints = await dataset.slice(0, size);
61
+ const results = new Array(datapoints.length);
62
+ const startTime = Date.now();
63
+ await pool(datapoints, concurrency, async (dp) => {
64
+ const idx = datapoints.indexOf(dp);
65
+ const dpStart = Date.now();
66
+ let output = null;
67
+ let error;
68
+ const scores = {};
69
+ try {
70
+ output = await executor(dp.data);
71
+ } catch (err) {
72
+ error = err instanceof Error ? err.message : String(err);
73
+ }
74
+ if (!error && output !== null) for (const [name, evaluator] of Object.entries(evaluators)) try {
75
+ const result = await evaluator(output, dp.target, dp.data);
76
+ if (typeof result === "number") scores[name] = result;
77
+ else for (const [subKey, subScore] of Object.entries(result)) scores[`${name}.${subKey}`] = subScore;
78
+ } catch {
79
+ scores[name] = NaN;
80
+ }
81
+ results[idx] = {
82
+ data: dp.data,
83
+ target: dp.target,
84
+ metadata: dp.metadata,
85
+ output,
86
+ scores,
87
+ durationMs: Date.now() - dpStart,
88
+ error
89
+ };
90
+ });
91
+ return {
92
+ results,
93
+ durationMs: Date.now() - startTime
94
+ };
95
+ }
96
+ function printSummary(result) {
97
+ const lines = [];
98
+ lines.push("");
99
+ lines.push(` ${result.name}`);
100
+ lines.push("");
101
+ const completed = result.count - result.errors;
102
+ lines.push(` ✓ ${completed}/${result.count} completed${result.errors > 0 ? ` ✗ ${result.errors} errors` : ""}`);
103
+ lines.push("");
104
+ lines.push(" Scores:");
105
+ for (const [name, stats] of Object.entries(result.scores)) lines.push(` ${name.padEnd(16)} mean=${stats.mean.toFixed(2)} min=${stats.min.toFixed(2)} max=${stats.max.toFixed(2)} median=${stats.median.toFixed(2)}`);
106
+ lines.push("");
107
+ lines.push(` Duration: ${(result.durationMs / 1e3).toFixed(1)}s`);
108
+ lines.push(` Run ID: ${result.runId}`);
109
+ lines.push("");
110
+ process.stderr.write(lines.join("\n"));
111
+ }
112
+ function saveResult(result, outputDir) {
113
+ const dir = (0, node_path.join)(outputDir, result.name);
114
+ (0, node_fs.mkdirSync)(dir, { recursive: true });
115
+ (0, node_fs.writeFileSync)((0, node_path.join)(dir, `${result.runId}.json`), JSON.stringify(result, null, 2));
116
+ }
117
+ async function evaluate(options) {
118
+ const { name, data, executor, variants, evaluators, concurrency = 5, group, metadata, outputDir = process.env.LLMOPS_EVAL_OUTPUT_DIR || "./llmops-evals" } = options;
119
+ const runId = (0, node_crypto.randomUUID)();
120
+ if (executor && variants) throw new Error("evaluate(): provide either executor or variants, not both");
121
+ if (!executor && !variants) throw new Error("evaluate(): provide either executor or variants");
122
+ const dataset = Array.isArray(data) ? new InlineDataset(data) : data;
123
+ if (executor) {
124
+ const { results, durationMs } = await runSingleExecutor(dataset, executor, evaluators, concurrency);
125
+ const scoreNames = /* @__PURE__ */ new Set();
126
+ for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
127
+ const scores = {};
128
+ for (const scoreName of scoreNames) scores[scoreName] = computeStats(results.map((r) => r.scores[scoreName] ?? NaN));
129
+ const result = {
130
+ name,
131
+ runId,
132
+ group,
133
+ scores,
134
+ durationMs,
135
+ count: results.length,
136
+ errors: results.filter((r) => r.error).length,
137
+ metadata,
138
+ results
139
+ };
140
+ if (process.env.LLMOPS_EVAL_OUTPUT === "json") process.stdout.write(JSON.stringify(result, null, 2));
141
+ else printSummary(result);
142
+ saveResult(result, outputDir);
143
+ return result;
144
+ }
145
+ const variantResults = {};
146
+ const totalStart = Date.now();
147
+ for (const [variantName, variantExecutor] of Object.entries(variants)) {
148
+ const { results, durationMs } = await runSingleExecutor(dataset, variantExecutor, evaluators, concurrency);
149
+ const scoreNames = /* @__PURE__ */ new Set();
150
+ for (const r of results) for (const key of Object.keys(r.scores)) scoreNames.add(key);
151
+ const scores = {};
152
+ for (const scoreName of scoreNames) scores[scoreName] = computeStats(results.map((r) => r.scores[scoreName] ?? NaN));
153
+ const variantResult = {
154
+ name: `${name}/${variantName}`,
155
+ runId,
156
+ group,
157
+ scores,
158
+ durationMs,
159
+ count: results.length,
160
+ errors: results.filter((r) => r.error).length,
161
+ metadata,
162
+ results
163
+ };
164
+ variantResults[variantName] = variantResult;
165
+ if (process.env.LLMOPS_EVAL_OUTPUT !== "json") printSummary(variantResult);
166
+ saveResult(variantResult, outputDir);
167
+ }
168
+ const variantEvalResult = {
169
+ name,
170
+ runId,
171
+ group,
172
+ durationMs: Date.now() - totalStart,
173
+ metadata,
174
+ variants: variantResults
175
+ };
176
+ if (process.env.LLMOPS_EVAL_OUTPUT === "json") process.stdout.write(JSON.stringify(variantEvalResult, null, 2));
177
+ return variantEvalResult;
178
+ }
179
+
180
+ //#endregion
181
+ //#region src/eval/compare.ts
182
+ /**
183
+ * Load an eval run from the filesystem.
184
+ */
185
+ function loadRun(outputDir, name, runId) {
186
+ const dir = (0, node_path.join)(outputDir, name);
187
+ const filePath = (0, node_path.join)(dir, `${runId}.json`);
188
+ try {
189
+ const content = (0, node_fs.readFileSync)(filePath, "utf-8");
190
+ return JSON.parse(content);
191
+ } catch {
192
+ try {
193
+ const match = (0, node_fs.readdirSync)(dir).find((f) => f.startsWith(runId) && f.endsWith(".json"));
194
+ if (match) {
195
+ const content = (0, node_fs.readFileSync)((0, node_path.join)(dir, match), "utf-8");
196
+ return JSON.parse(content);
197
+ }
198
+ } catch {}
199
+ throw new Error(`Eval run "${runId}" not found for "${name}" in ${outputDir}. Expected file: ${filePath}`);
200
+ }
201
+ }
202
+ /**
203
+ * Compare two eval runs. First run ID is the baseline.
204
+ *
205
+ * Usage:
206
+ * ```ts
207
+ * const diff = await compare({
208
+ * name: 'support-bot',
209
+ * runs: [run1.runId, run2.runId],
210
+ * })
211
+ * ```
212
+ */
213
+ async function compare(options) {
214
+ const { runs, name, outputDir = "./llmops-evals" } = options;
215
+ if (runs.length < 2) throw new Error("compare() requires at least 2 run IDs");
216
+ const baselineRun = loadRun(outputDir, name, runs[0]);
217
+ const candidateRun = loadRun(outputDir, name, runs[1]);
218
+ const allScoreNames = new Set([...Object.keys(baselineRun.scores), ...Object.keys(candidateRun.scores)]);
219
+ const scores = {};
220
+ for (const scoreName of allScoreNames) {
221
+ const baselineMean = baselineRun.scores[scoreName]?.mean ?? 0;
222
+ const candidateMean = candidateRun.scores[scoreName]?.mean ?? 0;
223
+ scores[scoreName] = {
224
+ baseline: baselineMean,
225
+ candidate: candidateMean,
226
+ delta: candidateMean - baselineMean
227
+ };
228
+ }
229
+ const regressions = [];
230
+ const improvements = [];
231
+ const minLen = Math.min(baselineRun.results.length, candidateRun.results.length);
232
+ for (let i = 0; i < minLen; i++) {
233
+ const baselineResult = baselineRun.results[i];
234
+ const candidateResult = candidateRun.results[i];
235
+ for (const scoreName of allScoreNames) {
236
+ const baselineScore = baselineResult.scores[scoreName] ?? NaN;
237
+ const candidateScore = candidateResult.scores[scoreName] ?? NaN;
238
+ if (Number.isNaN(baselineScore) || Number.isNaN(candidateScore)) continue;
239
+ if (candidateScore < baselineScore) regressions.push({
240
+ data: baselineResult.data,
241
+ evaluator: scoreName,
242
+ baselineScore,
243
+ candidateScore
244
+ });
245
+ else if (candidateScore > baselineScore) improvements.push({
246
+ data: baselineResult.data,
247
+ evaluator: scoreName,
248
+ baselineScore,
249
+ candidateScore
250
+ });
251
+ }
252
+ }
253
+ const result = {
254
+ baseline: runs[0],
255
+ candidate: runs[1],
256
+ scores,
257
+ regressions,
258
+ improvements
259
+ };
260
+ const lines = [];
261
+ lines.push("");
262
+ lines.push(` compare: ${runs[0].slice(0, 8)} → ${runs[1].slice(0, 8)}`);
263
+ lines.push("");
264
+ lines.push(" Scores:");
265
+ for (const [scoreName, delta] of Object.entries(scores)) {
266
+ const sign = delta.delta >= 0 ? "+" : "";
267
+ const marker = delta.delta >= 0 ? "✓" : "✗";
268
+ lines.push(` ${scoreName.padEnd(16)} ${delta.baseline.toFixed(2)} → ${delta.candidate.toFixed(2)} (${sign}${delta.delta.toFixed(2)}) ${marker}`);
269
+ }
270
+ if (regressions.length > 0) {
271
+ lines.push("");
272
+ lines.push(` Regressions (${regressions.length}):`);
273
+ for (const r of regressions.slice(0, 5)) {
274
+ const dataStr = typeof r.data === "string" ? r.data : JSON.stringify(r.data).slice(0, 60);
275
+ lines.push(` "${dataStr}" ${r.evaluator}: ${r.baselineScore.toFixed(2)} → ${r.candidateScore.toFixed(2)}`);
276
+ }
277
+ if (regressions.length > 5) lines.push(` ... and ${regressions.length - 5} more`);
278
+ }
279
+ if (improvements.length > 0) {
280
+ lines.push("");
281
+ lines.push(` Improvements (${improvements.length}):`);
282
+ for (const imp of improvements.slice(0, 5)) {
283
+ const dataStr = typeof imp.data === "string" ? imp.data : JSON.stringify(imp.data).slice(0, 60);
284
+ lines.push(` "${dataStr}" ${imp.evaluator}: ${imp.baselineScore.toFixed(2)} → ${imp.candidateScore.toFixed(2)}`);
285
+ }
286
+ if (improvements.length > 5) lines.push(` ... and ${improvements.length - 5} more`);
287
+ }
288
+ lines.push("");
289
+ process.stderr.write(lines.join("\n"));
290
+ return result;
291
+ }
292
+
293
+ //#endregion
294
+ //#region src/eval/judge.ts
295
+ /**
296
+ * Simple mustache-style template interpolation.
297
+ */
298
+ function interpolate(template, vars) {
299
+ return template.replace(/\{\{(\w+(?:\.\w+)*)\}\}/g, (_, path) => {
300
+ const value = path.split(".").reduce((obj, key) => obj?.[key], vars);
301
+ return typeof value === "string" ? value : JSON.stringify(value);
302
+ });
303
+ }
304
+ /**
305
+ * Default parser: expects JSON with a `score` field, a bare number,
306
+ * or an object of number values (multi-score).
307
+ */
308
+ function defaultParse(response) {
309
+ const cleaned = response.replace(/```json\n?|```/g, "").trim();
310
+ const parsed = JSON.parse(cleaned);
311
+ if (typeof parsed === "number") return parsed;
312
+ if (typeof parsed?.score === "number") return parsed.score;
313
+ if (typeof parsed === "object" && parsed !== null) {
314
+ const entries = Object.entries(parsed).filter(([, v]) => typeof v === "number");
315
+ if (entries.length > 0) return Object.fromEntries(entries);
316
+ }
317
+ throw new Error(`Could not extract score from judge response: ${response.slice(0, 200)}`);
318
+ }
319
+ /**
320
+ * Factory that returns an Evaluator which uses an LLM to score output.
321
+ *
322
+ * Usage:
323
+ * ```ts
324
+ * const accuracy = judgeScorer({
325
+ * model: '@openai/gpt-4o',
326
+ * prompt: 'Rate accuracy 0-1. Expected: {{target.answer}} Actual: {{output}}',
327
+ * ops,
328
+ * })
329
+ * ```
330
+ */
331
+ function judgeScorer(options) {
332
+ const { model, prompt, ops, parse = defaultParse } = options;
333
+ return async (output, target) => {
334
+ const vars = {
335
+ output: typeof output === "string" ? output : JSON.stringify(output),
336
+ target
337
+ };
338
+ if (target && typeof target === "object") for (const [k, v] of Object.entries(target)) vars[`target.${k}`] = v;
339
+ const renderedPrompt = interpolate(prompt, vars);
340
+ const providerConfig = ops.provider();
341
+ const response = await providerConfig.fetch(`${providerConfig.baseURL}/chat/completions`, {
342
+ method: "POST",
343
+ headers: {
344
+ "Content-Type": "application/json",
345
+ Authorization: `Bearer ${providerConfig.apiKey}`
346
+ },
347
+ body: JSON.stringify({
348
+ model,
349
+ messages: [{
350
+ role: "user",
351
+ content: renderedPrompt
352
+ }],
353
+ response_format: { type: "json_object" }
354
+ })
355
+ });
356
+ if (!response.ok) throw new Error(`Judge LLM call failed: ${response.status} ${await response.text()}`);
357
+ const content = (await response.json()).choices?.[0]?.message?.content;
358
+ if (!content) throw new Error("Judge LLM returned empty response");
359
+ return parse(content);
360
+ };
361
+ }
362
+
363
+ //#endregion
364
+ exports.InlineDataset = InlineDataset;
365
+ exports.compare = compare;
366
+ exports.evaluate = evaluate;
367
+ exports.judgeScorer = judgeScorer;
@@ -0,0 +1,200 @@
1
+ import "./agents-exporter-DkqkCcIx.cjs";
2
+ import { t as LLMOpsClient } from "./index-lgspeSNr.cjs";
3
+
4
+ //#region src/eval/dataset.d.ts
5
+
6
+ /**
7
+ * Interface for custom dataset sources.
8
+ * Built-in: inline arrays are wrapped in InlineDataset automatically.
9
+ * Future: CSVDataset, JSONLDataset, S3Dataset.
10
+ */
11
+ interface EvaluationDataset<D = Record<string, unknown>, T = Record<string, unknown>> {
12
+ size(): number | Promise<number>;
13
+ get(index: number): Datapoint<D, T> | Promise<Datapoint<D, T>>;
14
+ slice(start: number, end: number): Datapoint<D, T>[] | Promise<Datapoint<D, T>[]>;
15
+ }
16
+ /**
17
+ * Wraps a plain array as an EvaluationDataset.
18
+ */
19
+ declare class InlineDataset<D, T> implements EvaluationDataset<D, T> {
20
+ private items;
21
+ constructor(items: Datapoint<D, T>[]);
22
+ size(): number;
23
+ get(index: number): Datapoint<D, T>;
24
+ slice(start: number, end: number): Datapoint<D, T>[];
25
+ }
26
+ //#endregion
27
+ //#region src/eval/types.d.ts
28
+ /**
29
+ * A single datapoint in a dataset.
30
+ */
31
+ interface Datapoint<D = Record<string, unknown>, T = Record<string, unknown>> {
32
+ data: D;
33
+ target?: T;
34
+ metadata?: Record<string, unknown>;
35
+ }
36
+ /**
37
+ * An evaluator scores executor output.
38
+ * Returns a single number (0-1) or an object of named scores.
39
+ */
40
+ type Evaluator<O = unknown, T = unknown, D = unknown> = (output: O, target?: T, data?: D) => number | Record<string, number> | Promise<number | Record<string, number>>;
41
+ /**
42
+ * An executor is the function under test.
43
+ */
44
+ type Executor<D = Record<string, unknown>, O = unknown> = (data: D) => O | Promise<O>;
45
+ /**
46
+ * Configuration for evaluate().
47
+ */
48
+ interface EvaluateOptions<D, T, O> {
49
+ /** Name of this evaluation run. Required. */
50
+ name: string;
51
+ /** Dataset — inline array of datapoints or an EvaluationDataset */
52
+ data: Datapoint<D, T>[] | EvaluationDataset<D, T>;
53
+ /** The function under test. Provide either executor or variants, not both. */
54
+ executor?: Executor<D, O>;
55
+ /** Named variants for side-by-side comparison. Keys become variant labels. */
56
+ variants?: Record<string, Executor<D, O>>;
57
+ /** Named evaluator functions. Keys become score names. */
58
+ evaluators: Record<string, Evaluator<O, T>>;
59
+ /** Maximum concurrent datapoints. Default: 5 */
60
+ concurrency?: number;
61
+ /** Group name for tracking score progression across runs. */
62
+ group?: string;
63
+ /** Metadata attached to the entire run. */
64
+ metadata?: Record<string, unknown>;
65
+ /** Output directory for JSON results. Default: './llmops-evals' */
66
+ outputDir?: string;
67
+ }
68
+ /**
69
+ * Result for a single datapoint.
70
+ */
71
+ interface DatapointResult<D = unknown, O = unknown> {
72
+ data: D;
73
+ target?: unknown;
74
+ metadata?: Record<string, unknown>;
75
+ output: O;
76
+ scores: Record<string, number>;
77
+ durationMs: number;
78
+ error?: string;
79
+ }
80
+ /**
81
+ * Aggregated score statistics for one evaluator.
82
+ */
83
+ interface ScoreStats {
84
+ mean: number;
85
+ min: number;
86
+ max: number;
87
+ median: number;
88
+ count: number;
89
+ }
90
+ /**
91
+ * Summary of an evaluation run.
92
+ */
93
+ interface EvaluateResult<D = unknown, O = unknown> {
94
+ name: string;
95
+ runId: string;
96
+ group?: string;
97
+ scores: Record<string, ScoreStats>;
98
+ durationMs: number;
99
+ count: number;
100
+ errors: number;
101
+ metadata?: Record<string, unknown>;
102
+ results: DatapointResult<D, O>[];
103
+ }
104
+ /**
105
+ * When variants are used, wraps per-variant results.
106
+ */
107
+ interface VariantEvaluateResult<D = unknown, O = unknown> {
108
+ name: string;
109
+ runId: string;
110
+ group?: string;
111
+ durationMs: number;
112
+ metadata?: Record<string, unknown>;
113
+ variants: Record<string, EvaluateResult<D, O>>;
114
+ }
115
+ /**
116
+ * Options for compare().
117
+ */
118
+ interface CompareOptions {
119
+ /** Run IDs to compare. First is baseline. */
120
+ runs: string[];
121
+ /** Directory where eval results are stored. Default: './llmops-evals' */
122
+ outputDir?: string;
123
+ /** Eval name to search within. Required. */
124
+ name: string;
125
+ }
126
+ /**
127
+ * Per-evaluator delta between two runs.
128
+ */
129
+ interface ScoreDelta {
130
+ baseline: number;
131
+ candidate: number;
132
+ delta: number;
133
+ }
134
+ /**
135
+ * Result of comparing two runs.
136
+ */
137
+ interface CompareResult {
138
+ baseline: string;
139
+ candidate: string;
140
+ scores: Record<string, ScoreDelta>;
141
+ regressions: Array<{
142
+ data: unknown;
143
+ evaluator: string;
144
+ baselineScore: number;
145
+ candidateScore: number;
146
+ }>;
147
+ improvements: Array<{
148
+ data: unknown;
149
+ evaluator: string;
150
+ baselineScore: number;
151
+ candidateScore: number;
152
+ }>;
153
+ }
154
+ /**
155
+ * Options for judgeScorer().
156
+ */
157
+ interface JudgeScorerOptions {
158
+ /** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
159
+ model: string;
160
+ /** Prompt template. Supports {{output}}, {{target}}, {{target.*}} placeholders. */
161
+ prompt: string;
162
+ /** The llmops client instance. Judge call routed through gateway. */
163
+ ops: LLMOpsClient;
164
+ /** Custom parser for extracting score from LLM response. */
165
+ parse?: (response: string) => number | Record<string, number>;
166
+ }
167
+ //#endregion
168
+ //#region src/eval/evaluate.d.ts
169
+ declare function evaluate<D = Record<string, unknown>, T = Record<string, unknown>, O = unknown>(options: EvaluateOptions<D, T, O>): Promise<EvaluateResult<D, O> | VariantEvaluateResult<D, O>>;
170
+ //#endregion
171
+ //#region src/eval/compare.d.ts
172
+ /**
173
+ * Compare two eval runs. First run ID is the baseline.
174
+ *
175
+ * Usage:
176
+ * ```ts
177
+ * const diff = await compare({
178
+ * name: 'support-bot',
179
+ * runs: [run1.runId, run2.runId],
180
+ * })
181
+ * ```
182
+ */
183
+ declare function compare(options: CompareOptions): Promise<CompareResult>;
184
+ //#endregion
185
+ //#region src/eval/judge.d.ts
186
+ /**
187
+ * Factory that returns an Evaluator which uses an LLM to score output.
188
+ *
189
+ * Usage:
190
+ * ```ts
191
+ * const accuracy = judgeScorer({
192
+ * model: '@openai/gpt-4o',
193
+ * prompt: 'Rate accuracy 0-1. Expected: {{target.answer}} Actual: {{output}}',
194
+ * ops,
195
+ * })
196
+ * ```
197
+ */
198
+ declare function judgeScorer(options: JudgeScorerOptions): Evaluator;
199
+ //#endregion
200
+ export { type CompareOptions, type CompareResult, type Datapoint, type DatapointResult, type EvaluateOptions, type EvaluateResult, type EvaluationDataset, type Evaluator, type Executor, InlineDataset, type JudgeScorerOptions, type ScoreDelta, type ScoreStats, type VariantEvaluateResult, compare, evaluate, judgeScorer };