@llmops/sdk 1.0.0-beta.2 → 1.0.0-beta.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents.cjs +1 -1
- package/dist/agents.d.cts +1 -1
- package/dist/agents.d.mts +1 -1
- package/dist/agents.mjs +1 -1
- package/dist/chunk-CxwUPGYo.mjs +21 -0
- package/dist/constants--ywcWP7q.cjs +18 -0
- package/dist/constants-BvnYU_pl.mjs +12 -0
- package/dist/eval.cjs +464 -0
- package/dist/eval.d.cts +240 -0
- package/dist/eval.d.mts +240 -0
- package/dist/eval.mjs +461 -0
- package/dist/express.cjs +29 -2
- package/dist/express.d.cts +7 -3
- package/dist/express.d.mts +7 -3
- package/dist/express.mjs +28 -1
- package/dist/hono.d.cts +2 -2
- package/dist/hono.d.mts +2 -2
- package/dist/{index-05byZKeu.d.mts → index-BZLzywwb.d.mts} +1 -1
- package/dist/{index-Beb26ZNG.d.cts → index-lgspeSNr.d.cts} +1 -1
- package/dist/index.cjs +3 -3
- package/dist/index.d.cts +4 -4
- package/dist/index.d.mts +4 -4
- package/dist/index.mjs +3 -3
- package/dist/interface-BbAwy96d.d.cts +223 -0
- package/dist/interface-Dz7B6QN1.d.mts +223 -0
- package/dist/nextjs.d.cts +2 -2
- package/dist/nextjs.d.mts +2 -2
- package/dist/store/d1.cjs +512 -0
- package/dist/store/d1.d.cts +60 -0
- package/dist/store/d1.d.mts +60 -0
- package/dist/store/d1.mjs +511 -0
- package/dist/store/pg.cjs +13634 -6
- package/dist/store/pg.d.cts +38 -2
- package/dist/store/pg.d.mts +38 -2
- package/dist/store/pg.mjs +13618 -2
- package/dist/store/sqlite.cjs +541 -0
- package/dist/store/sqlite.d.cts +50 -0
- package/dist/store/sqlite.d.mts +50 -0
- package/dist/store/sqlite.mjs +541 -0
- package/dist/types.d.cts +2 -2
- package/dist/types.d.mts +2 -2
- package/package.json +48 -3
- package/dist/express-B-wbCza5.cjs +0 -35
- package/dist/express-DMtc0d_Y.mjs +0 -30
- package/dist/index-DnWGper4.d.cts +0 -7
- package/dist/index-Dvz-L2Hf.d.mts +0 -7
- /package/dist/{agents-exporter-vcpgCF69.mjs → agents-exporter-CGxTzDeQ.mjs} +0 -0
- /package/dist/{agents-exporter-BZHCcFSd.d.mts → agents-exporter-CehKIArI.d.mts} +0 -0
- /package/dist/{agents-exporter-BuTq2n2y.cjs → agents-exporter-DizRE7CQ.cjs} +0 -0
- /package/dist/{agents-exporter-uzN3bkth.d.cts → agents-exporter-DkqkCcIx.d.cts} +0 -0
package/dist/eval.d.cts
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import "./agents-exporter-DkqkCcIx.cjs";
|
|
2
|
+
import { t as LLMOpsClient } from "./index-lgspeSNr.cjs";
|
|
3
|
+
|
|
4
|
+
//#region src/eval/dataset.d.ts
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Interface for custom dataset sources.
|
|
8
|
+
* Built-in: inline arrays are wrapped in InlineDataset automatically.
|
|
9
|
+
* Future: CSVDataset, JSONLDataset, S3Dataset.
|
|
10
|
+
*/
|
|
11
|
+
interface EvaluationDataset<D = Record<string, unknown>, T = Record<string, unknown>> {
|
|
12
|
+
size(): number | Promise<number>;
|
|
13
|
+
get(index: number): Datapoint<D, T> | Promise<Datapoint<D, T>>;
|
|
14
|
+
slice(start: number, end: number): Datapoint<D, T>[] | Promise<Datapoint<D, T>[]>;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Wraps a plain array as an EvaluationDataset.
|
|
18
|
+
*/
|
|
19
|
+
declare class InlineDataset<D, T> implements EvaluationDataset<D, T> {
|
|
20
|
+
private items;
|
|
21
|
+
constructor(items: Datapoint<D, T>[]);
|
|
22
|
+
size(): number;
|
|
23
|
+
get(index: number): Datapoint<D, T>;
|
|
24
|
+
slice(start: number, end: number): Datapoint<D, T>[];
|
|
25
|
+
}
|
|
26
|
+
//#endregion
|
|
27
|
+
//#region src/eval/types.d.ts
|
|
28
|
+
/**
|
|
29
|
+
* A single datapoint in a dataset.
|
|
30
|
+
*/
|
|
31
|
+
interface Datapoint<D = Record<string, unknown>, T = Record<string, unknown>> {
|
|
32
|
+
data: D;
|
|
33
|
+
target?: T;
|
|
34
|
+
metadata?: Record<string, unknown>;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* An evaluator scores executor output.
|
|
38
|
+
* Returns a single number (0-1) or an object of named scores.
|
|
39
|
+
*/
|
|
40
|
+
type Evaluator<O = unknown, T = unknown, D = unknown> = (output: O, target?: T, data?: D) => number | Record<string, number> | Promise<number | Record<string, number>>;
|
|
41
|
+
/**
|
|
42
|
+
* An executor is the function under test.
|
|
43
|
+
*/
|
|
44
|
+
type Executor<D = Record<string, unknown>, O = unknown> = (data: D) => O | Promise<O>;
|
|
45
|
+
/**
|
|
46
|
+
* Configuration for evaluate().
|
|
47
|
+
*/
|
|
48
|
+
interface EvaluateOptions<D, T, O> {
|
|
49
|
+
/** Name of this evaluation run. Required. */
|
|
50
|
+
name: string;
|
|
51
|
+
/** Dataset — inline array of datapoints or an EvaluationDataset */
|
|
52
|
+
data: Datapoint<D, T>[] | EvaluationDataset<D, T>;
|
|
53
|
+
/** The function under test. Provide either executor or variants, not both. */
|
|
54
|
+
executor?: Executor<D, O>;
|
|
55
|
+
/** Named variants for side-by-side comparison. Keys become variant labels. */
|
|
56
|
+
variants?: Record<string, Executor<D, O>>;
|
|
57
|
+
/** Named evaluator functions. Keys become score names. */
|
|
58
|
+
evaluators: Record<string, Evaluator<O, T>>;
|
|
59
|
+
/** Maximum concurrent datapoints. Default: 5 */
|
|
60
|
+
concurrency?: number;
|
|
61
|
+
/** Group name for tracking score progression across runs. */
|
|
62
|
+
group?: string;
|
|
63
|
+
/** Metadata attached to the entire run. */
|
|
64
|
+
metadata?: Record<string, unknown>;
|
|
65
|
+
/** Output directory for JSON results. Default: './llmops-evals' */
|
|
66
|
+
outputDir?: string;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Result for a single datapoint.
|
|
70
|
+
*/
|
|
71
|
+
interface DatapointResult<D = unknown, O = unknown> {
|
|
72
|
+
data: D;
|
|
73
|
+
target?: unknown;
|
|
74
|
+
metadata?: Record<string, unknown>;
|
|
75
|
+
output: O;
|
|
76
|
+
scores: Record<string, number>;
|
|
77
|
+
durationMs: number;
|
|
78
|
+
error?: string;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Aggregated score statistics for one evaluator.
|
|
82
|
+
*/
|
|
83
|
+
interface ScoreStats {
|
|
84
|
+
mean: number;
|
|
85
|
+
min: number;
|
|
86
|
+
max: number;
|
|
87
|
+
median: number;
|
|
88
|
+
count: number;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Summary of an evaluation run.
|
|
92
|
+
*/
|
|
93
|
+
interface EvaluateResult<D = unknown, O = unknown> {
|
|
94
|
+
name: string;
|
|
95
|
+
runId: string;
|
|
96
|
+
group?: string;
|
|
97
|
+
scores: Record<string, ScoreStats>;
|
|
98
|
+
durationMs: number;
|
|
99
|
+
count: number;
|
|
100
|
+
errors: number;
|
|
101
|
+
metadata?: Record<string, unknown>;
|
|
102
|
+
results: DatapointResult<D, O>[];
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* When variants are used, wraps per-variant results.
|
|
106
|
+
*/
|
|
107
|
+
interface VariantEvaluateResult<D = unknown, O = unknown> {
|
|
108
|
+
name: string;
|
|
109
|
+
runId: string;
|
|
110
|
+
group?: string;
|
|
111
|
+
durationMs: number;
|
|
112
|
+
metadata?: Record<string, unknown>;
|
|
113
|
+
variants: Record<string, EvaluateResult<D, O>>;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Options for compare().
|
|
117
|
+
*/
|
|
118
|
+
interface CompareOptions {
|
|
119
|
+
/** Paths to eval result JSON files. First is baseline, second is candidate. */
|
|
120
|
+
files: [string, string];
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Per-evaluator delta between two runs.
|
|
124
|
+
*/
|
|
125
|
+
interface ScoreDelta {
|
|
126
|
+
baseline: number;
|
|
127
|
+
candidate: number;
|
|
128
|
+
delta: number;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Result of comparing two runs.
|
|
132
|
+
*/
|
|
133
|
+
interface CompareResult {
|
|
134
|
+
baseline: string;
|
|
135
|
+
candidate: string;
|
|
136
|
+
scores: Record<string, ScoreDelta>;
|
|
137
|
+
regressions: Array<{
|
|
138
|
+
data: unknown;
|
|
139
|
+
evaluator: string;
|
|
140
|
+
baselineScore: number;
|
|
141
|
+
candidateScore: number;
|
|
142
|
+
}>;
|
|
143
|
+
improvements: Array<{
|
|
144
|
+
data: unknown;
|
|
145
|
+
evaluator: string;
|
|
146
|
+
baselineScore: number;
|
|
147
|
+
candidateScore: number;
|
|
148
|
+
}>;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Options for judgeScorer().
|
|
152
|
+
*/
|
|
153
|
+
interface JudgeScorerOptions {
|
|
154
|
+
/** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
|
|
155
|
+
model: string;
|
|
156
|
+
/**
|
|
157
|
+
* Grading prompt. Supports {{output}}, {{target}}, {{target.*}},
|
|
158
|
+
* {{data}}, {{data.*}} placeholders.
|
|
159
|
+
*
|
|
160
|
+
* This becomes the user message. A system message is added automatically
|
|
161
|
+
* that instructs the LLM to return a JSON score.
|
|
162
|
+
*/
|
|
163
|
+
prompt: string;
|
|
164
|
+
/**
|
|
165
|
+
* The llmops client instance. The judge call is routed through the
|
|
166
|
+
* gateway and traced like any other LLM call.
|
|
167
|
+
*
|
|
168
|
+
* ```ts
|
|
169
|
+
* const client = llmops({ telemetry: pgStore(url) })
|
|
170
|
+
* judgeScorer({ model: '@openai/gpt-4o', prompt: '...', client })
|
|
171
|
+
* ```
|
|
172
|
+
*/
|
|
173
|
+
client: LLMOpsClient;
|
|
174
|
+
/**
|
|
175
|
+
* Custom system message. Overrides the default grading instructions.
|
|
176
|
+
* If omitted, a default system message is used that instructs
|
|
177
|
+
* the LLM to return JSON with a "score" field (0-1).
|
|
178
|
+
*/
|
|
179
|
+
system?: string;
|
|
180
|
+
/** Temperature for the judge LLM. Default: 0 (deterministic). */
|
|
181
|
+
temperature?: number;
|
|
182
|
+
/** Max retries on parse failure. Default: 1. */
|
|
183
|
+
maxRetries?: number;
|
|
184
|
+
/**
|
|
185
|
+
* Custom parser for extracting score from LLM response.
|
|
186
|
+
* Default: expects JSON with a `score` field.
|
|
187
|
+
*/
|
|
188
|
+
parse?: (response: string) => number | Record<string, number>;
|
|
189
|
+
}
|
|
190
|
+
//#endregion
|
|
191
|
+
//#region src/eval/evaluate.d.ts
|
|
192
|
+
declare function evaluate<D = Record<string, unknown>, T = Record<string, unknown>, O = unknown>(options: EvaluateOptions<D, T, O>): Promise<EvaluateResult<D, O> | VariantEvaluateResult<D, O>>;
|
|
193
|
+
//#endregion
|
|
194
|
+
//#region src/eval/compare.d.ts
|
|
195
|
+
/**
|
|
196
|
+
* Compare two eval result files. First file is the baseline.
|
|
197
|
+
*
|
|
198
|
+
* Usage with version control:
|
|
199
|
+
* 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
|
|
200
|
+
* 2. Commit the file
|
|
201
|
+
* 3. Make changes, re-run eval
|
|
202
|
+
* 4. Compare: git stash the new result, compare old vs new
|
|
203
|
+
*
|
|
204
|
+
* Or compare two named eval files:
|
|
205
|
+
* ```ts
|
|
206
|
+
* const diff = await compare({
|
|
207
|
+
* files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
|
|
208
|
+
* })
|
|
209
|
+
* ```
|
|
210
|
+
*/
|
|
211
|
+
declare function compare(options: CompareOptions): Promise<CompareResult>;
|
|
212
|
+
//#endregion
|
|
213
|
+
//#region src/eval/judge.d.ts
|
|
214
|
+
/**
|
|
215
|
+
* Factory that returns an Evaluator which uses an LLM to score output.
|
|
216
|
+
*
|
|
217
|
+
* The judge:
|
|
218
|
+
* - Uses a system message that instructs the LLM to return JSON scores
|
|
219
|
+
* - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
|
|
220
|
+
* - Uses temperature 0 by default for deterministic scoring
|
|
221
|
+
* - Retries on parse failure (configurable)
|
|
222
|
+
* - Clamps scores to [0, 1]
|
|
223
|
+
*
|
|
224
|
+
* Usage:
|
|
225
|
+
* ```ts
|
|
226
|
+
* import { llmops } from '@llmops/sdk'
|
|
227
|
+
*
|
|
228
|
+
* const client = llmops()
|
|
229
|
+
* const accuracy = judgeScorer({
|
|
230
|
+
* model: '@openai/gpt-4o',
|
|
231
|
+
* prompt: `Rate the accuracy of this response.
|
|
232
|
+
* Expected: {{target.answer}}
|
|
233
|
+
* Actual: {{output}}`,
|
|
234
|
+
* client,
|
|
235
|
+
* })
|
|
236
|
+
* ```
|
|
237
|
+
*/
|
|
238
|
+
declare function judgeScorer(options: JudgeScorerOptions): Evaluator;
|
|
239
|
+
//#endregion
|
|
240
|
+
export { type CompareOptions, type CompareResult, type Datapoint, type DatapointResult, type EvaluateOptions, type EvaluateResult, type EvaluationDataset, type Evaluator, type Executor, InlineDataset, type JudgeScorerOptions, type ScoreDelta, type ScoreStats, type VariantEvaluateResult, compare, evaluate, judgeScorer };
|
package/dist/eval.d.mts
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import "./agents-exporter-CehKIArI.mjs";
|
|
2
|
+
import { t as LLMOpsClient } from "./index-BZLzywwb.mjs";
|
|
3
|
+
|
|
4
|
+
//#region src/eval/dataset.d.ts
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Interface for custom dataset sources.
|
|
8
|
+
* Built-in: inline arrays are wrapped in InlineDataset automatically.
|
|
9
|
+
* Future: CSVDataset, JSONLDataset, S3Dataset.
|
|
10
|
+
*/
|
|
11
|
+
interface EvaluationDataset<D = Record<string, unknown>, T = Record<string, unknown>> {
|
|
12
|
+
size(): number | Promise<number>;
|
|
13
|
+
get(index: number): Datapoint<D, T> | Promise<Datapoint<D, T>>;
|
|
14
|
+
slice(start: number, end: number): Datapoint<D, T>[] | Promise<Datapoint<D, T>[]>;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Wraps a plain array as an EvaluationDataset.
|
|
18
|
+
*/
|
|
19
|
+
declare class InlineDataset<D, T> implements EvaluationDataset<D, T> {
|
|
20
|
+
private items;
|
|
21
|
+
constructor(items: Datapoint<D, T>[]);
|
|
22
|
+
size(): number;
|
|
23
|
+
get(index: number): Datapoint<D, T>;
|
|
24
|
+
slice(start: number, end: number): Datapoint<D, T>[];
|
|
25
|
+
}
|
|
26
|
+
//#endregion
|
|
27
|
+
//#region src/eval/types.d.ts
|
|
28
|
+
/**
|
|
29
|
+
* A single datapoint in a dataset.
|
|
30
|
+
*/
|
|
31
|
+
interface Datapoint<D = Record<string, unknown>, T = Record<string, unknown>> {
|
|
32
|
+
data: D;
|
|
33
|
+
target?: T;
|
|
34
|
+
metadata?: Record<string, unknown>;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* An evaluator scores executor output.
|
|
38
|
+
* Returns a single number (0-1) or an object of named scores.
|
|
39
|
+
*/
|
|
40
|
+
type Evaluator<O = unknown, T = unknown, D = unknown> = (output: O, target?: T, data?: D) => number | Record<string, number> | Promise<number | Record<string, number>>;
|
|
41
|
+
/**
|
|
42
|
+
* An executor is the function under test.
|
|
43
|
+
*/
|
|
44
|
+
type Executor<D = Record<string, unknown>, O = unknown> = (data: D) => O | Promise<O>;
|
|
45
|
+
/**
|
|
46
|
+
* Configuration for evaluate().
|
|
47
|
+
*/
|
|
48
|
+
interface EvaluateOptions<D, T, O> {
|
|
49
|
+
/** Name of this evaluation run. Required. */
|
|
50
|
+
name: string;
|
|
51
|
+
/** Dataset — inline array of datapoints or an EvaluationDataset */
|
|
52
|
+
data: Datapoint<D, T>[] | EvaluationDataset<D, T>;
|
|
53
|
+
/** The function under test. Provide either executor or variants, not both. */
|
|
54
|
+
executor?: Executor<D, O>;
|
|
55
|
+
/** Named variants for side-by-side comparison. Keys become variant labels. */
|
|
56
|
+
variants?: Record<string, Executor<D, O>>;
|
|
57
|
+
/** Named evaluator functions. Keys become score names. */
|
|
58
|
+
evaluators: Record<string, Evaluator<O, T>>;
|
|
59
|
+
/** Maximum concurrent datapoints. Default: 5 */
|
|
60
|
+
concurrency?: number;
|
|
61
|
+
/** Group name for tracking score progression across runs. */
|
|
62
|
+
group?: string;
|
|
63
|
+
/** Metadata attached to the entire run. */
|
|
64
|
+
metadata?: Record<string, unknown>;
|
|
65
|
+
/** Output directory for JSON results. Default: './llmops-evals' */
|
|
66
|
+
outputDir?: string;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Result for a single datapoint.
|
|
70
|
+
*/
|
|
71
|
+
interface DatapointResult<D = unknown, O = unknown> {
|
|
72
|
+
data: D;
|
|
73
|
+
target?: unknown;
|
|
74
|
+
metadata?: Record<string, unknown>;
|
|
75
|
+
output: O;
|
|
76
|
+
scores: Record<string, number>;
|
|
77
|
+
durationMs: number;
|
|
78
|
+
error?: string;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Aggregated score statistics for one evaluator.
|
|
82
|
+
*/
|
|
83
|
+
interface ScoreStats {
|
|
84
|
+
mean: number;
|
|
85
|
+
min: number;
|
|
86
|
+
max: number;
|
|
87
|
+
median: number;
|
|
88
|
+
count: number;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Summary of an evaluation run.
|
|
92
|
+
*/
|
|
93
|
+
interface EvaluateResult<D = unknown, O = unknown> {
|
|
94
|
+
name: string;
|
|
95
|
+
runId: string;
|
|
96
|
+
group?: string;
|
|
97
|
+
scores: Record<string, ScoreStats>;
|
|
98
|
+
durationMs: number;
|
|
99
|
+
count: number;
|
|
100
|
+
errors: number;
|
|
101
|
+
metadata?: Record<string, unknown>;
|
|
102
|
+
results: DatapointResult<D, O>[];
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* When variants are used, wraps per-variant results.
|
|
106
|
+
*/
|
|
107
|
+
interface VariantEvaluateResult<D = unknown, O = unknown> {
|
|
108
|
+
name: string;
|
|
109
|
+
runId: string;
|
|
110
|
+
group?: string;
|
|
111
|
+
durationMs: number;
|
|
112
|
+
metadata?: Record<string, unknown>;
|
|
113
|
+
variants: Record<string, EvaluateResult<D, O>>;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Options for compare().
|
|
117
|
+
*/
|
|
118
|
+
interface CompareOptions {
|
|
119
|
+
/** Paths to eval result JSON files. First is baseline, second is candidate. */
|
|
120
|
+
files: [string, string];
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Per-evaluator delta between two runs.
|
|
124
|
+
*/
|
|
125
|
+
interface ScoreDelta {
|
|
126
|
+
baseline: number;
|
|
127
|
+
candidate: number;
|
|
128
|
+
delta: number;
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Result of comparing two runs.
|
|
132
|
+
*/
|
|
133
|
+
interface CompareResult {
|
|
134
|
+
baseline: string;
|
|
135
|
+
candidate: string;
|
|
136
|
+
scores: Record<string, ScoreDelta>;
|
|
137
|
+
regressions: Array<{
|
|
138
|
+
data: unknown;
|
|
139
|
+
evaluator: string;
|
|
140
|
+
baselineScore: number;
|
|
141
|
+
candidateScore: number;
|
|
142
|
+
}>;
|
|
143
|
+
improvements: Array<{
|
|
144
|
+
data: unknown;
|
|
145
|
+
evaluator: string;
|
|
146
|
+
baselineScore: number;
|
|
147
|
+
candidateScore: number;
|
|
148
|
+
}>;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Options for judgeScorer().
|
|
152
|
+
*/
|
|
153
|
+
interface JudgeScorerOptions {
|
|
154
|
+
/** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
|
|
155
|
+
model: string;
|
|
156
|
+
/**
|
|
157
|
+
* Grading prompt. Supports {{output}}, {{target}}, {{target.*}},
|
|
158
|
+
* {{data}}, {{data.*}} placeholders.
|
|
159
|
+
*
|
|
160
|
+
* This becomes the user message. A system message is added automatically
|
|
161
|
+
* that instructs the LLM to return a JSON score.
|
|
162
|
+
*/
|
|
163
|
+
prompt: string;
|
|
164
|
+
/**
|
|
165
|
+
* The llmops client instance. The judge call is routed through the
|
|
166
|
+
* gateway and traced like any other LLM call.
|
|
167
|
+
*
|
|
168
|
+
* ```ts
|
|
169
|
+
* const client = llmops({ telemetry: pgStore(url) })
|
|
170
|
+
* judgeScorer({ model: '@openai/gpt-4o', prompt: '...', client })
|
|
171
|
+
* ```
|
|
172
|
+
*/
|
|
173
|
+
client: LLMOpsClient;
|
|
174
|
+
/**
|
|
175
|
+
* Custom system message. Overrides the default grading instructions.
|
|
176
|
+
* If omitted, a default system message is used that instructs
|
|
177
|
+
* the LLM to return JSON with a "score" field (0-1).
|
|
178
|
+
*/
|
|
179
|
+
system?: string;
|
|
180
|
+
/** Temperature for the judge LLM. Default: 0 (deterministic). */
|
|
181
|
+
temperature?: number;
|
|
182
|
+
/** Max retries on parse failure. Default: 1. */
|
|
183
|
+
maxRetries?: number;
|
|
184
|
+
/**
|
|
185
|
+
* Custom parser for extracting score from LLM response.
|
|
186
|
+
* Default: expects JSON with a `score` field.
|
|
187
|
+
*/
|
|
188
|
+
parse?: (response: string) => number | Record<string, number>;
|
|
189
|
+
}
|
|
190
|
+
//#endregion
|
|
191
|
+
//#region src/eval/evaluate.d.ts
|
|
192
|
+
declare function evaluate<D = Record<string, unknown>, T = Record<string, unknown>, O = unknown>(options: EvaluateOptions<D, T, O>): Promise<EvaluateResult<D, O> | VariantEvaluateResult<D, O>>;
|
|
193
|
+
//#endregion
|
|
194
|
+
//#region src/eval/compare.d.ts
|
|
195
|
+
/**
|
|
196
|
+
* Compare two eval result files. First file is the baseline.
|
|
197
|
+
*
|
|
198
|
+
* Usage with version control:
|
|
199
|
+
* 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
|
|
200
|
+
* 2. Commit the file
|
|
201
|
+
* 3. Make changes, re-run eval
|
|
202
|
+
* 4. Compare: git stash the new result, compare old vs new
|
|
203
|
+
*
|
|
204
|
+
* Or compare two named eval files:
|
|
205
|
+
* ```ts
|
|
206
|
+
* const diff = await compare({
|
|
207
|
+
* files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
|
|
208
|
+
* })
|
|
209
|
+
* ```
|
|
210
|
+
*/
|
|
211
|
+
declare function compare(options: CompareOptions): Promise<CompareResult>;
|
|
212
|
+
//#endregion
|
|
213
|
+
//#region src/eval/judge.d.ts
|
|
214
|
+
/**
|
|
215
|
+
* Factory that returns an Evaluator which uses an LLM to score output.
|
|
216
|
+
*
|
|
217
|
+
* The judge:
|
|
218
|
+
* - Uses a system message that instructs the LLM to return JSON scores
|
|
219
|
+
* - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
|
|
220
|
+
* - Uses temperature 0 by default for deterministic scoring
|
|
221
|
+
* - Retries on parse failure (configurable)
|
|
222
|
+
* - Clamps scores to [0, 1]
|
|
223
|
+
*
|
|
224
|
+
* Usage:
|
|
225
|
+
* ```ts
|
|
226
|
+
* import { llmops } from '@llmops/sdk'
|
|
227
|
+
*
|
|
228
|
+
* const client = llmops()
|
|
229
|
+
* const accuracy = judgeScorer({
|
|
230
|
+
* model: '@openai/gpt-4o',
|
|
231
|
+
* prompt: `Rate the accuracy of this response.
|
|
232
|
+
* Expected: {{target.answer}}
|
|
233
|
+
* Actual: {{output}}`,
|
|
234
|
+
* client,
|
|
235
|
+
* })
|
|
236
|
+
* ```
|
|
237
|
+
*/
|
|
238
|
+
declare function judgeScorer(options: JudgeScorerOptions): Evaluator;
|
|
239
|
+
//#endregion
|
|
240
|
+
export { type CompareOptions, type CompareResult, type Datapoint, type DatapointResult, type EvaluateOptions, type EvaluateResult, type EvaluationDataset, type Evaluator, type Executor, InlineDataset, type JudgeScorerOptions, type ScoreDelta, type ScoreStats, type VariantEvaluateResult, compare, evaluate, judgeScorer };
|