@llmops/sdk 1.0.0-beta.2 → 1.0.0-beta.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/dist/agents.cjs +1 -1
  2. package/dist/agents.d.cts +1 -1
  3. package/dist/agents.d.mts +1 -1
  4. package/dist/agents.mjs +1 -1
  5. package/dist/chunk-CxwUPGYo.mjs +21 -0
  6. package/dist/constants--ywcWP7q.cjs +18 -0
  7. package/dist/constants-BvnYU_pl.mjs +12 -0
  8. package/dist/eval.cjs +464 -0
  9. package/dist/eval.d.cts +240 -0
  10. package/dist/eval.d.mts +240 -0
  11. package/dist/eval.mjs +461 -0
  12. package/dist/express.cjs +29 -2
  13. package/dist/express.d.cts +7 -3
  14. package/dist/express.d.mts +7 -3
  15. package/dist/express.mjs +28 -1
  16. package/dist/hono.d.cts +2 -2
  17. package/dist/hono.d.mts +2 -2
  18. package/dist/{index-05byZKeu.d.mts → index-BZLzywwb.d.mts} +1 -1
  19. package/dist/{index-Beb26ZNG.d.cts → index-lgspeSNr.d.cts} +1 -1
  20. package/dist/index.cjs +3 -3
  21. package/dist/index.d.cts +4 -4
  22. package/dist/index.d.mts +4 -4
  23. package/dist/index.mjs +3 -3
  24. package/dist/interface-BbAwy96d.d.cts +223 -0
  25. package/dist/interface-Dz7B6QN1.d.mts +223 -0
  26. package/dist/nextjs.d.cts +2 -2
  27. package/dist/nextjs.d.mts +2 -2
  28. package/dist/store/d1.cjs +512 -0
  29. package/dist/store/d1.d.cts +60 -0
  30. package/dist/store/d1.d.mts +60 -0
  31. package/dist/store/d1.mjs +511 -0
  32. package/dist/store/pg.cjs +13634 -6
  33. package/dist/store/pg.d.cts +38 -2
  34. package/dist/store/pg.d.mts +38 -2
  35. package/dist/store/pg.mjs +13618 -2
  36. package/dist/store/sqlite.cjs +541 -0
  37. package/dist/store/sqlite.d.cts +50 -0
  38. package/dist/store/sqlite.d.mts +50 -0
  39. package/dist/store/sqlite.mjs +541 -0
  40. package/dist/types.d.cts +2 -2
  41. package/dist/types.d.mts +2 -2
  42. package/package.json +48 -3
  43. package/dist/express-B-wbCza5.cjs +0 -35
  44. package/dist/express-DMtc0d_Y.mjs +0 -30
  45. package/dist/index-DnWGper4.d.cts +0 -7
  46. package/dist/index-Dvz-L2Hf.d.mts +0 -7
  47. /package/dist/{agents-exporter-vcpgCF69.mjs → agents-exporter-CGxTzDeQ.mjs} +0 -0
  48. /package/dist/{agents-exporter-BZHCcFSd.d.mts → agents-exporter-CehKIArI.d.mts} +0 -0
  49. /package/dist/{agents-exporter-BuTq2n2y.cjs → agents-exporter-DizRE7CQ.cjs} +0 -0
  50. /package/dist/{agents-exporter-uzN3bkth.d.cts → agents-exporter-DkqkCcIx.d.cts} +0 -0
@@ -0,0 +1,240 @@
1
+ import "./agents-exporter-DkqkCcIx.cjs";
2
+ import { t as LLMOpsClient } from "./index-lgspeSNr.cjs";
3
+
4
+ //#region src/eval/dataset.d.ts
5
+
6
+ /**
7
+ * Interface for custom dataset sources.
8
+ * Built-in: inline arrays are wrapped in InlineDataset automatically.
9
+ * Future: CSVDataset, JSONLDataset, S3Dataset.
10
+ */
11
+ interface EvaluationDataset<D = Record<string, unknown>, T = Record<string, unknown>> {
12
+ size(): number | Promise<number>;
13
+ get(index: number): Datapoint<D, T> | Promise<Datapoint<D, T>>;
14
+ slice(start: number, end: number): Datapoint<D, T>[] | Promise<Datapoint<D, T>[]>;
15
+ }
16
+ /**
17
+ * Wraps a plain array as an EvaluationDataset.
18
+ */
19
+ declare class InlineDataset<D, T> implements EvaluationDataset<D, T> {
20
+ private items;
21
+ constructor(items: Datapoint<D, T>[]);
22
+ size(): number;
23
+ get(index: number): Datapoint<D, T>;
24
+ slice(start: number, end: number): Datapoint<D, T>[];
25
+ }
26
+ //#endregion
27
+ //#region src/eval/types.d.ts
28
+ /**
29
+ * A single datapoint in a dataset.
30
+ */
31
+ interface Datapoint<D = Record<string, unknown>, T = Record<string, unknown>> {
32
+ data: D;
33
+ target?: T;
34
+ metadata?: Record<string, unknown>;
35
+ }
36
+ /**
37
+ * An evaluator scores executor output.
38
+ * Returns a single number (0-1) or an object of named scores.
39
+ */
40
+ type Evaluator<O = unknown, T = unknown, D = unknown> = (output: O, target?: T, data?: D) => number | Record<string, number> | Promise<number | Record<string, number>>;
41
+ /**
42
+ * An executor is the function under test.
43
+ */
44
+ type Executor<D = Record<string, unknown>, O = unknown> = (data: D) => O | Promise<O>;
45
+ /**
46
+ * Configuration for evaluate().
47
+ */
48
+ interface EvaluateOptions<D, T, O> {
49
+ /** Name of this evaluation run. Required. */
50
+ name: string;
51
+ /** Dataset — inline array of datapoints or an EvaluationDataset */
52
+ data: Datapoint<D, T>[] | EvaluationDataset<D, T>;
53
+ /** The function under test. Provide either executor or variants, not both. */
54
+ executor?: Executor<D, O>;
55
+ /** Named variants for side-by-side comparison. Keys become variant labels. */
56
+ variants?: Record<string, Executor<D, O>>;
57
+ /** Named evaluator functions. Keys become score names. */
58
+ evaluators: Record<string, Evaluator<O, T>>;
59
+ /** Maximum concurrent datapoints. Default: 5 */
60
+ concurrency?: number;
61
+ /** Group name for tracking score progression across runs. */
62
+ group?: string;
63
+ /** Metadata attached to the entire run. */
64
+ metadata?: Record<string, unknown>;
65
+ /** Output directory for JSON results. Default: './llmops-evals' */
66
+ outputDir?: string;
67
+ }
68
+ /**
69
+ * Result for a single datapoint.
70
+ */
71
+ interface DatapointResult<D = unknown, O = unknown> {
72
+ data: D;
73
+ target?: unknown;
74
+ metadata?: Record<string, unknown>;
75
+ output: O;
76
+ scores: Record<string, number>;
77
+ durationMs: number;
78
+ error?: string;
79
+ }
80
+ /**
81
+ * Aggregated score statistics for one evaluator.
82
+ */
83
+ interface ScoreStats {
84
+ mean: number;
85
+ min: number;
86
+ max: number;
87
+ median: number;
88
+ count: number;
89
+ }
90
+ /**
91
+ * Summary of an evaluation run.
92
+ */
93
+ interface EvaluateResult<D = unknown, O = unknown> {
94
+ name: string;
95
+ runId: string;
96
+ group?: string;
97
+ scores: Record<string, ScoreStats>;
98
+ durationMs: number;
99
+ count: number;
100
+ errors: number;
101
+ metadata?: Record<string, unknown>;
102
+ results: DatapointResult<D, O>[];
103
+ }
104
+ /**
105
+ * When variants are used, wraps per-variant results.
106
+ */
107
+ interface VariantEvaluateResult<D = unknown, O = unknown> {
108
+ name: string;
109
+ runId: string;
110
+ group?: string;
111
+ durationMs: number;
112
+ metadata?: Record<string, unknown>;
113
+ variants: Record<string, EvaluateResult<D, O>>;
114
+ }
115
+ /**
116
+ * Options for compare().
117
+ */
118
+ interface CompareOptions {
119
+ /** Paths to eval result JSON files. First is baseline, second is candidate. */
120
+ files: [string, string];
121
+ }
122
+ /**
123
+ * Per-evaluator delta between two runs.
124
+ */
125
+ interface ScoreDelta {
126
+ baseline: number;
127
+ candidate: number;
128
+ delta: number;
129
+ }
130
+ /**
131
+ * Result of comparing two runs.
132
+ */
133
+ interface CompareResult {
134
+ baseline: string;
135
+ candidate: string;
136
+ scores: Record<string, ScoreDelta>;
137
+ regressions: Array<{
138
+ data: unknown;
139
+ evaluator: string;
140
+ baselineScore: number;
141
+ candidateScore: number;
142
+ }>;
143
+ improvements: Array<{
144
+ data: unknown;
145
+ evaluator: string;
146
+ baselineScore: number;
147
+ candidateScore: number;
148
+ }>;
149
+ }
150
+ /**
151
+ * Options for judgeScorer().
152
+ */
153
+ interface JudgeScorerOptions {
154
+ /** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
155
+ model: string;
156
+ /**
157
+ * Grading prompt. Supports {{output}}, {{target}}, {{target.*}},
158
+ * {{data}}, {{data.*}} placeholders.
159
+ *
160
+ * This becomes the user message. A system message is added automatically
161
+ * that instructs the LLM to return a JSON score.
162
+ */
163
+ prompt: string;
164
+ /**
165
+ * The llmops client instance. The judge call is routed through the
166
+ * gateway and traced like any other LLM call.
167
+ *
168
+ * ```ts
169
+ * const client = llmops({ telemetry: pgStore(url) })
170
+ * judgeScorer({ model: '@openai/gpt-4o', prompt: '...', client })
171
+ * ```
172
+ */
173
+ client: LLMOpsClient;
174
+ /**
175
+ * Custom system message. Overrides the default grading instructions.
176
+ * If omitted, a default system message is used that instructs
177
+ * the LLM to return JSON with a "score" field (0-1).
178
+ */
179
+ system?: string;
180
+ /** Temperature for the judge LLM. Default: 0 (deterministic). */
181
+ temperature?: number;
182
+ /** Max retries on parse failure. Default: 1. */
183
+ maxRetries?: number;
184
+ /**
185
+ * Custom parser for extracting score from LLM response.
186
+ * Default: expects JSON with a `score` field.
187
+ */
188
+ parse?: (response: string) => number | Record<string, number>;
189
+ }
190
+ //#endregion
191
+ //#region src/eval/evaluate.d.ts
192
+ declare function evaluate<D = Record<string, unknown>, T = Record<string, unknown>, O = unknown>(options: EvaluateOptions<D, T, O>): Promise<EvaluateResult<D, O> | VariantEvaluateResult<D, O>>;
193
+ //#endregion
194
+ //#region src/eval/compare.d.ts
195
+ /**
196
+ * Compare two eval result files. First file is the baseline.
197
+ *
198
+ * Usage with version control:
199
+ * 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
200
+ * 2. Commit the file
201
+ * 3. Make changes, re-run eval
202
+ * 4. Compare: git stash the new result, compare old vs new
203
+ *
204
+ * Or compare two named eval files:
205
+ * ```ts
206
+ * const diff = await compare({
207
+ * files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
208
+ * })
209
+ * ```
210
+ */
211
+ declare function compare(options: CompareOptions): Promise<CompareResult>;
212
+ //#endregion
213
+ //#region src/eval/judge.d.ts
214
+ /**
215
+ * Factory that returns an Evaluator which uses an LLM to score output.
216
+ *
217
+ * The judge:
218
+ * - Uses a system message that instructs the LLM to return JSON scores
219
+ * - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
220
+ * - Uses temperature 0 by default for deterministic scoring
221
+ * - Retries on parse failure (configurable)
222
+ * - Clamps scores to [0, 1]
223
+ *
224
+ * Usage:
225
+ * ```ts
226
+ * import { llmops } from '@llmops/sdk'
227
+ *
228
+ * const client = llmops()
229
+ * const accuracy = judgeScorer({
230
+ * model: '@openai/gpt-4o',
231
+ * prompt: `Rate the accuracy of this response.
232
+ * Expected: {{target.answer}}
233
+ * Actual: {{output}}`,
234
+ * client,
235
+ * })
236
+ * ```
237
+ */
238
+ declare function judgeScorer(options: JudgeScorerOptions): Evaluator;
239
+ //#endregion
240
+ export { type CompareOptions, type CompareResult, type Datapoint, type DatapointResult, type EvaluateOptions, type EvaluateResult, type EvaluationDataset, type Evaluator, type Executor, InlineDataset, type JudgeScorerOptions, type ScoreDelta, type ScoreStats, type VariantEvaluateResult, compare, evaluate, judgeScorer };
@@ -0,0 +1,240 @@
1
+ import "./agents-exporter-CehKIArI.mjs";
2
+ import { t as LLMOpsClient } from "./index-BZLzywwb.mjs";
3
+
4
+ //#region src/eval/dataset.d.ts
5
+
6
+ /**
7
+ * Interface for custom dataset sources.
8
+ * Built-in: inline arrays are wrapped in InlineDataset automatically.
9
+ * Future: CSVDataset, JSONLDataset, S3Dataset.
10
+ */
11
+ interface EvaluationDataset<D = Record<string, unknown>, T = Record<string, unknown>> {
12
+ size(): number | Promise<number>;
13
+ get(index: number): Datapoint<D, T> | Promise<Datapoint<D, T>>;
14
+ slice(start: number, end: number): Datapoint<D, T>[] | Promise<Datapoint<D, T>[]>;
15
+ }
16
+ /**
17
+ * Wraps a plain array as an EvaluationDataset.
18
+ */
19
+ declare class InlineDataset<D, T> implements EvaluationDataset<D, T> {
20
+ private items;
21
+ constructor(items: Datapoint<D, T>[]);
22
+ size(): number;
23
+ get(index: number): Datapoint<D, T>;
24
+ slice(start: number, end: number): Datapoint<D, T>[];
25
+ }
26
+ //#endregion
27
+ //#region src/eval/types.d.ts
28
+ /**
29
+ * A single datapoint in a dataset.
30
+ */
31
+ interface Datapoint<D = Record<string, unknown>, T = Record<string, unknown>> {
32
+ data: D;
33
+ target?: T;
34
+ metadata?: Record<string, unknown>;
35
+ }
36
+ /**
37
+ * An evaluator scores executor output.
38
+ * Returns a single number (0-1) or an object of named scores.
39
+ */
40
+ type Evaluator<O = unknown, T = unknown, D = unknown> = (output: O, target?: T, data?: D) => number | Record<string, number> | Promise<number | Record<string, number>>;
41
+ /**
42
+ * An executor is the function under test.
43
+ */
44
+ type Executor<D = Record<string, unknown>, O = unknown> = (data: D) => O | Promise<O>;
45
+ /**
46
+ * Configuration for evaluate().
47
+ */
48
+ interface EvaluateOptions<D, T, O> {
49
+ /** Name of this evaluation run. Required. */
50
+ name: string;
51
+ /** Dataset — inline array of datapoints or an EvaluationDataset */
52
+ data: Datapoint<D, T>[] | EvaluationDataset<D, T>;
53
+ /** The function under test. Provide either executor or variants, not both. */
54
+ executor?: Executor<D, O>;
55
+ /** Named variants for side-by-side comparison. Keys become variant labels. */
56
+ variants?: Record<string, Executor<D, O>>;
57
+ /** Named evaluator functions. Keys become score names. */
58
+ evaluators: Record<string, Evaluator<O, T>>;
59
+ /** Maximum concurrent datapoints. Default: 5 */
60
+ concurrency?: number;
61
+ /** Group name for tracking score progression across runs. */
62
+ group?: string;
63
+ /** Metadata attached to the entire run. */
64
+ metadata?: Record<string, unknown>;
65
+ /** Output directory for JSON results. Default: './llmops-evals' */
66
+ outputDir?: string;
67
+ }
68
+ /**
69
+ * Result for a single datapoint.
70
+ */
71
+ interface DatapointResult<D = unknown, O = unknown> {
72
+ data: D;
73
+ target?: unknown;
74
+ metadata?: Record<string, unknown>;
75
+ output: O;
76
+ scores: Record<string, number>;
77
+ durationMs: number;
78
+ error?: string;
79
+ }
80
+ /**
81
+ * Aggregated score statistics for one evaluator.
82
+ */
83
+ interface ScoreStats {
84
+ mean: number;
85
+ min: number;
86
+ max: number;
87
+ median: number;
88
+ count: number;
89
+ }
90
+ /**
91
+ * Summary of an evaluation run.
92
+ */
93
+ interface EvaluateResult<D = unknown, O = unknown> {
94
+ name: string;
95
+ runId: string;
96
+ group?: string;
97
+ scores: Record<string, ScoreStats>;
98
+ durationMs: number;
99
+ count: number;
100
+ errors: number;
101
+ metadata?: Record<string, unknown>;
102
+ results: DatapointResult<D, O>[];
103
+ }
104
+ /**
105
+ * When variants are used, wraps per-variant results.
106
+ */
107
+ interface VariantEvaluateResult<D = unknown, O = unknown> {
108
+ name: string;
109
+ runId: string;
110
+ group?: string;
111
+ durationMs: number;
112
+ metadata?: Record<string, unknown>;
113
+ variants: Record<string, EvaluateResult<D, O>>;
114
+ }
115
+ /**
116
+ * Options for compare().
117
+ */
118
+ interface CompareOptions {
119
+ /** Paths to eval result JSON files. First is baseline, second is candidate. */
120
+ files: [string, string];
121
+ }
122
+ /**
123
+ * Per-evaluator delta between two runs.
124
+ */
125
+ interface ScoreDelta {
126
+ baseline: number;
127
+ candidate: number;
128
+ delta: number;
129
+ }
130
+ /**
131
+ * Result of comparing two runs.
132
+ */
133
+ interface CompareResult {
134
+ baseline: string;
135
+ candidate: string;
136
+ scores: Record<string, ScoreDelta>;
137
+ regressions: Array<{
138
+ data: unknown;
139
+ evaluator: string;
140
+ baselineScore: number;
141
+ candidateScore: number;
142
+ }>;
143
+ improvements: Array<{
144
+ data: unknown;
145
+ evaluator: string;
146
+ baselineScore: number;
147
+ candidateScore: number;
148
+ }>;
149
+ }
150
+ /**
151
+ * Options for judgeScorer().
152
+ */
153
+ interface JudgeScorerOptions {
154
+ /** Model identifier — routed through the gateway. e.g. '@openai/gpt-4o' */
155
+ model: string;
156
+ /**
157
+ * Grading prompt. Supports {{output}}, {{target}}, {{target.*}},
158
+ * {{data}}, {{data.*}} placeholders.
159
+ *
160
+ * This becomes the user message. A system message is added automatically
161
+ * that instructs the LLM to return a JSON score.
162
+ */
163
+ prompt: string;
164
+ /**
165
+ * The llmops client instance. The judge call is routed through the
166
+ * gateway and traced like any other LLM call.
167
+ *
168
+ * ```ts
169
+ * const client = llmops({ telemetry: pgStore(url) })
170
+ * judgeScorer({ model: '@openai/gpt-4o', prompt: '...', client })
171
+ * ```
172
+ */
173
+ client: LLMOpsClient;
174
+ /**
175
+ * Custom system message. Overrides the default grading instructions.
176
+ * If omitted, a default system message is used that instructs
177
+ * the LLM to return JSON with a "score" field (0-1).
178
+ */
179
+ system?: string;
180
+ /** Temperature for the judge LLM. Default: 0 (deterministic). */
181
+ temperature?: number;
182
+ /** Max retries on parse failure. Default: 1. */
183
+ maxRetries?: number;
184
+ /**
185
+ * Custom parser for extracting score from LLM response.
186
+ * Default: expects JSON with a `score` field.
187
+ */
188
+ parse?: (response: string) => number | Record<string, number>;
189
+ }
190
+ //#endregion
191
+ //#region src/eval/evaluate.d.ts
192
+ declare function evaluate<D = Record<string, unknown>, T = Record<string, unknown>, O = unknown>(options: EvaluateOptions<D, T, O>): Promise<EvaluateResult<D, O> | VariantEvaluateResult<D, O>>;
193
+ //#endregion
194
+ //#region src/eval/compare.d.ts
195
+ /**
196
+ * Compare two eval result files. First file is the baseline.
197
+ *
198
+ * Usage with version control:
199
+ * 1. Run eval → results saved to ./llmops-evals/my-eval.eval.json
200
+ * 2. Commit the file
201
+ * 3. Make changes, re-run eval
202
+ * 4. Compare: git stash the new result, compare old vs new
203
+ *
204
+ * Or compare two named eval files:
205
+ * ```ts
206
+ * const diff = await compare({
207
+ * files: ['./llmops-evals/baseline.eval.json', './llmops-evals/candidate.eval.json'],
208
+ * })
209
+ * ```
210
+ */
211
+ declare function compare(options: CompareOptions): Promise<CompareResult>;
212
+ //#endregion
213
+ //#region src/eval/judge.d.ts
214
+ /**
215
+ * Factory that returns an Evaluator which uses an LLM to score output.
216
+ *
217
+ * The judge:
218
+ * - Uses a system message that instructs the LLM to return JSON scores
219
+ * - Interpolates {{output}}, {{target}}, {{data}} and their fields in the prompt
220
+ * - Uses temperature 0 by default for deterministic scoring
221
+ * - Retries on parse failure (configurable)
222
+ * - Clamps scores to [0, 1]
223
+ *
224
+ * Usage:
225
+ * ```ts
226
+ * import { llmops } from '@llmops/sdk'
227
+ *
228
+ * const client = llmops()
229
+ * const accuracy = judgeScorer({
230
+ * model: '@openai/gpt-4o',
231
+ * prompt: `Rate the accuracy of this response.
232
+ * Expected: {{target.answer}}
233
+ * Actual: {{output}}`,
234
+ * client,
235
+ * })
236
+ * ```
237
+ */
238
+ declare function judgeScorer(options: JudgeScorerOptions): Evaluator;
239
+ //#endregion
240
+ export { type CompareOptions, type CompareResult, type Datapoint, type DatapointResult, type EvaluateOptions, type EvaluateResult, type EvaluationDataset, type Evaluator, type Executor, InlineDataset, type JudgeScorerOptions, type ScoreDelta, type ScoreStats, type VariantEvaluateResult, compare, evaluate, judgeScorer };