evalsense 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +235 -98
- package/dist/{chunk-BFGA2NUB.cjs → chunk-4BKZPVY4.cjs} +13 -6
- package/dist/chunk-4BKZPVY4.cjs.map +1 -0
- package/dist/{chunk-IYLSY7NX.js → chunk-IUVDDMJ3.js} +13 -6
- package/dist/chunk-IUVDDMJ3.js.map +1 -0
- package/dist/chunk-NCCQRZ2Y.cjs +1141 -0
- package/dist/chunk-NCCQRZ2Y.cjs.map +1 -0
- package/dist/chunk-TDGWDK2L.js +1108 -0
- package/dist/chunk-TDGWDK2L.js.map +1 -0
- package/dist/cli.cjs +11 -11
- package/dist/cli.js +1 -1
- package/dist/index-CATqAHNK.d.cts +416 -0
- package/dist/index-CoMpaW-K.d.ts +416 -0
- package/dist/index.cjs +507 -580
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +210 -161
- package/dist/index.d.ts +210 -161
- package/dist/index.js +455 -524
- package/dist/index.js.map +1 -1
- package/dist/metrics/index.cjs +103 -342
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +260 -31
- package/dist/metrics/index.d.ts +260 -31
- package/dist/metrics/index.js +24 -312
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/opinionated/index.cjs +5 -5
- package/dist/metrics/opinionated/index.d.cts +2 -163
- package/dist/metrics/opinionated/index.d.ts +2 -163
- package/dist/metrics/opinionated/index.js +1 -1
- package/dist/{types-C71p0wzM.d.cts → types-D0hzfyKm.d.cts} +1 -13
- package/dist/{types-C71p0wzM.d.ts → types-D0hzfyKm.d.ts} +1 -13
- package/package.json +1 -1
- package/dist/chunk-BFGA2NUB.cjs.map +0 -1
- package/dist/chunk-IYLSY7NX.js.map +0 -1
- package/dist/chunk-RZFLCWTW.cjs +0 -942
- package/dist/chunk-RZFLCWTW.cjs.map +0 -1
- package/dist/chunk-Z3U6AUWX.js +0 -925
- package/dist/chunk-Z3U6AUWX.js.map +0 -1
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
import { L as LLMClient, a as MetricOutput } from './types-D0hzfyKm.cjs';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Type definitions for the LLM metric factory
|
|
5
|
+
*
|
|
6
|
+
* Provides a declarative API for creating LLM-based evaluation metrics
|
|
7
|
+
* with unified record input (no more parallel arrays).
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* A record with id and arbitrary fields for evaluation
|
|
12
|
+
*
|
|
13
|
+
* All LLM metrics expect unified records where each record contains
|
|
14
|
+
* all fields needed for evaluation (output, context, query, etc.).
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```ts
|
|
18
|
+
* // Hallucination metric (needs output + context)
|
|
19
|
+
* const records: EvalRecord[] = [
|
|
20
|
+
* { id: "1", output: "Paris has 50M people", context: "Paris has 2.1M residents" },
|
|
21
|
+
* { id: "2", output: "Berlin is Germany's capital", context: "Berlin is the capital of Germany" },
|
|
22
|
+
* ];
|
|
23
|
+
*
|
|
24
|
+
* // Toxicity metric (needs only output)
|
|
25
|
+
* const records: EvalRecord[] = [
|
|
26
|
+
* { id: "1", output: "Thank you for your question" },
|
|
27
|
+
* ];
|
|
28
|
+
* ```
|
|
29
|
+
*/
|
|
30
|
+
interface EvalRecord {
|
|
31
|
+
id: string;
|
|
32
|
+
[field: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Input field specification for createLLMMetric
|
|
36
|
+
*
|
|
37
|
+
* Can be a string (required field) or an object with explicit required flag.
|
|
38
|
+
*
|
|
39
|
+
* @example
|
|
40
|
+
* ```ts
|
|
41
|
+
* // Required fields
|
|
42
|
+
* inputs: ["output", "context"]
|
|
43
|
+
*
|
|
44
|
+
* // Optional context field
|
|
45
|
+
* inputs: ["output", { name: "context", required: false }]
|
|
46
|
+
* ```
|
|
47
|
+
*/
|
|
48
|
+
type InputSpec = string | {
|
|
49
|
+
name: string;
|
|
50
|
+
required: boolean;
|
|
51
|
+
};
|
|
52
|
+
/**
|
|
53
|
+
* Response field type for JSON schema generation
|
|
54
|
+
*/
|
|
55
|
+
type ResponseFieldType = "string" | "number" | "boolean" | "array";
|
|
56
|
+
/**
|
|
57
|
+
* Label threshold configuration
|
|
58
|
+
*
|
|
59
|
+
* Sorted by min descending at runtime to find matching label.
|
|
60
|
+
*
|
|
61
|
+
* @example
|
|
62
|
+
* ```ts
|
|
63
|
+
* labels: [
|
|
64
|
+
* { min: 0.7, label: "high" },
|
|
65
|
+
* { min: 0.4, label: "medium" },
|
|
66
|
+
* { min: 0, label: "low" },
|
|
67
|
+
* ]
|
|
68
|
+
* ```
|
|
69
|
+
*/
|
|
70
|
+
interface LabelThreshold {
|
|
71
|
+
min: number;
|
|
72
|
+
label: string;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Configuration for creating an LLM-based metric
|
|
76
|
+
*
|
|
77
|
+
* This declarative configuration replaces 90+ lines of boilerplate
|
|
78
|
+
* with ~15 lines of configuration.
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* ```ts
|
|
82
|
+
* const answerCorrectness = createLLMMetric({
|
|
83
|
+
* name: "answer-correctness",
|
|
84
|
+
* inputs: ["output", "reference"],
|
|
85
|
+
* prompt: ANSWER_CORRECTNESS_PROMPT,
|
|
86
|
+
* responseFields: { score: "number", reasoning: "string" },
|
|
87
|
+
* labels: [
|
|
88
|
+
* { min: 0.8, label: "correct" },
|
|
89
|
+
* { min: 0.5, label: "partial" },
|
|
90
|
+
* { min: 0, label: "incorrect" },
|
|
91
|
+
* ],
|
|
92
|
+
* });
|
|
93
|
+
* ```
|
|
94
|
+
*/
|
|
95
|
+
interface LLMMetricConfig {
|
|
96
|
+
/**
|
|
97
|
+
* Metric name - used in MetricOutput and error messages
|
|
98
|
+
*/
|
|
99
|
+
name: string;
|
|
100
|
+
/**
|
|
101
|
+
* Field names to extract from records for prompt filling.
|
|
102
|
+
*
|
|
103
|
+
* - string: required field (e.g., "output")
|
|
104
|
+
* - { name: string, required: boolean }: explicit requirement
|
|
105
|
+
*
|
|
106
|
+
* The "output" field is always required and should be first.
|
|
107
|
+
*
|
|
108
|
+
* @example
|
|
109
|
+
* ```ts
|
|
110
|
+
* // Toxicity: only output needed
|
|
111
|
+
* inputs: ["output"]
|
|
112
|
+
*
|
|
113
|
+
* // Hallucination: output + context
|
|
114
|
+
* inputs: ["output", "context"]
|
|
115
|
+
*
|
|
116
|
+
* // Relevance: output + query
|
|
117
|
+
* inputs: ["output", "query"]
|
|
118
|
+
*
|
|
119
|
+
* // Faithfulness: output + source
|
|
120
|
+
* inputs: ["output", "source"]
|
|
121
|
+
*
|
|
122
|
+
* // Optional context
|
|
123
|
+
* inputs: ["output", { name: "context", required: false }]
|
|
124
|
+
* ```
|
|
125
|
+
*/
|
|
126
|
+
inputs: InputSpec[];
|
|
127
|
+
/**
|
|
128
|
+
* Prompt template with {variable} placeholders.
|
|
129
|
+
*
|
|
130
|
+
* Variables are filled from record fields using fillPrompt().
|
|
131
|
+
* Use the same names as specified in `inputs`.
|
|
132
|
+
*
|
|
133
|
+
* @example
|
|
134
|
+
* ```ts
|
|
135
|
+
* prompt: `
|
|
136
|
+
* Context: {context}
|
|
137
|
+
* Output: {output}
|
|
138
|
+
*
|
|
139
|
+
* Evaluate for hallucinations...
|
|
140
|
+
* `
|
|
141
|
+
* ```
|
|
142
|
+
*/
|
|
143
|
+
prompt: string;
|
|
144
|
+
/**
|
|
145
|
+
* Optional batch prompt template.
|
|
146
|
+
*
|
|
147
|
+
* Uses {items} placeholder which receives JSON array of all records.
|
|
148
|
+
* If not provided, batch mode falls back to per-row evaluation.
|
|
149
|
+
*/
|
|
150
|
+
batchPrompt?: string;
|
|
151
|
+
/**
|
|
152
|
+
* Response fields and their types.
|
|
153
|
+
*
|
|
154
|
+
* Generates JSON schema for structured LLM outputs.
|
|
155
|
+
* All fields listed here become required in the schema.
|
|
156
|
+
*
|
|
157
|
+
* @example
|
|
158
|
+
* ```ts
|
|
159
|
+
* responseFields: {
|
|
160
|
+
* score: "number",
|
|
161
|
+
* reasoning: "string",
|
|
162
|
+
* categories: "array",
|
|
163
|
+
* }
|
|
164
|
+
* ```
|
|
165
|
+
*/
|
|
166
|
+
responseFields: Record<string, ResponseFieldType>;
|
|
167
|
+
/**
|
|
168
|
+
* Which response field to use as the primary score.
|
|
169
|
+
*
|
|
170
|
+
* @default "score"
|
|
171
|
+
*/
|
|
172
|
+
scoreField?: string;
|
|
173
|
+
/**
|
|
174
|
+
* Optional label field from response to use directly.
|
|
175
|
+
*
|
|
176
|
+
* If specified, uses this field from LLM response as the label
|
|
177
|
+
* instead of computing from score thresholds.
|
|
178
|
+
*
|
|
179
|
+
* @example
|
|
180
|
+
* ```ts
|
|
181
|
+
* // Toxicity uses "severity" field directly
|
|
182
|
+
* labelField: "severity"
|
|
183
|
+
* ```
|
|
184
|
+
*/
|
|
185
|
+
labelField?: string;
|
|
186
|
+
/**
|
|
187
|
+
* Label thresholds for score-to-label conversion.
|
|
188
|
+
*
|
|
189
|
+
* Applied in descending min order. Ignored if labelField is set.
|
|
190
|
+
*
|
|
191
|
+
* @example
|
|
192
|
+
* ```ts
|
|
193
|
+
* labels: [
|
|
194
|
+
* { min: 0.7, label: "high" },
|
|
195
|
+
* { min: 0.4, label: "medium" },
|
|
196
|
+
* { min: 0, label: "low" },
|
|
197
|
+
* ]
|
|
198
|
+
* ```
|
|
199
|
+
*/
|
|
200
|
+
labels?: LabelThreshold[];
|
|
201
|
+
/**
|
|
202
|
+
* Default evaluation mode for this metric.
|
|
203
|
+
*
|
|
204
|
+
* - "per-row": One LLM call per record (higher accuracy)
|
|
205
|
+
* - "batch": Single LLM call for all records (lower cost)
|
|
206
|
+
*
|
|
207
|
+
* @default "per-row"
|
|
208
|
+
*/
|
|
209
|
+
defaultMode?: "per-row" | "batch";
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Options for calling an LLM metric
|
|
213
|
+
*/
|
|
214
|
+
interface LLMMetricOptions {
|
|
215
|
+
/**
|
|
216
|
+
* Override the default evaluation mode
|
|
217
|
+
*/
|
|
218
|
+
evaluationMode?: "per-row" | "batch";
|
|
219
|
+
/**
|
|
220
|
+
* Override the global LLM client for this call
|
|
221
|
+
*/
|
|
222
|
+
llmClient?: LLMClient;
|
|
223
|
+
/**
|
|
224
|
+
* Custom prompt template override
|
|
225
|
+
*/
|
|
226
|
+
customPrompt?: string;
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* The resulting metric function from createLLMMetric
|
|
230
|
+
*
|
|
231
|
+
* Takes unified records and returns MetricOutput array.
|
|
232
|
+
*
|
|
233
|
+
* @example
|
|
234
|
+
* ```ts
|
|
235
|
+
* const results = await hallucination([
|
|
236
|
+
* { id: "1", output: "Paris has 50M people", context: "Paris has 2.1M residents" },
|
|
237
|
+
* ]);
|
|
238
|
+
* ```
|
|
239
|
+
*/
|
|
240
|
+
type LLMMetric = (records: EvalRecord[], options?: LLMMetricOptions) => Promise<MetricOutput[]>;
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Hallucination detection metric (LLM-based)
|
|
244
|
+
*
|
|
245
|
+
* Detects statements in the output that are not supported by the provided context.
|
|
246
|
+
* Uses LLM evaluation for accurate hallucination detection.
|
|
247
|
+
*
|
|
248
|
+
* @example
|
|
249
|
+
* ```ts
|
|
250
|
+
* import { setLLMClient, hallucination } from "evalsense/metrics";
|
|
251
|
+
*
|
|
252
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
253
|
+
*
|
|
254
|
+
* const results = await hallucination([
|
|
255
|
+
* { id: "1", output: "The capital of France is Paris.", context: "France is in Europe. Its capital is Paris." },
|
|
256
|
+
* ]);
|
|
257
|
+
* ```
|
|
258
|
+
*/
|
|
259
|
+
/**
|
|
260
|
+
* Detects potential hallucinations by checking if output content
|
|
261
|
+
* is supported by the provided context.
|
|
262
|
+
*
|
|
263
|
+
* Score interpretation:
|
|
264
|
+
* - 0.0 = No hallucinations (all claims fully supported)
|
|
265
|
+
* - 0.5 = Some unsupported claims
|
|
266
|
+
* - 1.0 = Severe hallucinations (most/all claims unsupported)
|
|
267
|
+
*
|
|
268
|
+
* Labels:
|
|
269
|
+
* - "true" = Hallucination detected (score >= 0.5)
|
|
270
|
+
* - "false" = No hallucination (score < 0.5)
|
|
271
|
+
*
|
|
272
|
+
* @example
|
|
273
|
+
* ```ts
|
|
274
|
+
* const results = await hallucination([
|
|
275
|
+
* { id: "1", output: "Paris has 50M people", context: "Paris has 2.1M residents" },
|
|
276
|
+
* { id: "2", output: "Berlin is Germany's capital", context: "Berlin is the capital of Germany" },
|
|
277
|
+
* ]);
|
|
278
|
+
*
|
|
279
|
+
* // With batch mode for lower cost:
|
|
280
|
+
* const batchResults = await hallucination(records, { evaluationMode: "batch" });
|
|
281
|
+
* ```
|
|
282
|
+
*/
|
|
283
|
+
declare const hallucination: LLMMetric;
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Relevance metric (LLM-based)
|
|
287
|
+
*
|
|
288
|
+
* Measures how relevant the output is to the input query.
|
|
289
|
+
* Uses LLM evaluation for accurate relevance assessment.
|
|
290
|
+
*
|
|
291
|
+
* @example
|
|
292
|
+
* ```ts
|
|
293
|
+
* import { setLLMClient, relevance } from "evalsense/metrics";
|
|
294
|
+
*
|
|
295
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
296
|
+
*
|
|
297
|
+
* const results = await relevance([
|
|
298
|
+
* { id: "1", output: "Paris is the capital of France.", query: "What is the capital of France?" },
|
|
299
|
+
* ]);
|
|
300
|
+
* ```
|
|
301
|
+
*/
|
|
302
|
+
/**
|
|
303
|
+
* Measures the relevance of outputs to their queries.
|
|
304
|
+
*
|
|
305
|
+
* Score interpretation:
|
|
306
|
+
* - 0.0 = Completely irrelevant (doesn't address the query at all)
|
|
307
|
+
* - 0.5 = Partially relevant (addresses some aspects but misses key points)
|
|
308
|
+
* - 1.0 = Highly relevant (fully addresses the query)
|
|
309
|
+
*
|
|
310
|
+
* Labels:
|
|
311
|
+
* - "high" = High relevance (score >= 0.7)
|
|
312
|
+
* - "medium" = Medium relevance (score >= 0.4)
|
|
313
|
+
* - "low" = Low relevance (score < 0.4)
|
|
314
|
+
*
|
|
315
|
+
* @example
|
|
316
|
+
* ```ts
|
|
317
|
+
* const results = await relevance([
|
|
318
|
+
* { id: "1", output: "Paris is the capital of France.", query: "What is the capital of France?" },
|
|
319
|
+
* { id: "2", output: "I like pizza.", query: "What is the weather today?" },
|
|
320
|
+
* ]);
|
|
321
|
+
*
|
|
322
|
+
* // With batch mode for lower cost:
|
|
323
|
+
* const batchResults = await relevance(records, { evaluationMode: "batch" });
|
|
324
|
+
* ```
|
|
325
|
+
*/
|
|
326
|
+
declare const relevance: LLMMetric;
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Faithfulness metric (LLM-based)
|
|
330
|
+
*
|
|
331
|
+
* Measures how faithful the output is to the source material.
|
|
332
|
+
* Uses LLM evaluation to detect contradictions and misrepresentations.
|
|
333
|
+
*
|
|
334
|
+
* @example
|
|
335
|
+
* ```ts
|
|
336
|
+
* import { setLLMClient, faithfulness } from "evalsense/metrics";
|
|
337
|
+
*
|
|
338
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
339
|
+
*
|
|
340
|
+
* const results = await faithfulness([
|
|
341
|
+
* { id: "1", output: "The document discusses climate change.", source: "This report covers the impacts of climate change on biodiversity." },
|
|
342
|
+
* ]);
|
|
343
|
+
* ```
|
|
344
|
+
*/
|
|
345
|
+
/**
|
|
346
|
+
* Measures the faithfulness of outputs to their source material.
|
|
347
|
+
*
|
|
348
|
+
* Score interpretation:
|
|
349
|
+
* - 0.0 = Unfaithful (contradicts or misrepresents source)
|
|
350
|
+
* - 0.5 = Partially faithful (some accurate, some distortions)
|
|
351
|
+
* - 1.0 = Fully faithful (accurate representation of source)
|
|
352
|
+
*
|
|
353
|
+
* Labels:
|
|
354
|
+
* - "high" = High faithfulness (score >= 0.7)
|
|
355
|
+
* - "medium" = Medium faithfulness (score >= 0.4)
|
|
356
|
+
* - "low" = Low faithfulness (score < 0.4)
|
|
357
|
+
*
|
|
358
|
+
* @example
|
|
359
|
+
* ```ts
|
|
360
|
+
* const results = await faithfulness([
|
|
361
|
+
* { id: "1", output: "Revenue increased by 15%", source: "Revenue increased by 15% in Q4" },
|
|
362
|
+
* { id: "2", output: "Sales dropped sharply", source: "Sales increased modestly" },
|
|
363
|
+
* ]);
|
|
364
|
+
*
|
|
365
|
+
* // With batch mode for lower cost:
|
|
366
|
+
* const batchResults = await faithfulness(records, { evaluationMode: "batch" });
|
|
367
|
+
* ```
|
|
368
|
+
*/
|
|
369
|
+
declare const faithfulness: LLMMetric;
|
|
370
|
+
|
|
371
|
+
/**
|
|
372
|
+
* Toxicity detection metric (LLM-based)
|
|
373
|
+
*
|
|
374
|
+
* Detects potentially toxic, harmful, or inappropriate content.
|
|
375
|
+
* Uses LLM evaluation for nuanced toxicity detection.
|
|
376
|
+
*
|
|
377
|
+
* @example
|
|
378
|
+
* ```ts
|
|
379
|
+
* import { setLLMClient, toxicity } from "evalsense/metrics";
|
|
380
|
+
*
|
|
381
|
+
* setLLMClient({ async complete(prompt) { ... } });
|
|
382
|
+
*
|
|
383
|
+
* const results = await toxicity([
|
|
384
|
+
* { id: "1", output: "This is a friendly message." },
|
|
385
|
+
* ]);
|
|
386
|
+
* ```
|
|
387
|
+
*/
|
|
388
|
+
/**
|
|
389
|
+
* Detects potential toxicity in outputs.
|
|
390
|
+
*
|
|
391
|
+
* Score interpretation:
|
|
392
|
+
* - 0.0 = No toxicity (completely safe, appropriate)
|
|
393
|
+
* - 0.3 = Mild (minor profanity or slightly inappropriate)
|
|
394
|
+
* - 0.6 = Moderate (clear toxic elements)
|
|
395
|
+
* - 1.0 = Severe (extremely toxic, harmful, or dangerous)
|
|
396
|
+
*
|
|
397
|
+
* Labels (from LLM's severity field):
|
|
398
|
+
* - "none" = No toxicity
|
|
399
|
+
* - "mild" = Mild toxicity
|
|
400
|
+
* - "moderate" = Moderate toxicity
|
|
401
|
+
* - "severe" = Severe toxicity
|
|
402
|
+
*
|
|
403
|
+
* @example
|
|
404
|
+
* ```ts
|
|
405
|
+
* const results = await toxicity([
|
|
406
|
+
* { id: "1", output: "Thank you for your question" },
|
|
407
|
+
* { id: "2", output: "That's a stupid question" },
|
|
408
|
+
* ]);
|
|
409
|
+
*
|
|
410
|
+
* // With batch mode for lower cost:
|
|
411
|
+
* const batchResults = await toxicity(records, { evaluationMode: "batch" });
|
|
412
|
+
* ```
|
|
413
|
+
*/
|
|
414
|
+
declare const toxicity: LLMMetric;
|
|
415
|
+
|
|
416
|
+
export { type EvalRecord as E, type InputSpec as I, type LLMMetricConfig as L, type ResponseFieldType as R, type LLMMetric as a, type LLMMetricOptions as b, type LabelThreshold as c, faithfulness as f, hallucination as h, relevance as r, toxicity as t };
|