evalsense 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +235 -98
  2. package/dist/{chunk-BFGA2NUB.cjs → chunk-4BKZPVY4.cjs} +13 -6
  3. package/dist/chunk-4BKZPVY4.cjs.map +1 -0
  4. package/dist/{chunk-IYLSY7NX.js → chunk-IUVDDMJ3.js} +13 -6
  5. package/dist/chunk-IUVDDMJ3.js.map +1 -0
  6. package/dist/chunk-NCCQRZ2Y.cjs +1141 -0
  7. package/dist/chunk-NCCQRZ2Y.cjs.map +1 -0
  8. package/dist/chunk-TDGWDK2L.js +1108 -0
  9. package/dist/chunk-TDGWDK2L.js.map +1 -0
  10. package/dist/cli.cjs +11 -11
  11. package/dist/cli.js +1 -1
  12. package/dist/index-CATqAHNK.d.cts +416 -0
  13. package/dist/index-CoMpaW-K.d.ts +416 -0
  14. package/dist/index.cjs +507 -580
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +210 -161
  17. package/dist/index.d.ts +210 -161
  18. package/dist/index.js +455 -524
  19. package/dist/index.js.map +1 -1
  20. package/dist/metrics/index.cjs +103 -342
  21. package/dist/metrics/index.cjs.map +1 -1
  22. package/dist/metrics/index.d.cts +260 -31
  23. package/dist/metrics/index.d.ts +260 -31
  24. package/dist/metrics/index.js +24 -312
  25. package/dist/metrics/index.js.map +1 -1
  26. package/dist/metrics/opinionated/index.cjs +5 -5
  27. package/dist/metrics/opinionated/index.d.cts +2 -163
  28. package/dist/metrics/opinionated/index.d.ts +2 -163
  29. package/dist/metrics/opinionated/index.js +1 -1
  30. package/dist/{types-C71p0wzM.d.cts → types-D0hzfyKm.d.cts} +1 -13
  31. package/dist/{types-C71p0wzM.d.ts → types-D0hzfyKm.d.ts} +1 -13
  32. package/package.json +1 -1
  33. package/dist/chunk-BFGA2NUB.cjs.map +0 -1
  34. package/dist/chunk-IYLSY7NX.js.map +0 -1
  35. package/dist/chunk-RZFLCWTW.cjs +0 -942
  36. package/dist/chunk-RZFLCWTW.cjs.map +0 -1
  37. package/dist/chunk-Z3U6AUWX.js +0 -925
  38. package/dist/chunk-Z3U6AUWX.js.map +0 -1
@@ -0,0 +1,416 @@
1
+ import { L as LLMClient, a as MetricOutput } from './types-D0hzfyKm.cjs';
2
+
3
+ /**
4
+ * Type definitions for the LLM metric factory
5
+ *
6
+ * Provides a declarative API for creating LLM-based evaluation metrics
7
+ * with unified record input (no more parallel arrays).
8
+ */
9
+
10
+ /**
11
+ * A record with id and arbitrary fields for evaluation
12
+ *
13
+ * All LLM metrics expect unified records where each record contains
14
+ * all fields needed for evaluation (output, context, query, etc.).
15
+ *
16
+ * @example
17
+ * ```ts
18
+ * // Hallucination metric (needs output + context)
19
+ * const records: EvalRecord[] = [
20
+ * { id: "1", output: "Paris has 50M people", context: "Paris has 2.1M residents" },
21
+ * { id: "2", output: "Berlin is Germany's capital", context: "Berlin is the capital of Germany" },
22
+ * ];
23
+ *
24
+ * // Toxicity metric (needs only output)
25
+ * const records: EvalRecord[] = [
26
+ * { id: "1", output: "Thank you for your question" },
27
+ * ];
28
+ * ```
29
+ */
30
+ interface EvalRecord {
31
+ id: string;
32
+ [field: string]: unknown;
33
+ }
34
+ /**
35
+ * Input field specification for createLLMMetric
36
+ *
37
+ * Can be a string (required field) or an object with explicit required flag.
38
+ *
39
+ * @example
40
+ * ```ts
41
+ * // Required fields
42
+ * inputs: ["output", "context"]
43
+ *
44
+ * // Optional context field
45
+ * inputs: ["output", { name: "context", required: false }]
46
+ * ```
47
+ */
48
+ type InputSpec = string | {
49
+ name: string;
50
+ required: boolean;
51
+ };
52
+ /**
53
+ * Response field type for JSON schema generation
54
+ */
55
+ type ResponseFieldType = "string" | "number" | "boolean" | "array";
56
+ /**
57
+ * Label threshold configuration
58
+ *
59
+ * Sorted by min descending at runtime to find matching label.
60
+ *
61
+ * @example
62
+ * ```ts
63
+ * labels: [
64
+ * { min: 0.7, label: "high" },
65
+ * { min: 0.4, label: "medium" },
66
+ * { min: 0, label: "low" },
67
+ * ]
68
+ * ```
69
+ */
70
+ interface LabelThreshold {
71
+ min: number;
72
+ label: string;
73
+ }
74
+ /**
75
+ * Configuration for creating an LLM-based metric
76
+ *
77
+ * This declarative configuration replaces 90+ lines of boilerplate
78
+ * with ~15 lines of configuration.
79
+ *
80
+ * @example
81
+ * ```ts
82
+ * const answerCorrectness = createLLMMetric({
83
+ * name: "answer-correctness",
84
+ * inputs: ["output", "reference"],
85
+ * prompt: ANSWER_CORRECTNESS_PROMPT,
86
+ * responseFields: { score: "number", reasoning: "string" },
87
+ * labels: [
88
+ * { min: 0.8, label: "correct" },
89
+ * { min: 0.5, label: "partial" },
90
+ * { min: 0, label: "incorrect" },
91
+ * ],
92
+ * });
93
+ * ```
94
+ */
95
+ interface LLMMetricConfig {
96
+ /**
97
+ * Metric name - used in MetricOutput and error messages
98
+ */
99
+ name: string;
100
+ /**
101
+ * Field names to extract from records for prompt filling.
102
+ *
103
+ * - string: required field (e.g., "output")
104
+ * - { name: string, required: boolean }: explicit requirement
105
+ *
106
+ * The "output" field is always required and should be first.
107
+ *
108
+ * @example
109
+ * ```ts
110
+ * // Toxicity: only output needed
111
+ * inputs: ["output"]
112
+ *
113
+ * // Hallucination: output + context
114
+ * inputs: ["output", "context"]
115
+ *
116
+ * // Relevance: output + query
117
+ * inputs: ["output", "query"]
118
+ *
119
+ * // Faithfulness: output + source
120
+ * inputs: ["output", "source"]
121
+ *
122
+ * // Optional context
123
+ * inputs: ["output", { name: "context", required: false }]
124
+ * ```
125
+ */
126
+ inputs: InputSpec[];
127
+ /**
128
+ * Prompt template with {variable} placeholders.
129
+ *
130
+ * Variables are filled from record fields using fillPrompt().
131
+ * Use the same names as specified in `inputs`.
132
+ *
133
+ * @example
134
+ * ```ts
135
+ * prompt: `
136
+ * Context: {context}
137
+ * Output: {output}
138
+ *
139
+ * Evaluate for hallucinations...
140
+ * `
141
+ * ```
142
+ */
143
+ prompt: string;
144
+ /**
145
+ * Optional batch prompt template.
146
+ *
147
+ * Uses {items} placeholder which receives JSON array of all records.
148
+ * If not provided, batch mode falls back to per-row evaluation.
149
+ */
150
+ batchPrompt?: string;
151
+ /**
152
+ * Response fields and their types.
153
+ *
154
+ * Generates JSON schema for structured LLM outputs.
155
+ * All fields listed here become required in the schema.
156
+ *
157
+ * @example
158
+ * ```ts
159
+ * responseFields: {
160
+ * score: "number",
161
+ * reasoning: "string",
162
+ * categories: "array",
163
+ * }
164
+ * ```
165
+ */
166
+ responseFields: Record<string, ResponseFieldType>;
167
+ /**
168
+ * Which response field to use as the primary score.
169
+ *
170
+ * @default "score"
171
+ */
172
+ scoreField?: string;
173
+ /**
174
+ * Optional label field from response to use directly.
175
+ *
176
+ * If specified, uses this field from LLM response as the label
177
+ * instead of computing from score thresholds.
178
+ *
179
+ * @example
180
+ * ```ts
181
+ * // Toxicity uses "severity" field directly
182
+ * labelField: "severity"
183
+ * ```
184
+ */
185
+ labelField?: string;
186
+ /**
187
+ * Label thresholds for score-to-label conversion.
188
+ *
189
+ * Applied in descending min order. Ignored if labelField is set.
190
+ *
191
+ * @example
192
+ * ```ts
193
+ * labels: [
194
+ * { min: 0.7, label: "high" },
195
+ * { min: 0.4, label: "medium" },
196
+ * { min: 0, label: "low" },
197
+ * ]
198
+ * ```
199
+ */
200
+ labels?: LabelThreshold[];
201
+ /**
202
+ * Default evaluation mode for this metric.
203
+ *
204
+ * - "per-row": One LLM call per record (higher accuracy)
205
+ * - "batch": Single LLM call for all records (lower cost)
206
+ *
207
+ * @default "per-row"
208
+ */
209
+ defaultMode?: "per-row" | "batch";
210
+ }
211
+ /**
212
+ * Options for calling an LLM metric
213
+ */
214
+ interface LLMMetricOptions {
215
+ /**
216
+ * Override the default evaluation mode
217
+ */
218
+ evaluationMode?: "per-row" | "batch";
219
+ /**
220
+ * Override the global LLM client for this call
221
+ */
222
+ llmClient?: LLMClient;
223
+ /**
224
+ * Custom prompt template override
225
+ */
226
+ customPrompt?: string;
227
+ }
228
+ /**
229
+ * The resulting metric function from createLLMMetric
230
+ *
231
+ * Takes unified records and returns MetricOutput array.
232
+ *
233
+ * @example
234
+ * ```ts
235
+ * const results = await hallucination([
236
+ * { id: "1", output: "Paris has 50M people", context: "Paris has 2.1M residents" },
237
+ * ]);
238
+ * ```
239
+ */
240
+ type LLMMetric = (records: EvalRecord[], options?: LLMMetricOptions) => Promise<MetricOutput[]>;
241
+
242
+ /**
243
+ * Hallucination detection metric (LLM-based)
244
+ *
245
+ * Detects statements in the output that are not supported by the provided context.
246
+ * Uses LLM evaluation for accurate hallucination detection.
247
+ *
248
+ * @example
249
+ * ```ts
250
+ * import { setLLMClient, hallucination } from "evalsense/metrics";
251
+ *
252
+ * setLLMClient({ async complete(prompt) { ... } });
253
+ *
254
+ * const results = await hallucination([
255
+ * { id: "1", output: "The capital of France is Paris.", context: "France is in Europe. Its capital is Paris." },
256
+ * ]);
257
+ * ```
258
+ */
259
+ /**
260
+ * Detects potential hallucinations by checking if output content
261
+ * is supported by the provided context.
262
+ *
263
+ * Score interpretation:
264
+ * - 0.0 = No hallucinations (all claims fully supported)
265
+ * - 0.5 = Some unsupported claims
266
+ * - 1.0 = Severe hallucinations (most/all claims unsupported)
267
+ *
268
+ * Labels:
269
+ * - "true" = Hallucination detected (score >= 0.5)
270
+ * - "false" = No hallucination (score < 0.5)
271
+ *
272
+ * @example
273
+ * ```ts
274
+ * const results = await hallucination([
275
+ * { id: "1", output: "Paris has 50M people", context: "Paris has 2.1M residents" },
276
+ * { id: "2", output: "Berlin is Germany's capital", context: "Berlin is the capital of Germany" },
277
+ * ]);
278
+ *
279
+ * // With batch mode for lower cost:
280
+ * const batchResults = await hallucination(records, { evaluationMode: "batch" });
281
+ * ```
282
+ */
283
+ declare const hallucination: LLMMetric;
284
+
285
+ /**
286
+ * Relevance metric (LLM-based)
287
+ *
288
+ * Measures how relevant the output is to the input query.
289
+ * Uses LLM evaluation for accurate relevance assessment.
290
+ *
291
+ * @example
292
+ * ```ts
293
+ * import { setLLMClient, relevance } from "evalsense/metrics";
294
+ *
295
+ * setLLMClient({ async complete(prompt) { ... } });
296
+ *
297
+ * const results = await relevance([
298
+ * { id: "1", output: "Paris is the capital of France.", query: "What is the capital of France?" },
299
+ * ]);
300
+ * ```
301
+ */
302
+ /**
303
+ * Measures the relevance of outputs to their queries.
304
+ *
305
+ * Score interpretation:
306
+ * - 0.0 = Completely irrelevant (doesn't address the query at all)
307
+ * - 0.5 = Partially relevant (addresses some aspects but misses key points)
308
+ * - 1.0 = Highly relevant (fully addresses the query)
309
+ *
310
+ * Labels:
311
+ * - "high" = High relevance (score >= 0.7)
312
+ * - "medium" = Medium relevance (score >= 0.4)
313
+ * - "low" = Low relevance (score < 0.4)
314
+ *
315
+ * @example
316
+ * ```ts
317
+ * const results = await relevance([
318
+ * { id: "1", output: "Paris is the capital of France.", query: "What is the capital of France?" },
319
+ * { id: "2", output: "I like pizza.", query: "What is the weather today?" },
320
+ * ]);
321
+ *
322
+ * // With batch mode for lower cost:
323
+ * const batchResults = await relevance(records, { evaluationMode: "batch" });
324
+ * ```
325
+ */
326
+ declare const relevance: LLMMetric;
327
+
328
+ /**
329
+ * Faithfulness metric (LLM-based)
330
+ *
331
+ * Measures how faithful the output is to the source material.
332
+ * Uses LLM evaluation to detect contradictions and misrepresentations.
333
+ *
334
+ * @example
335
+ * ```ts
336
+ * import { setLLMClient, faithfulness } from "evalsense/metrics";
337
+ *
338
+ * setLLMClient({ async complete(prompt) { ... } });
339
+ *
340
+ * const results = await faithfulness([
341
+ * { id: "1", output: "The document discusses climate change.", source: "This report covers the impacts of climate change on biodiversity." },
342
+ * ]);
343
+ * ```
344
+ */
345
+ /**
346
+ * Measures the faithfulness of outputs to their source material.
347
+ *
348
+ * Score interpretation:
349
+ * - 0.0 = Unfaithful (contradicts or misrepresents source)
350
+ * - 0.5 = Partially faithful (some accurate, some distortions)
351
+ * - 1.0 = Fully faithful (accurate representation of source)
352
+ *
353
+ * Labels:
354
+ * - "high" = High faithfulness (score >= 0.7)
355
+ * - "medium" = Medium faithfulness (score >= 0.4)
356
+ * - "low" = Low faithfulness (score < 0.4)
357
+ *
358
+ * @example
359
+ * ```ts
360
+ * const results = await faithfulness([
361
+ * { id: "1", output: "Revenue increased by 15%", source: "Revenue increased by 15% in Q4" },
362
+ * { id: "2", output: "Sales dropped sharply", source: "Sales increased modestly" },
363
+ * ]);
364
+ *
365
+ * // With batch mode for lower cost:
366
+ * const batchResults = await faithfulness(records, { evaluationMode: "batch" });
367
+ * ```
368
+ */
369
+ declare const faithfulness: LLMMetric;
370
+
371
+ /**
372
+ * Toxicity detection metric (LLM-based)
373
+ *
374
+ * Detects potentially toxic, harmful, or inappropriate content.
375
+ * Uses LLM evaluation for nuanced toxicity detection.
376
+ *
377
+ * @example
378
+ * ```ts
379
+ * import { setLLMClient, toxicity } from "evalsense/metrics";
380
+ *
381
+ * setLLMClient({ async complete(prompt) { ... } });
382
+ *
383
+ * const results = await toxicity([
384
+ * { id: "1", output: "This is a friendly message." },
385
+ * ]);
386
+ * ```
387
+ */
388
+ /**
389
+ * Detects potential toxicity in outputs.
390
+ *
391
+ * Score interpretation:
392
+ * - 0.0 = No toxicity (completely safe, appropriate)
393
+ * - 0.3 = Mild (minor profanity or slightly inappropriate)
394
+ * - 0.6 = Moderate (clear toxic elements)
395
+ * - 1.0 = Severe (extremely toxic, harmful, or dangerous)
396
+ *
397
+ * Labels (from LLM's severity field):
398
+ * - "none" = No toxicity
399
+ * - "mild" = Mild toxicity
400
+ * - "moderate" = Moderate toxicity
401
+ * - "severe" = Severe toxicity
402
+ *
403
+ * @example
404
+ * ```ts
405
+ * const results = await toxicity([
406
+ * { id: "1", output: "Thank you for your question" },
407
+ * { id: "2", output: "That's a stupid question" },
408
+ * ]);
409
+ *
410
+ * // With batch mode for lower cost:
411
+ * const batchResults = await toxicity(records, { evaluationMode: "batch" });
412
+ * ```
413
+ */
414
+ declare const toxicity: LLMMetric;
415
+
416
+ export { type EvalRecord as E, type InputSpec as I, type LLMMetricConfig as L, type ResponseFieldType as R, type LLMMetric as a, type LLMMetricOptions as b, type LabelThreshold as c, faithfulness as f, hallucination as h, relevance as r, toxicity as t };