evalsense 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +235 -98
  2. package/dist/{chunk-BFGA2NUB.cjs → chunk-4BKZPVY4.cjs} +13 -6
  3. package/dist/chunk-4BKZPVY4.cjs.map +1 -0
  4. package/dist/{chunk-IYLSY7NX.js → chunk-IUVDDMJ3.js} +13 -6
  5. package/dist/chunk-IUVDDMJ3.js.map +1 -0
  6. package/dist/chunk-NCCQRZ2Y.cjs +1141 -0
  7. package/dist/chunk-NCCQRZ2Y.cjs.map +1 -0
  8. package/dist/chunk-TDGWDK2L.js +1108 -0
  9. package/dist/chunk-TDGWDK2L.js.map +1 -0
  10. package/dist/cli.cjs +11 -11
  11. package/dist/cli.js +1 -1
  12. package/dist/index-CATqAHNK.d.cts +416 -0
  13. package/dist/index-CoMpaW-K.d.ts +416 -0
  14. package/dist/index.cjs +507 -580
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +210 -161
  17. package/dist/index.d.ts +210 -161
  18. package/dist/index.js +455 -524
  19. package/dist/index.js.map +1 -1
  20. package/dist/metrics/index.cjs +103 -342
  21. package/dist/metrics/index.cjs.map +1 -1
  22. package/dist/metrics/index.d.cts +260 -31
  23. package/dist/metrics/index.d.ts +260 -31
  24. package/dist/metrics/index.js +24 -312
  25. package/dist/metrics/index.js.map +1 -1
  26. package/dist/metrics/opinionated/index.cjs +5 -5
  27. package/dist/metrics/opinionated/index.d.cts +2 -163
  28. package/dist/metrics/opinionated/index.d.ts +2 -163
  29. package/dist/metrics/opinionated/index.js +1 -1
  30. package/dist/{types-C71p0wzM.d.cts → types-D0hzfyKm.d.cts} +1 -13
  31. package/dist/{types-C71p0wzM.d.ts → types-D0hzfyKm.d.ts} +1 -13
  32. package/package.json +1 -1
  33. package/dist/chunk-BFGA2NUB.cjs.map +0 -1
  34. package/dist/chunk-IYLSY7NX.js.map +0 -1
  35. package/dist/chunk-RZFLCWTW.cjs +0 -942
  36. package/dist/chunk-RZFLCWTW.cjs.map +0 -1
  37. package/dist/chunk-Z3U6AUWX.js +0 -925
  38. package/dist/chunk-Z3U6AUWX.js.map +0 -1
@@ -1,48 +1,99 @@
1
- export { FaithfulnessConfig, HallucinationConfig, RelevanceConfig, ToxicityConfig, faithfulness, hallucination, relevance, toxicity } from './opinionated/index.cjs';
2
- import { M as MetricFn, a as MetricConfig, b as MetricOutput, L as LLMClient, J as JSONSchema } from '../types-C71p0wzM.cjs';
1
+ import { L as LLMMetricConfig, a as LLMMetric } from '../index-CATqAHNK.cjs';
2
+ export { E as EvalRecord, I as InputSpec, b as LLMMetricOptions, c as LabelThreshold, R as ResponseFieldType, f as faithfulness, h as hallucination, r as relevance, t as toxicity } from '../index-CATqAHNK.cjs';
3
+ import { M as MetricFn, a as MetricOutput, L as LLMClient, J as JSONSchema } from '../types-D0hzfyKm.cjs';
3
4
 
4
5
  /**
5
- * Custom metric registration
6
+ * Factory function for creating LLM-based metrics
7
+ *
8
+ * Reduces metric definition from 90+ lines to ~15 lines with a declarative API.
9
+ * Eliminates parallel array matching and provides unified record input.
10
+ *
11
+ * @example
12
+ * ```ts
13
+ * const answerCorrectness = createLLMMetric({
14
+ * name: "answer-correctness",
15
+ * inputs: ["output", "reference"],
16
+ * prompt: ANSWER_CORRECTNESS_PROMPT,
17
+ * responseFields: { score: "number", reasoning: "string" },
18
+ * labels: [
19
+ * { min: 0.8, label: "correct" },
20
+ * { min: 0.5, label: "partial" },
21
+ * { min: 0, label: "incorrect" },
22
+ * ],
23
+ * });
24
+ *
25
+ * // Usage with unified records
26
+ * const results = await answerCorrectness([
27
+ * { id: "1", output: "Paris", reference: "Paris is the capital" },
28
+ * ]);
29
+ * ```
6
30
  */
7
31
 
8
32
  /**
9
- * Registers a custom metric
33
+ * Creates an LLM-based metric function
34
+ *
35
+ * This factory function eliminates boilerplate by:
36
+ * - Handling LLM client validation
37
+ * - Managing structured output with fallback to text parsing
38
+ * - Normalizing scores to 0-1 range
39
+ * - Converting scores to labels using thresholds
40
+ * - Supporting both per-row and batch evaluation modes
41
+ * - Providing consistent error handling
42
+ *
43
+ * @param config - Metric configuration
44
+ * @returns A metric function that takes unified records
10
45
  *
11
46
  * @example
12
47
  * ```ts
13
- * registerMetric("custom-relevance", async ({ outputs, query }) => {
14
- * // Custom evaluation logic
15
- * return outputs.map(o => ({
16
- * id: o.id,
17
- * metric: "custom-relevance",
18
- * score: evaluateRelevance(o.output, query),
19
- * }));
48
+ * // Create a custom LLM metric
49
+ * const myMetric = createLLMMetric({
50
+ * name: "my-metric",
51
+ * inputs: ["output", "reference"],
52
+ * prompt: `
53
+ * Reference: {reference}
54
+ * Output: {output}
55
+ *
56
+ * Evaluate the output against the reference...
57
+ * `,
58
+ * responseFields: { score: "number", reasoning: "string" },
59
+ * labels: [
60
+ * { min: 0.7, label: "good" },
61
+ * { min: 0.4, label: "fair" },
62
+ * { min: 0, label: "poor" },
63
+ * ],
20
64
  * });
65
+ *
66
+ * // Use with unified records
67
+ * const results = await myMetric([
68
+ * { id: "1", output: "answer A", reference: "correct answer" },
69
+ * { id: "2", output: "answer B", reference: "expected B" },
70
+ * ]);
21
71
  * ```
22
72
  */
23
- declare function registerMetric(name: string, fn: MetricFn): void;
24
- /**
25
- * Gets a registered custom metric
26
- */
27
- declare function getMetric(name: string): MetricFn | undefined;
28
- /**
29
- * Runs a registered metric
30
- */
31
- declare function runMetric(name: string, config: MetricConfig): Promise<MetricOutput[]>;
32
- /**
33
- * Lists all registered custom metrics
34
- */
35
- declare function listMetrics(): string[];
36
- /**
37
- * Unregisters a metric (mainly for testing)
38
- */
39
- declare function unregisterMetric(name: string): boolean;
73
+ declare function createLLMMetric(config: LLMMetricConfig): LLMMetric;
74
+
40
75
  /**
41
- * Clears all registered metrics (mainly for testing)
76
+ * Custom metric utilities
77
+ *
78
+ * Provides simple pattern-based and keyword-based metrics for non-LLM use cases.
79
+ * For LLM-based custom metrics, use createLLMMetric() from evalsense/metrics.
42
80
  */
43
- declare function clearMetrics(): void;
81
+
44
82
  /**
45
83
  * Creates a simple string-matching metric
84
+ *
85
+ * @example
86
+ * ```ts
87
+ * const containsCodeMetric = createPatternMetric("contains-code", [
88
+ * /```[\s\S]*?```/,
89
+ * /function\s+\w+\s*\(/,
90
+ * /const\s+\w+\s*=/,
91
+ * ]);
92
+ *
93
+ * const results = await containsCodeMetric({
94
+ * outputs: [{ id: "1", output: "const x = 5" }]
95
+ * });
96
+ * ```
46
97
  */
47
98
  declare function createPatternMetric(name: string, patterns: RegExp[], options?: {
48
99
  matchScore?: number;
@@ -50,6 +101,19 @@ declare function createPatternMetric(name: string, patterns: RegExp[], options?:
50
101
  }): MetricFn;
51
102
  /**
52
103
  * Creates a keyword-based metric
104
+ *
105
+ * @example
106
+ * ```ts
107
+ * const techTermsMetric = createKeywordMetric("tech-terms", [
108
+ * "machine learning",
109
+ * "neural network",
110
+ * "algorithm",
111
+ * ], { threshold: 0.3 });
112
+ *
113
+ * const results = await techTermsMetric({
114
+ * outputs: [{ id: "1", output: "This uses a neural network algorithm." }]
115
+ * });
116
+ * ```
53
117
  */
54
118
  declare function createKeywordMetric(name: string, keywords: string[], options?: {
55
119
  caseSensitive?: boolean;
@@ -108,6 +172,13 @@ declare function delay(ms: number): Promise<void>;
108
172
  * across all LLM-based metrics, with support for per-call overrides.
109
173
  */
110
174
 
175
+ /**
176
+ * Global defaults for LLM metrics
177
+ */
178
+ interface LLMDefaults {
179
+ /** Default evaluation mode for all metrics */
180
+ evaluationMode?: "per-row" | "batch";
181
+ }
111
182
  /**
112
183
  * Sets the global LLM client for all metrics
113
184
  *
@@ -144,6 +215,122 @@ declare function resetLLMClient(): void;
144
215
  * @returns The client to use (override or global)
145
216
  */
146
217
  declare function requireLLMClient(client: LLMClient | undefined, metricName: string): LLMClient;
218
+ /**
219
+ * Executes a function with a scoped LLM client
220
+ *
221
+ * The client is automatically restored after the function completes,
222
+ * even if an error is thrown. This is ideal for testing scenarios
223
+ * where you want to use a mock client without affecting other tests.
224
+ *
225
+ * @param client - The LLM client to use for this scope
226
+ * @param fn - The async function to execute with the scoped client
227
+ * @returns The result of the function
228
+ *
229
+ * @example
230
+ * ```ts
231
+ * // No need for beforeEach(() => resetLLMClient())
232
+ * it("test with mock client", async () => {
233
+ * const result = await withLLMClient(mockClient, async () => {
234
+ * return hallucination([{ id: "1", output: "test", context: "ctx" }]);
235
+ * });
236
+ * expect(result[0].score).toBe(0.5);
237
+ * });
238
+ * ```
239
+ */
240
+ declare function withLLMClient<T>(client: LLMClient, fn: () => Promise<T>): Promise<T>;
241
+ /**
242
+ * Sets global defaults for LLM metrics
243
+ *
244
+ * @param defaults - Default options to apply to all metrics
245
+ *
246
+ * @example
247
+ * ```ts
248
+ * // Make all metrics use batch mode by default
249
+ * setDefaults({ evaluationMode: "batch" });
250
+ * ```
251
+ */
252
+ declare function setDefaults(defaults: LLMDefaults): void;
253
+ /**
254
+ * Gets the current global defaults
255
+ *
256
+ * @returns Current global defaults
257
+ */
258
+ declare function getDefaults(): LLMDefaults;
259
+ /**
260
+ * Resets global defaults to empty
261
+ */
262
+ declare function resetDefaults(): void;
263
+ /**
264
+ * Provider types for configureLLM
265
+ */
266
+ type LLMProvider = "openai" | "anthropic" | "openrouter" | "custom";
267
+ /**
268
+ * Options for configureLLM
269
+ */
270
+ interface ConfigureLLMOptions {
271
+ /** LLM provider to use */
272
+ provider: LLMProvider;
273
+ /** API key (auto-detects from environment if not provided) */
274
+ apiKey?: string;
275
+ /** Model to use (provider-specific defaults apply if not set) */
276
+ model?: string;
277
+ /** Temperature for generation */
278
+ temperature?: number;
279
+ /** Max tokens per completion */
280
+ maxTokens?: number;
281
+ /** Custom client (required when provider is "custom") */
282
+ client?: LLMClient;
283
+ /** Global defaults to set */
284
+ defaults?: LLMDefaults;
285
+ }
286
+ /**
287
+ * Options for auto-detection
288
+ */
289
+ interface ConfigureLLMAutoOptions {
290
+ /** Model to use (optional, uses provider default if not set) */
291
+ model?: string;
292
+ /** Temperature for generation */
293
+ temperature?: number;
294
+ /** Max tokens per completion */
295
+ maxTokens?: number;
296
+ /** Global defaults to set */
297
+ defaults?: LLMDefaults;
298
+ }
299
+ /**
300
+ * One-step LLM configuration
301
+ *
302
+ * Simplifies LLM setup by combining adapter creation and client setting
303
+ * into a single call with environment variable auto-detection.
304
+ *
305
+ * @param options - Configuration options
306
+ * @returns The configured LLM client
307
+ *
308
+ * @example
309
+ * ```ts
310
+ * // Explicit provider (API key from environment)
311
+ * configureLLM({ provider: "openai", model: "gpt-4" });
312
+ *
313
+ * // With explicit API key
314
+ * configureLLM({
315
+ * provider: "anthropic",
316
+ * apiKey: "sk-ant-...",
317
+ * model: "claude-3-5-sonnet-20241022"
318
+ * });
319
+ *
320
+ * // With global defaults
321
+ * configureLLM({
322
+ * provider: "openai",
323
+ * defaults: { evaluationMode: "batch" }
324
+ * });
325
+ *
326
+ * // Custom client
327
+ * configureLLM({ provider: "custom", client: myClient });
328
+ * ```
329
+ */
330
+ declare function configureLLM(options: ConfigureLLMOptions): LLMClient;
331
+ declare namespace configureLLM {
332
+ var auto: (options?: ConfigureLLMAutoOptions) => LLMClient;
333
+ }
147
334
 
148
335
  /**
149
336
  * Utilities for LLM-based metric evaluation
@@ -547,4 +734,46 @@ interface OpenRouterAdapterOptions {
547
734
  */
548
735
  declare function createOpenRouterAdapter(apiKey: string, options?: OpenRouterAdapterOptions): LLMClient;
549
736
 
550
- export { type AnthropicAdapterOptions, BINARY_THRESHOLDS, type OpenAIAdapterOptions, type OpenRouterAdapterOptions, SEVERITY_THRESHOLDS, batch, batchItems, clearMetrics, createAnthropicAdapter, createErrorMockClient, createJSONSchema, createKeywordMetric, createLLMError, createMetricOutput, createMockLLMClient, createOpenAIAdapter, createOpenRouterAdapter, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, extractScore, fillPrompt, getLLMClient, getMetric, listMetrics, normalizeScore, parseJSONResponse, registerMetric, requireLLMClient, resetLLMClient, runMetric, scoreToLabel, setLLMClient, unregisterMetric, validateResponse, withTimeout };
737
+ /**
738
+ * Metrics module - entry point for evalsense/metrics
739
+ *
740
+ * Provides LLM-based metrics, custom metric utilities, and LLM client management.
741
+ */
742
+
743
+ /**
744
+ * Testing utilities for LLM metrics
745
+ *
746
+ * Provides convenient access to all testing-related functions in one namespace.
747
+ *
748
+ * @example
749
+ * ```ts
750
+ * import { testing } from "evalsense/metrics";
751
+ *
752
+ * describe("My tests", () => {
753
+ * beforeEach(testing.reset);
754
+ *
755
+ * it("test with mock", async () => {
756
+ * const result = await testing.withClient(
757
+ * testing.mock({ response: { score: 0.8 } }),
758
+ * async () => hallucination([...])
759
+ * );
760
+ * });
761
+ * });
762
+ * ```
763
+ */
764
+ declare const testing: {
765
+ /** Resets global LLM client and defaults */
766
+ reset: () => void;
767
+ /** Creates a mock LLM client */
768
+ mock: typeof createMockLLMClient;
769
+ /** Executes function with scoped LLM client */
770
+ withClient: typeof withLLMClient;
771
+ /** Creates a mock client that returns sequential responses */
772
+ sequentialMock: typeof createSequentialMockClient;
773
+ /** Creates a mock client that always errors */
774
+ errorMock: typeof createErrorMockClient;
775
+ /** Creates a spy mock client that records all prompts */
776
+ spyMock: typeof createSpyMockClient;
777
+ };
778
+
779
+ export { type AnthropicAdapterOptions, BINARY_THRESHOLDS, type ConfigureLLMAutoOptions, type ConfigureLLMOptions, type LLMDefaults, LLMMetric, LLMMetricConfig, type LLMProvider, type OpenAIAdapterOptions, type OpenRouterAdapterOptions, SEVERITY_THRESHOLDS, batch, batchItems, configureLLM, createAnthropicAdapter, createErrorMockClient, createJSONSchema, createKeywordMetric, createLLMError, createLLMMetric, createMetricOutput, createMockLLMClient, createOpenAIAdapter, createOpenRouterAdapter, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, extractScore, fillPrompt, getDefaults, getLLMClient, normalizeScore, parseJSONResponse, requireLLMClient, resetDefaults, resetLLMClient, scoreToLabel, setDefaults, setLLMClient, testing, validateResponse, withLLMClient, withTimeout };
@@ -1,48 +1,99 @@
1
- export { FaithfulnessConfig, HallucinationConfig, RelevanceConfig, ToxicityConfig, faithfulness, hallucination, relevance, toxicity } from './opinionated/index.js';
2
- import { M as MetricFn, a as MetricConfig, b as MetricOutput, L as LLMClient, J as JSONSchema } from '../types-C71p0wzM.js';
1
+ import { L as LLMMetricConfig, a as LLMMetric } from '../index-CoMpaW-K.js';
2
+ export { E as EvalRecord, I as InputSpec, b as LLMMetricOptions, c as LabelThreshold, R as ResponseFieldType, f as faithfulness, h as hallucination, r as relevance, t as toxicity } from '../index-CoMpaW-K.js';
3
+ import { M as MetricFn, a as MetricOutput, L as LLMClient, J as JSONSchema } from '../types-D0hzfyKm.js';
3
4
 
4
5
  /**
5
- * Custom metric registration
6
+ * Factory function for creating LLM-based metrics
7
+ *
8
+ * Reduces metric definition from 90+ lines to ~15 lines with a declarative API.
9
+ * Eliminates parallel array matching and provides unified record input.
10
+ *
11
+ * @example
12
+ * ```ts
13
+ * const answerCorrectness = createLLMMetric({
14
+ * name: "answer-correctness",
15
+ * inputs: ["output", "reference"],
16
+ * prompt: ANSWER_CORRECTNESS_PROMPT,
17
+ * responseFields: { score: "number", reasoning: "string" },
18
+ * labels: [
19
+ * { min: 0.8, label: "correct" },
20
+ * { min: 0.5, label: "partial" },
21
+ * { min: 0, label: "incorrect" },
22
+ * ],
23
+ * });
24
+ *
25
+ * // Usage with unified records
26
+ * const results = await answerCorrectness([
27
+ * { id: "1", output: "Paris", reference: "Paris is the capital" },
28
+ * ]);
29
+ * ```
6
30
  */
7
31
 
8
32
  /**
9
- * Registers a custom metric
33
+ * Creates an LLM-based metric function
34
+ *
35
+ * This factory function eliminates boilerplate by:
36
+ * - Handling LLM client validation
37
+ * - Managing structured output with fallback to text parsing
38
+ * - Normalizing scores to 0-1 range
39
+ * - Converting scores to labels using thresholds
40
+ * - Supporting both per-row and batch evaluation modes
41
+ * - Providing consistent error handling
42
+ *
43
+ * @param config - Metric configuration
44
+ * @returns A metric function that takes unified records
10
45
  *
11
46
  * @example
12
47
  * ```ts
13
- * registerMetric("custom-relevance", async ({ outputs, query }) => {
14
- * // Custom evaluation logic
15
- * return outputs.map(o => ({
16
- * id: o.id,
17
- * metric: "custom-relevance",
18
- * score: evaluateRelevance(o.output, query),
19
- * }));
48
+ * // Create a custom LLM metric
49
+ * const myMetric = createLLMMetric({
50
+ * name: "my-metric",
51
+ * inputs: ["output", "reference"],
52
+ * prompt: `
53
+ * Reference: {reference}
54
+ * Output: {output}
55
+ *
56
+ * Evaluate the output against the reference...
57
+ * `,
58
+ * responseFields: { score: "number", reasoning: "string" },
59
+ * labels: [
60
+ * { min: 0.7, label: "good" },
61
+ * { min: 0.4, label: "fair" },
62
+ * { min: 0, label: "poor" },
63
+ * ],
20
64
  * });
65
+ *
66
+ * // Use with unified records
67
+ * const results = await myMetric([
68
+ * { id: "1", output: "answer A", reference: "correct answer" },
69
+ * { id: "2", output: "answer B", reference: "expected B" },
70
+ * ]);
21
71
  * ```
22
72
  */
23
- declare function registerMetric(name: string, fn: MetricFn): void;
24
- /**
25
- * Gets a registered custom metric
26
- */
27
- declare function getMetric(name: string): MetricFn | undefined;
28
- /**
29
- * Runs a registered metric
30
- */
31
- declare function runMetric(name: string, config: MetricConfig): Promise<MetricOutput[]>;
32
- /**
33
- * Lists all registered custom metrics
34
- */
35
- declare function listMetrics(): string[];
36
- /**
37
- * Unregisters a metric (mainly for testing)
38
- */
39
- declare function unregisterMetric(name: string): boolean;
73
+ declare function createLLMMetric(config: LLMMetricConfig): LLMMetric;
74
+
40
75
  /**
41
- * Clears all registered metrics (mainly for testing)
76
+ * Custom metric utilities
77
+ *
78
+ * Provides simple pattern-based and keyword-based metrics for non-LLM use cases.
79
+ * For LLM-based custom metrics, use createLLMMetric() from evalsense/metrics.
42
80
  */
43
- declare function clearMetrics(): void;
81
+
44
82
  /**
45
83
  * Creates a simple string-matching metric
84
+ *
85
+ * @example
86
+ * ```ts
87
+ * const containsCodeMetric = createPatternMetric("contains-code", [
88
+ * /```[\s\S]*?```/,
89
+ * /function\s+\w+\s*\(/,
90
+ * /const\s+\w+\s*=/,
91
+ * ]);
92
+ *
93
+ * const results = await containsCodeMetric({
94
+ * outputs: [{ id: "1", output: "const x = 5" }]
95
+ * });
96
+ * ```
46
97
  */
47
98
  declare function createPatternMetric(name: string, patterns: RegExp[], options?: {
48
99
  matchScore?: number;
@@ -50,6 +101,19 @@ declare function createPatternMetric(name: string, patterns: RegExp[], options?:
50
101
  }): MetricFn;
51
102
  /**
52
103
  * Creates a keyword-based metric
104
+ *
105
+ * @example
106
+ * ```ts
107
+ * const techTermsMetric = createKeywordMetric("tech-terms", [
108
+ * "machine learning",
109
+ * "neural network",
110
+ * "algorithm",
111
+ * ], { threshold: 0.3 });
112
+ *
113
+ * const results = await techTermsMetric({
114
+ * outputs: [{ id: "1", output: "This uses a neural network algorithm." }]
115
+ * });
116
+ * ```
53
117
  */
54
118
  declare function createKeywordMetric(name: string, keywords: string[], options?: {
55
119
  caseSensitive?: boolean;
@@ -108,6 +172,13 @@ declare function delay(ms: number): Promise<void>;
108
172
  * across all LLM-based metrics, with support for per-call overrides.
109
173
  */
110
174
 
175
+ /**
176
+ * Global defaults for LLM metrics
177
+ */
178
+ interface LLMDefaults {
179
+ /** Default evaluation mode for all metrics */
180
+ evaluationMode?: "per-row" | "batch";
181
+ }
111
182
  /**
112
183
  * Sets the global LLM client for all metrics
113
184
  *
@@ -144,6 +215,122 @@ declare function resetLLMClient(): void;
144
215
  * @returns The client to use (override or global)
145
216
  */
146
217
  declare function requireLLMClient(client: LLMClient | undefined, metricName: string): LLMClient;
218
+ /**
219
+ * Executes a function with a scoped LLM client
220
+ *
221
+ * The client is automatically restored after the function completes,
222
+ * even if an error is thrown. This is ideal for testing scenarios
223
+ * where you want to use a mock client without affecting other tests.
224
+ *
225
+ * @param client - The LLM client to use for this scope
226
+ * @param fn - The async function to execute with the scoped client
227
+ * @returns The result of the function
228
+ *
229
+ * @example
230
+ * ```ts
231
+ * // No need for beforeEach(() => resetLLMClient())
232
+ * it("test with mock client", async () => {
233
+ * const result = await withLLMClient(mockClient, async () => {
234
+ * return hallucination([{ id: "1", output: "test", context: "ctx" }]);
235
+ * });
236
+ * expect(result[0].score).toBe(0.5);
237
+ * });
238
+ * ```
239
+ */
240
+ declare function withLLMClient<T>(client: LLMClient, fn: () => Promise<T>): Promise<T>;
241
+ /**
242
+ * Sets global defaults for LLM metrics
243
+ *
244
+ * @param defaults - Default options to apply to all metrics
245
+ *
246
+ * @example
247
+ * ```ts
248
+ * // Make all metrics use batch mode by default
249
+ * setDefaults({ evaluationMode: "batch" });
250
+ * ```
251
+ */
252
+ declare function setDefaults(defaults: LLMDefaults): void;
253
+ /**
254
+ * Gets the current global defaults
255
+ *
256
+ * @returns Current global defaults
257
+ */
258
+ declare function getDefaults(): LLMDefaults;
259
+ /**
260
+ * Resets global defaults to empty
261
+ */
262
+ declare function resetDefaults(): void;
263
+ /**
264
+ * Provider types for configureLLM
265
+ */
266
+ type LLMProvider = "openai" | "anthropic" | "openrouter" | "custom";
267
+ /**
268
+ * Options for configureLLM
269
+ */
270
+ interface ConfigureLLMOptions {
271
+ /** LLM provider to use */
272
+ provider: LLMProvider;
273
+ /** API key (auto-detects from environment if not provided) */
274
+ apiKey?: string;
275
+ /** Model to use (provider-specific defaults apply if not set) */
276
+ model?: string;
277
+ /** Temperature for generation */
278
+ temperature?: number;
279
+ /** Max tokens per completion */
280
+ maxTokens?: number;
281
+ /** Custom client (required when provider is "custom") */
282
+ client?: LLMClient;
283
+ /** Global defaults to set */
284
+ defaults?: LLMDefaults;
285
+ }
286
+ /**
287
+ * Options for auto-detection
288
+ */
289
+ interface ConfigureLLMAutoOptions {
290
+ /** Model to use (optional, uses provider default if not set) */
291
+ model?: string;
292
+ /** Temperature for generation */
293
+ temperature?: number;
294
+ /** Max tokens per completion */
295
+ maxTokens?: number;
296
+ /** Global defaults to set */
297
+ defaults?: LLMDefaults;
298
+ }
299
+ /**
300
+ * One-step LLM configuration
301
+ *
302
+ * Simplifies LLM setup by combining adapter creation and client setting
303
+ * into a single call with environment variable auto-detection.
304
+ *
305
+ * @param options - Configuration options
306
+ * @returns The configured LLM client
307
+ *
308
+ * @example
309
+ * ```ts
310
+ * // Explicit provider (API key from environment)
311
+ * configureLLM({ provider: "openai", model: "gpt-4" });
312
+ *
313
+ * // With explicit API key
314
+ * configureLLM({
315
+ * provider: "anthropic",
316
+ * apiKey: "sk-ant-...",
317
+ * model: "claude-3-5-sonnet-20241022"
318
+ * });
319
+ *
320
+ * // With global defaults
321
+ * configureLLM({
322
+ * provider: "openai",
323
+ * defaults: { evaluationMode: "batch" }
324
+ * });
325
+ *
326
+ * // Custom client
327
+ * configureLLM({ provider: "custom", client: myClient });
328
+ * ```
329
+ */
330
+ declare function configureLLM(options: ConfigureLLMOptions): LLMClient;
331
+ declare namespace configureLLM {
332
+ var auto: (options?: ConfigureLLMAutoOptions) => LLMClient;
333
+ }
147
334
 
148
335
  /**
149
336
  * Utilities for LLM-based metric evaluation
@@ -547,4 +734,46 @@ interface OpenRouterAdapterOptions {
547
734
  */
548
735
  declare function createOpenRouterAdapter(apiKey: string, options?: OpenRouterAdapterOptions): LLMClient;
549
736
 
550
- export { type AnthropicAdapterOptions, BINARY_THRESHOLDS, type OpenAIAdapterOptions, type OpenRouterAdapterOptions, SEVERITY_THRESHOLDS, batch, batchItems, clearMetrics, createAnthropicAdapter, createErrorMockClient, createJSONSchema, createKeywordMetric, createLLMError, createMetricOutput, createMockLLMClient, createOpenAIAdapter, createOpenRouterAdapter, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, extractScore, fillPrompt, getLLMClient, getMetric, listMetrics, normalizeScore, parseJSONResponse, registerMetric, requireLLMClient, resetLLMClient, runMetric, scoreToLabel, setLLMClient, unregisterMetric, validateResponse, withTimeout };
737
+ /**
738
+ * Metrics module - entry point for evalsense/metrics
739
+ *
740
+ * Provides LLM-based metrics, custom metric utilities, and LLM client management.
741
+ */
742
+
743
+ /**
744
+ * Testing utilities for LLM metrics
745
+ *
746
+ * Provides convenient access to all testing-related functions in one namespace.
747
+ *
748
+ * @example
749
+ * ```ts
750
+ * import { testing } from "evalsense/metrics";
751
+ *
752
+ * describe("My tests", () => {
753
+ * beforeEach(testing.reset);
754
+ *
755
+ * it("test with mock", async () => {
756
+ * const result = await testing.withClient(
757
+ * testing.mock({ response: { score: 0.8 } }),
758
+ * async () => hallucination([...])
759
+ * );
760
+ * });
761
+ * });
762
+ * ```
763
+ */
764
+ declare const testing: {
765
+ /** Resets global LLM client and defaults */
766
+ reset: () => void;
767
+ /** Creates a mock LLM client */
768
+ mock: typeof createMockLLMClient;
769
+ /** Executes function with scoped LLM client */
770
+ withClient: typeof withLLMClient;
771
+ /** Creates a mock client that returns sequential responses */
772
+ sequentialMock: typeof createSequentialMockClient;
773
+ /** Creates a mock client that always errors */
774
+ errorMock: typeof createErrorMockClient;
775
+ /** Creates a spy mock client that records all prompts */
776
+ spyMock: typeof createSpyMockClient;
777
+ };
778
+
779
+ export { type AnthropicAdapterOptions, BINARY_THRESHOLDS, type ConfigureLLMAutoOptions, type ConfigureLLMOptions, type LLMDefaults, LLMMetric, LLMMetricConfig, type LLMProvider, type OpenAIAdapterOptions, type OpenRouterAdapterOptions, SEVERITY_THRESHOLDS, batch, batchItems, configureLLM, createAnthropicAdapter, createErrorMockClient, createJSONSchema, createKeywordMetric, createLLMError, createLLMMetric, createMetricOutput, createMockLLMClient, createOpenAIAdapter, createOpenRouterAdapter, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, extractScore, fillPrompt, getDefaults, getLLMClient, normalizeScore, parseJSONResponse, requireLLMClient, resetDefaults, resetLLMClient, scoreToLabel, setDefaults, setLLMClient, testing, validateResponse, withLLMClient, withTimeout };