evalsense 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +235 -98
- package/dist/{chunk-BFGA2NUB.cjs → chunk-4BKZPVY4.cjs} +13 -6
- package/dist/chunk-4BKZPVY4.cjs.map +1 -0
- package/dist/{chunk-IYLSY7NX.js → chunk-IUVDDMJ3.js} +13 -6
- package/dist/chunk-IUVDDMJ3.js.map +1 -0
- package/dist/chunk-NCCQRZ2Y.cjs +1141 -0
- package/dist/chunk-NCCQRZ2Y.cjs.map +1 -0
- package/dist/chunk-TDGWDK2L.js +1108 -0
- package/dist/chunk-TDGWDK2L.js.map +1 -0
- package/dist/cli.cjs +11 -11
- package/dist/cli.js +1 -1
- package/dist/index-CATqAHNK.d.cts +416 -0
- package/dist/index-CoMpaW-K.d.ts +416 -0
- package/dist/index.cjs +507 -580
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +210 -161
- package/dist/index.d.ts +210 -161
- package/dist/index.js +455 -524
- package/dist/index.js.map +1 -1
- package/dist/metrics/index.cjs +103 -342
- package/dist/metrics/index.cjs.map +1 -1
- package/dist/metrics/index.d.cts +260 -31
- package/dist/metrics/index.d.ts +260 -31
- package/dist/metrics/index.js +24 -312
- package/dist/metrics/index.js.map +1 -1
- package/dist/metrics/opinionated/index.cjs +5 -5
- package/dist/metrics/opinionated/index.d.cts +2 -163
- package/dist/metrics/opinionated/index.d.ts +2 -163
- package/dist/metrics/opinionated/index.js +1 -1
- package/dist/{types-C71p0wzM.d.cts → types-D0hzfyKm.d.cts} +1 -13
- package/dist/{types-C71p0wzM.d.ts → types-D0hzfyKm.d.ts} +1 -13
- package/package.json +1 -1
- package/dist/chunk-BFGA2NUB.cjs.map +0 -1
- package/dist/chunk-IYLSY7NX.js.map +0 -1
- package/dist/chunk-RZFLCWTW.cjs +0 -942
- package/dist/chunk-RZFLCWTW.cjs.map +0 -1
- package/dist/chunk-Z3U6AUWX.js +0 -925
- package/dist/chunk-Z3U6AUWX.js.map +0 -1
package/dist/metrics/index.d.cts
CHANGED
|
@@ -1,48 +1,99 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
import { L as LLMMetricConfig, a as LLMMetric } from '../index-CATqAHNK.cjs';
|
|
2
|
+
export { E as EvalRecord, I as InputSpec, b as LLMMetricOptions, c as LabelThreshold, R as ResponseFieldType, f as faithfulness, h as hallucination, r as relevance, t as toxicity } from '../index-CATqAHNK.cjs';
|
|
3
|
+
import { M as MetricFn, a as MetricOutput, L as LLMClient, J as JSONSchema } from '../types-D0hzfyKm.cjs';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
|
-
*
|
|
6
|
+
* Factory function for creating LLM-based metrics
|
|
7
|
+
*
|
|
8
|
+
* Reduces metric definition from 90+ lines to ~15 lines with a declarative API.
|
|
9
|
+
* Eliminates parallel array matching and provides unified record input.
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```ts
|
|
13
|
+
* const answerCorrectness = createLLMMetric({
|
|
14
|
+
* name: "answer-correctness",
|
|
15
|
+
* inputs: ["output", "reference"],
|
|
16
|
+
* prompt: ANSWER_CORRECTNESS_PROMPT,
|
|
17
|
+
* responseFields: { score: "number", reasoning: "string" },
|
|
18
|
+
* labels: [
|
|
19
|
+
* { min: 0.8, label: "correct" },
|
|
20
|
+
* { min: 0.5, label: "partial" },
|
|
21
|
+
* { min: 0, label: "incorrect" },
|
|
22
|
+
* ],
|
|
23
|
+
* });
|
|
24
|
+
*
|
|
25
|
+
* // Usage with unified records
|
|
26
|
+
* const results = await answerCorrectness([
|
|
27
|
+
* { id: "1", output: "Paris", reference: "Paris is the capital" },
|
|
28
|
+
* ]);
|
|
29
|
+
* ```
|
|
6
30
|
*/
|
|
7
31
|
|
|
8
32
|
/**
|
|
9
|
-
*
|
|
33
|
+
* Creates an LLM-based metric function
|
|
34
|
+
*
|
|
35
|
+
* This factory function eliminates boilerplate by:
|
|
36
|
+
* - Handling LLM client validation
|
|
37
|
+
* - Managing structured output with fallback to text parsing
|
|
38
|
+
* - Normalizing scores to 0-1 range
|
|
39
|
+
* - Converting scores to labels using thresholds
|
|
40
|
+
* - Supporting both per-row and batch evaluation modes
|
|
41
|
+
* - Providing consistent error handling
|
|
42
|
+
*
|
|
43
|
+
* @param config - Metric configuration
|
|
44
|
+
* @returns A metric function that takes unified records
|
|
10
45
|
*
|
|
11
46
|
* @example
|
|
12
47
|
* ```ts
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
48
|
+
* // Create a custom LLM metric
|
|
49
|
+
* const myMetric = createLLMMetric({
|
|
50
|
+
* name: "my-metric",
|
|
51
|
+
* inputs: ["output", "reference"],
|
|
52
|
+
* prompt: `
|
|
53
|
+
* Reference: {reference}
|
|
54
|
+
* Output: {output}
|
|
55
|
+
*
|
|
56
|
+
* Evaluate the output against the reference...
|
|
57
|
+
* `,
|
|
58
|
+
* responseFields: { score: "number", reasoning: "string" },
|
|
59
|
+
* labels: [
|
|
60
|
+
* { min: 0.7, label: "good" },
|
|
61
|
+
* { min: 0.4, label: "fair" },
|
|
62
|
+
* { min: 0, label: "poor" },
|
|
63
|
+
* ],
|
|
20
64
|
* });
|
|
65
|
+
*
|
|
66
|
+
* // Use with unified records
|
|
67
|
+
* const results = await myMetric([
|
|
68
|
+
* { id: "1", output: "answer A", reference: "correct answer" },
|
|
69
|
+
* { id: "2", output: "answer B", reference: "expected B" },
|
|
70
|
+
* ]);
|
|
21
71
|
* ```
|
|
22
72
|
*/
|
|
23
|
-
declare function
|
|
24
|
-
|
|
25
|
-
* Gets a registered custom metric
|
|
26
|
-
*/
|
|
27
|
-
declare function getMetric(name: string): MetricFn | undefined;
|
|
28
|
-
/**
|
|
29
|
-
* Runs a registered metric
|
|
30
|
-
*/
|
|
31
|
-
declare function runMetric(name: string, config: MetricConfig): Promise<MetricOutput[]>;
|
|
32
|
-
/**
|
|
33
|
-
* Lists all registered custom metrics
|
|
34
|
-
*/
|
|
35
|
-
declare function listMetrics(): string[];
|
|
36
|
-
/**
|
|
37
|
-
* Unregisters a metric (mainly for testing)
|
|
38
|
-
*/
|
|
39
|
-
declare function unregisterMetric(name: string): boolean;
|
|
73
|
+
declare function createLLMMetric(config: LLMMetricConfig): LLMMetric;
|
|
74
|
+
|
|
40
75
|
/**
|
|
41
|
-
*
|
|
76
|
+
* Custom metric utilities
|
|
77
|
+
*
|
|
78
|
+
* Provides simple pattern-based and keyword-based metrics for non-LLM use cases.
|
|
79
|
+
* For LLM-based custom metrics, use createLLMMetric() from evalsense/metrics.
|
|
42
80
|
*/
|
|
43
|
-
|
|
81
|
+
|
|
44
82
|
/**
|
|
45
83
|
* Creates a simple string-matching metric
|
|
84
|
+
*
|
|
85
|
+
* @example
|
|
86
|
+
* ```ts
|
|
87
|
+
* const containsCodeMetric = createPatternMetric("contains-code", [
|
|
88
|
+
* /```[\s\S]*?```/,
|
|
89
|
+
* /function\s+\w+\s*\(/,
|
|
90
|
+
* /const\s+\w+\s*=/,
|
|
91
|
+
* ]);
|
|
92
|
+
*
|
|
93
|
+
* const results = await containsCodeMetric({
|
|
94
|
+
* outputs: [{ id: "1", output: "const x = 5" }]
|
|
95
|
+
* });
|
|
96
|
+
* ```
|
|
46
97
|
*/
|
|
47
98
|
declare function createPatternMetric(name: string, patterns: RegExp[], options?: {
|
|
48
99
|
matchScore?: number;
|
|
@@ -50,6 +101,19 @@ declare function createPatternMetric(name: string, patterns: RegExp[], options?:
|
|
|
50
101
|
}): MetricFn;
|
|
51
102
|
/**
|
|
52
103
|
* Creates a keyword-based metric
|
|
104
|
+
*
|
|
105
|
+
* @example
|
|
106
|
+
* ```ts
|
|
107
|
+
* const techTermsMetric = createKeywordMetric("tech-terms", [
|
|
108
|
+
* "machine learning",
|
|
109
|
+
* "neural network",
|
|
110
|
+
* "algorithm",
|
|
111
|
+
* ], { threshold: 0.3 });
|
|
112
|
+
*
|
|
113
|
+
* const results = await techTermsMetric({
|
|
114
|
+
* outputs: [{ id: "1", output: "This uses a neural network algorithm." }]
|
|
115
|
+
* });
|
|
116
|
+
* ```
|
|
53
117
|
*/
|
|
54
118
|
declare function createKeywordMetric(name: string, keywords: string[], options?: {
|
|
55
119
|
caseSensitive?: boolean;
|
|
@@ -108,6 +172,13 @@ declare function delay(ms: number): Promise<void>;
|
|
|
108
172
|
* across all LLM-based metrics, with support for per-call overrides.
|
|
109
173
|
*/
|
|
110
174
|
|
|
175
|
+
/**
|
|
176
|
+
* Global defaults for LLM metrics
|
|
177
|
+
*/
|
|
178
|
+
interface LLMDefaults {
|
|
179
|
+
/** Default evaluation mode for all metrics */
|
|
180
|
+
evaluationMode?: "per-row" | "batch";
|
|
181
|
+
}
|
|
111
182
|
/**
|
|
112
183
|
* Sets the global LLM client for all metrics
|
|
113
184
|
*
|
|
@@ -144,6 +215,122 @@ declare function resetLLMClient(): void;
|
|
|
144
215
|
* @returns The client to use (override or global)
|
|
145
216
|
*/
|
|
146
217
|
declare function requireLLMClient(client: LLMClient | undefined, metricName: string): LLMClient;
|
|
218
|
+
/**
|
|
219
|
+
* Executes a function with a scoped LLM client
|
|
220
|
+
*
|
|
221
|
+
* The client is automatically restored after the function completes,
|
|
222
|
+
* even if an error is thrown. This is ideal for testing scenarios
|
|
223
|
+
* where you want to use a mock client without affecting other tests.
|
|
224
|
+
*
|
|
225
|
+
* @param client - The LLM client to use for this scope
|
|
226
|
+
* @param fn - The async function to execute with the scoped client
|
|
227
|
+
* @returns The result of the function
|
|
228
|
+
*
|
|
229
|
+
* @example
|
|
230
|
+
* ```ts
|
|
231
|
+
* // No need for beforeEach(() => resetLLMClient())
|
|
232
|
+
* it("test with mock client", async () => {
|
|
233
|
+
* const result = await withLLMClient(mockClient, async () => {
|
|
234
|
+
* return hallucination([{ id: "1", output: "test", context: "ctx" }]);
|
|
235
|
+
* });
|
|
236
|
+
* expect(result[0].score).toBe(0.5);
|
|
237
|
+
* });
|
|
238
|
+
* ```
|
|
239
|
+
*/
|
|
240
|
+
declare function withLLMClient<T>(client: LLMClient, fn: () => Promise<T>): Promise<T>;
|
|
241
|
+
/**
|
|
242
|
+
* Sets global defaults for LLM metrics
|
|
243
|
+
*
|
|
244
|
+
* @param defaults - Default options to apply to all metrics
|
|
245
|
+
*
|
|
246
|
+
* @example
|
|
247
|
+
* ```ts
|
|
248
|
+
* // Make all metrics use batch mode by default
|
|
249
|
+
* setDefaults({ evaluationMode: "batch" });
|
|
250
|
+
* ```
|
|
251
|
+
*/
|
|
252
|
+
declare function setDefaults(defaults: LLMDefaults): void;
|
|
253
|
+
/**
|
|
254
|
+
* Gets the current global defaults
|
|
255
|
+
*
|
|
256
|
+
* @returns Current global defaults
|
|
257
|
+
*/
|
|
258
|
+
declare function getDefaults(): LLMDefaults;
|
|
259
|
+
/**
|
|
260
|
+
* Resets global defaults to empty
|
|
261
|
+
*/
|
|
262
|
+
declare function resetDefaults(): void;
|
|
263
|
+
/**
|
|
264
|
+
* Provider types for configureLLM
|
|
265
|
+
*/
|
|
266
|
+
type LLMProvider = "openai" | "anthropic" | "openrouter" | "custom";
|
|
267
|
+
/**
|
|
268
|
+
* Options for configureLLM
|
|
269
|
+
*/
|
|
270
|
+
interface ConfigureLLMOptions {
|
|
271
|
+
/** LLM provider to use */
|
|
272
|
+
provider: LLMProvider;
|
|
273
|
+
/** API key (auto-detects from environment if not provided) */
|
|
274
|
+
apiKey?: string;
|
|
275
|
+
/** Model to use (provider-specific defaults apply if not set) */
|
|
276
|
+
model?: string;
|
|
277
|
+
/** Temperature for generation */
|
|
278
|
+
temperature?: number;
|
|
279
|
+
/** Max tokens per completion */
|
|
280
|
+
maxTokens?: number;
|
|
281
|
+
/** Custom client (required when provider is "custom") */
|
|
282
|
+
client?: LLMClient;
|
|
283
|
+
/** Global defaults to set */
|
|
284
|
+
defaults?: LLMDefaults;
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Options for auto-detection
|
|
288
|
+
*/
|
|
289
|
+
interface ConfigureLLMAutoOptions {
|
|
290
|
+
/** Model to use (optional, uses provider default if not set) */
|
|
291
|
+
model?: string;
|
|
292
|
+
/** Temperature for generation */
|
|
293
|
+
temperature?: number;
|
|
294
|
+
/** Max tokens per completion */
|
|
295
|
+
maxTokens?: number;
|
|
296
|
+
/** Global defaults to set */
|
|
297
|
+
defaults?: LLMDefaults;
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* One-step LLM configuration
|
|
301
|
+
*
|
|
302
|
+
* Simplifies LLM setup by combining adapter creation and client setting
|
|
303
|
+
* into a single call with environment variable auto-detection.
|
|
304
|
+
*
|
|
305
|
+
* @param options - Configuration options
|
|
306
|
+
* @returns The configured LLM client
|
|
307
|
+
*
|
|
308
|
+
* @example
|
|
309
|
+
* ```ts
|
|
310
|
+
* // Explicit provider (API key from environment)
|
|
311
|
+
* configureLLM({ provider: "openai", model: "gpt-4" });
|
|
312
|
+
*
|
|
313
|
+
* // With explicit API key
|
|
314
|
+
* configureLLM({
|
|
315
|
+
* provider: "anthropic",
|
|
316
|
+
* apiKey: "sk-ant-...",
|
|
317
|
+
* model: "claude-3-5-sonnet-20241022"
|
|
318
|
+
* });
|
|
319
|
+
*
|
|
320
|
+
* // With global defaults
|
|
321
|
+
* configureLLM({
|
|
322
|
+
* provider: "openai",
|
|
323
|
+
* defaults: { evaluationMode: "batch" }
|
|
324
|
+
* });
|
|
325
|
+
*
|
|
326
|
+
* // Custom client
|
|
327
|
+
* configureLLM({ provider: "custom", client: myClient });
|
|
328
|
+
* ```
|
|
329
|
+
*/
|
|
330
|
+
declare function configureLLM(options: ConfigureLLMOptions): LLMClient;
|
|
331
|
+
declare namespace configureLLM {
|
|
332
|
+
var auto: (options?: ConfigureLLMAutoOptions) => LLMClient;
|
|
333
|
+
}
|
|
147
334
|
|
|
148
335
|
/**
|
|
149
336
|
* Utilities for LLM-based metric evaluation
|
|
@@ -547,4 +734,46 @@ interface OpenRouterAdapterOptions {
|
|
|
547
734
|
*/
|
|
548
735
|
declare function createOpenRouterAdapter(apiKey: string, options?: OpenRouterAdapterOptions): LLMClient;
|
|
549
736
|
|
|
550
|
-
|
|
737
|
+
/**
|
|
738
|
+
* Metrics module - entry point for evalsense/metrics
|
|
739
|
+
*
|
|
740
|
+
* Provides LLM-based metrics, custom metric utilities, and LLM client management.
|
|
741
|
+
*/
|
|
742
|
+
|
|
743
|
+
/**
|
|
744
|
+
* Testing utilities for LLM metrics
|
|
745
|
+
*
|
|
746
|
+
* Provides convenient access to all testing-related functions in one namespace.
|
|
747
|
+
*
|
|
748
|
+
* @example
|
|
749
|
+
* ```ts
|
|
750
|
+
* import { testing } from "evalsense/metrics";
|
|
751
|
+
*
|
|
752
|
+
* describe("My tests", () => {
|
|
753
|
+
* beforeEach(testing.reset);
|
|
754
|
+
*
|
|
755
|
+
* it("test with mock", async () => {
|
|
756
|
+
* const result = await testing.withClient(
|
|
757
|
+
* testing.mock({ response: { score: 0.8 } }),
|
|
758
|
+
* async () => hallucination([...])
|
|
759
|
+
* );
|
|
760
|
+
* });
|
|
761
|
+
* });
|
|
762
|
+
* ```
|
|
763
|
+
*/
|
|
764
|
+
declare const testing: {
|
|
765
|
+
/** Resets global LLM client and defaults */
|
|
766
|
+
reset: () => void;
|
|
767
|
+
/** Creates a mock LLM client */
|
|
768
|
+
mock: typeof createMockLLMClient;
|
|
769
|
+
/** Executes function with scoped LLM client */
|
|
770
|
+
withClient: typeof withLLMClient;
|
|
771
|
+
/** Creates a mock client that returns sequential responses */
|
|
772
|
+
sequentialMock: typeof createSequentialMockClient;
|
|
773
|
+
/** Creates a mock client that always errors */
|
|
774
|
+
errorMock: typeof createErrorMockClient;
|
|
775
|
+
/** Creates a spy mock client that records all prompts */
|
|
776
|
+
spyMock: typeof createSpyMockClient;
|
|
777
|
+
};
|
|
778
|
+
|
|
779
|
+
export { type AnthropicAdapterOptions, BINARY_THRESHOLDS, type ConfigureLLMAutoOptions, type ConfigureLLMOptions, type LLMDefaults, LLMMetric, LLMMetricConfig, type LLMProvider, type OpenAIAdapterOptions, type OpenRouterAdapterOptions, SEVERITY_THRESHOLDS, batch, batchItems, configureLLM, createAnthropicAdapter, createErrorMockClient, createJSONSchema, createKeywordMetric, createLLMError, createLLMMetric, createMetricOutput, createMockLLMClient, createOpenAIAdapter, createOpenRouterAdapter, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, extractScore, fillPrompt, getDefaults, getLLMClient, normalizeScore, parseJSONResponse, requireLLMClient, resetDefaults, resetLLMClient, scoreToLabel, setDefaults, setLLMClient, testing, validateResponse, withLLMClient, withTimeout };
|
package/dist/metrics/index.d.ts
CHANGED
|
@@ -1,48 +1,99 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
import { L as LLMMetricConfig, a as LLMMetric } from '../index-CoMpaW-K.js';
|
|
2
|
+
export { E as EvalRecord, I as InputSpec, b as LLMMetricOptions, c as LabelThreshold, R as ResponseFieldType, f as faithfulness, h as hallucination, r as relevance, t as toxicity } from '../index-CoMpaW-K.js';
|
|
3
|
+
import { M as MetricFn, a as MetricOutput, L as LLMClient, J as JSONSchema } from '../types-D0hzfyKm.js';
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
|
-
*
|
|
6
|
+
* Factory function for creating LLM-based metrics
|
|
7
|
+
*
|
|
8
|
+
* Reduces metric definition from 90+ lines to ~15 lines with a declarative API.
|
|
9
|
+
* Eliminates parallel array matching and provides unified record input.
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```ts
|
|
13
|
+
* const answerCorrectness = createLLMMetric({
|
|
14
|
+
* name: "answer-correctness",
|
|
15
|
+
* inputs: ["output", "reference"],
|
|
16
|
+
* prompt: ANSWER_CORRECTNESS_PROMPT,
|
|
17
|
+
* responseFields: { score: "number", reasoning: "string" },
|
|
18
|
+
* labels: [
|
|
19
|
+
* { min: 0.8, label: "correct" },
|
|
20
|
+
* { min: 0.5, label: "partial" },
|
|
21
|
+
* { min: 0, label: "incorrect" },
|
|
22
|
+
* ],
|
|
23
|
+
* });
|
|
24
|
+
*
|
|
25
|
+
* // Usage with unified records
|
|
26
|
+
* const results = await answerCorrectness([
|
|
27
|
+
* { id: "1", output: "Paris", reference: "Paris is the capital" },
|
|
28
|
+
* ]);
|
|
29
|
+
* ```
|
|
6
30
|
*/
|
|
7
31
|
|
|
8
32
|
/**
|
|
9
|
-
*
|
|
33
|
+
* Creates an LLM-based metric function
|
|
34
|
+
*
|
|
35
|
+
* This factory function eliminates boilerplate by:
|
|
36
|
+
* - Handling LLM client validation
|
|
37
|
+
* - Managing structured output with fallback to text parsing
|
|
38
|
+
* - Normalizing scores to 0-1 range
|
|
39
|
+
* - Converting scores to labels using thresholds
|
|
40
|
+
* - Supporting both per-row and batch evaluation modes
|
|
41
|
+
* - Providing consistent error handling
|
|
42
|
+
*
|
|
43
|
+
* @param config - Metric configuration
|
|
44
|
+
* @returns A metric function that takes unified records
|
|
10
45
|
*
|
|
11
46
|
* @example
|
|
12
47
|
* ```ts
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
48
|
+
* // Create a custom LLM metric
|
|
49
|
+
* const myMetric = createLLMMetric({
|
|
50
|
+
* name: "my-metric",
|
|
51
|
+
* inputs: ["output", "reference"],
|
|
52
|
+
* prompt: `
|
|
53
|
+
* Reference: {reference}
|
|
54
|
+
* Output: {output}
|
|
55
|
+
*
|
|
56
|
+
* Evaluate the output against the reference...
|
|
57
|
+
* `,
|
|
58
|
+
* responseFields: { score: "number", reasoning: "string" },
|
|
59
|
+
* labels: [
|
|
60
|
+
* { min: 0.7, label: "good" },
|
|
61
|
+
* { min: 0.4, label: "fair" },
|
|
62
|
+
* { min: 0, label: "poor" },
|
|
63
|
+
* ],
|
|
20
64
|
* });
|
|
65
|
+
*
|
|
66
|
+
* // Use with unified records
|
|
67
|
+
* const results = await myMetric([
|
|
68
|
+
* { id: "1", output: "answer A", reference: "correct answer" },
|
|
69
|
+
* { id: "2", output: "answer B", reference: "expected B" },
|
|
70
|
+
* ]);
|
|
21
71
|
* ```
|
|
22
72
|
*/
|
|
23
|
-
declare function
|
|
24
|
-
|
|
25
|
-
* Gets a registered custom metric
|
|
26
|
-
*/
|
|
27
|
-
declare function getMetric(name: string): MetricFn | undefined;
|
|
28
|
-
/**
|
|
29
|
-
* Runs a registered metric
|
|
30
|
-
*/
|
|
31
|
-
declare function runMetric(name: string, config: MetricConfig): Promise<MetricOutput[]>;
|
|
32
|
-
/**
|
|
33
|
-
* Lists all registered custom metrics
|
|
34
|
-
*/
|
|
35
|
-
declare function listMetrics(): string[];
|
|
36
|
-
/**
|
|
37
|
-
* Unregisters a metric (mainly for testing)
|
|
38
|
-
*/
|
|
39
|
-
declare function unregisterMetric(name: string): boolean;
|
|
73
|
+
declare function createLLMMetric(config: LLMMetricConfig): LLMMetric;
|
|
74
|
+
|
|
40
75
|
/**
|
|
41
|
-
*
|
|
76
|
+
* Custom metric utilities
|
|
77
|
+
*
|
|
78
|
+
* Provides simple pattern-based and keyword-based metrics for non-LLM use cases.
|
|
79
|
+
* For LLM-based custom metrics, use createLLMMetric() from evalsense/metrics.
|
|
42
80
|
*/
|
|
43
|
-
|
|
81
|
+
|
|
44
82
|
/**
|
|
45
83
|
* Creates a simple string-matching metric
|
|
84
|
+
*
|
|
85
|
+
* @example
|
|
86
|
+
* ```ts
|
|
87
|
+
* const containsCodeMetric = createPatternMetric("contains-code", [
|
|
88
|
+
* /```[\s\S]*?```/,
|
|
89
|
+
* /function\s+\w+\s*\(/,
|
|
90
|
+
* /const\s+\w+\s*=/,
|
|
91
|
+
* ]);
|
|
92
|
+
*
|
|
93
|
+
* const results = await containsCodeMetric({
|
|
94
|
+
* outputs: [{ id: "1", output: "const x = 5" }]
|
|
95
|
+
* });
|
|
96
|
+
* ```
|
|
46
97
|
*/
|
|
47
98
|
declare function createPatternMetric(name: string, patterns: RegExp[], options?: {
|
|
48
99
|
matchScore?: number;
|
|
@@ -50,6 +101,19 @@ declare function createPatternMetric(name: string, patterns: RegExp[], options?:
|
|
|
50
101
|
}): MetricFn;
|
|
51
102
|
/**
|
|
52
103
|
* Creates a keyword-based metric
|
|
104
|
+
*
|
|
105
|
+
* @example
|
|
106
|
+
* ```ts
|
|
107
|
+
* const techTermsMetric = createKeywordMetric("tech-terms", [
|
|
108
|
+
* "machine learning",
|
|
109
|
+
* "neural network",
|
|
110
|
+
* "algorithm",
|
|
111
|
+
* ], { threshold: 0.3 });
|
|
112
|
+
*
|
|
113
|
+
* const results = await techTermsMetric({
|
|
114
|
+
* outputs: [{ id: "1", output: "This uses a neural network algorithm." }]
|
|
115
|
+
* });
|
|
116
|
+
* ```
|
|
53
117
|
*/
|
|
54
118
|
declare function createKeywordMetric(name: string, keywords: string[], options?: {
|
|
55
119
|
caseSensitive?: boolean;
|
|
@@ -108,6 +172,13 @@ declare function delay(ms: number): Promise<void>;
|
|
|
108
172
|
* across all LLM-based metrics, with support for per-call overrides.
|
|
109
173
|
*/
|
|
110
174
|
|
|
175
|
+
/**
|
|
176
|
+
* Global defaults for LLM metrics
|
|
177
|
+
*/
|
|
178
|
+
interface LLMDefaults {
|
|
179
|
+
/** Default evaluation mode for all metrics */
|
|
180
|
+
evaluationMode?: "per-row" | "batch";
|
|
181
|
+
}
|
|
111
182
|
/**
|
|
112
183
|
* Sets the global LLM client for all metrics
|
|
113
184
|
*
|
|
@@ -144,6 +215,122 @@ declare function resetLLMClient(): void;
|
|
|
144
215
|
* @returns The client to use (override or global)
|
|
145
216
|
*/
|
|
146
217
|
declare function requireLLMClient(client: LLMClient | undefined, metricName: string): LLMClient;
|
|
218
|
+
/**
|
|
219
|
+
* Executes a function with a scoped LLM client
|
|
220
|
+
*
|
|
221
|
+
* The client is automatically restored after the function completes,
|
|
222
|
+
* even if an error is thrown. This is ideal for testing scenarios
|
|
223
|
+
* where you want to use a mock client without affecting other tests.
|
|
224
|
+
*
|
|
225
|
+
* @param client - The LLM client to use for this scope
|
|
226
|
+
* @param fn - The async function to execute with the scoped client
|
|
227
|
+
* @returns The result of the function
|
|
228
|
+
*
|
|
229
|
+
* @example
|
|
230
|
+
* ```ts
|
|
231
|
+
* // No need for beforeEach(() => resetLLMClient())
|
|
232
|
+
* it("test with mock client", async () => {
|
|
233
|
+
* const result = await withLLMClient(mockClient, async () => {
|
|
234
|
+
* return hallucination([{ id: "1", output: "test", context: "ctx" }]);
|
|
235
|
+
* });
|
|
236
|
+
* expect(result[0].score).toBe(0.5);
|
|
237
|
+
* });
|
|
238
|
+
* ```
|
|
239
|
+
*/
|
|
240
|
+
declare function withLLMClient<T>(client: LLMClient, fn: () => Promise<T>): Promise<T>;
|
|
241
|
+
/**
|
|
242
|
+
* Sets global defaults for LLM metrics
|
|
243
|
+
*
|
|
244
|
+
* @param defaults - Default options to apply to all metrics
|
|
245
|
+
*
|
|
246
|
+
* @example
|
|
247
|
+
* ```ts
|
|
248
|
+
* // Make all metrics use batch mode by default
|
|
249
|
+
* setDefaults({ evaluationMode: "batch" });
|
|
250
|
+
* ```
|
|
251
|
+
*/
|
|
252
|
+
declare function setDefaults(defaults: LLMDefaults): void;
|
|
253
|
+
/**
|
|
254
|
+
* Gets the current global defaults
|
|
255
|
+
*
|
|
256
|
+
* @returns Current global defaults
|
|
257
|
+
*/
|
|
258
|
+
declare function getDefaults(): LLMDefaults;
|
|
259
|
+
/**
|
|
260
|
+
* Resets global defaults to empty
|
|
261
|
+
*/
|
|
262
|
+
declare function resetDefaults(): void;
|
|
263
|
+
/**
|
|
264
|
+
* Provider types for configureLLM
|
|
265
|
+
*/
|
|
266
|
+
type LLMProvider = "openai" | "anthropic" | "openrouter" | "custom";
|
|
267
|
+
/**
|
|
268
|
+
* Options for configureLLM
|
|
269
|
+
*/
|
|
270
|
+
interface ConfigureLLMOptions {
|
|
271
|
+
/** LLM provider to use */
|
|
272
|
+
provider: LLMProvider;
|
|
273
|
+
/** API key (auto-detects from environment if not provided) */
|
|
274
|
+
apiKey?: string;
|
|
275
|
+
/** Model to use (provider-specific defaults apply if not set) */
|
|
276
|
+
model?: string;
|
|
277
|
+
/** Temperature for generation */
|
|
278
|
+
temperature?: number;
|
|
279
|
+
/** Max tokens per completion */
|
|
280
|
+
maxTokens?: number;
|
|
281
|
+
/** Custom client (required when provider is "custom") */
|
|
282
|
+
client?: LLMClient;
|
|
283
|
+
/** Global defaults to set */
|
|
284
|
+
defaults?: LLMDefaults;
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Options for auto-detection
|
|
288
|
+
*/
|
|
289
|
+
interface ConfigureLLMAutoOptions {
|
|
290
|
+
/** Model to use (optional, uses provider default if not set) */
|
|
291
|
+
model?: string;
|
|
292
|
+
/** Temperature for generation */
|
|
293
|
+
temperature?: number;
|
|
294
|
+
/** Max tokens per completion */
|
|
295
|
+
maxTokens?: number;
|
|
296
|
+
/** Global defaults to set */
|
|
297
|
+
defaults?: LLMDefaults;
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* One-step LLM configuration
|
|
301
|
+
*
|
|
302
|
+
* Simplifies LLM setup by combining adapter creation and client setting
|
|
303
|
+
* into a single call with environment variable auto-detection.
|
|
304
|
+
*
|
|
305
|
+
* @param options - Configuration options
|
|
306
|
+
* @returns The configured LLM client
|
|
307
|
+
*
|
|
308
|
+
* @example
|
|
309
|
+
* ```ts
|
|
310
|
+
* // Explicit provider (API key from environment)
|
|
311
|
+
* configureLLM({ provider: "openai", model: "gpt-4" });
|
|
312
|
+
*
|
|
313
|
+
* // With explicit API key
|
|
314
|
+
* configureLLM({
|
|
315
|
+
* provider: "anthropic",
|
|
316
|
+
* apiKey: "sk-ant-...",
|
|
317
|
+
* model: "claude-3-5-sonnet-20241022"
|
|
318
|
+
* });
|
|
319
|
+
*
|
|
320
|
+
* // With global defaults
|
|
321
|
+
* configureLLM({
|
|
322
|
+
* provider: "openai",
|
|
323
|
+
* defaults: { evaluationMode: "batch" }
|
|
324
|
+
* });
|
|
325
|
+
*
|
|
326
|
+
* // Custom client
|
|
327
|
+
* configureLLM({ provider: "custom", client: myClient });
|
|
328
|
+
* ```
|
|
329
|
+
*/
|
|
330
|
+
declare function configureLLM(options: ConfigureLLMOptions): LLMClient;
|
|
331
|
+
declare namespace configureLLM {
|
|
332
|
+
var auto: (options?: ConfigureLLMAutoOptions) => LLMClient;
|
|
333
|
+
}
|
|
147
334
|
|
|
148
335
|
/**
|
|
149
336
|
* Utilities for LLM-based metric evaluation
|
|
@@ -547,4 +734,46 @@ interface OpenRouterAdapterOptions {
|
|
|
547
734
|
*/
|
|
548
735
|
declare function createOpenRouterAdapter(apiKey: string, options?: OpenRouterAdapterOptions): LLMClient;
|
|
549
736
|
|
|
550
|
-
|
|
737
|
+
/**
|
|
738
|
+
* Metrics module - entry point for evalsense/metrics
|
|
739
|
+
*
|
|
740
|
+
* Provides LLM-based metrics, custom metric utilities, and LLM client management.
|
|
741
|
+
*/
|
|
742
|
+
|
|
743
|
+
/**
|
|
744
|
+
* Testing utilities for LLM metrics
|
|
745
|
+
*
|
|
746
|
+
* Provides convenient access to all testing-related functions in one namespace.
|
|
747
|
+
*
|
|
748
|
+
* @example
|
|
749
|
+
* ```ts
|
|
750
|
+
* import { testing } from "evalsense/metrics";
|
|
751
|
+
*
|
|
752
|
+
* describe("My tests", () => {
|
|
753
|
+
* beforeEach(testing.reset);
|
|
754
|
+
*
|
|
755
|
+
* it("test with mock", async () => {
|
|
756
|
+
* const result = await testing.withClient(
|
|
757
|
+
* testing.mock({ response: { score: 0.8 } }),
|
|
758
|
+
* async () => hallucination([...])
|
|
759
|
+
* );
|
|
760
|
+
* });
|
|
761
|
+
* });
|
|
762
|
+
* ```
|
|
763
|
+
*/
|
|
764
|
+
declare const testing: {
|
|
765
|
+
/** Resets global LLM client and defaults */
|
|
766
|
+
reset: () => void;
|
|
767
|
+
/** Creates a mock LLM client */
|
|
768
|
+
mock: typeof createMockLLMClient;
|
|
769
|
+
/** Executes function with scoped LLM client */
|
|
770
|
+
withClient: typeof withLLMClient;
|
|
771
|
+
/** Creates a mock client that returns sequential responses */
|
|
772
|
+
sequentialMock: typeof createSequentialMockClient;
|
|
773
|
+
/** Creates a mock client that always errors */
|
|
774
|
+
errorMock: typeof createErrorMockClient;
|
|
775
|
+
/** Creates a spy mock client that records all prompts */
|
|
776
|
+
spyMock: typeof createSpyMockClient;
|
|
777
|
+
};
|
|
778
|
+
|
|
779
|
+
export { type AnthropicAdapterOptions, BINARY_THRESHOLDS, type ConfigureLLMAutoOptions, type ConfigureLLMOptions, type LLMDefaults, LLMMetric, LLMMetricConfig, type LLMProvider, type OpenAIAdapterOptions, type OpenRouterAdapterOptions, SEVERITY_THRESHOLDS, batch, batchItems, configureLLM, createAnthropicAdapter, createErrorMockClient, createJSONSchema, createKeywordMetric, createLLMError, createLLMMetric, createMetricOutput, createMockLLMClient, createOpenAIAdapter, createOpenRouterAdapter, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, extractScore, fillPrompt, getDefaults, getLLMClient, normalizeScore, parseJSONResponse, requireLLMClient, resetDefaults, resetLLMClient, scoreToLabel, setDefaults, setLLMClient, testing, validateResponse, withLLMClient, withTimeout };
|