evalsense 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +678 -0
- package/bin/evalsense.js +3 -0
- package/dist/chunk-5P7LNNO6.js +747 -0
- package/dist/chunk-5P7LNNO6.js.map +1 -0
- package/dist/chunk-BRPM6AB6.js +925 -0
- package/dist/chunk-BRPM6AB6.js.map +1 -0
- package/dist/chunk-HDJID3GC.cjs +779 -0
- package/dist/chunk-HDJID3GC.cjs.map +1 -0
- package/dist/chunk-Y23VHTD3.cjs +942 -0
- package/dist/chunk-Y23VHTD3.cjs.map +1 -0
- package/dist/cli.cjs +65 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +63 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +1126 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +604 -0
- package/dist/index.d.ts +604 -0
- package/dist/index.js +1043 -0
- package/dist/index.js.map +1 -0
- package/dist/metrics/index.cjs +275 -0
- package/dist/metrics/index.cjs.map +1 -0
- package/dist/metrics/index.d.cts +299 -0
- package/dist/metrics/index.d.ts +299 -0
- package/dist/metrics/index.js +191 -0
- package/dist/metrics/index.js.map +1 -0
- package/dist/metrics/opinionated/index.cjs +24 -0
- package/dist/metrics/opinionated/index.cjs.map +1 -0
- package/dist/metrics/opinionated/index.d.cts +163 -0
- package/dist/metrics/opinionated/index.d.ts +163 -0
- package/dist/metrics/opinionated/index.js +3 -0
- package/dist/metrics/opinionated/index.js.map +1 -0
- package/dist/types-C71p0wzM.d.cts +265 -0
- package/dist/types-C71p0wzM.d.ts +265 -0
- package/package.json +91 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
export { FaithfulnessConfig, HallucinationConfig, RelevanceConfig, ToxicityConfig, faithfulness, hallucination, relevance, toxicity } from './opinionated/index.cjs';
|
|
2
|
+
import { M as MetricFn, a as MetricConfig, b as MetricOutput, L as LLMClient, J as JSONSchema } from '../types-C71p0wzM.cjs';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Custom metric registration
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Registers a custom metric
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```ts
|
|
13
|
+
* registerMetric("custom-relevance", async ({ outputs, query }) => {
|
|
14
|
+
* // Custom evaluation logic
|
|
15
|
+
* return outputs.map(o => ({
|
|
16
|
+
* id: o.id,
|
|
17
|
+
* metric: "custom-relevance",
|
|
18
|
+
* score: evaluateRelevance(o.output, query),
|
|
19
|
+
* }));
|
|
20
|
+
* });
|
|
21
|
+
* ```
|
|
22
|
+
*/
|
|
23
|
+
declare function registerMetric(name: string, fn: MetricFn): void;
|
|
24
|
+
/**
|
|
25
|
+
* Gets a registered custom metric
|
|
26
|
+
*/
|
|
27
|
+
declare function getMetric(name: string): MetricFn | undefined;
|
|
28
|
+
/**
|
|
29
|
+
* Runs a registered metric
|
|
30
|
+
*/
|
|
31
|
+
declare function runMetric(name: string, config: MetricConfig): Promise<MetricOutput[]>;
|
|
32
|
+
/**
|
|
33
|
+
* Lists all registered custom metrics
|
|
34
|
+
*/
|
|
35
|
+
declare function listMetrics(): string[];
|
|
36
|
+
/**
|
|
37
|
+
* Unregisters a metric (mainly for testing)
|
|
38
|
+
*/
|
|
39
|
+
declare function unregisterMetric(name: string): boolean;
|
|
40
|
+
/**
|
|
41
|
+
* Clears all registered metrics (mainly for testing)
|
|
42
|
+
*/
|
|
43
|
+
declare function clearMetrics(): void;
|
|
44
|
+
/**
|
|
45
|
+
* Creates a simple string-matching metric
|
|
46
|
+
*/
|
|
47
|
+
declare function createPatternMetric(name: string, patterns: RegExp[], options?: {
|
|
48
|
+
matchScore?: number;
|
|
49
|
+
noMatchScore?: number;
|
|
50
|
+
}): MetricFn;
|
|
51
|
+
/**
|
|
52
|
+
* Creates a keyword-based metric
|
|
53
|
+
*/
|
|
54
|
+
declare function createKeywordMetric(name: string, keywords: string[], options?: {
|
|
55
|
+
caseSensitive?: boolean;
|
|
56
|
+
threshold?: number;
|
|
57
|
+
}): MetricFn;
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Metric utilities
|
|
61
|
+
*/
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Normalizes a score to 0-1 range
|
|
65
|
+
*/
|
|
66
|
+
declare function normalizeScore(score: number, min?: number, max?: number): number;
|
|
67
|
+
/**
|
|
68
|
+
* Converts a numeric score to a label based on thresholds
|
|
69
|
+
*/
|
|
70
|
+
declare function scoreToLabel(score: number, thresholds: {
|
|
71
|
+
label: string;
|
|
72
|
+
min: number;
|
|
73
|
+
}[]): string;
|
|
74
|
+
/**
|
|
75
|
+
* Creates a metric output from a score
|
|
76
|
+
*/
|
|
77
|
+
declare function createMetricOutput(id: string, metric: string, score: number, labelThresholds?: {
|
|
78
|
+
label: string;
|
|
79
|
+
min: number;
|
|
80
|
+
}[]): MetricOutput;
|
|
81
|
+
/**
|
|
82
|
+
* Default thresholds for binary metrics
|
|
83
|
+
*/
|
|
84
|
+
declare const BINARY_THRESHOLDS: {
|
|
85
|
+
label: string;
|
|
86
|
+
min: number;
|
|
87
|
+
}[];
|
|
88
|
+
/**
|
|
89
|
+
* Default thresholds for severity metrics
|
|
90
|
+
*/
|
|
91
|
+
declare const SEVERITY_THRESHOLDS: {
|
|
92
|
+
label: string;
|
|
93
|
+
min: number;
|
|
94
|
+
}[];
|
|
95
|
+
/**
|
|
96
|
+
* Batches items for parallel processing
|
|
97
|
+
*/
|
|
98
|
+
declare function batch<T>(items: T[], size: number): T[][];
|
|
99
|
+
/**
|
|
100
|
+
* Delays execution
|
|
101
|
+
*/
|
|
102
|
+
declare function delay(ms: number): Promise<void>;
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* LLM client management for metric evaluation
|
|
106
|
+
*
|
|
107
|
+
* Provides a global LLM client that can be configured once and used
|
|
108
|
+
* across all LLM-based metrics, with support for per-call overrides.
|
|
109
|
+
*/
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Sets the global LLM client for all metrics
|
|
113
|
+
*
|
|
114
|
+
* @example
|
|
115
|
+
* ```ts
|
|
116
|
+
* import { setLLMClient } from "evalsense/metrics";
|
|
117
|
+
*
|
|
118
|
+
* setLLMClient({
|
|
119
|
+
* async complete(prompt) {
|
|
120
|
+
* return await yourLLM.generate(prompt);
|
|
121
|
+
* }
|
|
122
|
+
* });
|
|
123
|
+
* ```
|
|
124
|
+
*/
|
|
125
|
+
declare function setLLMClient(client: LLMClient): void;
|
|
126
|
+
/**
|
|
127
|
+
* Gets the current global LLM client
|
|
128
|
+
*
|
|
129
|
+
* @returns The global client or null if not set
|
|
130
|
+
*/
|
|
131
|
+
declare function getLLMClient(): LLMClient | null;
|
|
132
|
+
/**
|
|
133
|
+
* Resets the global LLM client
|
|
134
|
+
*
|
|
135
|
+
* Useful for testing or switching between different LLM providers.
|
|
136
|
+
*/
|
|
137
|
+
declare function resetLLMClient(): void;
|
|
138
|
+
/**
|
|
139
|
+
* Validates that an LLM client is available
|
|
140
|
+
*
|
|
141
|
+
* @param client - Optional client override
|
|
142
|
+
* @param metricName - Name of the metric for error messages
|
|
143
|
+
* @throws Error if no client is configured
|
|
144
|
+
* @returns The client to use (override or global)
|
|
145
|
+
*/
|
|
146
|
+
declare function requireLLMClient(client: LLMClient | undefined, metricName: string): LLMClient;
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Utilities for LLM-based metric evaluation
|
|
150
|
+
*
|
|
151
|
+
* Provides helpers for prompt templating, response parsing, validation, and error handling.
|
|
152
|
+
*/
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Fills a prompt template with variables
|
|
156
|
+
*
|
|
157
|
+
* @example
|
|
158
|
+
* ```ts
|
|
159
|
+
* const prompt = fillPrompt(
|
|
160
|
+
* "Context: {context}\nOutput: {output}",
|
|
161
|
+
* { context: "Paris is the capital", output: "France's capital is Paris" }
|
|
162
|
+
* );
|
|
163
|
+
* ```
|
|
164
|
+
*/
|
|
165
|
+
declare function fillPrompt(template: string, variables: Record<string, string>): string;
|
|
166
|
+
/**
|
|
167
|
+
* Parses a JSON response from an LLM, with fallback handling
|
|
168
|
+
*
|
|
169
|
+
* Handles:
|
|
170
|
+
* - Plain JSON strings
|
|
171
|
+
* - JSON wrapped in markdown code blocks
|
|
172
|
+
* - Malformed JSON with helpful error messages
|
|
173
|
+
*
|
|
174
|
+
* @example
|
|
175
|
+
* ```ts
|
|
176
|
+
* const result = parseJSONResponse<{ score: number }>(llmResponse);
|
|
177
|
+
* ```
|
|
178
|
+
*/
|
|
179
|
+
declare function parseJSONResponse<T>(response: string): T;
|
|
180
|
+
/**
|
|
181
|
+
* Validates that a parsed JSON response has required fields
|
|
182
|
+
*
|
|
183
|
+
* @example
|
|
184
|
+
* ```ts
|
|
185
|
+
* validateResponse(result, ["score", "reasoning"], "hallucination");
|
|
186
|
+
* ```
|
|
187
|
+
*/
|
|
188
|
+
declare function validateResponse(response: unknown, requiredFields: string[], metricName: string): void;
|
|
189
|
+
/**
|
|
190
|
+
* Extracts a score from various formats (number, string, object with score field)
|
|
191
|
+
*/
|
|
192
|
+
declare function extractScore(value: unknown, defaultScore?: number): number;
|
|
193
|
+
/**
|
|
194
|
+
* Creates a JSON schema for structured LLM outputs
|
|
195
|
+
*
|
|
196
|
+
* @example
|
|
197
|
+
* ```ts
|
|
198
|
+
* const schema = createJSONSchema({
|
|
199
|
+
* score: "number",
|
|
200
|
+
* reasoning: "string"
|
|
201
|
+
* });
|
|
202
|
+
* ```
|
|
203
|
+
*/
|
|
204
|
+
declare function createJSONSchema(properties: Record<string, string>, required?: string[]): JSONSchema;
|
|
205
|
+
/**
|
|
206
|
+
* Batches an array of items into chunks
|
|
207
|
+
*
|
|
208
|
+
* Useful for batch evaluation mode to control batch size.
|
|
209
|
+
*/
|
|
210
|
+
declare function batchItems<T>(items: T[], batchSize: number): T[][];
|
|
211
|
+
/**
|
|
212
|
+
* Creates a consistent error message for LLM metric failures
|
|
213
|
+
*/
|
|
214
|
+
declare function createLLMError(metricName: string, operation: string, error: unknown, context?: {
|
|
215
|
+
id?: string;
|
|
216
|
+
index?: number;
|
|
217
|
+
}): Error;
|
|
218
|
+
/**
|
|
219
|
+
* Waits for a promise with a timeout
|
|
220
|
+
*/
|
|
221
|
+
declare function withTimeout<T>(promise: Promise<T>, timeoutMs: number, operation: string): Promise<T>;
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Mock LLM client for testing
|
|
225
|
+
*
|
|
226
|
+
* Provides a configurable mock implementation of LLMClient for unit tests.
|
|
227
|
+
*/
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Configuration for mock LLM client
|
|
231
|
+
*/
|
|
232
|
+
interface MockLLMConfig {
|
|
233
|
+
/** Fixed response to return (can be string or object for JSON mode) */
|
|
234
|
+
response?: string | Record<string, unknown>;
|
|
235
|
+
/** Multiple responses for sequential calls */
|
|
236
|
+
responses?: Array<string | Record<string, unknown>>;
|
|
237
|
+
/** Delay in milliseconds before responding */
|
|
238
|
+
delay?: number;
|
|
239
|
+
/** Whether to throw an error */
|
|
240
|
+
shouldError?: boolean;
|
|
241
|
+
/** Error message to throw */
|
|
242
|
+
errorMessage?: string;
|
|
243
|
+
/** Function to validate prompts */
|
|
244
|
+
onPrompt?: (prompt: string) => void;
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Creates a mock LLM client for testing
|
|
248
|
+
*
|
|
249
|
+
* @example
|
|
250
|
+
* ```ts
|
|
251
|
+
* const mock = createMockLLMClient({
|
|
252
|
+
* response: JSON.stringify({ score: 0.8, reasoning: "test" }),
|
|
253
|
+
* delay: 100
|
|
254
|
+
* });
|
|
255
|
+
*
|
|
256
|
+
* setLLMClient(mock);
|
|
257
|
+
* ```
|
|
258
|
+
*/
|
|
259
|
+
declare function createMockLLMClient(config?: MockLLMConfig): LLMClient;
|
|
260
|
+
/**
|
|
261
|
+
* Creates a mock client that returns sequential responses
|
|
262
|
+
*
|
|
263
|
+
* Useful for testing multiple calls with different responses.
|
|
264
|
+
*
|
|
265
|
+
* @example
|
|
266
|
+
* ```ts
|
|
267
|
+
* const mock = createSequentialMockClient([
|
|
268
|
+
* { score: 0.2, reasoning: "First call" },
|
|
269
|
+
* { score: 0.8, reasoning: "Second call" }
|
|
270
|
+
* ]);
|
|
271
|
+
* ```
|
|
272
|
+
*/
|
|
273
|
+
declare function createSequentialMockClient(responses: Array<string | Record<string, unknown>>, options?: {
|
|
274
|
+
delay?: number;
|
|
275
|
+
}): LLMClient;
|
|
276
|
+
/**
|
|
277
|
+
* Creates a mock client that always errors
|
|
278
|
+
*
|
|
279
|
+
* Useful for testing error handling.
|
|
280
|
+
*/
|
|
281
|
+
declare function createErrorMockClient(errorMessage?: string): LLMClient;
|
|
282
|
+
/**
|
|
283
|
+
* Creates a spy mock client that records all prompts
|
|
284
|
+
*
|
|
285
|
+
* Useful for testing what prompts are being sent to the LLM.
|
|
286
|
+
*
|
|
287
|
+
* @example
|
|
288
|
+
* ```ts
|
|
289
|
+
* const { client, prompts } = createSpyMockClient({ score: 0.5 });
|
|
290
|
+
* await metric({ outputs, context, llmClient: client });
|
|
291
|
+
* console.log(prompts); // See all prompts that were sent
|
|
292
|
+
* ```
|
|
293
|
+
*/
|
|
294
|
+
declare function createSpyMockClient(response: string | Record<string, unknown>): {
|
|
295
|
+
client: LLMClient;
|
|
296
|
+
prompts: string[];
|
|
297
|
+
};
|
|
298
|
+
|
|
299
|
+
export { BINARY_THRESHOLDS, SEVERITY_THRESHOLDS, batch, batchItems, clearMetrics, createErrorMockClient, createJSONSchema, createKeywordMetric, createLLMError, createMetricOutput, createMockLLMClient, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, extractScore, fillPrompt, getLLMClient, getMetric, listMetrics, normalizeScore, parseJSONResponse, registerMetric, requireLLMClient, resetLLMClient, runMetric, scoreToLabel, setLLMClient, unregisterMetric, validateResponse, withTimeout };
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
export { FaithfulnessConfig, HallucinationConfig, RelevanceConfig, ToxicityConfig, faithfulness, hallucination, relevance, toxicity } from './opinionated/index.js';
|
|
2
|
+
import { M as MetricFn, a as MetricConfig, b as MetricOutput, L as LLMClient, J as JSONSchema } from '../types-C71p0wzM.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Custom metric registration
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Registers a custom metric
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```ts
|
|
13
|
+
* registerMetric("custom-relevance", async ({ outputs, query }) => {
|
|
14
|
+
* // Custom evaluation logic
|
|
15
|
+
* return outputs.map(o => ({
|
|
16
|
+
* id: o.id,
|
|
17
|
+
* metric: "custom-relevance",
|
|
18
|
+
* score: evaluateRelevance(o.output, query),
|
|
19
|
+
* }));
|
|
20
|
+
* });
|
|
21
|
+
* ```
|
|
22
|
+
*/
|
|
23
|
+
declare function registerMetric(name: string, fn: MetricFn): void;
|
|
24
|
+
/**
|
|
25
|
+
* Gets a registered custom metric
|
|
26
|
+
*/
|
|
27
|
+
declare function getMetric(name: string): MetricFn | undefined;
|
|
28
|
+
/**
|
|
29
|
+
* Runs a registered metric
|
|
30
|
+
*/
|
|
31
|
+
declare function runMetric(name: string, config: MetricConfig): Promise<MetricOutput[]>;
|
|
32
|
+
/**
|
|
33
|
+
* Lists all registered custom metrics
|
|
34
|
+
*/
|
|
35
|
+
declare function listMetrics(): string[];
|
|
36
|
+
/**
|
|
37
|
+
* Unregisters a metric (mainly for testing)
|
|
38
|
+
*/
|
|
39
|
+
declare function unregisterMetric(name: string): boolean;
|
|
40
|
+
/**
|
|
41
|
+
* Clears all registered metrics (mainly for testing)
|
|
42
|
+
*/
|
|
43
|
+
declare function clearMetrics(): void;
|
|
44
|
+
/**
|
|
45
|
+
* Creates a simple string-matching metric
|
|
46
|
+
*/
|
|
47
|
+
declare function createPatternMetric(name: string, patterns: RegExp[], options?: {
|
|
48
|
+
matchScore?: number;
|
|
49
|
+
noMatchScore?: number;
|
|
50
|
+
}): MetricFn;
|
|
51
|
+
/**
|
|
52
|
+
* Creates a keyword-based metric
|
|
53
|
+
*/
|
|
54
|
+
declare function createKeywordMetric(name: string, keywords: string[], options?: {
|
|
55
|
+
caseSensitive?: boolean;
|
|
56
|
+
threshold?: number;
|
|
57
|
+
}): MetricFn;
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Metric utilities
|
|
61
|
+
*/
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Normalizes a score to 0-1 range
|
|
65
|
+
*/
|
|
66
|
+
declare function normalizeScore(score: number, min?: number, max?: number): number;
|
|
67
|
+
/**
|
|
68
|
+
* Converts a numeric score to a label based on thresholds
|
|
69
|
+
*/
|
|
70
|
+
declare function scoreToLabel(score: number, thresholds: {
|
|
71
|
+
label: string;
|
|
72
|
+
min: number;
|
|
73
|
+
}[]): string;
|
|
74
|
+
/**
|
|
75
|
+
* Creates a metric output from a score
|
|
76
|
+
*/
|
|
77
|
+
declare function createMetricOutput(id: string, metric: string, score: number, labelThresholds?: {
|
|
78
|
+
label: string;
|
|
79
|
+
min: number;
|
|
80
|
+
}[]): MetricOutput;
|
|
81
|
+
/**
|
|
82
|
+
* Default thresholds for binary metrics
|
|
83
|
+
*/
|
|
84
|
+
declare const BINARY_THRESHOLDS: {
|
|
85
|
+
label: string;
|
|
86
|
+
min: number;
|
|
87
|
+
}[];
|
|
88
|
+
/**
|
|
89
|
+
* Default thresholds for severity metrics
|
|
90
|
+
*/
|
|
91
|
+
declare const SEVERITY_THRESHOLDS: {
|
|
92
|
+
label: string;
|
|
93
|
+
min: number;
|
|
94
|
+
}[];
|
|
95
|
+
/**
|
|
96
|
+
* Batches items for parallel processing
|
|
97
|
+
*/
|
|
98
|
+
declare function batch<T>(items: T[], size: number): T[][];
|
|
99
|
+
/**
|
|
100
|
+
* Delays execution
|
|
101
|
+
*/
|
|
102
|
+
declare function delay(ms: number): Promise<void>;
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* LLM client management for metric evaluation
|
|
106
|
+
*
|
|
107
|
+
* Provides a global LLM client that can be configured once and used
|
|
108
|
+
* across all LLM-based metrics, with support for per-call overrides.
|
|
109
|
+
*/
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Sets the global LLM client for all metrics
|
|
113
|
+
*
|
|
114
|
+
* @example
|
|
115
|
+
* ```ts
|
|
116
|
+
* import { setLLMClient } from "evalsense/metrics";
|
|
117
|
+
*
|
|
118
|
+
* setLLMClient({
|
|
119
|
+
* async complete(prompt) {
|
|
120
|
+
* return await yourLLM.generate(prompt);
|
|
121
|
+
* }
|
|
122
|
+
* });
|
|
123
|
+
* ```
|
|
124
|
+
*/
|
|
125
|
+
declare function setLLMClient(client: LLMClient): void;
|
|
126
|
+
/**
|
|
127
|
+
* Gets the current global LLM client
|
|
128
|
+
*
|
|
129
|
+
* @returns The global client or null if not set
|
|
130
|
+
*/
|
|
131
|
+
declare function getLLMClient(): LLMClient | null;
|
|
132
|
+
/**
|
|
133
|
+
* Resets the global LLM client
|
|
134
|
+
*
|
|
135
|
+
* Useful for testing or switching between different LLM providers.
|
|
136
|
+
*/
|
|
137
|
+
declare function resetLLMClient(): void;
|
|
138
|
+
/**
|
|
139
|
+
* Validates that an LLM client is available
|
|
140
|
+
*
|
|
141
|
+
* @param client - Optional client override
|
|
142
|
+
* @param metricName - Name of the metric for error messages
|
|
143
|
+
* @throws Error if no client is configured
|
|
144
|
+
* @returns The client to use (override or global)
|
|
145
|
+
*/
|
|
146
|
+
declare function requireLLMClient(client: LLMClient | undefined, metricName: string): LLMClient;
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Utilities for LLM-based metric evaluation
|
|
150
|
+
*
|
|
151
|
+
* Provides helpers for prompt templating, response parsing, validation, and error handling.
|
|
152
|
+
*/
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Fills a prompt template with variables
|
|
156
|
+
*
|
|
157
|
+
* @example
|
|
158
|
+
* ```ts
|
|
159
|
+
* const prompt = fillPrompt(
|
|
160
|
+
* "Context: {context}\nOutput: {output}",
|
|
161
|
+
* { context: "Paris is the capital", output: "France's capital is Paris" }
|
|
162
|
+
* );
|
|
163
|
+
* ```
|
|
164
|
+
*/
|
|
165
|
+
declare function fillPrompt(template: string, variables: Record<string, string>): string;
|
|
166
|
+
/**
|
|
167
|
+
* Parses a JSON response from an LLM, with fallback handling
|
|
168
|
+
*
|
|
169
|
+
* Handles:
|
|
170
|
+
* - Plain JSON strings
|
|
171
|
+
* - JSON wrapped in markdown code blocks
|
|
172
|
+
* - Malformed JSON with helpful error messages
|
|
173
|
+
*
|
|
174
|
+
* @example
|
|
175
|
+
* ```ts
|
|
176
|
+
* const result = parseJSONResponse<{ score: number }>(llmResponse);
|
|
177
|
+
* ```
|
|
178
|
+
*/
|
|
179
|
+
declare function parseJSONResponse<T>(response: string): T;
|
|
180
|
+
/**
|
|
181
|
+
* Validates that a parsed JSON response has required fields
|
|
182
|
+
*
|
|
183
|
+
* @example
|
|
184
|
+
* ```ts
|
|
185
|
+
* validateResponse(result, ["score", "reasoning"], "hallucination");
|
|
186
|
+
* ```
|
|
187
|
+
*/
|
|
188
|
+
declare function validateResponse(response: unknown, requiredFields: string[], metricName: string): void;
|
|
189
|
+
/**
|
|
190
|
+
* Extracts a score from various formats (number, string, object with score field)
|
|
191
|
+
*/
|
|
192
|
+
declare function extractScore(value: unknown, defaultScore?: number): number;
|
|
193
|
+
/**
|
|
194
|
+
* Creates a JSON schema for structured LLM outputs
|
|
195
|
+
*
|
|
196
|
+
* @example
|
|
197
|
+
* ```ts
|
|
198
|
+
* const schema = createJSONSchema({
|
|
199
|
+
* score: "number",
|
|
200
|
+
* reasoning: "string"
|
|
201
|
+
* });
|
|
202
|
+
* ```
|
|
203
|
+
*/
|
|
204
|
+
declare function createJSONSchema(properties: Record<string, string>, required?: string[]): JSONSchema;
|
|
205
|
+
/**
|
|
206
|
+
* Batches an array of items into chunks
|
|
207
|
+
*
|
|
208
|
+
* Useful for batch evaluation mode to control batch size.
|
|
209
|
+
*/
|
|
210
|
+
declare function batchItems<T>(items: T[], batchSize: number): T[][];
|
|
211
|
+
/**
|
|
212
|
+
* Creates a consistent error message for LLM metric failures
|
|
213
|
+
*/
|
|
214
|
+
declare function createLLMError(metricName: string, operation: string, error: unknown, context?: {
|
|
215
|
+
id?: string;
|
|
216
|
+
index?: number;
|
|
217
|
+
}): Error;
|
|
218
|
+
/**
|
|
219
|
+
* Waits for a promise with a timeout
|
|
220
|
+
*/
|
|
221
|
+
declare function withTimeout<T>(promise: Promise<T>, timeoutMs: number, operation: string): Promise<T>;
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Mock LLM client for testing
|
|
225
|
+
*
|
|
226
|
+
* Provides a configurable mock implementation of LLMClient for unit tests.
|
|
227
|
+
*/
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Configuration for mock LLM client
|
|
231
|
+
*/
|
|
232
|
+
interface MockLLMConfig {
|
|
233
|
+
/** Fixed response to return (can be string or object for JSON mode) */
|
|
234
|
+
response?: string | Record<string, unknown>;
|
|
235
|
+
/** Multiple responses for sequential calls */
|
|
236
|
+
responses?: Array<string | Record<string, unknown>>;
|
|
237
|
+
/** Delay in milliseconds before responding */
|
|
238
|
+
delay?: number;
|
|
239
|
+
/** Whether to throw an error */
|
|
240
|
+
shouldError?: boolean;
|
|
241
|
+
/** Error message to throw */
|
|
242
|
+
errorMessage?: string;
|
|
243
|
+
/** Function to validate prompts */
|
|
244
|
+
onPrompt?: (prompt: string) => void;
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Creates a mock LLM client for testing
|
|
248
|
+
*
|
|
249
|
+
* @example
|
|
250
|
+
* ```ts
|
|
251
|
+
* const mock = createMockLLMClient({
|
|
252
|
+
* response: JSON.stringify({ score: 0.8, reasoning: "test" }),
|
|
253
|
+
* delay: 100
|
|
254
|
+
* });
|
|
255
|
+
*
|
|
256
|
+
* setLLMClient(mock);
|
|
257
|
+
* ```
|
|
258
|
+
*/
|
|
259
|
+
declare function createMockLLMClient(config?: MockLLMConfig): LLMClient;
|
|
260
|
+
/**
|
|
261
|
+
* Creates a mock client that returns sequential responses
|
|
262
|
+
*
|
|
263
|
+
* Useful for testing multiple calls with different responses.
|
|
264
|
+
*
|
|
265
|
+
* @example
|
|
266
|
+
* ```ts
|
|
267
|
+
* const mock = createSequentialMockClient([
|
|
268
|
+
* { score: 0.2, reasoning: "First call" },
|
|
269
|
+
* { score: 0.8, reasoning: "Second call" }
|
|
270
|
+
* ]);
|
|
271
|
+
* ```
|
|
272
|
+
*/
|
|
273
|
+
declare function createSequentialMockClient(responses: Array<string | Record<string, unknown>>, options?: {
|
|
274
|
+
delay?: number;
|
|
275
|
+
}): LLMClient;
|
|
276
|
+
/**
|
|
277
|
+
* Creates a mock client that always errors
|
|
278
|
+
*
|
|
279
|
+
* Useful for testing error handling.
|
|
280
|
+
*/
|
|
281
|
+
declare function createErrorMockClient(errorMessage?: string): LLMClient;
|
|
282
|
+
/**
|
|
283
|
+
* Creates a spy mock client that records all prompts
|
|
284
|
+
*
|
|
285
|
+
* Useful for testing what prompts are being sent to the LLM.
|
|
286
|
+
*
|
|
287
|
+
* @example
|
|
288
|
+
* ```ts
|
|
289
|
+
* const { client, prompts } = createSpyMockClient({ score: 0.5 });
|
|
290
|
+
* await metric({ outputs, context, llmClient: client });
|
|
291
|
+
* console.log(prompts); // See all prompts that were sent
|
|
292
|
+
* ```
|
|
293
|
+
*/
|
|
294
|
+
declare function createSpyMockClient(response: string | Record<string, unknown>): {
|
|
295
|
+
client: LLMClient;
|
|
296
|
+
prompts: string[];
|
|
297
|
+
};
|
|
298
|
+
|
|
299
|
+
export { BINARY_THRESHOLDS, SEVERITY_THRESHOLDS, batch, batchItems, clearMetrics, createErrorMockClient, createJSONSchema, createKeywordMetric, createLLMError, createMetricOutput, createMockLLMClient, createPatternMetric, createSequentialMockClient, createSpyMockClient, delay, extractScore, fillPrompt, getLLMClient, getMetric, listMetrics, normalizeScore, parseJSONResponse, registerMetric, requireLLMClient, resetLLMClient, runMetric, scoreToLabel, setLLMClient, unregisterMetric, validateResponse, withTimeout };
|