@elsium-ai/testing 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1105 -21
- package/package.json +6 -6
package/README.md
CHANGED
|
@@ -13,36 +13,1120 @@ npm install @elsium-ai/testing --save-dev
|
|
|
13
13
|
|
|
14
14
|
## What's Inside
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
16
|
+
| Category | Exports | Description |
|
|
17
|
+
|---|---|---|
|
|
18
|
+
| **Mock Provider** | `mockProvider`, `MockProvider`, `MockProviderOptions`, `MockResponseConfig` | Zero-latency LLM provider for unit tests |
|
|
19
|
+
| **Fixtures** | `createFixture`, `loadFixture`, `createRecorder`, `Fixture`, `FixtureEntry`, `FixtureRecorder` | Record, save, and replay request/response pairs |
|
|
20
|
+
| **Eval** | `runEvalSuite`, `formatEvalReport`, `EvalCase`, `EvalCriterion`, `EvalResult`, `CriterionResult`, `EvalSuiteConfig`, `EvalSuiteResult`, `LLMJudge` | Evaluation framework with built-in and custom criteria |
|
|
21
|
+
| **Snapshot** | `createSnapshotStore`, `hashOutput`, `testSnapshot`, `PromptSnapshot`, `SnapshotStore`, `SnapshotTestResult` | Hash-based snapshot testing for LLM outputs |
|
|
22
|
+
| **Prompts** | `createPromptRegistry`, `definePrompt`, `PromptDefinition`, `PromptDiff`, `DiffLine`, `PromptRegistry` | Versioned prompt registry with diff and rendering |
|
|
23
|
+
| **Regression** | `createRegressionSuite`, `RegressionBaseline`, `RegressionResult`, `RegressionDetail`, `RegressionSuite` | Baseline-driven regression detection |
|
|
24
|
+
| **Replay** | `createReplayRecorder`, `createReplayPlayer`, `ReplayEntry`, `ReplayRecorder`, `ReplayPlayer` | Record and replay raw LLM completion calls |
|
|
25
|
+
| **Pinning** | `createPinStore`, `pinOutput`, `Pin`, `PinStore`, `PinResult` | Pin expected outputs and detect drift |
|
|
26
|
+
| **Determinism** | `assertDeterministic`, `assertStable`, `DeterminismResult`, `StabilityResult` | Verify output consistency across repeated runs |
|
|
23
27
|
|
|
24
|
-
|
|
28
|
+
---
|
|
25
29
|
|
|
26
|
-
|
|
27
|
-
|
|
30
|
+
## Mock Provider
|
|
31
|
+
|
|
32
|
+
Create a mock `LLMProvider` that returns pre-configured responses without making real API calls.
|
|
33
|
+
|
|
34
|
+
### `MockResponseConfig`
|
|
35
|
+
|
|
36
|
+
```ts
|
|
37
|
+
interface MockResponseConfig {
|
|
38
|
+
content?: string
|
|
39
|
+
toolCalls?: Array<{
|
|
40
|
+
id?: string
|
|
41
|
+
name: string
|
|
42
|
+
arguments: Record<string, unknown>
|
|
43
|
+
}>
|
|
44
|
+
stopReason?: 'end_turn' | 'tool_use' | 'max_tokens' | 'stop_sequence'
|
|
45
|
+
usage?: Partial<TokenUsage>
|
|
46
|
+
model?: string
|
|
47
|
+
delay?: number
|
|
48
|
+
}
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### `MockProviderOptions`
|
|
52
|
+
|
|
53
|
+
```ts
|
|
54
|
+
interface MockProviderOptions {
|
|
55
|
+
responses?: MockResponseConfig[]
|
|
56
|
+
defaultResponse?: MockResponseConfig
|
|
57
|
+
onRequest?: (request: CompletionRequest) => void
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
| Field | Description |
|
|
62
|
+
|---|---|
|
|
63
|
+
| `responses` | Ordered list of responses returned sequentially per call |
|
|
64
|
+
| `defaultResponse` | Fallback response used when `responses` is exhausted |
|
|
65
|
+
| `onRequest` | Callback invoked on every request (useful for assertions) |
|
|
66
|
+
|
|
67
|
+
### `MockProvider`
|
|
68
|
+
|
|
69
|
+
```ts
|
|
70
|
+
interface MockProvider extends LLMProvider {
|
|
71
|
+
readonly calls: CompletionRequest[]
|
|
72
|
+
readonly callCount: number
|
|
73
|
+
reset(): void
|
|
74
|
+
}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Extends the standard `LLMProvider` interface with inspection helpers. `calls` stores every `CompletionRequest` received, `callCount` returns the total, and `reset()` clears both the call log and the response index.
|
|
78
|
+
|
|
79
|
+
### `mockProvider()`
|
|
80
|
+
|
|
81
|
+
Creates a mock provider instance.
|
|
82
|
+
|
|
83
|
+
```ts
|
|
84
|
+
function mockProvider(options?: MockProviderOptions): MockProvider
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
| Parameter | Type | Default | Description |
|
|
88
|
+
|---|---|---|---|
|
|
89
|
+
| `options` | `MockProviderOptions` | `{}` | Configuration for responses and callbacks |
|
|
90
|
+
|
|
91
|
+
**Returns:** `MockProvider`
|
|
92
|
+
|
|
93
|
+
```ts
|
|
94
|
+
import { mockProvider } from '@elsium-ai/testing'
|
|
95
|
+
|
|
96
|
+
const mock = mockProvider({
|
|
97
|
+
responses: [
|
|
98
|
+
{ content: 'Hello!' },
|
|
99
|
+
{ content: 'Goodbye!', stopReason: 'end_turn' },
|
|
100
|
+
],
|
|
101
|
+
defaultResponse: { content: 'Default reply' },
|
|
102
|
+
onRequest: (req) => console.log('Model:', req.model),
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
const first = await mock.complete({ messages: [{ role: 'user', content: 'Hi' }] })
|
|
106
|
+
// first.message.content === 'Hello!'
|
|
107
|
+
|
|
108
|
+
console.log(mock.callCount) // 1
|
|
109
|
+
mock.reset()
|
|
110
|
+
console.log(mock.callCount) // 0
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Fixtures
|
|
116
|
+
|
|
117
|
+
Capture request/response pairs as reusable fixtures that can be serialized to JSON and replayed as mock providers.
|
|
118
|
+
|
|
119
|
+
### `FixtureEntry`
|
|
120
|
+
|
|
121
|
+
```ts
|
|
122
|
+
interface FixtureEntry {
|
|
123
|
+
request: {
|
|
124
|
+
messages: Array<{ role: string; content: string }>
|
|
125
|
+
model?: string
|
|
126
|
+
system?: string
|
|
127
|
+
}
|
|
128
|
+
response: MockResponseConfig
|
|
129
|
+
timestamp?: string
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### `Fixture`
|
|
134
|
+
|
|
135
|
+
```ts
|
|
136
|
+
interface Fixture {
|
|
137
|
+
readonly name: string
|
|
138
|
+
readonly entries: FixtureEntry[]
|
|
139
|
+
toProvider(options?: { matching?: 'sequential' | 'request-hash' }): MockProvider
|
|
140
|
+
toJSON(): string
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
| Method | Description |
|
|
145
|
+
|---|---|
|
|
146
|
+
| `toProvider()` | Converts the fixture into a `MockProvider`. Pass `{ matching: 'request-hash' }` to match responses by message content hash instead of sequential order. |
|
|
147
|
+
| `toJSON()` | Serializes the fixture (with timestamps) to a JSON string. |
|
|
148
|
+
|
|
149
|
+
### `createFixture()`
|
|
150
|
+
|
|
151
|
+
Creates a fixture from a name and an array of entries.
|
|
152
|
+
|
|
153
|
+
```ts
|
|
154
|
+
function createFixture(name: string, entries: FixtureEntry[]): Fixture
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
| Parameter | Type | Description |
|
|
158
|
+
|---|---|---|
|
|
159
|
+
| `name` | `string` | Human-readable fixture name |
|
|
160
|
+
| `entries` | `FixtureEntry[]` | Array of request/response pairs |
|
|
161
|
+
|
|
162
|
+
**Returns:** `Fixture`
|
|
163
|
+
|
|
164
|
+
```ts
|
|
165
|
+
import { createFixture } from '@elsium-ai/testing'
|
|
166
|
+
|
|
167
|
+
const fixture = createFixture('greeting-flow', [
|
|
168
|
+
{
|
|
169
|
+
request: { messages: [{ role: 'user', content: 'Hello' }] },
|
|
170
|
+
response: { content: 'Hi there!' },
|
|
171
|
+
},
|
|
172
|
+
])
|
|
173
|
+
|
|
174
|
+
const provider = fixture.toProvider()
|
|
175
|
+
const res = await provider.complete({
|
|
176
|
+
messages: [{ role: 'user', content: 'Hello' }],
|
|
177
|
+
})
|
|
178
|
+
// res.message.content === 'Hi there!'
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### `loadFixture()`
|
|
182
|
+
|
|
183
|
+
Deserializes a JSON string back into a `Fixture`.
|
|
184
|
+
|
|
185
|
+
```ts
|
|
186
|
+
function loadFixture(json: string): Fixture
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
| Parameter | Type | Description |
|
|
190
|
+
|---|---|---|
|
|
191
|
+
| `json` | `string` | JSON string previously produced by `fixture.toJSON()` |
|
|
192
|
+
|
|
193
|
+
**Returns:** `Fixture`
|
|
194
|
+
|
|
195
|
+
```ts
|
|
196
|
+
import { createFixture, loadFixture } from '@elsium-ai/testing'
|
|
197
|
+
|
|
198
|
+
const original = createFixture('test', [
|
|
199
|
+
{
|
|
200
|
+
request: { messages: [{ role: 'user', content: 'ping' }] },
|
|
201
|
+
response: { content: 'pong' },
|
|
202
|
+
},
|
|
203
|
+
])
|
|
204
|
+
|
|
205
|
+
const json = original.toJSON()
|
|
206
|
+
const restored = loadFixture(json)
|
|
207
|
+
// restored.name === 'test'
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### `FixtureRecorder`
|
|
211
|
+
|
|
212
|
+
```ts
|
|
213
|
+
interface FixtureRecorder {
|
|
214
|
+
wrap(provider: MockProvider): MockProvider
|
|
215
|
+
getEntries(): FixtureEntry[]
|
|
216
|
+
toFixture(name: string): Fixture
|
|
217
|
+
clear(): void
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### `createRecorder()`
|
|
222
|
+
|
|
223
|
+
Creates a recorder that intercepts `complete()` calls and captures request/response pairs.
|
|
224
|
+
|
|
225
|
+
```ts
|
|
226
|
+
function createRecorder(): FixtureRecorder
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
**Returns:** `FixtureRecorder`
|
|
230
|
+
|
|
231
|
+
```ts
|
|
232
|
+
import { mockProvider, createRecorder } from '@elsium-ai/testing'
|
|
233
|
+
|
|
234
|
+
const recorder = createRecorder()
|
|
235
|
+
const mock = mockProvider({ responses: [{ content: 'recorded response' }] })
|
|
236
|
+
const wrapped = recorder.wrap(mock)
|
|
237
|
+
|
|
238
|
+
await wrapped.complete({
|
|
239
|
+
messages: [{ role: 'user', content: 'capture this' }],
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
const fixture = recorder.toFixture('my-fixture')
|
|
243
|
+
console.log(fixture.entries.length) // 1
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Eval
|
|
249
|
+
|
|
250
|
+
A structured evaluation framework for assessing LLM outputs against configurable criteria.
|
|
251
|
+
|
|
252
|
+
### `EvalCase`
|
|
253
|
+
|
|
254
|
+
```ts
|
|
255
|
+
interface EvalCase {
|
|
256
|
+
name: string
|
|
257
|
+
input: string
|
|
258
|
+
expected?: string
|
|
259
|
+
criteria?: EvalCriterion[]
|
|
260
|
+
tags?: string[]
|
|
261
|
+
}
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
### `EvalCriterion`
|
|
265
|
+
|
|
266
|
+
A discriminated union of all supported criterion types:
|
|
267
|
+
|
|
268
|
+
```ts
|
|
269
|
+
type EvalCriterion =
|
|
270
|
+
| { type: 'contains'; value: string; caseSensitive?: boolean }
|
|
271
|
+
| { type: 'not_contains'; value: string; caseSensitive?: boolean }
|
|
272
|
+
| { type: 'matches'; pattern: string; flags?: string }
|
|
273
|
+
| { type: 'length_min'; value: number }
|
|
274
|
+
| { type: 'length_max'; value: number }
|
|
275
|
+
| { type: 'json_valid' }
|
|
276
|
+
| { type: 'json_matches'; schema: Record<string, unknown> }
|
|
277
|
+
| { type: 'custom'; name: string; fn: (output: string) => boolean }
|
|
278
|
+
| { type: 'llm_judge'; prompt: string; judge: LLMJudge; threshold?: number }
|
|
279
|
+
| { type: 'semantic_similarity'; reference: string; threshold?: number }
|
|
280
|
+
| { type: 'factual_accuracy'; facts: string[]; threshold?: number }
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
| Criterion | Description |
|
|
284
|
+
|---|---|
|
|
285
|
+
| `contains` | Output must contain `value` (case-insensitive by default) |
|
|
286
|
+
| `not_contains` | Output must not contain `value` |
|
|
287
|
+
| `matches` | Output must match the regex `pattern` |
|
|
288
|
+
| `length_min` | Output length must be at least `value` characters |
|
|
289
|
+
| `length_max` | Output length must be at most `value` characters |
|
|
290
|
+
| `json_valid` | Output must be valid JSON |
|
|
291
|
+
| `json_matches` | Output must be valid JSON matching `schema` (key presence + type check) |
|
|
292
|
+
| `custom` | Output is passed to `fn`; must return `true` to pass |
|
|
293
|
+
| `llm_judge` | An LLM judge scores the output; must meet `threshold` (default 0.7) |
|
|
294
|
+
| `semantic_similarity` | Word-overlap similarity against `reference`; must meet `threshold` (default 0.7) |
|
|
295
|
+
| `factual_accuracy` | Checks how many `facts` appear in the output; must meet `threshold` (default 0.7) |
|
|
296
|
+
|
|
297
|
+
### `LLMJudge`
|
|
298
|
+
|
|
299
|
+
```ts
|
|
300
|
+
type LLMJudge = (prompt: string) => Promise<{ score: number; reasoning: string }>
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### `EvalResult`
|
|
304
|
+
|
|
305
|
+
```ts
|
|
306
|
+
interface EvalResult {
|
|
307
|
+
name: string
|
|
308
|
+
passed: boolean
|
|
309
|
+
score: number
|
|
310
|
+
criteria: CriterionResult[]
|
|
311
|
+
input: string
|
|
312
|
+
output: string
|
|
313
|
+
durationMs: number
|
|
314
|
+
tags: string[]
|
|
315
|
+
}
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### `CriterionResult`
|
|
319
|
+
|
|
320
|
+
```ts
|
|
321
|
+
interface CriterionResult {
|
|
322
|
+
type: string
|
|
323
|
+
passed: boolean
|
|
324
|
+
message: string
|
|
325
|
+
}
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### `EvalSuiteConfig`
|
|
329
|
+
|
|
330
|
+
```ts
|
|
331
|
+
interface EvalSuiteConfig {
|
|
332
|
+
name: string
|
|
333
|
+
cases: EvalCase[]
|
|
334
|
+
runner: (input: string) => Promise<string>
|
|
335
|
+
concurrency?: number
|
|
336
|
+
}
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
| Field | Description |
|
|
340
|
+
|---|---|
|
|
341
|
+
| `name` | Suite name for reporting |
|
|
342
|
+
| `cases` | Array of eval cases to run |
|
|
343
|
+
| `runner` | Function that takes an input string and returns the LLM output |
|
|
344
|
+
| `concurrency` | Max parallel eval cases (default `1` for sequential execution) |
|
|
345
|
+
|
|
346
|
+
### `EvalSuiteResult`
|
|
347
|
+
|
|
348
|
+
```ts
|
|
349
|
+
interface EvalSuiteResult {
|
|
350
|
+
name: string
|
|
351
|
+
total: number
|
|
352
|
+
passed: number
|
|
353
|
+
failed: number
|
|
354
|
+
score: number
|
|
355
|
+
results: EvalResult[]
|
|
356
|
+
durationMs: number
|
|
357
|
+
}
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### `runEvalSuite()`
|
|
361
|
+
|
|
362
|
+
Runs all eval cases through the runner and evaluates each against its criteria.
|
|
363
|
+
|
|
364
|
+
```ts
|
|
365
|
+
function runEvalSuite(config: EvalSuiteConfig): Promise<EvalSuiteResult>
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
| Parameter | Type | Description |
|
|
369
|
+
|---|---|---|
|
|
370
|
+
| `config` | `EvalSuiteConfig` | Suite configuration including cases and runner |
|
|
371
|
+
|
|
372
|
+
**Returns:** `Promise<EvalSuiteResult>`
|
|
373
|
+
|
|
374
|
+
```ts
|
|
375
|
+
import { runEvalSuite, formatEvalReport } from '@elsium-ai/testing'
|
|
376
|
+
|
|
377
|
+
const result = await runEvalSuite({
|
|
378
|
+
name: 'Sentiment classifier',
|
|
379
|
+
cases: [
|
|
380
|
+
{
|
|
381
|
+
name: 'positive review',
|
|
382
|
+
input: 'This product is amazing!',
|
|
383
|
+
criteria: [
|
|
384
|
+
{ type: 'contains', value: 'positive' },
|
|
385
|
+
{ type: 'length_max', value: 50 },
|
|
386
|
+
],
|
|
387
|
+
},
|
|
388
|
+
{
|
|
389
|
+
name: 'negative review',
|
|
390
|
+
input: 'Terrible experience.',
|
|
391
|
+
expected: 'negative',
|
|
392
|
+
},
|
|
393
|
+
],
|
|
394
|
+
runner: async (input) => {
|
|
395
|
+
// Call your LLM or classifier here
|
|
396
|
+
return input.includes('amazing') ? 'positive' : 'negative'
|
|
397
|
+
},
|
|
398
|
+
concurrency: 2,
|
|
399
|
+
})
|
|
400
|
+
|
|
401
|
+
console.log(result.score) // 0..1
|
|
402
|
+
console.log(result.passed) // number of passing cases
|
|
403
|
+
```
|
|
404
|
+
|
|
405
|
+
### `formatEvalReport()`
|
|
406
|
+
|
|
407
|
+
Formats an `EvalSuiteResult` into a human-readable string report.
|
|
408
|
+
|
|
409
|
+
```ts
|
|
410
|
+
function formatEvalReport(result: EvalSuiteResult): string
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
| Parameter | Type | Description |
|
|
414
|
+
|---|---|---|
|
|
415
|
+
| `result` | `EvalSuiteResult` | The result object returned by `runEvalSuite` |
|
|
416
|
+
|
|
417
|
+
**Returns:** `string`
|
|
418
|
+
|
|
419
|
+
```ts
|
|
420
|
+
import { runEvalSuite, formatEvalReport } from '@elsium-ai/testing'
|
|
421
|
+
|
|
422
|
+
const result = await runEvalSuite({ /* ... */ })
|
|
423
|
+
console.log(formatEvalReport(result))
|
|
424
|
+
// Output:
|
|
425
|
+
// Eval Suite: Sentiment classifier
|
|
426
|
+
// --------------------------------------------------
|
|
427
|
+
// [PASS] positive review (3ms)
|
|
428
|
+
// [PASS] negative review (1ms)
|
|
429
|
+
// --------------------------------------------------
|
|
430
|
+
// Score: 100.0% | 2/2 passed | 4ms
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
---
|
|
434
|
+
|
|
435
|
+
## Snapshot
|
|
436
|
+
|
|
437
|
+
Hash-based snapshot testing that detects when LLM outputs change between runs.
|
|
438
|
+
|
|
439
|
+
### `PromptSnapshot`
|
|
440
|
+
|
|
441
|
+
```ts
|
|
442
|
+
interface PromptSnapshot {
|
|
443
|
+
name: string
|
|
444
|
+
request: {
|
|
445
|
+
system?: string
|
|
446
|
+
messages: Array<{ role: string; content: string }>
|
|
447
|
+
model?: string
|
|
448
|
+
}
|
|
449
|
+
outputHash: string
|
|
450
|
+
timestamp: string
|
|
451
|
+
}
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
### `SnapshotStore`
|
|
455
|
+
|
|
456
|
+
```ts
|
|
457
|
+
interface SnapshotStore {
|
|
458
|
+
get(name: string): PromptSnapshot | undefined
|
|
459
|
+
set(name: string, snapshot: PromptSnapshot): void
|
|
460
|
+
getAll(): PromptSnapshot[]
|
|
461
|
+
toJSON(): string
|
|
462
|
+
}
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
### `createSnapshotStore()`
|
|
466
|
+
|
|
467
|
+
Creates an in-memory snapshot store, optionally seeded with existing snapshots.
|
|
468
|
+
|
|
469
|
+
```ts
|
|
470
|
+
function createSnapshotStore(existing?: PromptSnapshot[]): SnapshotStore
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
| Parameter | Type | Default | Description |
|
|
474
|
+
|---|---|---|---|
|
|
475
|
+
| `existing` | `PromptSnapshot[]` | `undefined` | Previously saved snapshots to preload |
|
|
476
|
+
|
|
477
|
+
**Returns:** `SnapshotStore`
|
|
478
|
+
|
|
479
|
+
```ts
|
|
480
|
+
import { createSnapshotStore } from '@elsium-ai/testing'
|
|
481
|
+
|
|
482
|
+
const store = createSnapshotStore()
|
|
483
|
+
console.log(store.getAll().length) // 0
|
|
484
|
+
```
|
|
485
|
+
|
|
486
|
+
### `hashOutput()`
|
|
487
|
+
|
|
488
|
+
Produces a SHA-256 hex digest of the given string.
|
|
489
|
+
|
|
490
|
+
```ts
|
|
491
|
+
function hashOutput(output: string): string
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
| Parameter | Type | Description |
|
|
495
|
+
|---|---|---|
|
|
496
|
+
| `output` | `string` | The output string to hash |
|
|
497
|
+
|
|
498
|
+
**Returns:** `string` -- SHA-256 hex hash
|
|
499
|
+
|
|
500
|
+
```ts
|
|
501
|
+
import { hashOutput } from '@elsium-ai/testing'
|
|
502
|
+
|
|
503
|
+
const hash = hashOutput('Hello, world!')
|
|
504
|
+
// 'dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f'
|
|
505
|
+
```
|
|
506
|
+
|
|
507
|
+
### `SnapshotTestResult`
|
|
508
|
+
|
|
509
|
+
```ts
|
|
510
|
+
interface SnapshotTestResult {
|
|
511
|
+
name: string
|
|
512
|
+
status: 'new' | 'match' | 'changed'
|
|
513
|
+
previousHash?: string
|
|
514
|
+
currentHash: string
|
|
515
|
+
output: string
|
|
516
|
+
}
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
### `testSnapshot()`
|
|
520
|
+
|
|
521
|
+
Runs the provided function, hashes its output, and compares the hash against the stored snapshot.
|
|
522
|
+
|
|
523
|
+
```ts
|
|
524
|
+
function testSnapshot(
|
|
525
|
+
name: string,
|
|
526
|
+
store: SnapshotStore,
|
|
527
|
+
runner: () => Promise<string>,
|
|
528
|
+
request?: Partial<CompletionRequest>,
|
|
529
|
+
): Promise<SnapshotTestResult>
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
| Parameter | Type | Description |
|
|
533
|
+
|---|---|---|
|
|
534
|
+
| `name` | `string` | Unique snapshot name |
|
|
535
|
+
| `store` | `SnapshotStore` | The store to read/write snapshots |
|
|
536
|
+
| `runner` | `() => Promise<string>` | Function that produces the output to snapshot |
|
|
537
|
+
| `request` | `Partial<CompletionRequest>` | Optional request metadata stored in the snapshot |
|
|
538
|
+
|
|
539
|
+
**Returns:** `Promise<SnapshotTestResult>` -- status is `'new'` on first run, `'match'` if the hash is unchanged, or `'changed'` if it differs.
|
|
540
|
+
|
|
541
|
+
```ts
|
|
542
|
+
import { createSnapshotStore, testSnapshot } from '@elsium-ai/testing'
|
|
543
|
+
|
|
544
|
+
const store = createSnapshotStore()
|
|
545
|
+
|
|
546
|
+
const result = await testSnapshot('greeting', store, async () => {
|
|
547
|
+
return 'Hello, world!'
|
|
548
|
+
})
|
|
549
|
+
|
|
550
|
+
console.log(result.status) // 'new' on first run
|
|
551
|
+
|
|
552
|
+
const result2 = await testSnapshot('greeting', store, async () => {
|
|
553
|
+
return 'Hello, world!'
|
|
554
|
+
})
|
|
555
|
+
|
|
556
|
+
console.log(result2.status) // 'match'
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
---
|
|
560
|
+
|
|
561
|
+
## Prompts
|
|
562
|
+
|
|
563
|
+
A versioned prompt registry for managing, rendering, and diffing prompt templates.
|
|
564
|
+
|
|
565
|
+
### `PromptDefinition`
|
|
566
|
+
|
|
567
|
+
```ts
|
|
568
|
+
interface PromptDefinition {
|
|
569
|
+
name: string
|
|
570
|
+
version: string
|
|
571
|
+
content: string
|
|
572
|
+
variables: string[]
|
|
573
|
+
metadata?: Record<string, unknown>
|
|
574
|
+
}
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
### `PromptDiff`
|
|
578
|
+
|
|
579
|
+
```ts
|
|
580
|
+
interface PromptDiff {
|
|
581
|
+
name: string
|
|
582
|
+
fromVersion: string
|
|
583
|
+
toVersion: string
|
|
584
|
+
changes: DiffLine[]
|
|
585
|
+
}
|
|
586
|
+
```
|
|
587
|
+
|
|
588
|
+
### `DiffLine`
|
|
589
|
+
|
|
590
|
+
```ts
|
|
591
|
+
interface DiffLine {
|
|
592
|
+
type: 'added' | 'removed' | 'unchanged'
|
|
593
|
+
lineNumber: number
|
|
594
|
+
content: string
|
|
595
|
+
}
|
|
596
|
+
```
|
|
597
|
+
|
|
598
|
+
### `PromptRegistry`
|
|
599
|
+
|
|
600
|
+
```ts
|
|
601
|
+
interface PromptRegistry {
|
|
602
|
+
register(name: string, prompt: PromptDefinition): void
|
|
603
|
+
get(name: string, version?: string): PromptDefinition | undefined
|
|
604
|
+
getLatest(name: string): PromptDefinition | undefined
|
|
605
|
+
list(): Array<{ name: string; versions: string[] }>
|
|
606
|
+
diff(name: string, fromVersion: string, toVersion: string): PromptDiff | null
|
|
607
|
+
render(name: string, variables: Record<string, string>, version?: string): string
|
|
608
|
+
getVersions(name: string): string[]
|
|
609
|
+
}
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
| Method | Description |
|
|
613
|
+
|---|---|
|
|
614
|
+
| `register` | Stores a prompt under its name and version |
|
|
615
|
+
| `get` | Retrieves a specific version, or the latest if `version` is omitted |
|
|
616
|
+
| `getLatest` | Returns the highest semver version for a prompt |
|
|
617
|
+
| `list` | Lists all prompt names with their available versions |
|
|
618
|
+
| `diff` | Computes a line-by-line diff between two versions |
|
|
619
|
+
| `render` | Replaces `{{variable}}` placeholders in the prompt content |
|
|
620
|
+
| `getVersions` | Returns all versions for a prompt sorted by semver |
|
|
621
|
+
|
|
622
|
+
### `definePrompt()`
|
|
623
|
+
|
|
624
|
+
A convenience function that returns a shallow copy of the given prompt definition.
|
|
625
|
+
|
|
626
|
+
```ts
|
|
627
|
+
function definePrompt(config: PromptDefinition): PromptDefinition
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
| Parameter | Type | Description |
|
|
631
|
+
|---|---|---|
|
|
632
|
+
| `config` | `PromptDefinition` | The prompt definition to register |
|
|
633
|
+
|
|
634
|
+
**Returns:** `PromptDefinition`
|
|
635
|
+
|
|
636
|
+
```ts
|
|
637
|
+
import { definePrompt } from '@elsium-ai/testing'
|
|
638
|
+
|
|
639
|
+
const prompt = definePrompt({
|
|
640
|
+
name: 'classifier',
|
|
641
|
+
version: '1.0.0',
|
|
642
|
+
content: 'Classify the following text as {{label}}: {{text}}',
|
|
643
|
+
variables: ['label', 'text'],
|
|
644
|
+
})
|
|
645
|
+
```
|
|
646
|
+
|
|
647
|
+
### `createPromptRegistry()`
|
|
648
|
+
|
|
649
|
+
Creates an empty prompt registry.
|
|
650
|
+
|
|
651
|
+
```ts
|
|
652
|
+
function createPromptRegistry(): PromptRegistry
|
|
653
|
+
```
|
|
654
|
+
|
|
655
|
+
**Returns:** `PromptRegistry`
|
|
656
|
+
|
|
657
|
+
```ts
|
|
658
|
+
import { definePrompt, createPromptRegistry } from '@elsium-ai/testing'
|
|
659
|
+
|
|
660
|
+
const registry = createPromptRegistry()
|
|
661
|
+
|
|
662
|
+
const v1 = definePrompt({
|
|
663
|
+
name: 'summarizer',
|
|
664
|
+
version: '1.0.0',
|
|
665
|
+
content: 'Summarize: {{text}}',
|
|
666
|
+
variables: ['text'],
|
|
667
|
+
})
|
|
668
|
+
|
|
669
|
+
const v2 = definePrompt({
|
|
670
|
+
name: 'summarizer',
|
|
671
|
+
version: '2.0.0',
|
|
672
|
+
content: 'Provide a concise summary of: {{text}}',
|
|
673
|
+
variables: ['text'],
|
|
674
|
+
})
|
|
675
|
+
|
|
676
|
+
registry.register('summarizer', v1)
|
|
677
|
+
registry.register('summarizer', v2)
|
|
678
|
+
|
|
679
|
+
// Render with the latest version
|
|
680
|
+
const output = registry.render('summarizer', { text: 'A long article...' })
|
|
681
|
+
// 'Provide a concise summary of: A long article...'
|
|
682
|
+
|
|
683
|
+
// Diff between versions
|
|
684
|
+
const diff = registry.diff('summarizer', '1.0.0', '2.0.0')
|
|
685
|
+
// diff.changes includes added/removed/unchanged lines
|
|
686
|
+
```
|
|
687
|
+
|
|
688
|
+
---
|
|
689
|
+
|
|
690
|
+
## Regression
|
|
691
|
+
|
|
692
|
+
Baseline-driven regression detection that compares current LLM outputs to previously recorded baselines.
|
|
693
|
+
|
|
694
|
+
### `RegressionBaseline`
|
|
695
|
+
|
|
696
|
+
```ts
|
|
697
|
+
interface RegressionBaseline {
|
|
698
|
+
name: string
|
|
699
|
+
cases: Array<{
|
|
700
|
+
input: string
|
|
701
|
+
output: string
|
|
702
|
+
score: number
|
|
703
|
+
timestamp: number
|
|
704
|
+
}>
|
|
705
|
+
createdAt: number
|
|
706
|
+
updatedAt: number
|
|
707
|
+
}
|
|
708
|
+
```
|
|
709
|
+
|
|
710
|
+
### `RegressionResult`
|
|
711
|
+
|
|
712
|
+
```ts
|
|
713
|
+
interface RegressionResult {
|
|
714
|
+
name: string
|
|
715
|
+
totalCases: number
|
|
716
|
+
regressions: RegressionDetail[]
|
|
717
|
+
improvements: RegressionDetail[]
|
|
718
|
+
unchanged: number
|
|
719
|
+
overallScore: number
|
|
720
|
+
baselineScore: number
|
|
721
|
+
}
|
|
722
|
+
```
|
|
723
|
+
|
|
724
|
+
### `RegressionDetail`
|
|
725
|
+
|
|
726
|
+
```ts
|
|
727
|
+
interface RegressionDetail {
|
|
728
|
+
input: string
|
|
729
|
+
baselineOutput: string
|
|
730
|
+
currentOutput: string
|
|
731
|
+
baselineScore: number
|
|
732
|
+
currentScore: number
|
|
733
|
+
delta: number
|
|
734
|
+
}
|
|
735
|
+
```
|
|
736
|
+
|
|
737
|
+
### `RegressionSuite`
|
|
738
|
+
|
|
739
|
+
```ts
|
|
740
|
+
interface RegressionSuite {
|
|
741
|
+
load(path: string): Promise<void>
|
|
742
|
+
save(path: string): Promise<void>
|
|
743
|
+
run(
|
|
744
|
+
runner: (input: string) => Promise<string>,
|
|
745
|
+
scorer?: (input: string, output: string) => Promise<number>,
|
|
746
|
+
): Promise<RegressionResult>
|
|
747
|
+
addCase(input: string, output: string, score: number): void
|
|
748
|
+
readonly baseline: RegressionBaseline | null
|
|
749
|
+
}
|
|
750
|
+
```
|
|
751
|
+
|
|
752
|
+
| Method | Description |
|
|
753
|
+
|---|---|
|
|
754
|
+
| `load` | Reads a baseline JSON file from disk |
|
|
755
|
+
| `save` | Writes the current baseline to disk (creates directories as needed) |
|
|
756
|
+
| `run` | Runs all baseline cases through `runner`, compares scores, and classifies regressions (delta < -0.1), improvements (delta > 0.1), or unchanged |
|
|
757
|
+
| `addCase` | Adds or updates a case in the baseline |
|
|
758
|
+
| `baseline` | Read-only access to the current baseline (or `null`) |
|
|
759
|
+
|
|
760
|
+
### `createRegressionSuite()`
|
|
761
|
+
|
|
762
|
+
Creates a new regression suite with the given name.
|
|
763
|
+
|
|
764
|
+
```ts
|
|
765
|
+
function createRegressionSuite(name: string): RegressionSuite
|
|
766
|
+
```
|
|
767
|
+
|
|
768
|
+
| Parameter | Type | Description |
|
|
769
|
+
|---|---|---|
|
|
770
|
+
| `name` | `string` | Name for the regression suite |
|
|
771
|
+
|
|
772
|
+
**Returns:** `RegressionSuite`
|
|
773
|
+
|
|
774
|
+
```ts
|
|
775
|
+
import { createRegressionSuite } from '@elsium-ai/testing'
|
|
776
|
+
|
|
777
|
+
const suite = createRegressionSuite('qa-bot')
|
|
778
|
+
|
|
779
|
+
// Build baseline
|
|
780
|
+
suite.addCase('What is 2+2?', '4', 1.0)
|
|
781
|
+
suite.addCase('Capital of France?', 'Paris', 1.0)
|
|
782
|
+
await suite.save('./baselines/qa-bot.json')
|
|
783
|
+
|
|
784
|
+
// Later, run against the baseline
|
|
785
|
+
await suite.load('./baselines/qa-bot.json')
|
|
786
|
+
const result = await suite.run(async (input) => {
|
|
787
|
+
// Call your LLM here
|
|
788
|
+
return 'some answer'
|
|
789
|
+
})
|
|
790
|
+
|
|
791
|
+
console.log(result.regressions.length) // number of regressions detected
|
|
792
|
+
console.log(result.overallScore) // aggregate score across all cases
|
|
793
|
+
```
|
|
794
|
+
|
|
795
|
+
---
|
|
796
|
+
|
|
797
|
+
## Replay
|
|
798
|
+
|
|
799
|
+
Record raw `CompletionRequest` / `LLMResponse` pairs and replay them deterministically in tests.
|
|
800
|
+
|
|
801
|
+
### `ReplayEntry`
|
|
802
|
+
|
|
803
|
+
```ts
|
|
804
|
+
interface ReplayEntry {
|
|
805
|
+
request: CompletionRequest
|
|
806
|
+
response: LLMResponse
|
|
807
|
+
timestamp: number
|
|
808
|
+
}
|
|
809
|
+
```
|
|
810
|
+
|
|
811
|
+
### `ReplayRecorder`
|
|
812
|
+
|
|
813
|
+
```ts
|
|
814
|
+
interface ReplayRecorder {
|
|
815
|
+
wrap(
|
|
816
|
+
completeFn: (req: CompletionRequest) => Promise<LLMResponse>,
|
|
817
|
+
): (req: CompletionRequest) => Promise<LLMResponse>
|
|
818
|
+
getEntries(): ReplayEntry[]
|
|
819
|
+
toJSON(): string
|
|
820
|
+
clear(): void
|
|
821
|
+
}
|
|
822
|
+
```
|
|
823
|
+
|
|
824
|
+
### `createReplayRecorder()`
|
|
825
|
+
|
|
826
|
+
Creates a recorder that wraps a completion function and captures every request/response pair.
|
|
827
|
+
|
|
828
|
+
```ts
|
|
829
|
+
function createReplayRecorder(): ReplayRecorder
|
|
830
|
+
```
|
|
831
|
+
|
|
832
|
+
**Returns:** `ReplayRecorder`
|
|
833
|
+
|
|
834
|
+
```ts
|
|
835
|
+
import { createReplayRecorder } from '@elsium-ai/testing'
|
|
836
|
+
|
|
837
|
+
const recorder = createReplayRecorder()
|
|
838
|
+
const wrappedComplete = recorder.wrap(provider.complete.bind(provider))
|
|
839
|
+
|
|
840
|
+
// Use wrappedComplete in place of provider.complete — all calls are recorded
|
|
841
|
+
const response = await wrappedComplete({
|
|
842
|
+
messages: [{ role: 'user', content: 'Hello' }],
|
|
843
|
+
})
|
|
844
|
+
|
|
845
|
+
// Save for later replay
|
|
846
|
+
const json = recorder.toJSON()
|
|
847
|
+
```
|
|
848
|
+
|
|
849
|
+
### `ReplayPlayer`
|
|
850
|
+
|
|
851
|
+
```ts
|
|
852
|
+
interface ReplayPlayer {
|
|
853
|
+
complete(request: CompletionRequest): Promise<LLMResponse>
|
|
854
|
+
readonly remaining: number
|
|
855
|
+
}
|
|
856
|
+
```
|
|
857
|
+
|
|
858
|
+
### `createReplayPlayer()`
|
|
859
|
+
|
|
860
|
+
Creates a player that replays recorded responses sequentially, regardless of the incoming request.
|
|
861
|
+
|
|
862
|
+
```ts
|
|
863
|
+
function createReplayPlayer(entriesOrJson: ReplayEntry[] | string): ReplayPlayer
|
|
864
|
+
```
|
|
865
|
+
|
|
866
|
+
| Parameter | Type | Description |
|
|
867
|
+
|---|---|---|
|
|
868
|
+
| `entriesOrJson` | `ReplayEntry[] \| string` | An array of replay entries, or a JSON string produced by `recorder.toJSON()` |
|
|
869
|
+
|
|
870
|
+
**Returns:** `ReplayPlayer`
|
|
871
|
+
|
|
872
|
+
Throws an error with the message `'Replay exhausted: no more recorded responses'` if `complete()` is called after all entries have been consumed.
|
|
873
|
+
|
|
874
|
+
```ts
|
|
875
|
+
import { createReplayRecorder, createReplayPlayer } from '@elsium-ai/testing'
|
|
876
|
+
|
|
877
|
+
// Record
|
|
878
|
+
const recorder = createReplayRecorder()
|
|
879
|
+
const wrapped = recorder.wrap(provider.complete.bind(provider))
|
|
880
|
+
await wrapped({ messages: [{ role: 'user', content: 'Hi' }] })
|
|
881
|
+
|
|
882
|
+
// Replay
|
|
883
|
+
const player = createReplayPlayer(recorder.getEntries())
|
|
884
|
+
console.log(player.remaining) // 1
|
|
885
|
+
|
|
886
|
+
const replayed = await player.complete({
|
|
887
|
+
messages: [{ role: 'user', content: 'Hi' }],
|
|
888
|
+
})
|
|
889
|
+
console.log(player.remaining) // 0
|
|
890
|
+
```
|
|
891
|
+
|
|
892
|
+
---
|
|
893
|
+
|
|
894
|
+
## Pinning
|
|
895
|
+
|
|
896
|
+
Pin LLM outputs to specific prompt + config combinations and detect when outputs drift.
|
|
897
|
+
|
|
898
|
+
### `Pin`
|
|
899
|
+
|
|
900
|
+
```ts
|
|
901
|
+
interface Pin {
|
|
902
|
+
promptHash: string
|
|
903
|
+
configHash: string
|
|
904
|
+
outputHash: string
|
|
905
|
+
outputText: string
|
|
906
|
+
model?: string
|
|
907
|
+
createdAt: number
|
|
908
|
+
}
|
|
909
|
+
```
|
|
910
|
+
|
|
911
|
+
### `PinStore`
|
|
912
|
+
|
|
913
|
+
```ts
|
|
914
|
+
interface PinStore {
|
|
915
|
+
get(key: string): Pin | undefined
|
|
916
|
+
set(key: string, pin: Pin): void
|
|
917
|
+
delete(key: string): boolean
|
|
918
|
+
getAll(): Pin[]
|
|
919
|
+
toJSON(): string
|
|
920
|
+
}
|
|
921
|
+
```
|
|
922
|
+
|
|
923
|
+
### `PinResult`
|
|
924
|
+
|
|
925
|
+
```ts
|
|
926
|
+
interface PinResult {
|
|
927
|
+
status: 'new' | 'match' | 'mismatch'
|
|
928
|
+
pin: Pin
|
|
929
|
+
previousPin?: Pin
|
|
930
|
+
}
|
|
931
|
+
```
|
|
932
|
+
|
|
933
|
+
### `createPinStore()`
|
|
934
|
+
|
|
935
|
+
Creates an in-memory pin store, optionally preloaded with existing pins.
|
|
936
|
+
|
|
937
|
+
```ts
|
|
938
|
+
function createPinStore(existing?: Pin[]): PinStore
|
|
939
|
+
```
|
|
940
|
+
|
|
941
|
+
| Parameter | Type | Default | Description |
|
|
942
|
+
|---|---|---|---|
|
|
943
|
+
| `existing` | `Pin[]` | `undefined` | Previously saved pins to preload |
|
|
944
|
+
|
|
945
|
+
**Returns:** `PinStore`
|
|
946
|
+
|
|
947
|
+
```ts
|
|
948
|
+
import { createPinStore } from '@elsium-ai/testing'
|
|
949
|
+
|
|
950
|
+
const store = createPinStore()
|
|
951
|
+
console.log(store.getAll().length) // 0
|
|
952
|
+
```
|
|
953
|
+
|
|
954
|
+
### `pinOutput()`
|
|
955
|
+
|
|
956
|
+
Runs a function, hashes its output along with the prompt and config, and compares against any previously stored pin.
|
|
957
|
+
|
|
958
|
+
```ts
|
|
959
|
+
function pinOutput(
|
|
960
|
+
name: string,
|
|
961
|
+
store: PinStore,
|
|
962
|
+
runner: () => Promise<string>,
|
|
963
|
+
config: {
|
|
964
|
+
prompt: string
|
|
965
|
+
model?: string
|
|
966
|
+
temperature?: number
|
|
967
|
+
seed?: number
|
|
968
|
+
},
|
|
969
|
+
options?: { assert?: boolean },
|
|
970
|
+
): Promise<PinResult>
|
|
971
|
+
```
|
|
972
|
+
|
|
973
|
+
| Parameter | Type | Description |
|
|
974
|
+
|---|---|---|
|
|
975
|
+
| `name` | `string` | Human-readable name for the pin (used in error messages) |
|
|
976
|
+
| `store` | `PinStore` | The store to read/write pins |
|
|
977
|
+
| `runner` | `() => Promise<string>` | Function that produces the output to pin |
|
|
978
|
+
| `config` | `object` | Prompt text and model config used to generate the hash key |
|
|
979
|
+
| `options.assert` | `boolean` | When `true`, throws an `ElsiumError` on mismatch instead of returning |
|
|
980
|
+
|
|
981
|
+
**Returns:** `Promise<PinResult>` -- status is `'new'` on first run, `'match'` if output is identical, or `'mismatch'` if the output has changed.
|
|
982
|
+
|
|
983
|
+
```ts
|
|
984
|
+
import { createPinStore, pinOutput } from '@elsium-ai/testing'
|
|
985
|
+
|
|
986
|
+
const store = createPinStore()
|
|
987
|
+
|
|
988
|
+
const result = await pinOutput(
|
|
989
|
+
'greeting-pin',
|
|
990
|
+
store,
|
|
991
|
+
async () => 'Hello, world!',
|
|
992
|
+
{ prompt: 'Say hello', model: 'gpt-4', temperature: 0 },
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
console.log(result.status) // 'new'
|
|
996
|
+
|
|
997
|
+
// Run again with the same output
|
|
998
|
+
const result2 = await pinOutput(
|
|
999
|
+
'greeting-pin',
|
|
1000
|
+
store,
|
|
1001
|
+
async () => 'Hello, world!',
|
|
1002
|
+
{ prompt: 'Say hello', model: 'gpt-4', temperature: 0 },
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
console.log(result2.status) // 'match'
|
|
1006
|
+
|
|
1007
|
+
// Run with assert mode in CI
|
|
1008
|
+
await pinOutput(
|
|
1009
|
+
'greeting-pin',
|
|
1010
|
+
store,
|
|
1011
|
+
async () => 'Different output!',
|
|
1012
|
+
{ prompt: 'Say hello', model: 'gpt-4', temperature: 0 },
|
|
1013
|
+
{ assert: true }, // throws ElsiumError on mismatch
|
|
1014
|
+
)
|
|
1015
|
+
```
|
|
1016
|
+
|
|
1017
|
+
---
|
|
1018
|
+
|
|
1019
|
+
## Determinism
|
|
1020
|
+
|
|
1021
|
+
Verify that an LLM function produces consistent outputs across multiple invocations.
|
|
1022
|
+
|
|
1023
|
+
### `DeterminismResult`
|
|
1024
|
+
|
|
1025
|
+
```ts
|
|
1026
|
+
interface DeterminismResult {
|
|
1027
|
+
deterministic: boolean
|
|
1028
|
+
runs: number
|
|
1029
|
+
uniqueOutputs: number
|
|
1030
|
+
outputs: string[]
|
|
1031
|
+
variance: number
|
|
1032
|
+
}
|
|
1033
|
+
```
|
|
1034
|
+
|
|
1035
|
+
### `StabilityResult`
|
|
1036
|
+
|
|
1037
|
+
```ts
|
|
1038
|
+
interface StabilityResult {
|
|
1039
|
+
stable: boolean
|
|
1040
|
+
runs: number
|
|
1041
|
+
uniqueOutputs: number
|
|
1042
|
+
outputs: Array<{ output: string; timestamp: number }>
|
|
1043
|
+
variance: number
|
|
1044
|
+
}
|
|
1045
|
+
```
|
|
1046
|
+
|
|
1047
|
+
### `assertDeterministic()`
|
|
1048
|
+
|
|
1049
|
+
Runs a function multiple times and verifies that all outputs are identical (or within the specified tolerance).
|
|
1050
|
+
|
|
1051
|
+
```ts
|
|
1052
|
+
function assertDeterministic(
|
|
1053
|
+
fn: (seed?: number) => Promise<string>,
|
|
1054
|
+
options?: {
|
|
1055
|
+
runs?: number
|
|
1056
|
+
seed?: number
|
|
1057
|
+
tolerance?: number
|
|
1058
|
+
},
|
|
1059
|
+
): Promise<DeterminismResult>
|
|
1060
|
+
```
|
|
1061
|
+
|
|
1062
|
+
| Parameter | Type | Default | Description |
|
|
1063
|
+
|---|---|---|---|
|
|
1064
|
+
| `fn` | `(seed?: number) => Promise<string>` | -- | The function to test for determinism |
|
|
1065
|
+
| `options.runs` | `number` | `5` | Number of times to invoke `fn` |
|
|
1066
|
+
| `options.seed` | `number` | `undefined` | Seed passed to `fn` on each invocation |
|
|
1067
|
+
| `options.tolerance` | `number` | `0` | Maximum allowed variance (0 = strictly deterministic) |
|
|
1068
|
+
|
|
1069
|
+
**Returns:** `Promise<DeterminismResult>`
|
|
1070
|
+
|
|
1071
|
+
Throws an `ElsiumError` when `tolerance` is `0` (the default) and outputs are not identical.
|
|
1072
|
+
|
|
1073
|
+
```ts
|
|
1074
|
+
import { assertDeterministic } from '@elsium-ai/testing'
|
|
28
1075
|
|
|
29
|
-
// Determinism check
|
|
30
1076
|
const result = await assertDeterministic(
|
|
31
|
-
(seed) =>
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
}).then(r => r.message.content),
|
|
1077
|
+
async (seed) => {
|
|
1078
|
+
// Call your LLM with temperature: 0 and the provided seed
|
|
1079
|
+
return 'consistent output'
|
|
1080
|
+
},
|
|
36
1081
|
{ runs: 5, seed: 42, tolerance: 0 },
|
|
37
1082
|
)
|
|
38
|
-
// { deterministic: true, variance: 0, uniqueOutputs: 1 }
|
|
39
1083
|
|
|
40
|
-
//
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
1084
|
+
console.log(result.deterministic) // true
|
|
1085
|
+
console.log(result.uniqueOutputs) // 1
|
|
1086
|
+
console.log(result.variance) // 0
|
|
1087
|
+
```
|
|
1088
|
+
|
|
1089
|
+
### `assertStable()`
|
|
1090
|
+
|
|
1091
|
+
Runs a function multiple times with a delay between invocations to verify temporal stability.
|
|
1092
|
+
|
|
1093
|
+
```ts
|
|
1094
|
+
function assertStable(
|
|
1095
|
+
fn: (seed?: number) => Promise<string>,
|
|
1096
|
+
options?: {
|
|
1097
|
+
intervalMs?: number
|
|
1098
|
+
runs?: number
|
|
1099
|
+
seed?: number
|
|
1100
|
+
},
|
|
1101
|
+
): Promise<StabilityResult>
|
|
1102
|
+
```
|
|
1103
|
+
|
|
1104
|
+
| Parameter | Type | Default | Description |
|
|
1105
|
+
|---|---|---|---|
|
|
1106
|
+
| `fn` | `(seed?: number) => Promise<string>` | -- | The function to test for stability |
|
|
1107
|
+
| `options.intervalMs` | `number` | `100` | Delay in milliseconds between runs |
|
|
1108
|
+
| `options.runs` | `number` | `3` | Number of times to invoke `fn` |
|
|
1109
|
+
| `options.seed` | `number` | `undefined` | Seed passed to `fn` on each invocation |
|
|
1110
|
+
|
|
1111
|
+
**Returns:** `Promise<StabilityResult>`
|
|
1112
|
+
|
|
1113
|
+
```ts
|
|
1114
|
+
import { assertStable } from '@elsium-ai/testing'
|
|
1115
|
+
|
|
1116
|
+
const result = await assertStable(
|
|
1117
|
+
async (seed) => {
|
|
1118
|
+
return 'same output every time'
|
|
1119
|
+
},
|
|
1120
|
+
{ intervalMs: 200, runs: 3, seed: 42 },
|
|
1121
|
+
)
|
|
1122
|
+
|
|
1123
|
+
console.log(result.stable) // true
|
|
1124
|
+
console.log(result.uniqueOutputs) // 1
|
|
1125
|
+
console.log(result.outputs) // [{ output: '...', timestamp: ... }, ...]
|
|
44
1126
|
```
|
|
45
1127
|
|
|
1128
|
+
---
|
|
1129
|
+
|
|
46
1130
|
## Part of ElsiumAI
|
|
47
1131
|
|
|
48
1132
|
This package is the testing layer of the [ElsiumAI](https://github.com/elsium-ai/elsium-ai) framework. See the [full documentation](https://github.com/elsium-ai/elsium-ai) for guides and examples.
|
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@elsium-ai/testing",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.3",
|
|
4
4
|
"description": "Testing utilities, mock providers, fixtures, and eval framework for ElsiumAI",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Eric Utrera <ebutrera9103@gmail.com>",
|
|
7
7
|
"repository": {
|
|
8
8
|
"type": "git",
|
|
9
|
-
"url": "https://github.com/elsium-ai/elsium-ai",
|
|
9
|
+
"url": "git+https://github.com/elsium-ai/elsium-ai.git",
|
|
10
10
|
"directory": "packages/testing"
|
|
11
11
|
},
|
|
12
12
|
"type": "module",
|
|
@@ -26,10 +26,10 @@
|
|
|
26
26
|
"dev": "bun --watch src/index.ts"
|
|
27
27
|
},
|
|
28
28
|
"dependencies": {
|
|
29
|
-
"@elsium-ai/core": "^0.2.
|
|
30
|
-
"@elsium-ai/gateway": "^0.2.
|
|
31
|
-
"@elsium-ai/agents": "^0.2.
|
|
32
|
-
"@elsium-ai/tools": "^0.2.
|
|
29
|
+
"@elsium-ai/core": "^0.2.3",
|
|
30
|
+
"@elsium-ai/gateway": "^0.2.3",
|
|
31
|
+
"@elsium-ai/agents": "^0.2.3",
|
|
32
|
+
"@elsium-ai/tools": "^0.2.3"
|
|
33
33
|
},
|
|
34
34
|
"devDependencies": {
|
|
35
35
|
"typescript": "^5.7.0"
|