@elsium-ai/testing 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +1105 -21
  2. package/package.json +6 -6
package/README.md CHANGED
@@ -13,36 +13,1120 @@ npm install @elsium-ai/testing --save-dev
13
13
 
14
14
  ## What's Inside
15
15
 
16
- - **Mock Providers** Zero-latency providers for unit testing
17
- - **Evals** — LLM-as-judge evaluation framework
18
- - **Output Pinning** Lock expected outputs, catch regressions when models update
19
- - **Determinism Assertions** Run N times, verify all outputs match
20
- - **Prompt Versioning** Track and compare prompt versions
21
- - **Request-Matched Fixtures** Replay fixtures by content hash, not sequence order
22
- - **Regression Suites** Automated regression detection in CI
16
+ | Category | Exports | Description |
17
+ |---|---|---|
18
+ | **Mock Provider** | `mockProvider`, `MockProvider`, `MockProviderOptions`, `MockResponseConfig` | Zero-latency LLM provider for unit tests |
19
+ | **Fixtures** | `createFixture`, `loadFixture`, `createRecorder`, `Fixture`, `FixtureEntry`, `FixtureRecorder` | Record, save, and replay request/response pairs |
20
+ | **Eval** | `runEvalSuite`, `formatEvalReport`, `EvalCase`, `EvalCriterion`, `EvalResult`, `CriterionResult`, `EvalSuiteConfig`, `EvalSuiteResult`, `LLMJudge` | Evaluation framework with built-in and custom criteria |
21
+ | **Snapshot** | `createSnapshotStore`, `hashOutput`, `testSnapshot`, `PromptSnapshot`, `SnapshotStore`, `SnapshotTestResult` | Hash-based snapshot testing for LLM outputs |
22
+ | **Prompts** | `createPromptRegistry`, `definePrompt`, `PromptDefinition`, `PromptDiff`, `DiffLine`, `PromptRegistry` | Versioned prompt registry with diff and rendering |
23
+ | **Regression** | `createRegressionSuite`, `RegressionBaseline`, `RegressionResult`, `RegressionDetail`, `RegressionSuite` | Baseline-driven regression detection |
24
+ | **Replay** | `createReplayRecorder`, `createReplayPlayer`, `ReplayEntry`, `ReplayRecorder`, `ReplayPlayer` | Record and replay raw LLM completion calls |
25
+ | **Pinning** | `createPinStore`, `pinOutput`, `Pin`, `PinStore`, `PinResult` | Pin expected outputs and detect drift |
26
+ | **Determinism** | `assertDeterministic`, `assertStable`, `DeterminismResult`, `StabilityResult` | Verify output consistency across repeated runs |
23
27
 
24
- ## Usage
28
+ ---
25
29
 
26
- ```typescript
27
- import { assertDeterministic, createMockProvider, pinOutput } from '@elsium-ai/testing'
30
+ ## Mock Provider
31
+
32
+ Create a mock `LLMProvider` that returns pre-configured responses without making real API calls.
33
+
34
+ ### `MockResponseConfig`
35
+
36
+ ```ts
37
+ interface MockResponseConfig {
38
+ content?: string
39
+ toolCalls?: Array<{
40
+ id?: string
41
+ name: string
42
+ arguments: Record<string, unknown>
43
+ }>
44
+ stopReason?: 'end_turn' | 'tool_use' | 'max_tokens' | 'stop_sequence'
45
+ usage?: Partial<TokenUsage>
46
+ model?: string
47
+ delay?: number
48
+ }
49
+ ```
50
+
51
+ ### `MockProviderOptions`
52
+
53
+ ```ts
54
+ interface MockProviderOptions {
55
+ responses?: MockResponseConfig[]
56
+ defaultResponse?: MockResponseConfig
57
+ onRequest?: (request: CompletionRequest) => void
58
+ }
59
+ ```
60
+
61
+ | Field | Description |
62
+ |---|---|
63
+ | `responses` | Ordered list of responses returned sequentially per call |
64
+ | `defaultResponse` | Fallback response used when `responses` is exhausted |
65
+ | `onRequest` | Callback invoked on every request (useful for assertions) |
66
+
67
+ ### `MockProvider`
68
+
69
+ ```ts
70
+ interface MockProvider extends LLMProvider {
71
+ readonly calls: CompletionRequest[]
72
+ readonly callCount: number
73
+ reset(): void
74
+ }
75
+ ```
76
+
77
+ Extends the standard `LLMProvider` interface with inspection helpers. `calls` stores every `CompletionRequest` received, `callCount` returns the total, and `reset()` clears both the call log and the response index.
78
+
79
+ ### `mockProvider()`
80
+
81
+ Creates a mock provider instance.
82
+
83
+ ```ts
84
+ function mockProvider(options?: MockProviderOptions): MockProvider
85
+ ```
86
+
87
+ | Parameter | Type | Default | Description |
88
+ |---|---|---|---|
89
+ | `options` | `MockProviderOptions` | `{}` | Configuration for responses and callbacks |
90
+
91
+ **Returns:** `MockProvider`
92
+
93
+ ```ts
94
+ import { mockProvider } from '@elsium-ai/testing'
95
+
96
+ const mock = mockProvider({
97
+ responses: [
98
+ { content: 'Hello!' },
99
+ { content: 'Goodbye!', stopReason: 'end_turn' },
100
+ ],
101
+ defaultResponse: { content: 'Default reply' },
102
+ onRequest: (req) => console.log('Model:', req.model),
103
+ })
104
+
105
+ const first = await mock.complete({ messages: [{ role: 'user', content: 'Hi' }] })
106
+ // first.message.content === 'Hello!'
107
+
108
+ console.log(mock.callCount) // 1
109
+ mock.reset()
110
+ console.log(mock.callCount) // 0
111
+ ```
112
+
113
+ ---
114
+
115
+ ## Fixtures
116
+
117
+ Capture request/response pairs as reusable fixtures that can be serialized to JSON and replayed as mock providers.
118
+
119
+ ### `FixtureEntry`
120
+
121
+ ```ts
122
+ interface FixtureEntry {
123
+ request: {
124
+ messages: Array<{ role: string; content: string }>
125
+ model?: string
126
+ system?: string
127
+ }
128
+ response: MockResponseConfig
129
+ timestamp?: string
130
+ }
131
+ ```
132
+
133
+ ### `Fixture`
134
+
135
+ ```ts
136
+ interface Fixture {
137
+ readonly name: string
138
+ readonly entries: FixtureEntry[]
139
+ toProvider(options?: { matching?: 'sequential' | 'request-hash' }): MockProvider
140
+ toJSON(): string
141
+ }
142
+ ```
143
+
144
+ | Method | Description |
145
+ |---|---|
146
+ | `toProvider()` | Converts the fixture into a `MockProvider`. Pass `{ matching: 'request-hash' }` to match responses by message content hash instead of sequential order. |
147
+ | `toJSON()` | Serializes the fixture (with timestamps) to a JSON string. |
148
+
149
+ ### `createFixture()`
150
+
151
+ Creates a fixture from a name and an array of entries.
152
+
153
+ ```ts
154
+ function createFixture(name: string, entries: FixtureEntry[]): Fixture
155
+ ```
156
+
157
+ | Parameter | Type | Description |
158
+ |---|---|---|
159
+ | `name` | `string` | Human-readable fixture name |
160
+ | `entries` | `FixtureEntry[]` | Array of request/response pairs |
161
+
162
+ **Returns:** `Fixture`
163
+
164
+ ```ts
165
+ import { createFixture } from '@elsium-ai/testing'
166
+
167
+ const fixture = createFixture('greeting-flow', [
168
+ {
169
+ request: { messages: [{ role: 'user', content: 'Hello' }] },
170
+ response: { content: 'Hi there!' },
171
+ },
172
+ ])
173
+
174
+ const provider = fixture.toProvider()
175
+ const res = await provider.complete({
176
+ messages: [{ role: 'user', content: 'Hello' }],
177
+ })
178
+ // res.message.content === 'Hi there!'
179
+ ```
180
+
181
+ ### `loadFixture()`
182
+
183
+ Deserializes a JSON string back into a `Fixture`.
184
+
185
+ ```ts
186
+ function loadFixture(json: string): Fixture
187
+ ```
188
+
189
+ | Parameter | Type | Description |
190
+ |---|---|---|
191
+ | `json` | `string` | JSON string previously produced by `fixture.toJSON()` |
192
+
193
+ **Returns:** `Fixture`
194
+
195
+ ```ts
196
+ import { createFixture, loadFixture } from '@elsium-ai/testing'
197
+
198
+ const original = createFixture('test', [
199
+ {
200
+ request: { messages: [{ role: 'user', content: 'ping' }] },
201
+ response: { content: 'pong' },
202
+ },
203
+ ])
204
+
205
+ const json = original.toJSON()
206
+ const restored = loadFixture(json)
207
+ // restored.name === 'test'
208
+ ```
209
+
210
+ ### `FixtureRecorder`
211
+
212
+ ```ts
213
+ interface FixtureRecorder {
214
+ wrap(provider: MockProvider): MockProvider
215
+ getEntries(): FixtureEntry[]
216
+ toFixture(name: string): Fixture
217
+ clear(): void
218
+ }
219
+ ```
220
+
221
+ ### `createRecorder()`
222
+
223
+ Creates a recorder that intercepts `complete()` calls and captures request/response pairs.
224
+
225
+ ```ts
226
+ function createRecorder(): FixtureRecorder
227
+ ```
228
+
229
+ **Returns:** `FixtureRecorder`
230
+
231
+ ```ts
232
+ import { mockProvider, createRecorder } from '@elsium-ai/testing'
233
+
234
+ const recorder = createRecorder()
235
+ const mock = mockProvider({ responses: [{ content: 'recorded response' }] })
236
+ const wrapped = recorder.wrap(mock)
237
+
238
+ await wrapped.complete({
239
+ messages: [{ role: 'user', content: 'capture this' }],
240
+ })
241
+
242
+ const fixture = recorder.toFixture('my-fixture')
243
+ console.log(fixture.entries.length) // 1
244
+ ```
245
+
246
+ ---
247
+
248
+ ## Eval
249
+
250
+ A structured evaluation framework for assessing LLM outputs against configurable criteria.
251
+
252
+ ### `EvalCase`
253
+
254
+ ```ts
255
+ interface EvalCase {
256
+ name: string
257
+ input: string
258
+ expected?: string
259
+ criteria?: EvalCriterion[]
260
+ tags?: string[]
261
+ }
262
+ ```
263
+
264
+ ### `EvalCriterion`
265
+
266
+ A discriminated union of all supported criterion types:
267
+
268
+ ```ts
269
+ type EvalCriterion =
270
+ | { type: 'contains'; value: string; caseSensitive?: boolean }
271
+ | { type: 'not_contains'; value: string; caseSensitive?: boolean }
272
+ | { type: 'matches'; pattern: string; flags?: string }
273
+ | { type: 'length_min'; value: number }
274
+ | { type: 'length_max'; value: number }
275
+ | { type: 'json_valid' }
276
+ | { type: 'json_matches'; schema: Record<string, unknown> }
277
+ | { type: 'custom'; name: string; fn: (output: string) => boolean }
278
+ | { type: 'llm_judge'; prompt: string; judge: LLMJudge; threshold?: number }
279
+ | { type: 'semantic_similarity'; reference: string; threshold?: number }
280
+ | { type: 'factual_accuracy'; facts: string[]; threshold?: number }
281
+ ```
282
+
283
+ | Criterion | Description |
284
+ |---|---|
285
+ | `contains` | Output must contain `value` (case-insensitive by default) |
286
+ | `not_contains` | Output must not contain `value` |
287
+ | `matches` | Output must match the regex `pattern` |
288
+ | `length_min` | Output length must be at least `value` characters |
289
+ | `length_max` | Output length must be at most `value` characters |
290
+ | `json_valid` | Output must be valid JSON |
291
+ | `json_matches` | Output must be valid JSON matching `schema` (key presence + type check) |
292
+ | `custom` | Output is passed to `fn`; must return `true` to pass |
293
+ | `llm_judge` | An LLM judge scores the output; must meet `threshold` (default 0.7) |
294
+ | `semantic_similarity` | Word-overlap similarity against `reference`; must meet `threshold` (default 0.7) |
295
+ | `factual_accuracy` | Checks how many `facts` appear in the output; must meet `threshold` (default 0.7) |
296
+
297
+ ### `LLMJudge`
298
+
299
+ ```ts
300
+ type LLMJudge = (prompt: string) => Promise<{ score: number; reasoning: string }>
301
+ ```
302
+
303
+ ### `EvalResult`
304
+
305
+ ```ts
306
+ interface EvalResult {
307
+ name: string
308
+ passed: boolean
309
+ score: number
310
+ criteria: CriterionResult[]
311
+ input: string
312
+ output: string
313
+ durationMs: number
314
+ tags: string[]
315
+ }
316
+ ```
317
+
318
+ ### `CriterionResult`
319
+
320
+ ```ts
321
+ interface CriterionResult {
322
+ type: string
323
+ passed: boolean
324
+ message: string
325
+ }
326
+ ```
327
+
328
+ ### `EvalSuiteConfig`
329
+
330
+ ```ts
331
+ interface EvalSuiteConfig {
332
+ name: string
333
+ cases: EvalCase[]
334
+ runner: (input: string) => Promise<string>
335
+ concurrency?: number
336
+ }
337
+ ```
338
+
339
+ | Field | Description |
340
+ |---|---|
341
+ | `name` | Suite name for reporting |
342
+ | `cases` | Array of eval cases to run |
343
+ | `runner` | Function that takes an input string and returns the LLM output |
344
+ | `concurrency` | Max parallel eval cases (default `1` for sequential execution) |
345
+
346
+ ### `EvalSuiteResult`
347
+
348
+ ```ts
349
+ interface EvalSuiteResult {
350
+ name: string
351
+ total: number
352
+ passed: number
353
+ failed: number
354
+ score: number
355
+ results: EvalResult[]
356
+ durationMs: number
357
+ }
358
+ ```
359
+
360
+ ### `runEvalSuite()`
361
+
362
+ Runs all eval cases through the runner and evaluates each against its criteria.
363
+
364
+ ```ts
365
+ function runEvalSuite(config: EvalSuiteConfig): Promise<EvalSuiteResult>
366
+ ```
367
+
368
+ | Parameter | Type | Description |
369
+ |---|---|---|
370
+ | `config` | `EvalSuiteConfig` | Suite configuration including cases and runner |
371
+
372
+ **Returns:** `Promise<EvalSuiteResult>`
373
+
374
+ ```ts
375
+ import { runEvalSuite, formatEvalReport } from '@elsium-ai/testing'
376
+
377
+ const result = await runEvalSuite({
378
+ name: 'Sentiment classifier',
379
+ cases: [
380
+ {
381
+ name: 'positive review',
382
+ input: 'This product is amazing!',
383
+ criteria: [
384
+ { type: 'contains', value: 'positive' },
385
+ { type: 'length_max', value: 50 },
386
+ ],
387
+ },
388
+ {
389
+ name: 'negative review',
390
+ input: 'Terrible experience.',
391
+ expected: 'negative',
392
+ },
393
+ ],
394
+ runner: async (input) => {
395
+ // Call your LLM or classifier here
396
+ return input.includes('amazing') ? 'positive' : 'negative'
397
+ },
398
+ concurrency: 2,
399
+ })
400
+
401
+ console.log(result.score) // 0..1
402
+ console.log(result.passed) // number of passing cases
403
+ ```
404
+
405
+ ### `formatEvalReport()`
406
+
407
+ Formats an `EvalSuiteResult` into a human-readable string report.
408
+
409
+ ```ts
410
+ function formatEvalReport(result: EvalSuiteResult): string
411
+ ```
412
+
413
+ | Parameter | Type | Description |
414
+ |---|---|---|
415
+ | `result` | `EvalSuiteResult` | The result object returned by `runEvalSuite` |
416
+
417
+ **Returns:** `string`
418
+
419
+ ```ts
420
+ import { runEvalSuite, formatEvalReport } from '@elsium-ai/testing'
421
+
422
+ const result = await runEvalSuite({ /* ... */ })
423
+ console.log(formatEvalReport(result))
424
+ // Output:
425
+ // Eval Suite: Sentiment classifier
426
+ // --------------------------------------------------
427
+ // [PASS] positive review (3ms)
428
+ // [PASS] negative review (1ms)
429
+ // --------------------------------------------------
430
+ // Score: 100.0% | 2/2 passed | 4ms
431
+ ```
432
+
433
+ ---
434
+
435
+ ## Snapshot
436
+
437
+ Hash-based snapshot testing that detects when LLM outputs change between runs.
438
+
439
+ ### `PromptSnapshot`
440
+
441
+ ```ts
442
+ interface PromptSnapshot {
443
+ name: string
444
+ request: {
445
+ system?: string
446
+ messages: Array<{ role: string; content: string }>
447
+ model?: string
448
+ }
449
+ outputHash: string
450
+ timestamp: string
451
+ }
452
+ ```
453
+
454
+ ### `SnapshotStore`
455
+
456
+ ```ts
457
+ interface SnapshotStore {
458
+ get(name: string): PromptSnapshot | undefined
459
+ set(name: string, snapshot: PromptSnapshot): void
460
+ getAll(): PromptSnapshot[]
461
+ toJSON(): string
462
+ }
463
+ ```
464
+
465
+ ### `createSnapshotStore()`
466
+
467
+ Creates an in-memory snapshot store, optionally seeded with existing snapshots.
468
+
469
+ ```ts
470
+ function createSnapshotStore(existing?: PromptSnapshot[]): SnapshotStore
471
+ ```
472
+
473
+ | Parameter | Type | Default | Description |
474
+ |---|---|---|---|
475
+ | `existing` | `PromptSnapshot[]` | `undefined` | Previously saved snapshots to preload |
476
+
477
+ **Returns:** `SnapshotStore`
478
+
479
+ ```ts
480
+ import { createSnapshotStore } from '@elsium-ai/testing'
481
+
482
+ const store = createSnapshotStore()
483
+ console.log(store.getAll().length) // 0
484
+ ```
485
+
486
+ ### `hashOutput()`
487
+
488
+ Produces a SHA-256 hex digest of the given string.
489
+
490
+ ```ts
491
+ function hashOutput(output: string): string
492
+ ```
493
+
494
+ | Parameter | Type | Description |
495
+ |---|---|---|
496
+ | `output` | `string` | The output string to hash |
497
+
498
+ **Returns:** `string` -- SHA-256 hex hash
499
+
500
+ ```ts
501
+ import { hashOutput } from '@elsium-ai/testing'
502
+
503
+ const hash = hashOutput('Hello, world!')
504
+ // 'dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f'
505
+ ```
506
+
507
+ ### `SnapshotTestResult`
508
+
509
+ ```ts
510
+ interface SnapshotTestResult {
511
+ name: string
512
+ status: 'new' | 'match' | 'changed'
513
+ previousHash?: string
514
+ currentHash: string
515
+ output: string
516
+ }
517
+ ```
518
+
519
+ ### `testSnapshot()`
520
+
521
+ Runs the provided function, hashes its output, and compares the hash against the stored snapshot.
522
+
523
+ ```ts
524
+ function testSnapshot(
525
+ name: string,
526
+ store: SnapshotStore,
527
+ runner: () => Promise<string>,
528
+ request?: Partial<CompletionRequest>,
529
+ ): Promise<SnapshotTestResult>
530
+ ```
531
+
532
+ | Parameter | Type | Description |
533
+ |---|---|---|
534
+ | `name` | `string` | Unique snapshot name |
535
+ | `store` | `SnapshotStore` | The store to read/write snapshots |
536
+ | `runner` | `() => Promise<string>` | Function that produces the output to snapshot |
537
+ | `request` | `Partial<CompletionRequest>` | Optional request metadata stored in the snapshot |
538
+
539
+ **Returns:** `Promise<SnapshotTestResult>` -- status is `'new'` on first run, `'match'` if the hash is unchanged, or `'changed'` if it differs.
540
+
541
+ ```ts
542
+ import { createSnapshotStore, testSnapshot } from '@elsium-ai/testing'
543
+
544
+ const store = createSnapshotStore()
545
+
546
+ const result = await testSnapshot('greeting', store, async () => {
547
+ return 'Hello, world!'
548
+ })
549
+
550
+ console.log(result.status) // 'new' on first run
551
+
552
+ const result2 = await testSnapshot('greeting', store, async () => {
553
+ return 'Hello, world!'
554
+ })
555
+
556
+ console.log(result2.status) // 'match'
557
+ ```
558
+
559
+ ---
560
+
561
+ ## Prompts
562
+
563
+ A versioned prompt registry for managing, rendering, and diffing prompt templates.
564
+
565
+ ### `PromptDefinition`
566
+
567
+ ```ts
568
+ interface PromptDefinition {
569
+ name: string
570
+ version: string
571
+ content: string
572
+ variables: string[]
573
+ metadata?: Record<string, unknown>
574
+ }
575
+ ```
576
+
577
+ ### `PromptDiff`
578
+
579
+ ```ts
580
+ interface PromptDiff {
581
+ name: string
582
+ fromVersion: string
583
+ toVersion: string
584
+ changes: DiffLine[]
585
+ }
586
+ ```
587
+
588
+ ### `DiffLine`
589
+
590
+ ```ts
591
+ interface DiffLine {
592
+ type: 'added' | 'removed' | 'unchanged'
593
+ lineNumber: number
594
+ content: string
595
+ }
596
+ ```
597
+
598
+ ### `PromptRegistry`
599
+
600
+ ```ts
601
+ interface PromptRegistry {
602
+ register(name: string, prompt: PromptDefinition): void
603
+ get(name: string, version?: string): PromptDefinition | undefined
604
+ getLatest(name: string): PromptDefinition | undefined
605
+ list(): Array<{ name: string; versions: string[] }>
606
+ diff(name: string, fromVersion: string, toVersion: string): PromptDiff | null
607
+ render(name: string, variables: Record<string, string>, version?: string): string
608
+ getVersions(name: string): string[]
609
+ }
610
+ ```
611
+
612
+ | Method | Description |
613
+ |---|---|
614
+ | `register` | Stores a prompt under its name and version |
615
+ | `get` | Retrieves a specific version, or the latest if `version` is omitted |
616
+ | `getLatest` | Returns the highest semver version for a prompt |
617
+ | `list` | Lists all prompt names with their available versions |
618
+ | `diff` | Computes a line-by-line diff between two versions |
619
+ | `render` | Replaces `{{variable}}` placeholders in the prompt content |
620
+ | `getVersions` | Returns all versions for a prompt sorted by semver |
621
+
622
+ ### `definePrompt()`
623
+
624
+ A convenience function that returns a shallow copy of the given prompt definition.
625
+
626
+ ```ts
627
+ function definePrompt(config: PromptDefinition): PromptDefinition
628
+ ```
629
+
630
+ | Parameter | Type | Description |
631
+ |---|---|---|
632
+ | `config` | `PromptDefinition` | The prompt definition to register |
633
+
634
+ **Returns:** `PromptDefinition`
635
+
636
+ ```ts
637
+ import { definePrompt } from '@elsium-ai/testing'
638
+
639
+ const prompt = definePrompt({
640
+ name: 'classifier',
641
+ version: '1.0.0',
642
+ content: 'Classify the following text as {{label}}: {{text}}',
643
+ variables: ['label', 'text'],
644
+ })
645
+ ```
646
+
647
+ ### `createPromptRegistry()`
648
+
649
+ Creates an empty prompt registry.
650
+
651
+ ```ts
652
+ function createPromptRegistry(): PromptRegistry
653
+ ```
654
+
655
+ **Returns:** `PromptRegistry`
656
+
657
+ ```ts
658
+ import { definePrompt, createPromptRegistry } from '@elsium-ai/testing'
659
+
660
+ const registry = createPromptRegistry()
661
+
662
+ const v1 = definePrompt({
663
+ name: 'summarizer',
664
+ version: '1.0.0',
665
+ content: 'Summarize: {{text}}',
666
+ variables: ['text'],
667
+ })
668
+
669
+ const v2 = definePrompt({
670
+ name: 'summarizer',
671
+ version: '2.0.0',
672
+ content: 'Provide a concise summary of: {{text}}',
673
+ variables: ['text'],
674
+ })
675
+
676
+ registry.register('summarizer', v1)
677
+ registry.register('summarizer', v2)
678
+
679
+ // Render with the latest version
680
+ const output = registry.render('summarizer', { text: 'A long article...' })
681
+ // 'Provide a concise summary of: A long article...'
682
+
683
+ // Diff between versions
684
+ const diff = registry.diff('summarizer', '1.0.0', '2.0.0')
685
+ // diff.changes includes added/removed/unchanged lines
686
+ ```
687
+
688
+ ---
689
+
690
+ ## Regression
691
+
692
+ Baseline-driven regression detection that compares current LLM outputs to previously recorded baselines.
693
+
694
+ ### `RegressionBaseline`
695
+
696
+ ```ts
697
+ interface RegressionBaseline {
698
+ name: string
699
+ cases: Array<{
700
+ input: string
701
+ output: string
702
+ score: number
703
+ timestamp: number
704
+ }>
705
+ createdAt: number
706
+ updatedAt: number
707
+ }
708
+ ```
709
+
710
+ ### `RegressionResult`
711
+
712
+ ```ts
713
+ interface RegressionResult {
714
+ name: string
715
+ totalCases: number
716
+ regressions: RegressionDetail[]
717
+ improvements: RegressionDetail[]
718
+ unchanged: number
719
+ overallScore: number
720
+ baselineScore: number
721
+ }
722
+ ```
723
+
724
+ ### `RegressionDetail`
725
+
726
+ ```ts
727
+ interface RegressionDetail {
728
+ input: string
729
+ baselineOutput: string
730
+ currentOutput: string
731
+ baselineScore: number
732
+ currentScore: number
733
+ delta: number
734
+ }
735
+ ```
736
+
737
+ ### `RegressionSuite`
738
+
739
+ ```ts
740
+ interface RegressionSuite {
741
+ load(path: string): Promise<void>
742
+ save(path: string): Promise<void>
743
+ run(
744
+ runner: (input: string) => Promise<string>,
745
+ scorer?: (input: string, output: string) => Promise<number>,
746
+ ): Promise<RegressionResult>
747
+ addCase(input: string, output: string, score: number): void
748
+ readonly baseline: RegressionBaseline | null
749
+ }
750
+ ```
751
+
752
+ | Method | Description |
753
+ |---|---|
754
+ | `load` | Reads a baseline JSON file from disk |
755
+ | `save` | Writes the current baseline to disk (creates directories as needed) |
756
+ | `run` | Runs all baseline cases through `runner`, compares scores, and classifies regressions (delta < -0.1), improvements (delta > 0.1), or unchanged |
757
+ | `addCase` | Adds or updates a case in the baseline |
758
+ | `baseline` | Read-only access to the current baseline (or `null`) |
759
+
760
+ ### `createRegressionSuite()`
761
+
762
+ Creates a new regression suite with the given name.
763
+
764
+ ```ts
765
+ function createRegressionSuite(name: string): RegressionSuite
766
+ ```
767
+
768
+ | Parameter | Type | Description |
769
+ |---|---|---|
770
+ | `name` | `string` | Name for the regression suite |
771
+
772
+ **Returns:** `RegressionSuite`
773
+
774
+ ```ts
775
+ import { createRegressionSuite } from '@elsium-ai/testing'
776
+
777
+ const suite = createRegressionSuite('qa-bot')
778
+
779
+ // Build baseline
780
+ suite.addCase('What is 2+2?', '4', 1.0)
781
+ suite.addCase('Capital of France?', 'Paris', 1.0)
782
+ await suite.save('./baselines/qa-bot.json')
783
+
784
+ // Later, run against the baseline
785
+ await suite.load('./baselines/qa-bot.json')
786
+ const result = await suite.run(async (input) => {
787
+ // Call your LLM here
788
+ return 'some answer'
789
+ })
790
+
791
+ console.log(result.regressions.length) // number of regressions detected
792
+ console.log(result.overallScore) // aggregate score across all cases
793
+ ```
794
+
795
+ ---
796
+
797
+ ## Replay
798
+
799
+ Record raw `CompletionRequest` / `LLMResponse` pairs and replay them deterministically in tests.
800
+
801
+ ### `ReplayEntry`
802
+
803
+ ```ts
804
+ interface ReplayEntry {
805
+ request: CompletionRequest
806
+ response: LLMResponse
807
+ timestamp: number
808
+ }
809
+ ```
810
+
811
+ ### `ReplayRecorder`
812
+
813
+ ```ts
814
+ interface ReplayRecorder {
815
+ wrap(
816
+ completeFn: (req: CompletionRequest) => Promise<LLMResponse>,
817
+ ): (req: CompletionRequest) => Promise<LLMResponse>
818
+ getEntries(): ReplayEntry[]
819
+ toJSON(): string
820
+ clear(): void
821
+ }
822
+ ```
823
+
824
+ ### `createReplayRecorder()`
825
+
826
+ Creates a recorder that wraps a completion function and captures every request/response pair.
827
+
828
+ ```ts
829
+ function createReplayRecorder(): ReplayRecorder
830
+ ```
831
+
832
+ **Returns:** `ReplayRecorder`
833
+
834
+ ```ts
835
+ import { createReplayRecorder } from '@elsium-ai/testing'
836
+
837
+ const recorder = createReplayRecorder()
838
+ const wrappedComplete = recorder.wrap(provider.complete.bind(provider))
839
+
840
+ // Use wrappedComplete in place of provider.complete — all calls are recorded
841
+ const response = await wrappedComplete({
842
+ messages: [{ role: 'user', content: 'Hello' }],
843
+ })
844
+
845
+ // Save for later replay
846
+ const json = recorder.toJSON()
847
+ ```
848
+
849
+ ### `ReplayPlayer`
850
+
851
+ ```ts
852
+ interface ReplayPlayer {
853
+ complete(request: CompletionRequest): Promise<LLMResponse>
854
+ readonly remaining: number
855
+ }
856
+ ```
857
+
858
+ ### `createReplayPlayer()`
859
+
860
+ Creates a player that replays recorded responses sequentially, regardless of the incoming request.
861
+
862
+ ```ts
863
+ function createReplayPlayer(entriesOrJson: ReplayEntry[] | string): ReplayPlayer
864
+ ```
865
+
866
+ | Parameter | Type | Description |
867
+ |---|---|---|
868
+ | `entriesOrJson` | `ReplayEntry[] \| string` | An array of replay entries, or a JSON string produced by `recorder.toJSON()` |
869
+
870
+ **Returns:** `ReplayPlayer`
871
+
872
+ Throws an error with the message `'Replay exhausted: no more recorded responses'` if `complete()` is called after all entries have been consumed.
873
+
874
+ ```ts
875
+ import { createReplayRecorder, createReplayPlayer } from '@elsium-ai/testing'
876
+
877
+ // Record
878
+ const recorder = createReplayRecorder()
879
+ const wrapped = recorder.wrap(provider.complete.bind(provider))
880
+ await wrapped({ messages: [{ role: 'user', content: 'Hi' }] })
881
+
882
+ // Replay
883
+ const player = createReplayPlayer(recorder.getEntries())
884
+ console.log(player.remaining) // 1
885
+
886
+ const replayed = await player.complete({
887
+ messages: [{ role: 'user', content: 'Hi' }],
888
+ })
889
+ console.log(player.remaining) // 0
890
+ ```
891
+
892
+ ---
893
+
894
+ ## Pinning
895
+
896
+ Pin LLM outputs to specific prompt + config combinations and detect when outputs drift.
897
+
898
+ ### `Pin`
899
+
900
+ ```ts
901
+ interface Pin {
902
+ promptHash: string
903
+ configHash: string
904
+ outputHash: string
905
+ outputText: string
906
+ model?: string
907
+ createdAt: number
908
+ }
909
+ ```
910
+
911
+ ### `PinStore`
912
+
913
+ ```ts
914
+ interface PinStore {
915
+ get(key: string): Pin | undefined
916
+ set(key: string, pin: Pin): void
917
+ delete(key: string): boolean
918
+ getAll(): Pin[]
919
+ toJSON(): string
920
+ }
921
+ ```
922
+
923
+ ### `PinResult`
924
+
925
+ ```ts
926
+ interface PinResult {
927
+ status: 'new' | 'match' | 'mismatch'
928
+ pin: Pin
929
+ previousPin?: Pin
930
+ }
931
+ ```
932
+
933
+ ### `createPinStore()`
934
+
935
+ Creates an in-memory pin store, optionally preloaded with existing pins.
936
+
937
+ ```ts
938
+ function createPinStore(existing?: Pin[]): PinStore
939
+ ```
940
+
941
+ | Parameter | Type | Default | Description |
942
+ |---|---|---|---|
943
+ | `existing` | `Pin[]` | `undefined` | Previously saved pins to preload |
944
+
945
+ **Returns:** `PinStore`
946
+
947
+ ```ts
948
+ import { createPinStore } from '@elsium-ai/testing'
949
+
950
+ const store = createPinStore()
951
+ console.log(store.getAll().length) // 0
952
+ ```
953
+
954
+ ### `pinOutput()`
955
+
956
+ Runs a function, hashes its output along with the prompt and config, and compares against any previously stored pin.
957
+
958
+ ```ts
959
+ function pinOutput(
960
+ name: string,
961
+ store: PinStore,
962
+ runner: () => Promise<string>,
963
+ config: {
964
+ prompt: string
965
+ model?: string
966
+ temperature?: number
967
+ seed?: number
968
+ },
969
+ options?: { assert?: boolean },
970
+ ): Promise<PinResult>
971
+ ```
972
+
973
+ | Parameter | Type | Description |
974
+ |---|---|---|
975
+ | `name` | `string` | Human-readable name for the pin (used in error messages) |
976
+ | `store` | `PinStore` | The store to read/write pins |
977
+ | `runner` | `() => Promise<string>` | Function that produces the output to pin |
978
+ | `config` | `object` | Prompt text and model config used to generate the hash key |
979
+ | `options.assert` | `boolean` | When `true`, throws an `ElsiumError` on mismatch instead of returning |
980
+
981
+ **Returns:** `Promise<PinResult>` -- status is `'new'` on first run, `'match'` if output is identical, or `'mismatch'` if the output has changed.
982
+
983
+ ```ts
984
+ import { createPinStore, pinOutput } from '@elsium-ai/testing'
985
+
986
+ const store = createPinStore()
987
+
988
+ const result = await pinOutput(
989
+ 'greeting-pin',
990
+ store,
991
+ async () => 'Hello, world!',
992
+ { prompt: 'Say hello', model: 'gpt-4', temperature: 0 },
993
+ )
994
+
995
+ console.log(result.status) // 'new'
996
+
997
+ // Run again with the same output
998
+ const result2 = await pinOutput(
999
+ 'greeting-pin',
1000
+ store,
1001
+ async () => 'Hello, world!',
1002
+ { prompt: 'Say hello', model: 'gpt-4', temperature: 0 },
1003
+ )
1004
+
1005
+ console.log(result2.status) // 'match'
1006
+
1007
+ // Run with assert mode in CI
1008
+ await pinOutput(
1009
+ 'greeting-pin',
1010
+ store,
1011
+ async () => 'Different output!',
1012
+ { prompt: 'Say hello', model: 'gpt-4', temperature: 0 },
1013
+ { assert: true }, // throws ElsiumError on mismatch
1014
+ )
1015
+ ```
1016
+
1017
+ ---
1018
+
1019
+ ## Determinism
1020
+
1021
+ Verify that an LLM function produces consistent outputs across multiple invocations.
1022
+
1023
+ ### `DeterminismResult`
1024
+
1025
+ ```ts
1026
+ interface DeterminismResult {
1027
+ deterministic: boolean
1028
+ runs: number
1029
+ uniqueOutputs: number
1030
+ outputs: string[]
1031
+ variance: number
1032
+ }
1033
+ ```
1034
+
1035
+ ### `StabilityResult`
1036
+
1037
+ ```ts
1038
+ interface StabilityResult {
1039
+ stable: boolean
1040
+ runs: number
1041
+ uniqueOutputs: number
1042
+ outputs: Array<{ output: string; timestamp: number }>
1043
+ variance: number
1044
+ }
1045
+ ```
1046
+
1047
+ ### `assertDeterministic()`
1048
+
1049
+ Runs a function multiple times and verifies that all outputs are identical (or within the specified tolerance).
1050
+
1051
+ ```ts
1052
+ function assertDeterministic(
1053
+ fn: (seed?: number) => Promise<string>,
1054
+ options?: {
1055
+ runs?: number
1056
+ seed?: number
1057
+ tolerance?: number
1058
+ },
1059
+ ): Promise<DeterminismResult>
1060
+ ```
1061
+
1062
+ | Parameter | Type | Default | Description |
1063
+ |---|---|---|---|
1064
+ | `fn` | `(seed?: number) => Promise<string>` | -- | The function to test for determinism |
1065
+ | `options.runs` | `number` | `5` | Number of times to invoke `fn` |
1066
+ | `options.seed` | `number` | `undefined` | Seed passed to `fn` on each invocation |
1067
+ | `options.tolerance` | `number` | `0` | Maximum allowed variance (0 = strictly deterministic) |
1068
+
1069
+ **Returns:** `Promise<DeterminismResult>`
1070
+
1071
+ Throws an `ElsiumError` when `tolerance` is `0` (the default) and outputs are not identical.
1072
+
1073
+ ```ts
1074
+ import { assertDeterministic } from '@elsium-ai/testing'
28
1075
 
29
- // Determinism check
30
1076
  const result = await assertDeterministic(
31
- (seed) => llm.complete({
32
- messages: [{ role: 'user', content: 'Classify: spam' }],
33
- temperature: 0,
34
- seed,
35
- }).then(r => r.message.content),
1077
+ async (seed) => {
1078
+ // Call your LLM with temperature: 0 and the provided seed
1079
+ return 'consistent output'
1080
+ },
36
1081
  { runs: 5, seed: 42, tolerance: 0 },
37
1082
  )
38
- // { deterministic: true, variance: 0, uniqueOutputs: 1 }
39
1083
 
40
- // Mock provider for tests
41
- const mock = createMockProvider({
42
- responses: [{ content: 'Mocked response' }],
43
- })
1084
+ console.log(result.deterministic) // true
1085
+ console.log(result.uniqueOutputs) // 1
1086
+ console.log(result.variance) // 0
1087
+ ```
1088
+
1089
+ ### `assertStable()`
1090
+
1091
+ Runs a function multiple times with a delay between invocations to verify temporal stability.
1092
+
1093
+ ```ts
1094
+ function assertStable(
1095
+ fn: (seed?: number) => Promise<string>,
1096
+ options?: {
1097
+ intervalMs?: number
1098
+ runs?: number
1099
+ seed?: number
1100
+ },
1101
+ ): Promise<StabilityResult>
1102
+ ```
1103
+
1104
+ | Parameter | Type | Default | Description |
1105
+ |---|---|---|---|
1106
+ | `fn` | `(seed?: number) => Promise<string>` | -- | The function to test for stability |
1107
+ | `options.intervalMs` | `number` | `100` | Delay in milliseconds between runs |
1108
+ | `options.runs` | `number` | `3` | Number of times to invoke `fn` |
1109
+ | `options.seed` | `number` | `undefined` | Seed passed to `fn` on each invocation |
1110
+
1111
+ **Returns:** `Promise<StabilityResult>`
1112
+
1113
+ ```ts
1114
+ import { assertStable } from '@elsium-ai/testing'
1115
+
1116
+ const result = await assertStable(
1117
+ async (seed) => {
1118
+ return 'same output every time'
1119
+ },
1120
+ { intervalMs: 200, runs: 3, seed: 42 },
1121
+ )
1122
+
1123
+ console.log(result.stable) // true
1124
+ console.log(result.uniqueOutputs) // 1
1125
+ console.log(result.outputs) // [{ output: '...', timestamp: ... }, ...]
44
1126
  ```
45
1127
 
1128
+ ---
1129
+
46
1130
  ## Part of ElsiumAI
47
1131
 
48
1132
  This package is the testing layer of the [ElsiumAI](https://github.com/elsium-ai/elsium-ai) framework. See the [full documentation](https://github.com/elsium-ai/elsium-ai) for guides and examples.
package/package.json CHANGED
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "@elsium-ai/testing",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "description": "Testing utilities, mock providers, fixtures, and eval framework for ElsiumAI",
5
5
  "license": "MIT",
6
6
  "author": "Eric Utrera <ebutrera9103@gmail.com>",
7
7
  "repository": {
8
8
  "type": "git",
9
- "url": "https://github.com/elsium-ai/elsium-ai",
9
+ "url": "git+https://github.com/elsium-ai/elsium-ai.git",
10
10
  "directory": "packages/testing"
11
11
  },
12
12
  "type": "module",
@@ -26,10 +26,10 @@
26
26
  "dev": "bun --watch src/index.ts"
27
27
  },
28
28
  "dependencies": {
29
- "@elsium-ai/core": "^0.2.1",
30
- "@elsium-ai/gateway": "^0.2.1",
31
- "@elsium-ai/agents": "^0.2.1",
32
- "@elsium-ai/tools": "^0.2.1"
29
+ "@elsium-ai/core": "^0.2.3",
30
+ "@elsium-ai/gateway": "^0.2.3",
31
+ "@elsium-ai/agents": "^0.2.3",
32
+ "@elsium-ai/tools": "^0.2.3"
33
33
  },
34
34
  "devDependencies": {
35
35
  "typescript": "^5.7.0"