@elsium-ai/testing 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1263 @@
1
+ // @bun
2
+ // ../core/src/errors.ts
3
+ class ElsiumError extends Error {
4
+ code;
5
+ provider;
6
+ model;
7
+ statusCode;
8
+ retryable;
9
+ retryAfterMs;
10
+ cause;
11
+ metadata;
12
+ constructor(details) {
13
+ super(details.message);
14
+ this.name = "ElsiumError";
15
+ this.code = details.code;
16
+ this.provider = details.provider;
17
+ this.model = details.model;
18
+ this.statusCode = details.statusCode;
19
+ this.retryable = details.retryable;
20
+ this.retryAfterMs = details.retryAfterMs;
21
+ this.cause = details.cause;
22
+ this.metadata = details.metadata;
23
+ }
24
+ toJSON() {
25
+ return {
26
+ name: this.name,
27
+ code: this.code,
28
+ message: this.message,
29
+ provider: this.provider,
30
+ model: this.model,
31
+ statusCode: this.statusCode,
32
+ retryable: this.retryable,
33
+ retryAfterMs: this.retryAfterMs,
34
+ metadata: this.metadata
35
+ };
36
+ }
37
+ static providerError(message, opts) {
38
+ return new ElsiumError({
39
+ code: "PROVIDER_ERROR",
40
+ message,
41
+ provider: opts.provider,
42
+ statusCode: opts.statusCode,
43
+ retryable: opts.retryable ?? false,
44
+ cause: opts.cause
45
+ });
46
+ }
47
+ static rateLimit(provider, retryAfterMs) {
48
+ return new ElsiumError({
49
+ code: "RATE_LIMIT",
50
+ message: `Rate limited by ${provider}`,
51
+ provider,
52
+ statusCode: 429,
53
+ retryable: true,
54
+ retryAfterMs
55
+ });
56
+ }
57
+ static authError(provider) {
58
+ return new ElsiumError({
59
+ code: "AUTH_ERROR",
60
+ message: `Authentication failed for ${provider}. Check your API key.`,
61
+ provider,
62
+ statusCode: 401,
63
+ retryable: false
64
+ });
65
+ }
66
+ static timeout(provider, timeoutMs) {
67
+ return new ElsiumError({
68
+ code: "TIMEOUT",
69
+ message: `Request to ${provider} timed out after ${timeoutMs}ms`,
70
+ provider,
71
+ retryable: true
72
+ });
73
+ }
74
+ static validation(message, metadata) {
75
+ return new ElsiumError({
76
+ code: "VALIDATION_ERROR",
77
+ message,
78
+ retryable: false,
79
+ metadata
80
+ });
81
+ }
82
+ static budgetExceeded(spent, budget) {
83
+ return new ElsiumError({
84
+ code: "BUDGET_EXCEEDED",
85
+ message: `Token budget exceeded: spent ${spent}, budget ${budget}`,
86
+ retryable: false,
87
+ metadata: { spent, budget }
88
+ });
89
+ }
90
+ }
91
+ // ../core/src/utils.ts
92
+ import { randomBytes } from "crypto";
93
+ function cryptoHex(bytes) {
94
+ return randomBytes(bytes).toString("hex");
95
+ }
96
+ function generateId(prefix = "els") {
97
+ const timestamp = Date.now().toString(36);
98
+ const random = cryptoHex(4);
99
+ return `${prefix}_${timestamp}_${random}`;
100
+ }
101
+ function generateTraceId() {
102
+ const timestamp = Date.now().toString(36);
103
+ const random = cryptoHex(6);
104
+ return `trc_${timestamp}_${random}`;
105
+ }
106
+ async function sleep(ms) {
107
+ return new Promise((resolve) => setTimeout(resolve, ms));
108
+ }
109
+
110
+ // ../core/src/stream.ts
111
+ function shouldEmitCheckpoint(lastCheckpointTime, intervalMs, textLength) {
112
+ const elapsed = Date.now() - lastCheckpointTime;
113
+ return elapsed >= intervalMs && textLength > 0;
114
+ }
115
+ function createCheckpoint(textAccumulator, eventIndex, now) {
116
+ return {
117
+ id: generateId("ckpt"),
118
+ timestamp: now,
119
+ text: textAccumulator,
120
+ tokensSoFar: Math.ceil(textAccumulator.length / 1.5),
121
+ eventIndex
122
+ };
123
+ }
124
+ function toError(err) {
125
+ return err instanceof Error ? err : new Error(String(err));
126
+ }
127
+ function* emitErrorEvent(err, textAccumulator, onPartialRecovery) {
128
+ const error = toError(err);
129
+ if (textAccumulator.length > 0) {
130
+ onPartialRecovery?.(textAccumulator, error);
131
+ yield { type: "recovery", partialText: textAccumulator, error };
132
+ } else {
133
+ yield { type: "error", error };
134
+ }
135
+ }
136
+
137
+ class ElsiumStream {
138
+ source;
139
+ iterating = false;
140
+ constructor(source) {
141
+ this.source = source;
142
+ }
143
+ async* [Symbol.asyncIterator]() {
144
+ if (this.iterating) {
145
+ throw new Error("ElsiumStream supports only a single consumer");
146
+ }
147
+ this.iterating = true;
148
+ yield* this.source;
149
+ }
150
+ text() {
151
+ const source = this.source;
152
+ return {
153
+ async* [Symbol.asyncIterator]() {
154
+ for await (const event of source) {
155
+ if (event.type === "text_delta") {
156
+ yield event.text;
157
+ }
158
+ }
159
+ }
160
+ };
161
+ }
162
+ async toText() {
163
+ const parts = [];
164
+ for await (const text of this.text()) {
165
+ parts.push(text);
166
+ }
167
+ return parts.join("");
168
+ }
169
+ async toTextWithTimeout(timeoutMs) {
170
+ const parts = [];
171
+ const deadline = Date.now() + timeoutMs;
172
+ const iterator = this.source[Symbol.asyncIterator]();
173
+ try {
174
+ while (true) {
175
+ const remaining = deadline - Date.now();
176
+ if (remaining <= 0)
177
+ break;
178
+ let timer;
179
+ const timeoutPromise = new Promise((resolve) => {
180
+ timer = setTimeout(() => resolve({ value: undefined, done: true }), remaining);
181
+ });
182
+ const result = await Promise.race([iterator.next(), timeoutPromise]);
183
+ if (timer !== undefined)
184
+ clearTimeout(timer);
185
+ if (result.done)
186
+ break;
187
+ const event = result.value;
188
+ if (event.type === "text_delta") {
189
+ parts.push(event.text);
190
+ }
191
+ }
192
+ } catch (err) {
193
+ if (parts.length === 0)
194
+ throw err;
195
+ } finally {
196
+ await iterator.return?.();
197
+ }
198
+ return parts.join("");
199
+ }
200
+ async toResponse() {
201
+ const parts = [];
202
+ let usage = null;
203
+ let stopReason = null;
204
+ for await (const event of this.source) {
205
+ switch (event.type) {
206
+ case "text_delta":
207
+ parts.push(event.text);
208
+ break;
209
+ case "message_end":
210
+ usage = event.usage;
211
+ stopReason = event.stopReason;
212
+ break;
213
+ }
214
+ }
215
+ return { text: parts.join(""), usage, stopReason };
216
+ }
217
+ pipe(transform) {
218
+ return new ElsiumStream(transform(this.source));
219
+ }
220
+ resilient(options = {}) {
221
+ const { checkpointIntervalMs = 1000, onCheckpoint, onPartialRecovery } = options;
222
+ const source = this.source;
223
+ const resilientSource = {
224
+ async* [Symbol.asyncIterator]() {
225
+ let lastCheckpointTime = Date.now();
226
+ let textAccumulator = "";
227
+ let eventIndex = 0;
228
+ try {
229
+ for await (const event of source) {
230
+ eventIndex++;
231
+ if (event.type === "text_delta") {
232
+ textAccumulator += event.text;
233
+ }
234
+ yield event;
235
+ if (shouldEmitCheckpoint(lastCheckpointTime, checkpointIntervalMs, textAccumulator.length)) {
236
+ const now = Date.now();
237
+ const checkpoint = createCheckpoint(textAccumulator, eventIndex, now);
238
+ onCheckpoint?.(checkpoint);
239
+ yield { type: "checkpoint", checkpoint };
240
+ lastCheckpointTime = now;
241
+ }
242
+ }
243
+ } catch (err) {
244
+ yield* emitErrorEvent(err, textAccumulator, onPartialRecovery);
245
+ }
246
+ }
247
+ };
248
+ return new ElsiumStream(resilientSource);
249
+ }
250
+ }
251
+ var MAX_BUFFER_SIZE = 1e4;
252
+ function createStream(executor) {
253
+ let resolve = null;
254
+ const buffer = [];
255
+ let done = false;
256
+ let error = null;
257
+ let dropped = 0;
258
+ const source = {
259
+ [Symbol.asyncIterator]() {
260
+ return {
261
+ next() {
262
+ if (buffer.length > 0) {
263
+ const value = buffer.shift();
264
+ return Promise.resolve({ value, done: false });
265
+ }
266
+ if (done) {
267
+ return Promise.resolve({ value: undefined, done: true });
268
+ }
269
+ if (error) {
270
+ return Promise.reject(error);
271
+ }
272
+ return new Promise((r) => {
273
+ resolve = r;
274
+ });
275
+ }
276
+ };
277
+ }
278
+ };
279
+ const emit = (event) => {
280
+ if (resolve) {
281
+ const r = resolve;
282
+ resolve = null;
283
+ r({ value: event, done: false });
284
+ } else {
285
+ if (buffer.length < MAX_BUFFER_SIZE) {
286
+ buffer.push(event);
287
+ } else {
288
+ dropped++;
289
+ }
290
+ }
291
+ };
292
+ executor(emit).then(() => {
293
+ if (dropped > 0) {
294
+ emit({
295
+ type: "error",
296
+ error: new Error(`Stream buffer overflow: ${dropped} events dropped`)
297
+ });
298
+ }
299
+ done = true;
300
+ if (resolve) {
301
+ const r = resolve;
302
+ resolve = null;
303
+ r({ value: undefined, done: true });
304
+ }
305
+ }).catch((e) => {
306
+ error = e instanceof Error ? e : new Error(String(e));
307
+ if (resolve) {
308
+ resolve({ value: { type: "error", error }, done: false });
309
+ resolve = null;
310
+ }
311
+ });
312
+ return new ElsiumStream(source);
313
+ }
314
+ // src/mock-provider.ts
315
+ function mockProvider(options = {}) {
316
+ const { responses = [], defaultResponse, onRequest } = options;
317
+ const calls = [];
318
+ let callIndex = 0;
319
+ function getNextResponse() {
320
+ if (callIndex < responses.length) {
321
+ return responses[callIndex++];
322
+ }
323
+ if (defaultResponse) {
324
+ callIndex++;
325
+ return defaultResponse;
326
+ }
327
+ callIndex++;
328
+ return { content: "" };
329
+ }
330
+ async function emitStreamEvents(emit, config) {
331
+ if (config.delay) {
332
+ await new Promise((r) => setTimeout(r, config.delay));
333
+ }
334
+ emit({
335
+ type: "message_start",
336
+ id: generateId("msg"),
337
+ model: config.model ?? "mock-model"
338
+ });
339
+ const content = config.content ?? "";
340
+ if (content) {
341
+ const words = content.split(" ");
342
+ for (const word of words) {
343
+ emit({ type: "text_delta", text: `${word} ` });
344
+ }
345
+ }
346
+ emit({
347
+ type: "message_end",
348
+ usage: {
349
+ inputTokens: config.usage?.inputTokens ?? 10,
350
+ outputTokens: config.usage?.outputTokens ?? 5,
351
+ totalTokens: config.usage?.totalTokens ?? 15
352
+ },
353
+ stopReason: config.stopReason ?? "end_turn"
354
+ });
355
+ }
356
+ function buildResponse(config, request) {
357
+ const model = config.model ?? request.model ?? "mock-model";
358
+ const content = config.content ?? "";
359
+ const toolCalls = config.toolCalls?.map((tc) => ({
360
+ id: tc.id ?? generateId("tc"),
361
+ name: tc.name,
362
+ arguments: tc.arguments
363
+ }));
364
+ const usage = {
365
+ inputTokens: config.usage?.inputTokens ?? Math.ceil(content.length / 4),
366
+ outputTokens: config.usage?.outputTokens ?? Math.ceil(content.length / 4),
367
+ totalTokens: 0,
368
+ ...config.usage
369
+ };
370
+ usage.totalTokens = usage.inputTokens + usage.outputTokens;
371
+ const message = {
372
+ role: "assistant",
373
+ content,
374
+ ...toolCalls?.length ? { toolCalls } : {}
375
+ };
376
+ return {
377
+ id: generateId("msg"),
378
+ message,
379
+ usage,
380
+ cost: { inputCost: 0, outputCost: 0, totalCost: 0, currency: "USD" },
381
+ model,
382
+ provider: "mock",
383
+ stopReason: config.stopReason ?? (toolCalls?.length ? "tool_use" : "end_turn"),
384
+ latencyMs: config.delay ?? 0,
385
+ traceId: generateTraceId()
386
+ };
387
+ }
388
+ return {
389
+ name: "mock",
390
+ defaultModel: "mock-model",
391
+ get calls() {
392
+ return calls;
393
+ },
394
+ get callCount() {
395
+ return calls.length;
396
+ },
397
+ async complete(request) {
398
+ calls.push(request);
399
+ onRequest?.(request);
400
+ const config = getNextResponse();
401
+ if (config.delay) {
402
+ await new Promise((r) => setTimeout(r, config.delay));
403
+ }
404
+ return buildResponse(config, request);
405
+ },
406
+ stream(request) {
407
+ calls.push(request);
408
+ onRequest?.(request);
409
+ const config = getNextResponse();
410
+ return createStream((emit) => emitStreamEvents(emit, config));
411
+ },
412
+ async listModels() {
413
+ return ["mock-model"];
414
+ },
415
+ reset() {
416
+ calls.length = 0;
417
+ callIndex = 0;
418
+ }
419
+ };
420
+ }
421
+ // src/fixtures.ts
422
+ import { createHash } from "crypto";
423
+ function hashMessages(messages) {
424
+ const content = messages.map((m) => `${m.role}:${m.content}`).join("|");
425
+ return createHash("sha256").update(content).digest("hex").slice(0, 16);
426
+ }
427
+ function createFixture(name, entries) {
428
+ return {
429
+ name,
430
+ entries,
431
+ toProvider(options) {
432
+ if (options?.matching === "request-hash") {
433
+ const hashMap = new Map;
434
+ for (const entry of entries) {
435
+ const hash = hashMessages(entry.request.messages);
436
+ hashMap.set(hash, entry.response);
437
+ }
438
+ const provider = mockProvider({
439
+ responses: entries.map((e) => e.response)
440
+ });
441
+ const originalComplete = provider.complete.bind(provider);
442
+ const wrapped = Object.create(provider);
443
+ wrapped.complete = async (request) => {
444
+ const reqMessages = request.messages.map((m) => ({
445
+ role: m.role,
446
+ content: typeof m.content === "string" ? m.content : "[complex]"
447
+ }));
448
+ const hash = hashMessages(reqMessages);
449
+ const matched = hashMap.get(hash);
450
+ if (matched) {
451
+ return mockProvider({ responses: [matched] }).complete(request);
452
+ }
453
+ return originalComplete(request);
454
+ };
455
+ return wrapped;
456
+ }
457
+ return mockProvider({
458
+ responses: entries.map((e) => e.response)
459
+ });
460
+ },
461
+ toJSON() {
462
+ return JSON.stringify({
463
+ name,
464
+ entries: entries.map((e) => ({
465
+ ...e,
466
+ timestamp: e.timestamp ?? new Date().toISOString()
467
+ }))
468
+ }, null, 2);
469
+ }
470
+ };
471
+ }
472
+ function loadFixture(json) {
473
+ const data = JSON.parse(json);
474
+ return createFixture(data.name, data.entries);
475
+ }
476
+ function createRecorder() {
477
+ const entries = [];
478
+ return {
479
+ wrap(provider) {
480
+ const originalComplete = provider.complete.bind(provider);
481
+ const wrapped = Object.create(provider);
482
+ wrapped.complete = async (request) => {
483
+ const response = await originalComplete(request);
484
+ entries.push({
485
+ request: {
486
+ messages: request.messages.map((m) => ({
487
+ role: m.role,
488
+ content: typeof m.content === "string" ? m.content : "[complex]"
489
+ })),
490
+ model: request.model,
491
+ system: request.system
492
+ },
493
+ response: {
494
+ content: typeof response.message.content === "string" ? response.message.content : "",
495
+ toolCalls: response.message.toolCalls,
496
+ stopReason: response.stopReason,
497
+ usage: response.usage,
498
+ model: response.model
499
+ },
500
+ timestamp: new Date().toISOString()
501
+ });
502
+ return response;
503
+ };
504
+ return wrapped;
505
+ },
506
+ getEntries() {
507
+ return [...entries];
508
+ },
509
+ toFixture(name) {
510
+ return createFixture(name, [...entries]);
511
+ },
512
+ clear() {
513
+ entries.length = 0;
514
+ }
515
+ };
516
+ }
517
+ // src/eval.ts
518
+ function evaluateContains(output, criterion) {
519
+ const target = criterion.caseSensitive ? criterion.value : criterion.value.toLowerCase();
520
+ const haystack = criterion.caseSensitive ? output : output.toLowerCase();
521
+ const passed = haystack.includes(target);
522
+ return {
523
+ type: "contains",
524
+ passed,
525
+ message: passed ? `Contains "${criterion.value}"` : `Does not contain "${criterion.value}"`
526
+ };
527
+ }
528
+ function evaluateNotContains(output, criterion) {
529
+ const target = criterion.caseSensitive ? criterion.value : criterion.value.toLowerCase();
530
+ const haystack = criterion.caseSensitive ? output : output.toLowerCase();
531
+ const passed = !haystack.includes(target);
532
+ return {
533
+ type: "not_contains",
534
+ passed,
535
+ message: passed ? `Does not contain "${criterion.value}"` : `Contains "${criterion.value}" (should not)`
536
+ };
537
+ }
538
+ function evaluateMatches(output, criterion) {
539
+ const regex = new RegExp(criterion.pattern, criterion.flags);
540
+ const passed = regex.test(output);
541
+ return {
542
+ type: "matches",
543
+ passed,
544
+ message: passed ? `Matches /${criterion.pattern}/` : `Does not match /${criterion.pattern}/`
545
+ };
546
+ }
547
+ function evaluateLengthMin(output, criterion) {
548
+ const passed = output.length >= criterion.value;
549
+ return {
550
+ type: "length_min",
551
+ passed,
552
+ message: passed ? `Length ${output.length} >= ${criterion.value}` : `Length ${output.length} < ${criterion.value}`
553
+ };
554
+ }
555
+ function evaluateLengthMax(output, criterion) {
556
+ const passed = output.length <= criterion.value;
557
+ return {
558
+ type: "length_max",
559
+ passed,
560
+ message: passed ? `Length ${output.length} <= ${criterion.value}` : `Length ${output.length} > ${criterion.value}`
561
+ };
562
+ }
563
+ function evaluateJsonValid(output) {
564
+ try {
565
+ JSON.parse(output);
566
+ return { type: "json_valid", passed: true, message: "Valid JSON" };
567
+ } catch {
568
+ return { type: "json_valid", passed: false, message: "Invalid JSON" };
569
+ }
570
+ }
571
+ function evaluateJsonMatches(output, criterion) {
572
+ try {
573
+ const parsed = JSON.parse(output);
574
+ const passed = matchesSchema(parsed, criterion.schema);
575
+ return {
576
+ type: "json_matches",
577
+ passed,
578
+ message: passed ? "JSON matches schema" : "JSON does not match schema"
579
+ };
580
+ } catch {
581
+ return { type: "json_matches", passed: false, message: "Invalid JSON" };
582
+ }
583
+ }
584
+ function evaluateCustom(output, criterion) {
585
+ const passed = criterion.fn(output);
586
+ return {
587
+ type: `custom:${criterion.name}`,
588
+ passed,
589
+ message: passed ? `Custom check "${criterion.name}" passed` : `Custom check "${criterion.name}" failed`
590
+ };
591
+ }
592
+ function evaluateSemanticSimilarity(output, criterion) {
593
+ const refWords = new Set(criterion.reference.toLowerCase().split(/\s+/).filter((w) => w.length > 3));
594
+ const outWords = output.toLowerCase().split(/\s+/).filter((w) => w.length > 3);
595
+ const overlap = outWords.filter((w) => refWords.has(w)).length;
596
+ const score = refWords.size > 0 ? overlap / refWords.size : 0;
597
+ const threshold = criterion.threshold ?? 0.7;
598
+ const passed = score >= threshold;
599
+ return {
600
+ type: "semantic_similarity",
601
+ passed,
602
+ message: passed ? `Semantic similarity ${(score * 100).toFixed(0)}% >= ${(threshold * 100).toFixed(0)}%` : `Semantic similarity ${(score * 100).toFixed(0)}% < ${(threshold * 100).toFixed(0)}%`
603
+ };
604
+ }
605
+ function evaluateFactualAccuracy(output, criterion) {
606
+ const facts = criterion.facts;
607
+ let matchedFacts = 0;
608
+ const outputLower = output.toLowerCase();
609
+ for (const fact of facts) {
610
+ const factWords = fact.toLowerCase().split(/\s+/).filter((w) => w.length > 3);
611
+ const matches = factWords.filter((w) => outputLower.includes(w)).length;
612
+ if (matches / Math.max(factWords.length, 1) > 0.5) {
613
+ matchedFacts++;
614
+ }
615
+ }
616
+ const score = facts.length > 0 ? matchedFacts / facts.length : 1;
617
+ const threshold = criterion.threshold ?? 0.7;
618
+ const passed = score >= threshold;
619
+ return {
620
+ type: "factual_accuracy",
621
+ passed,
622
+ message: passed ? `Factual accuracy: ${matchedFacts}/${facts.length} facts verified` : `Factual accuracy: only ${matchedFacts}/${facts.length} facts found`
623
+ };
624
+ }
625
+ function evaluateCriterion(output, criterion) {
626
+ switch (criterion.type) {
627
+ case "contains":
628
+ return evaluateContains(output, criterion);
629
+ case "not_contains":
630
+ return evaluateNotContains(output, criterion);
631
+ case "matches":
632
+ return evaluateMatches(output, criterion);
633
+ case "length_min":
634
+ return evaluateLengthMin(output, criterion);
635
+ case "length_max":
636
+ return evaluateLengthMax(output, criterion);
637
+ case "json_valid":
638
+ return evaluateJsonValid(output);
639
+ case "json_matches":
640
+ return evaluateJsonMatches(output, criterion);
641
+ case "custom":
642
+ return evaluateCustom(output, criterion);
643
+ case "llm_judge":
644
+ return { type: "llm_judge", passed: false, message: "LLM judge requires async evaluation" };
645
+ case "semantic_similarity":
646
+ return evaluateSemanticSimilarity(output, criterion);
647
+ case "factual_accuracy":
648
+ return evaluateFactualAccuracy(output, criterion);
649
+ }
650
+ }
651
+ function matchesSchema(value, schema) {
652
+ if (typeof value !== "object" || value === null)
653
+ return false;
654
+ const obj = value;
655
+ for (const key of Object.keys(schema)) {
656
+ if (!(key in obj))
657
+ return false;
658
+ const expectedType = schema[key];
659
+ if (typeof expectedType === "string") {
660
+ const actualType = typeof obj[key];
661
+ if (actualType !== expectedType)
662
+ return false;
663
+ }
664
+ }
665
+ return true;
666
+ }
667
+ function makeRunnerErrorResult(evalCase, error, startTime) {
668
+ return {
669
+ name: evalCase.name,
670
+ passed: false,
671
+ score: 0,
672
+ criteria: [
673
+ {
674
+ type: "error",
675
+ passed: false,
676
+ message: `Runner error: ${error instanceof Error ? error.message : String(error)}`
677
+ }
678
+ ],
679
+ input: evalCase.input,
680
+ output: "",
681
+ durationMs: Math.round(performance.now() - startTime),
682
+ tags: evalCase.tags ?? []
683
+ };
684
+ }
685
+ function checkExpected(output, expected) {
686
+ const passed = output.includes(expected);
687
+ return {
688
+ type: "expected",
689
+ passed,
690
+ message: passed ? "Output contains expected text" : `Output does not contain expected "${expected}"`
691
+ };
692
+ }
693
+ async function evaluateLlmJudge(output, criterion) {
694
+ try {
695
+ const fullPrompt = `${criterion.prompt}
696
+
697
+ Output to evaluate:
698
+ ${output}`;
699
+ const result = await criterion.judge(fullPrompt);
700
+ const threshold = criterion.threshold ?? 0.7;
701
+ const passed = result.score >= threshold;
702
+ return {
703
+ type: "llm_judge",
704
+ passed,
705
+ message: passed ? `LLM judge score: ${result.score.toFixed(2)} (${result.reasoning})` : `LLM judge score: ${result.score.toFixed(2)} < ${threshold} (${result.reasoning})`
706
+ };
707
+ } catch (error) {
708
+ return {
709
+ type: "llm_judge",
710
+ passed: false,
711
+ message: `LLM judge error: ${error instanceof Error ? error.message : String(error)}`
712
+ };
713
+ }
714
+ }
715
+ async function evaluateAllCriteria(output, evalCase) {
716
+ const criteriaResults = [];
717
+ if (evalCase.expected !== undefined) {
718
+ criteriaResults.push(checkExpected(output, evalCase.expected));
719
+ }
720
+ for (const criterion of evalCase.criteria ?? []) {
721
+ if (criterion.type === "llm_judge") {
722
+ criteriaResults.push(await evaluateLlmJudge(output, criterion));
723
+ } else {
724
+ criteriaResults.push(evaluateCriterion(output, criterion));
725
+ }
726
+ }
727
+ return criteriaResults;
728
+ }
729
+ async function runCase(evalCase, runner) {
730
+ const startTime = performance.now();
731
+ let output;
732
+ try {
733
+ output = await runner(evalCase.input);
734
+ } catch (error) {
735
+ return makeRunnerErrorResult(evalCase, error, startTime);
736
+ }
737
+ const criteriaResults = await evaluateAllCriteria(output, evalCase);
738
+ const passedCount = criteriaResults.filter((c) => c.passed).length;
739
+ const totalCount = criteriaResults.length;
740
+ const allPassed = totalCount === 0 || passedCount === totalCount;
741
+ const score = totalCount === 0 ? 1 : passedCount / totalCount;
742
+ return {
743
+ name: evalCase.name,
744
+ passed: allPassed,
745
+ score,
746
+ criteria: criteriaResults,
747
+ input: evalCase.input,
748
+ output,
749
+ durationMs: Math.round(performance.now() - startTime),
750
+ tags: evalCase.tags ?? []
751
+ };
752
+ }
753
+ async function runEvalSuite(config) {
754
+ const startTime = performance.now();
755
+ const concurrency = config.concurrency ?? 1;
756
+ const results = [];
757
+ if (concurrency <= 1) {
758
+ for (const evalCase of config.cases) {
759
+ results.push(await runCase(evalCase, config.runner));
760
+ }
761
+ } else {
762
+ for (let i = 0;i < config.cases.length; i += concurrency) {
763
+ const batch = config.cases.slice(i, i + concurrency);
764
+ const batchResults = await Promise.all(batch.map((c) => runCase(c, config.runner)));
765
+ results.push(...batchResults);
766
+ }
767
+ }
768
+ const passed = results.filter((r) => r.passed).length;
769
+ const failed = results.length - passed;
770
+ return {
771
+ name: config.name,
772
+ total: results.length,
773
+ passed,
774
+ failed,
775
+ score: results.length > 0 ? passed / results.length : 0,
776
+ results,
777
+ durationMs: Math.round(performance.now() - startTime)
778
+ };
779
+ }
780
+ function formatEvalReport(result) {
781
+ const lines = [];
782
+ lines.push(`
783
+ Eval Suite: ${result.name}`);
784
+ lines.push(` ${"\u2500".repeat(50)}`);
785
+ for (const r of result.results) {
786
+ const icon = r.passed ? "PASS" : "FAIL";
787
+ lines.push(` [${icon}] ${r.name} (${r.durationMs}ms)`);
788
+ if (!r.passed) {
789
+ for (const c of r.criteria) {
790
+ if (!c.passed) {
791
+ lines.push(` ${c.message}`);
792
+ }
793
+ }
794
+ }
795
+ }
796
+ lines.push(` ${"\u2500".repeat(50)}`);
797
+ lines.push(` Score: ${(result.score * 100).toFixed(1)}% | ${result.passed}/${result.total} passed | ${result.durationMs}ms`);
798
+ lines.push("");
799
+ return lines.join(`
800
+ `);
801
+ }
802
+ // src/snapshot.ts
803
+ import { createHash as createHash2 } from "crypto";
804
+ function createSnapshotStore(existing) {
805
+ const snapshots = new Map;
806
+ if (existing) {
807
+ for (const s of existing) {
808
+ snapshots.set(s.name, s);
809
+ }
810
+ }
811
+ return {
812
+ get(name) {
813
+ return snapshots.get(name);
814
+ },
815
+ set(name, snapshot) {
816
+ snapshots.set(name, snapshot);
817
+ },
818
+ getAll() {
819
+ return Array.from(snapshots.values());
820
+ },
821
+ toJSON() {
822
+ return JSON.stringify(Array.from(snapshots.values()), null, 2);
823
+ }
824
+ };
825
+ }
826
+ function hashOutput(output) {
827
+ return createHash2("sha256").update(output).digest("hex");
828
+ }
829
+ async function testSnapshot(name, store, runner, request) {
830
+ const output = await runner();
831
+ const currentHash = hashOutput(output);
832
+ const existing = store.get(name);
833
+ const snapshot = {
834
+ name,
835
+ request: {
836
+ system: request?.system,
837
+ messages: request?.messages?.map((m) => ({
838
+ role: m.role,
839
+ content: typeof m.content === "string" ? m.content : "[complex]"
840
+ })) ?? [],
841
+ model: request?.model
842
+ },
843
+ outputHash: currentHash,
844
+ timestamp: new Date().toISOString()
845
+ };
846
+ if (!existing) {
847
+ store.set(name, snapshot);
848
+ return { name, status: "new", currentHash, output };
849
+ }
850
+ if (existing.outputHash === currentHash) {
851
+ return {
852
+ name,
853
+ status: "match",
854
+ previousHash: existing.outputHash,
855
+ currentHash,
856
+ output
857
+ };
858
+ }
859
+ store.set(name, snapshot);
860
+ return {
861
+ name,
862
+ status: "changed",
863
+ previousHash: existing.outputHash,
864
+ currentHash,
865
+ output
866
+ };
867
+ }
868
+ // src/prompts.ts
869
+ function definePrompt(config) {
870
+ return { ...config };
871
+ }
872
+ function compareLine(fromLine, toLine, lineNumber) {
873
+ if (fromLine === undefined) {
874
+ return [{ type: "added", lineNumber, content: toLine }];
875
+ }
876
+ if (toLine === undefined) {
877
+ return [{ type: "removed", lineNumber, content: fromLine }];
878
+ }
879
+ if (fromLine !== toLine) {
880
+ return [
881
+ { type: "removed", lineNumber, content: fromLine },
882
+ { type: "added", lineNumber, content: toLine }
883
+ ];
884
+ }
885
+ return [{ type: "unchanged", lineNumber, content: fromLine }];
886
+ }
887
+ function computeLineChanges(fromLines, toLines) {
888
+ const changes = [];
889
+ const maxLen = Math.max(fromLines.length, toLines.length);
890
+ for (let i = 0;i < maxLen; i++) {
891
+ changes.push(...compareLine(fromLines[i], toLines[i], i + 1));
892
+ }
893
+ return changes;
894
+ }
895
+ function createPromptRegistry() {
896
+ const store = new Map;
897
+ function compareVersions(a, b) {
898
+ const aParts = a.split(".").map(Number);
899
+ const bParts = b.split(".").map(Number);
900
+ for (let i = 0;i < Math.max(aParts.length, bParts.length); i++) {
901
+ const aVal = aParts[i] ?? 0;
902
+ const bVal = bParts[i] ?? 0;
903
+ if (aVal !== bVal)
904
+ return aVal - bVal;
905
+ }
906
+ return 0;
907
+ }
908
+ return {
909
+ register(name, prompt) {
910
+ if (!store.has(name)) {
911
+ store.set(name, new Map);
912
+ }
913
+ const versions = store.get(name);
914
+ if (versions) {
915
+ versions.set(prompt.version, prompt);
916
+ }
917
+ },
918
+ get(name, version) {
919
+ const versions = store.get(name);
920
+ if (!versions)
921
+ return;
922
+ if (version)
923
+ return versions.get(version);
924
+ return this.getLatest(name);
925
+ },
926
+ getLatest(name) {
927
+ const versions = store.get(name);
928
+ if (!versions || versions.size === 0)
929
+ return;
930
+ const sorted = [...versions.keys()].sort(compareVersions);
931
+ return versions.get(sorted[sorted.length - 1]);
932
+ },
933
+ list() {
934
+ const result = [];
935
+ for (const [name, versions] of store) {
936
+ result.push({
937
+ name,
938
+ versions: [...versions.keys()].sort(compareVersions)
939
+ });
940
+ }
941
+ return result;
942
+ },
943
+ getVersions(name) {
944
+ const versions = store.get(name);
945
+ if (!versions)
946
+ return [];
947
+ return [...versions.keys()].sort(compareVersions);
948
+ },
949
+ diff(name, fromVersion, toVersion) {
950
+ const versions = store.get(name);
951
+ if (!versions)
952
+ return null;
953
+ const from = versions.get(fromVersion);
954
+ const to = versions.get(toVersion);
955
+ if (!from || !to)
956
+ return null;
957
+ const fromLines = from.content.split(`
958
+ `);
959
+ const toLines = to.content.split(`
960
+ `);
961
+ const changes = computeLineChanges(fromLines, toLines);
962
+ return { name, fromVersion, toVersion, changes };
963
+ },
964
+ render(name, variables, version) {
965
+ const prompt = this.get(name, version);
966
+ if (!prompt) {
967
+ throw new Error(`Prompt "${name}" not found${version ? ` (version ${version})` : ""}`);
968
+ }
969
+ let rendered = prompt.content;
970
+ for (const [key, value] of Object.entries(variables)) {
971
+ rendered = rendered.replaceAll(`{{${key}}}`, value);
972
+ }
973
+ return rendered;
974
+ }
975
+ };
976
+ }
977
+ // src/regression.ts
978
+ import { mkdirSync, readFileSync, writeFileSync } from "fs";
979
+ import { dirname } from "path";
980
+ function makeEmptyResult(name) {
981
+ return {
982
+ name,
983
+ totalCases: 0,
984
+ regressions: [],
985
+ improvements: [],
986
+ unchanged: 0,
987
+ overallScore: 0,
988
+ baselineScore: 0
989
+ };
990
+ }
991
+ async function scoreCase(input, currentOutput, baselineOutput, scorer) {
992
+ if (scorer)
993
+ return scorer(input, currentOutput);
994
+ return currentOutput === baselineOutput ? 1 : 0.5;
995
+ }
996
+ function classifyDetail(detail, regressions, improvements) {
997
+ if (detail.delta < -0.1) {
998
+ regressions.push(detail);
999
+ return false;
1000
+ }
1001
+ if (detail.delta > 0.1) {
1002
+ improvements.push(detail);
1003
+ return false;
1004
+ }
1005
+ return true;
1006
+ }
1007
+ async function compareWithBaseline(name, baseline, runner, scorer) {
1008
+ const regressions = [];
1009
+ const improvements = [];
1010
+ let unchanged = 0;
1011
+ let totalCurrentScore = 0;
1012
+ const baselineScore = baseline.cases.reduce((sum, c) => sum + c.score, 0) / baseline.cases.length;
1013
+ for (const baselineCase of baseline.cases) {
1014
+ const currentOutput = await runner(baselineCase.input);
1015
+ const currentScore = await scoreCase(baselineCase.input, currentOutput, baselineCase.output, scorer);
1016
+ totalCurrentScore += currentScore;
1017
+ const detail = {
1018
+ input: baselineCase.input,
1019
+ baselineOutput: baselineCase.output,
1020
+ currentOutput,
1021
+ baselineScore: baselineCase.score,
1022
+ currentScore,
1023
+ delta: currentScore - baselineCase.score
1024
+ };
1025
+ if (classifyDetail(detail, regressions, improvements)) {
1026
+ unchanged++;
1027
+ }
1028
+ }
1029
+ return {
1030
+ name,
1031
+ totalCases: baseline.cases.length,
1032
+ regressions,
1033
+ improvements,
1034
+ unchanged,
1035
+ overallScore: totalCurrentScore / baseline.cases.length,
1036
+ baselineScore
1037
+ };
1038
+ }
1039
+ function createRegressionSuite(name) {
1040
+ let baseline = null;
1041
+ return {
1042
+ get baseline() {
1043
+ return baseline;
1044
+ },
1045
+ async load(path) {
1046
+ try {
1047
+ const data = readFileSync(path, "utf-8");
1048
+ baseline = JSON.parse(data);
1049
+ } catch {
1050
+ baseline = null;
1051
+ }
1052
+ },
1053
+ async save(path) {
1054
+ if (!baseline) {
1055
+ baseline = {
1056
+ name,
1057
+ cases: [],
1058
+ createdAt: Date.now(),
1059
+ updatedAt: Date.now()
1060
+ };
1061
+ }
1062
+ mkdirSync(dirname(path), { recursive: true });
1063
+ writeFileSync(path, JSON.stringify(baseline, null, 2));
1064
+ },
1065
+ addCase(input, output, score) {
1066
+ if (!baseline) {
1067
+ baseline = {
1068
+ name,
1069
+ cases: [],
1070
+ createdAt: Date.now(),
1071
+ updatedAt: Date.now()
1072
+ };
1073
+ }
1074
+ const existing = baseline.cases.findIndex((c) => c.input === input);
1075
+ if (existing >= 0) {
1076
+ baseline.cases[existing] = { input, output, score, timestamp: Date.now() };
1077
+ } else {
1078
+ baseline.cases.push({ input, output, score, timestamp: Date.now() });
1079
+ }
1080
+ baseline.updatedAt = Date.now();
1081
+ },
1082
+ async run(runner, scorer) {
1083
+ if (!baseline || baseline.cases.length === 0) {
1084
+ return makeEmptyResult(name);
1085
+ }
1086
+ return compareWithBaseline(name, baseline, runner, scorer);
1087
+ }
1088
+ };
1089
+ }
1090
+ // src/replay.ts
1091
+ function createReplayRecorder() {
1092
+ const entries = [];
1093
+ return {
1094
+ wrap(completeFn) {
1095
+ return async (request) => {
1096
+ const response = await completeFn(request);
1097
+ entries.push({
1098
+ request,
1099
+ response,
1100
+ timestamp: Date.now()
1101
+ });
1102
+ return response;
1103
+ };
1104
+ },
1105
+ getEntries() {
1106
+ return [...entries];
1107
+ },
1108
+ toJSON() {
1109
+ return JSON.stringify(entries, null, 2);
1110
+ },
1111
+ clear() {
1112
+ entries.length = 0;
1113
+ }
1114
+ };
1115
+ }
1116
+ function createReplayPlayer(entriesOrJson) {
1117
+ const entries = typeof entriesOrJson === "string" ? JSON.parse(entriesOrJson) : [...entriesOrJson];
1118
+ let index = 0;
1119
+ return {
1120
+ get remaining() {
1121
+ return entries.length - index;
1122
+ },
1123
+ async complete(_request) {
1124
+ if (index >= entries.length) {
1125
+ throw new Error("Replay exhausted: no more recorded responses");
1126
+ }
1127
+ const entry = entries[index];
1128
+ index++;
1129
+ return entry.response;
1130
+ }
1131
+ };
1132
+ }
1133
+ // src/pinning.ts
1134
+ import { createHash as createHash3 } from "crypto";
1135
+ function sha256(input) {
1136
+ return createHash3("sha256").update(input).digest("hex");
1137
+ }
1138
+ function createPinStore(existing) {
1139
+ const pins = new Map;
1140
+ if (existing) {
1141
+ for (const pin of existing) {
1142
+ const key = `${pin.promptHash}:${pin.configHash}`;
1143
+ pins.set(key, pin);
1144
+ }
1145
+ }
1146
+ return {
1147
+ get(key) {
1148
+ return pins.get(key);
1149
+ },
1150
+ set(key, pin) {
1151
+ pins.set(key, pin);
1152
+ },
1153
+ delete(key) {
1154
+ return pins.delete(key);
1155
+ },
1156
+ getAll() {
1157
+ return Array.from(pins.values());
1158
+ },
1159
+ toJSON() {
1160
+ return JSON.stringify(Array.from(pins.values()), null, 2);
1161
+ }
1162
+ };
1163
+ }
1164
+ async function pinOutput(name, store, runner, config, options) {
1165
+ const promptHash = sha256(config.prompt);
1166
+ const configHash = sha256(JSON.stringify({
1167
+ model: config.model,
1168
+ temperature: config.temperature,
1169
+ seed: config.seed
1170
+ }));
1171
+ const key = `${promptHash}:${configHash}`;
1172
+ const output = await runner();
1173
+ const outputHash = sha256(output);
1174
+ const pin = {
1175
+ promptHash,
1176
+ configHash,
1177
+ outputHash,
1178
+ outputText: output,
1179
+ model: config.model,
1180
+ createdAt: Date.now()
1181
+ };
1182
+ const existing = store.get(key);
1183
+ if (!existing) {
1184
+ store.set(key, pin);
1185
+ return { status: "new", pin };
1186
+ }
1187
+ if (existing.outputHash === outputHash) {
1188
+ return { status: "match", pin, previousPin: existing };
1189
+ }
1190
+ if (options?.assert) {
1191
+ throw ElsiumError.validation(`Pin mismatch for "${name}": expected hash ${existing.outputHash}, got ${outputHash}`);
1192
+ }
1193
+ store.set(key, pin);
1194
+ return { status: "mismatch", pin, previousPin: existing };
1195
+ }
1196
+ // src/determinism.ts
1197
+ async function assertDeterministic(fn, options) {
1198
+ const runs = options?.runs ?? 5;
1199
+ const seed = options?.seed;
1200
+ const tolerance = options?.tolerance ?? 0;
1201
+ const outputs = [];
1202
+ for (let i = 0;i < runs; i++) {
1203
+ const output = await fn(seed);
1204
+ outputs.push(output);
1205
+ }
1206
+ const unique = new Set(outputs);
1207
+ const uniqueOutputs = unique.size;
1208
+ const variance = runs > 1 ? (uniqueOutputs - 1) / (runs - 1) : 0;
1209
+ const deterministic = variance <= tolerance;
1210
+ if (!deterministic && tolerance === 0) {
1211
+ throw ElsiumError.validation(`Non-deterministic output: ${uniqueOutputs} unique outputs across ${runs} runs (variance: ${variance.toFixed(3)})`);
1212
+ }
1213
+ return {
1214
+ deterministic,
1215
+ runs,
1216
+ uniqueOutputs,
1217
+ outputs,
1218
+ variance
1219
+ };
1220
+ }
1221
+ async function assertStable(fn, options) {
1222
+ const intervalMs = options?.intervalMs ?? 100;
1223
+ const runs = options?.runs ?? 3;
1224
+ const seed = options?.seed;
1225
+ const outputs = [];
1226
+ for (let i = 0;i < runs; i++) {
1227
+ if (i > 0) {
1228
+ await sleep(intervalMs);
1229
+ }
1230
+ const output = await fn(seed);
1231
+ outputs.push({ output, timestamp: Date.now() });
1232
+ }
1233
+ const unique = new Set(outputs.map((o) => o.output));
1234
+ const uniqueOutputs = unique.size;
1235
+ const variance = runs > 1 ? (uniqueOutputs - 1) / (runs - 1) : 0;
1236
+ return {
1237
+ stable: uniqueOutputs === 1,
1238
+ runs,
1239
+ uniqueOutputs,
1240
+ outputs,
1241
+ variance
1242
+ };
1243
+ }
1244
+ export {
1245
+ testSnapshot,
1246
+ runEvalSuite,
1247
+ pinOutput,
1248
+ mockProvider,
1249
+ loadFixture,
1250
+ hashOutput,
1251
+ formatEvalReport,
1252
+ definePrompt,
1253
+ createSnapshotStore,
1254
+ createReplayRecorder,
1255
+ createReplayPlayer,
1256
+ createRegressionSuite,
1257
+ createRecorder,
1258
+ createPromptRegistry,
1259
+ createPinStore,
1260
+ createFixture,
1261
+ assertStable,
1262
+ assertDeterministic
1263
+ };