@elsium-ai/testing 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ import type { EvalCase } from './eval';
2
+ export interface EvalDataset {
3
+ name: string;
4
+ version?: string;
5
+ cases: EvalCase[];
6
+ }
7
+ export interface DatasetLoaderOptions {
8
+ inputField?: string;
9
+ expectedField?: string;
10
+ nameField?: string;
11
+ tagsField?: string;
12
+ }
13
+ export declare function loadDatasetFromJSON(path: string, options?: DatasetLoaderOptions): Promise<EvalDataset>;
14
+ export declare function loadDatasetFromCSV(path: string, options?: DatasetLoaderOptions): Promise<EvalDataset>;
15
+ export declare function loadDataset(path: string, options?: DatasetLoaderOptions): Promise<EvalDataset>;
16
+ //# sourceMappingURL=dataset.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../src/dataset.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,QAAQ,CAAA;AAEtC,MAAM,WAAW,WAAW;IAC3B,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,KAAK,EAAE,QAAQ,EAAE,CAAA;CACjB;AAED,MAAM,WAAW,oBAAoB;IACpC,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AA8BD,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC,CAkBtB;AAgCD,wBAAsB,kBAAkB,CACvC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC,CAqBtB;AAYD,wBAAsB,WAAW,CAChC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC,CAatB"}
@@ -0,0 +1,34 @@
1
+ import type { EvalSuiteResult } from './eval';
2
+ export interface EvalBaseline {
3
+ name: string;
4
+ timestamp: number;
5
+ score: number;
6
+ results: Array<{
7
+ name: string;
8
+ passed: boolean;
9
+ score: number;
10
+ }>;
11
+ }
12
+ export interface EvalComparison {
13
+ baselineName: string;
14
+ currentName: string;
15
+ baselineScore: number;
16
+ currentScore: number;
17
+ delta: number;
18
+ regressions: Array<{
19
+ name: string;
20
+ baselineScore: number;
21
+ currentScore: number;
22
+ }>;
23
+ improvements: Array<{
24
+ name: string;
25
+ baselineScore: number;
26
+ currentScore: number;
27
+ }>;
28
+ regression: boolean;
29
+ }
30
+ export declare function saveBaseline(result: EvalSuiteResult, dir: string): Promise<string>;
31
+ export declare function loadBaseline(name: string, dir: string): Promise<EvalBaseline | null>;
32
+ export declare function compareResults(baseline: EvalBaseline, current: EvalSuiteResult): EvalComparison;
33
+ export declare function formatComparison(comparison: EvalComparison): string;
34
+ //# sourceMappingURL=eval-compare.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-compare.d.ts","sourceRoot":"","sources":["../src/eval-compare.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,QAAQ,CAAA;AAE7C,MAAM,WAAW,YAAY;IAC5B,IAAI,EAAE,MAAM,CAAA;IACZ,SAAS,EAAE,MAAM,CAAA;IACjB,KAAK,EAAE,MAAM,CAAA;IACb,OAAO,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,OAAO,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAChE;AAED,MAAM,WAAW,cAAc;IAC9B,YAAY,EAAE,MAAM,CAAA;IACpB,WAAW,EAAE,MAAM,CAAA;IACnB,aAAa,EAAE,MAAM,CAAA;IACrB,YAAY,EAAE,MAAM,CAAA;IACpB,KAAK,EAAE,MAAM,CAAA;IACb,WAAW,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IACjF,YAAY,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IAClF,UAAU,EAAE,OAAO,CAAA;CACnB;AAED,wBAAsB,YAAY,CAAC,MAAM,EAAE,eAAe,EAAE,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAiBxF;AAED,wBAAsB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,CAQ1F;AAED,wBAAgB,cAAc,CAAC,QAAQ,EAAE,YAAY,EAAE,OAAO,EAAE,eAAe,GAAG,cAAc,CAyC/F;AAED,wBAAgB,gBAAgB,CAAC,UAAU,EAAE,cAAc,GAAG,MAAM,CAkCnE"}
package/dist/index.d.ts CHANGED
@@ -16,4 +16,8 @@ export { createPinStore, pinOutput } from './pinning';
16
16
  export type { Pin, PinStore, PinResult } from './pinning';
17
17
  export { assertDeterministic, assertStable } from './determinism';
18
18
  export type { DeterminismResult, StabilityResult } from './determinism';
19
+ export { loadDataset, loadDatasetFromJSON, loadDatasetFromCSV } from './dataset';
20
+ export type { EvalDataset, DatasetLoaderOptions } from './dataset';
21
+ export { saveBaseline, loadBaseline, compareResults, formatComparison } from './eval-compare';
22
+ export type { EvalBaseline, EvalComparison } from './eval-compare';
19
23
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAA;AAC9C,YAAY,EAAE,YAAY,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA;AAG5F,OAAO,EAAE,aAAa,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,YAAY,CAAA;AACvE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,YAAY,CAAA;AAGxE,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,QAAQ,CAAA;AACvD,YAAY,EACX,QAAQ,EACR,aAAa,EACb,UAAU,EACV,eAAe,EACf,eAAe,EACf,eAAe,EACf,QAAQ,GACR,MAAM,QAAQ,CAAA;AAGf,OAAO,EAAE,mBAAmB,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAC1E,YAAY,EAAE,cAAc,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAA;AAGnF,OAAO,EAAE,oBAAoB,EAAE,YAAY,EAAE,MAAM,WAAW,CAAA;AAC9D,YAAY,EAAE,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,WAAW,CAAA;AAGvF,OAAO,EAAE,qBAAqB,EAAE,MAAM,cAAc,CAAA;AACpD,YAAY,EACX,kBAAkB,EAClB,gBAAgB,EAChB,gBAAgB,EAChB,eAAe,GACf,MAAM,cAAc,CAAA;AAGrB,OAAO,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,MAAM,UAAU,CAAA;AACnE,YAAY,EAAE,WAAW,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,UAAU,CAAA;AAGzE,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AACrD,YAAY,EAAE,GAAG,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AAGzD,OAAO,EAAE,mBAAmB,EAAE,YAAY,EAAE,MAAM,eAAe,CAAA;AACjE,YAAY,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAA;AAC9C,YAAY,EAAE,YAAY,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA;AAG5F,OAAO,EAAE,aAAa,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,YAAY,CAAA;AACvE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,YAAY,CAAA;AAGxE,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,QAAQ,CAAA;AACvD,YAAY,EACX,QAAQ,EACR,aAAa,EACb,UAAU,EACV,eAAe,EACf,eAAe,EACf,eAAe,EACf,QAAQ,GACR,MAAM,QAAQ,CAAA;AAGf,OAAO,EAAE,mBAAmB,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAC1E,YAAY,EAAE,cAAc,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAA;AAGnF,OAAO,EAAE,oBAAoB,EAAE,YAAY,EAAE,MAAM,WAAW,CAAA;AAC9D,YAAY,EAAE,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,WAAW,CAAA;AAGvF,OAAO,EAAE,qBAAqB,EAAE,MAAM,cAAc,CAAA;AACpD,YAAY,EACX,kBAAkB,EAClB,gBAAgB,EAChB,gBAAgB,EAChB,eAAe,GACf,MAAM,cAAc,CAAA;AAGrB,OAAO,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,MAAM,UAAU,CAAA;AACnE,YAAY,EAAE,WAAW,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,UAAU,CAAA;AAGzE,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AACrD,YAAY,EAAE,GAAG,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AAGzD,OAAO,EAAE,mBAAmB,EAAE,YAAY,EAAE,MAAM,eAAe,CAAA;AACjE,YAAY,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA;AAGvE,OAAO,EAAE,WAAW,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAA;AAChF,YAAY,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,WAAW,CAAA;AAGlE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,cAAc,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAA;AAC7F,YAAY,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA"}
package/dist/index.js CHANGED
@@ -1288,14 +1288,217 @@ async function assertStable(fn, options) {
1288
1288
  variance
1289
1289
  };
1290
1290
  }
1291
+ // src/dataset.ts
1292
+ import { readFile } from "node:fs/promises";
1293
+ import { extname } from "node:path";
1294
+ function mapRecordToCase(record, options) {
1295
+ const inputField = options?.inputField ?? "input";
1296
+ const expectedField = options?.expectedField ?? "expected";
1297
+ const nameField = options?.nameField ?? "name";
1298
+ const tagsField = options?.tagsField ?? "tags";
1299
+ const tags = record[tagsField];
1300
+ let parsedTags;
1301
+ if (typeof tags === "string") {
1302
+ parsedTags = tags.split(",").map((t) => t.trim()).filter(Boolean);
1303
+ } else if (Array.isArray(tags)) {
1304
+ parsedTags = tags.map(String);
1305
+ }
1306
+ return {
1307
+ name: String(record[nameField] ?? ""),
1308
+ input: String(record[inputField] ?? ""),
1309
+ expected: record[expectedField] !== undefined ? String(record[expectedField]) : undefined,
1310
+ tags: parsedTags
1311
+ };
1312
+ }
1313
+ async function loadDatasetFromJSON(path, options) {
1314
+ const content = await readFile(path, "utf-8");
1315
+ const parsed = JSON.parse(content);
1316
+ if (Array.isArray(parsed)) {
1317
+ return {
1318
+ name: "",
1319
+ cases: parsed.map((record) => mapRecordToCase(record, options))
1320
+ };
1321
+ }
1322
+ return {
1323
+ name: parsed.name ?? "",
1324
+ version: parsed.version,
1325
+ cases: (parsed.cases ?? []).map((record) => mapRecordToCase(record, options))
1326
+ };
1327
+ }
1328
+ function parseCSVLine(line) {
1329
+ const fields = [];
1330
+ let current = "";
1331
+ let inQuotes = false;
1332
+ for (let i = 0;i < line.length; i++) {
1333
+ const char = line[i];
1334
+ if (inQuotes) {
1335
+ if (char === '"' && line[i + 1] === '"') {
1336
+ current += '"';
1337
+ i++;
1338
+ } else if (char === '"') {
1339
+ inQuotes = false;
1340
+ } else {
1341
+ current += char;
1342
+ }
1343
+ } else if (char === '"') {
1344
+ inQuotes = true;
1345
+ } else if (char === ",") {
1346
+ fields.push(current.trim());
1347
+ current = "";
1348
+ } else {
1349
+ current += char;
1350
+ }
1351
+ }
1352
+ fields.push(current.trim());
1353
+ return fields;
1354
+ }
1355
+ async function loadDatasetFromCSV(path, options) {
1356
+ const content = await readFile(path, "utf-8");
1357
+ const lines = content.split(`
1358
+ `).filter((line) => line.trim().length > 0);
1359
+ if (lines.length < 2) {
1360
+ return { name: "", cases: [] };
1361
+ }
1362
+ const headers = parseCSVLine(lines[0]);
1363
+ const cases = [];
1364
+ for (let i = 1;i < lines.length; i++) {
1365
+ const values = parseCSVLine(lines[i]);
1366
+ const record = {};
1367
+ for (let j = 0;j < headers.length; j++) {
1368
+ record[headers[j]] = values[j] ?? "";
1369
+ }
1370
+ cases.push(mapRecordToCase(record, options));
1371
+ }
1372
+ return { name: "", cases };
1373
+ }
1374
+ async function loadDatasetFromJSONL(path, options) {
1375
+ const content = await readFile(path, "utf-8");
1376
+ const lines = content.split(`
1377
+ `).filter((line) => line.trim().length > 0);
1378
+ const cases = lines.map((line) => mapRecordToCase(JSON.parse(line), options));
1379
+ return { name: "", cases };
1380
+ }
1381
+ async function loadDataset(path, options) {
1382
+ const ext = extname(path).toLowerCase();
1383
+ switch (ext) {
1384
+ case ".json":
1385
+ return loadDatasetFromJSON(path, options);
1386
+ case ".csv":
1387
+ return loadDatasetFromCSV(path, options);
1388
+ case ".jsonl":
1389
+ return loadDatasetFromJSONL(path, options);
1390
+ default:
1391
+ throw new Error(`Unsupported dataset format: ${ext}`);
1392
+ }
1393
+ }
1394
+ // src/eval-compare.ts
1395
+ import { mkdir, readFile as readFile2, writeFile } from "node:fs/promises";
1396
+ import { join } from "node:path";
1397
+ async function saveBaseline(result, dir) {
1398
+ await mkdir(dir, { recursive: true });
1399
+ const baseline = {
1400
+ name: result.name,
1401
+ timestamp: Date.now(),
1402
+ score: result.score,
1403
+ results: result.results.map((r) => ({
1404
+ name: r.name,
1405
+ passed: r.passed,
1406
+ score: r.score
1407
+ }))
1408
+ };
1409
+ const filePath = join(dir, `${result.name}.baseline.json`);
1410
+ await writeFile(filePath, JSON.stringify(baseline, null, "\t"), "utf-8");
1411
+ return filePath;
1412
+ }
1413
+ async function loadBaseline(name, dir) {
1414
+ const filePath = join(dir, `${name}.baseline.json`);
1415
+ try {
1416
+ const content = await readFile2(filePath, "utf-8");
1417
+ return JSON.parse(content);
1418
+ } catch {
1419
+ return null;
1420
+ }
1421
+ }
1422
+ function compareResults(baseline, current) {
1423
+ const baselineMap = new Map(baseline.results.map((r) => [r.name, r]));
1424
+ const regressions = [];
1425
+ const improvements = [];
1426
+ for (const result of current.results) {
1427
+ const baselineResult = baselineMap.get(result.name);
1428
+ if (!baselineResult)
1429
+ continue;
1430
+ if (result.score < baselineResult.score) {
1431
+ regressions.push({
1432
+ name: result.name,
1433
+ baselineScore: baselineResult.score,
1434
+ currentScore: result.score
1435
+ });
1436
+ } else if (result.score > baselineResult.score) {
1437
+ improvements.push({
1438
+ name: result.name,
1439
+ baselineScore: baselineResult.score,
1440
+ currentScore: result.score
1441
+ });
1442
+ }
1443
+ }
1444
+ const delta = current.score - baseline.score;
1445
+ const hasFailedRegression = current.results.some((r) => {
1446
+ const base = baselineMap.get(r.name);
1447
+ return base?.passed && !r.passed;
1448
+ });
1449
+ return {
1450
+ baselineName: baseline.name,
1451
+ currentName: current.name,
1452
+ baselineScore: baseline.score,
1453
+ currentScore: current.score,
1454
+ delta,
1455
+ regressions,
1456
+ improvements,
1457
+ regression: delta < 0 || hasFailedRegression
1458
+ };
1459
+ }
1460
+ function formatComparison(comparison) {
1461
+ const lines = [];
1462
+ const deltaSign = comparison.delta >= 0 ? "+" : "";
1463
+ const deltaPercent = `${deltaSign}${(comparison.delta * 100).toFixed(1)}%`;
1464
+ lines.push(`
1465
+ Comparison: ${comparison.baselineName} -> ${comparison.currentName}`);
1466
+ lines.push(` ${"─".repeat(50)}`);
1467
+ lines.push(` Baseline: ${(comparison.baselineScore * 100).toFixed(1)}% | Current: ${(comparison.currentScore * 100).toFixed(1)}% | Delta: ${deltaPercent}`);
1468
+ if (comparison.regressions.length > 0) {
1469
+ lines.push(`
1470
+ Regressions (${comparison.regressions.length}):`);
1471
+ for (const r of comparison.regressions) {
1472
+ lines.push(` - ${r.name}: ${(r.baselineScore * 100).toFixed(1)}% -> ${(r.currentScore * 100).toFixed(1)}%`);
1473
+ }
1474
+ }
1475
+ if (comparison.improvements.length > 0) {
1476
+ lines.push(`
1477
+ Improvements (${comparison.improvements.length}):`);
1478
+ for (const imp of comparison.improvements) {
1479
+ lines.push(` + ${imp.name}: ${(imp.baselineScore * 100).toFixed(1)}% -> ${(imp.currentScore * 100).toFixed(1)}%`);
1480
+ }
1481
+ }
1482
+ lines.push(` ${"─".repeat(50)}`);
1483
+ lines.push(` Result: ${comparison.regression ? "REGRESSION DETECTED" : "OK"}`);
1484
+ lines.push("");
1485
+ return lines.join(`
1486
+ `);
1487
+ }
1291
1488
  export {
1292
1489
  testSnapshot,
1490
+ saveBaseline,
1293
1491
  runEvalSuite,
1294
1492
  pinOutput,
1295
1493
  mockProvider,
1296
1494
  loadFixture,
1495
+ loadDatasetFromJSON,
1496
+ loadDatasetFromCSV,
1497
+ loadDataset,
1498
+ loadBaseline,
1297
1499
  hashOutput,
1298
1500
  formatEvalReport,
1501
+ formatComparison,
1299
1502
  definePrompt,
1300
1503
  createSnapshotStore,
1301
1504
  createReplayRecorder,
@@ -1305,6 +1508,7 @@ export {
1305
1508
  createPromptRegistry,
1306
1509
  createPinStore,
1307
1510
  createFixture,
1511
+ compareResults,
1308
1512
  assertStable,
1309
1513
  assertDeterministic
1310
1514
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@elsium-ai/testing",
3
- "version": "0.8.0",
3
+ "version": "0.9.0",
4
4
  "description": "Testing utilities, mock providers, fixtures, and eval framework for ElsiumAI",
5
5
  "license": "MIT",
6
6
  "author": "Eric Utrera <ebutrera9103@gmail.com>",
@@ -26,10 +26,10 @@
26
26
  "dev": "bun --watch src/index.ts"
27
27
  },
28
28
  "dependencies": {
29
- "@elsium-ai/core": "^0.8.0",
30
- "@elsium-ai/gateway": "^0.8.0",
31
- "@elsium-ai/agents": "^0.8.0",
32
- "@elsium-ai/tools": "^0.8.0"
29
+ "@elsium-ai/core": "^0.9.0",
30
+ "@elsium-ai/gateway": "^0.9.0",
31
+ "@elsium-ai/agents": "^0.9.0",
32
+ "@elsium-ai/tools": "^0.9.0"
33
33
  },
34
34
  "devDependencies": {
35
35
  "typescript": "^5.7.0"