@elsium-ai/testing 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dataset.d.ts +16 -0
- package/dist/dataset.d.ts.map +1 -0
- package/dist/eval-compare.d.ts +34 -0
- package/dist/eval-compare.d.ts.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +204 -0
- package/package.json +5 -5
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { EvalCase } from './eval';
|
|
2
|
+
export interface EvalDataset {
|
|
3
|
+
name: string;
|
|
4
|
+
version?: string;
|
|
5
|
+
cases: EvalCase[];
|
|
6
|
+
}
|
|
7
|
+
export interface DatasetLoaderOptions {
|
|
8
|
+
inputField?: string;
|
|
9
|
+
expectedField?: string;
|
|
10
|
+
nameField?: string;
|
|
11
|
+
tagsField?: string;
|
|
12
|
+
}
|
|
13
|
+
export declare function loadDatasetFromJSON(path: string, options?: DatasetLoaderOptions): Promise<EvalDataset>;
|
|
14
|
+
export declare function loadDatasetFromCSV(path: string, options?: DatasetLoaderOptions): Promise<EvalDataset>;
|
|
15
|
+
export declare function loadDataset(path: string, options?: DatasetLoaderOptions): Promise<EvalDataset>;
|
|
16
|
+
//# sourceMappingURL=dataset.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../src/dataset.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,QAAQ,CAAA;AAEtC,MAAM,WAAW,WAAW;IAC3B,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,KAAK,EAAE,QAAQ,EAAE,CAAA;CACjB;AAED,MAAM,WAAW,oBAAoB;IACpC,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AA8BD,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC,CAkBtB;AAgCD,wBAAsB,kBAAkB,CACvC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC,CAqBtB;AAYD,wBAAsB,WAAW,CAChC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,oBAAoB,GAC5B,OAAO,CAAC,WAAW,CAAC,CAatB"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import type { EvalSuiteResult } from './eval';
|
|
2
|
+
export interface EvalBaseline {
|
|
3
|
+
name: string;
|
|
4
|
+
timestamp: number;
|
|
5
|
+
score: number;
|
|
6
|
+
results: Array<{
|
|
7
|
+
name: string;
|
|
8
|
+
passed: boolean;
|
|
9
|
+
score: number;
|
|
10
|
+
}>;
|
|
11
|
+
}
|
|
12
|
+
export interface EvalComparison {
|
|
13
|
+
baselineName: string;
|
|
14
|
+
currentName: string;
|
|
15
|
+
baselineScore: number;
|
|
16
|
+
currentScore: number;
|
|
17
|
+
delta: number;
|
|
18
|
+
regressions: Array<{
|
|
19
|
+
name: string;
|
|
20
|
+
baselineScore: number;
|
|
21
|
+
currentScore: number;
|
|
22
|
+
}>;
|
|
23
|
+
improvements: Array<{
|
|
24
|
+
name: string;
|
|
25
|
+
baselineScore: number;
|
|
26
|
+
currentScore: number;
|
|
27
|
+
}>;
|
|
28
|
+
regression: boolean;
|
|
29
|
+
}
|
|
30
|
+
export declare function saveBaseline(result: EvalSuiteResult, dir: string): Promise<string>;
|
|
31
|
+
export declare function loadBaseline(name: string, dir: string): Promise<EvalBaseline | null>;
|
|
32
|
+
export declare function compareResults(baseline: EvalBaseline, current: EvalSuiteResult): EvalComparison;
|
|
33
|
+
export declare function formatComparison(comparison: EvalComparison): string;
|
|
34
|
+
//# sourceMappingURL=eval-compare.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-compare.d.ts","sourceRoot":"","sources":["../src/eval-compare.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,QAAQ,CAAA;AAE7C,MAAM,WAAW,YAAY;IAC5B,IAAI,EAAE,MAAM,CAAA;IACZ,SAAS,EAAE,MAAM,CAAA;IACjB,KAAK,EAAE,MAAM,CAAA;IACb,OAAO,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,OAAO,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAChE;AAED,MAAM,WAAW,cAAc;IAC9B,YAAY,EAAE,MAAM,CAAA;IACpB,WAAW,EAAE,MAAM,CAAA;IACnB,aAAa,EAAE,MAAM,CAAA;IACrB,YAAY,EAAE,MAAM,CAAA;IACpB,KAAK,EAAE,MAAM,CAAA;IACb,WAAW,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IACjF,YAAY,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,aAAa,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;IAClF,UAAU,EAAE,OAAO,CAAA;CACnB;AAED,wBAAsB,YAAY,CAAC,MAAM,EAAE,eAAe,EAAE,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAiBxF;AAED,wBAAsB,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,CAQ1F;AAED,wBAAgB,cAAc,CAAC,QAAQ,EAAE,YAAY,EAAE,OAAO,EAAE,eAAe,GAAG,cAAc,CAyC/F;AAED,wBAAgB,gBAAgB,CAAC,UAAU,EAAE,cAAc,GAAG,MAAM,CAkCnE"}
|
package/dist/index.d.ts
CHANGED
|
@@ -16,4 +16,8 @@ export { createPinStore, pinOutput } from './pinning';
|
|
|
16
16
|
export type { Pin, PinStore, PinResult } from './pinning';
|
|
17
17
|
export { assertDeterministic, assertStable } from './determinism';
|
|
18
18
|
export type { DeterminismResult, StabilityResult } from './determinism';
|
|
19
|
+
export { loadDataset, loadDatasetFromJSON, loadDatasetFromCSV } from './dataset';
|
|
20
|
+
export type { EvalDataset, DatasetLoaderOptions } from './dataset';
|
|
21
|
+
export { saveBaseline, loadBaseline, compareResults, formatComparison } from './eval-compare';
|
|
22
|
+
export type { EvalBaseline, EvalComparison } from './eval-compare';
|
|
19
23
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAA;AAC9C,YAAY,EAAE,YAAY,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA;AAG5F,OAAO,EAAE,aAAa,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,YAAY,CAAA;AACvE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,YAAY,CAAA;AAGxE,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,QAAQ,CAAA;AACvD,YAAY,EACX,QAAQ,EACR,aAAa,EACb,UAAU,EACV,eAAe,EACf,eAAe,EACf,eAAe,EACf,QAAQ,GACR,MAAM,QAAQ,CAAA;AAGf,OAAO,EAAE,mBAAmB,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAC1E,YAAY,EAAE,cAAc,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAA;AAGnF,OAAO,EAAE,oBAAoB,EAAE,YAAY,EAAE,MAAM,WAAW,CAAA;AAC9D,YAAY,EAAE,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,WAAW,CAAA;AAGvF,OAAO,EAAE,qBAAqB,EAAE,MAAM,cAAc,CAAA;AACpD,YAAY,EACX,kBAAkB,EAClB,gBAAgB,EAChB,gBAAgB,EAChB,eAAe,GACf,MAAM,cAAc,CAAA;AAGrB,OAAO,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,MAAM,UAAU,CAAA;AACnE,YAAY,EAAE,WAAW,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,UAAU,CAAA;AAGzE,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AACrD,YAAY,EAAE,GAAG,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AAGzD,OAAO,EAAE,mBAAmB,EAAE,YAAY,EAAE,MAAM,eAAe,CAAA;AACjE,YAAY,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAA;AAC9C,YAAY,EAAE,YAAY,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA;AAG5F,OAAO,EAAE,aAAa,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,YAAY,CAAA;AACvE,YAAY,EAAE,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,YAAY,CAAA;AAGxE,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,QAAQ,CAAA;AACvD,YAAY,EACX,QAAQ,EACR,aAAa,EACb,UAAU,EACV,eAAe,EACf,eAAe,EACf,eAAe,EACf,QAAQ,GACR,MAAM,QAAQ,CAAA;AAGf,OAAO,EAAE,mBAAmB,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAC1E,YAAY,EAAE,cAAc,EAAE,aAAa,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAA;AAGnF,OAAO,EAAE,oBAAoB,EAAE,YAAY,EAAE,MAAM,WAAW,CAAA;AAC9D,YAAY,EAAE,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,WAAW,CAAA;AAGvF,OAAO,EAAE,qBAAqB,EAAE,MAAM,cAAc,CAAA;AACpD,YAAY,EACX,kBAAkB,EAClB,gBAAgB,EAChB,gBAAgB,EAChB,eAAe,GACf,MAAM,cAAc,CAAA;AAGrB,OAAO,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,MAAM,UAAU,CAAA;AACnE,YAAY,EAAE,WAAW,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,UAAU,CAAA;AAGzE,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AACrD,YAAY,EAAE,GAAG,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,WAAW,CAAA;AAGzD,OAAO,EAAE,mBAAmB,EAAE,YAAY,EAAE,MAAM,eAAe,CAAA;AACjE,YAAY,EAAE,iBAAiB,EAAE,eAAe,EAAE,MAAM,eAAe,CAAA;AAGvE,OAAO,EAAE,WAAW,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,WAAW,CAAA;AAChF,YAAY,EAAE,WAAW,EAAE,oBAAoB,EAAE,MAAM,WAAW,CAAA;AAGlE,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,cAAc,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAA;AAC7F,YAAY,EAAE,YAAY,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA"}
|
package/dist/index.js
CHANGED
|
@@ -1288,14 +1288,217 @@ async function assertStable(fn, options) {
|
|
|
1288
1288
|
variance
|
|
1289
1289
|
};
|
|
1290
1290
|
}
|
|
1291
|
+
// src/dataset.ts
|
|
1292
|
+
import { readFile } from "node:fs/promises";
|
|
1293
|
+
import { extname } from "node:path";
|
|
1294
|
+
function mapRecordToCase(record, options) {
|
|
1295
|
+
const inputField = options?.inputField ?? "input";
|
|
1296
|
+
const expectedField = options?.expectedField ?? "expected";
|
|
1297
|
+
const nameField = options?.nameField ?? "name";
|
|
1298
|
+
const tagsField = options?.tagsField ?? "tags";
|
|
1299
|
+
const tags = record[tagsField];
|
|
1300
|
+
let parsedTags;
|
|
1301
|
+
if (typeof tags === "string") {
|
|
1302
|
+
parsedTags = tags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
1303
|
+
} else if (Array.isArray(tags)) {
|
|
1304
|
+
parsedTags = tags.map(String);
|
|
1305
|
+
}
|
|
1306
|
+
return {
|
|
1307
|
+
name: String(record[nameField] ?? ""),
|
|
1308
|
+
input: String(record[inputField] ?? ""),
|
|
1309
|
+
expected: record[expectedField] !== undefined ? String(record[expectedField]) : undefined,
|
|
1310
|
+
tags: parsedTags
|
|
1311
|
+
};
|
|
1312
|
+
}
|
|
1313
|
+
async function loadDatasetFromJSON(path, options) {
|
|
1314
|
+
const content = await readFile(path, "utf-8");
|
|
1315
|
+
const parsed = JSON.parse(content);
|
|
1316
|
+
if (Array.isArray(parsed)) {
|
|
1317
|
+
return {
|
|
1318
|
+
name: "",
|
|
1319
|
+
cases: parsed.map((record) => mapRecordToCase(record, options))
|
|
1320
|
+
};
|
|
1321
|
+
}
|
|
1322
|
+
return {
|
|
1323
|
+
name: parsed.name ?? "",
|
|
1324
|
+
version: parsed.version,
|
|
1325
|
+
cases: (parsed.cases ?? []).map((record) => mapRecordToCase(record, options))
|
|
1326
|
+
};
|
|
1327
|
+
}
|
|
1328
|
+
function parseCSVLine(line) {
|
|
1329
|
+
const fields = [];
|
|
1330
|
+
let current = "";
|
|
1331
|
+
let inQuotes = false;
|
|
1332
|
+
for (let i = 0;i < line.length; i++) {
|
|
1333
|
+
const char = line[i];
|
|
1334
|
+
if (inQuotes) {
|
|
1335
|
+
if (char === '"' && line[i + 1] === '"') {
|
|
1336
|
+
current += '"';
|
|
1337
|
+
i++;
|
|
1338
|
+
} else if (char === '"') {
|
|
1339
|
+
inQuotes = false;
|
|
1340
|
+
} else {
|
|
1341
|
+
current += char;
|
|
1342
|
+
}
|
|
1343
|
+
} else if (char === '"') {
|
|
1344
|
+
inQuotes = true;
|
|
1345
|
+
} else if (char === ",") {
|
|
1346
|
+
fields.push(current.trim());
|
|
1347
|
+
current = "";
|
|
1348
|
+
} else {
|
|
1349
|
+
current += char;
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
fields.push(current.trim());
|
|
1353
|
+
return fields;
|
|
1354
|
+
}
|
|
1355
|
+
async function loadDatasetFromCSV(path, options) {
|
|
1356
|
+
const content = await readFile(path, "utf-8");
|
|
1357
|
+
const lines = content.split(`
|
|
1358
|
+
`).filter((line) => line.trim().length > 0);
|
|
1359
|
+
if (lines.length < 2) {
|
|
1360
|
+
return { name: "", cases: [] };
|
|
1361
|
+
}
|
|
1362
|
+
const headers = parseCSVLine(lines[0]);
|
|
1363
|
+
const cases = [];
|
|
1364
|
+
for (let i = 1;i < lines.length; i++) {
|
|
1365
|
+
const values = parseCSVLine(lines[i]);
|
|
1366
|
+
const record = {};
|
|
1367
|
+
for (let j = 0;j < headers.length; j++) {
|
|
1368
|
+
record[headers[j]] = values[j] ?? "";
|
|
1369
|
+
}
|
|
1370
|
+
cases.push(mapRecordToCase(record, options));
|
|
1371
|
+
}
|
|
1372
|
+
return { name: "", cases };
|
|
1373
|
+
}
|
|
1374
|
+
async function loadDatasetFromJSONL(path, options) {
|
|
1375
|
+
const content = await readFile(path, "utf-8");
|
|
1376
|
+
const lines = content.split(`
|
|
1377
|
+
`).filter((line) => line.trim().length > 0);
|
|
1378
|
+
const cases = lines.map((line) => mapRecordToCase(JSON.parse(line), options));
|
|
1379
|
+
return { name: "", cases };
|
|
1380
|
+
}
|
|
1381
|
+
async function loadDataset(path, options) {
|
|
1382
|
+
const ext = extname(path).toLowerCase();
|
|
1383
|
+
switch (ext) {
|
|
1384
|
+
case ".json":
|
|
1385
|
+
return loadDatasetFromJSON(path, options);
|
|
1386
|
+
case ".csv":
|
|
1387
|
+
return loadDatasetFromCSV(path, options);
|
|
1388
|
+
case ".jsonl":
|
|
1389
|
+
return loadDatasetFromJSONL(path, options);
|
|
1390
|
+
default:
|
|
1391
|
+
throw new Error(`Unsupported dataset format: ${ext}`);
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1394
|
+
// src/eval-compare.ts
|
|
1395
|
+
import { mkdir, readFile as readFile2, writeFile } from "node:fs/promises";
|
|
1396
|
+
import { join } from "node:path";
|
|
1397
|
+
async function saveBaseline(result, dir) {
|
|
1398
|
+
await mkdir(dir, { recursive: true });
|
|
1399
|
+
const baseline = {
|
|
1400
|
+
name: result.name,
|
|
1401
|
+
timestamp: Date.now(),
|
|
1402
|
+
score: result.score,
|
|
1403
|
+
results: result.results.map((r) => ({
|
|
1404
|
+
name: r.name,
|
|
1405
|
+
passed: r.passed,
|
|
1406
|
+
score: r.score
|
|
1407
|
+
}))
|
|
1408
|
+
};
|
|
1409
|
+
const filePath = join(dir, `${result.name}.baseline.json`);
|
|
1410
|
+
await writeFile(filePath, JSON.stringify(baseline, null, "\t"), "utf-8");
|
|
1411
|
+
return filePath;
|
|
1412
|
+
}
|
|
1413
|
+
async function loadBaseline(name, dir) {
|
|
1414
|
+
const filePath = join(dir, `${name}.baseline.json`);
|
|
1415
|
+
try {
|
|
1416
|
+
const content = await readFile2(filePath, "utf-8");
|
|
1417
|
+
return JSON.parse(content);
|
|
1418
|
+
} catch {
|
|
1419
|
+
return null;
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
function compareResults(baseline, current) {
|
|
1423
|
+
const baselineMap = new Map(baseline.results.map((r) => [r.name, r]));
|
|
1424
|
+
const regressions = [];
|
|
1425
|
+
const improvements = [];
|
|
1426
|
+
for (const result of current.results) {
|
|
1427
|
+
const baselineResult = baselineMap.get(result.name);
|
|
1428
|
+
if (!baselineResult)
|
|
1429
|
+
continue;
|
|
1430
|
+
if (result.score < baselineResult.score) {
|
|
1431
|
+
regressions.push({
|
|
1432
|
+
name: result.name,
|
|
1433
|
+
baselineScore: baselineResult.score,
|
|
1434
|
+
currentScore: result.score
|
|
1435
|
+
});
|
|
1436
|
+
} else if (result.score > baselineResult.score) {
|
|
1437
|
+
improvements.push({
|
|
1438
|
+
name: result.name,
|
|
1439
|
+
baselineScore: baselineResult.score,
|
|
1440
|
+
currentScore: result.score
|
|
1441
|
+
});
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
const delta = current.score - baseline.score;
|
|
1445
|
+
const hasFailedRegression = current.results.some((r) => {
|
|
1446
|
+
const base = baselineMap.get(r.name);
|
|
1447
|
+
return base?.passed && !r.passed;
|
|
1448
|
+
});
|
|
1449
|
+
return {
|
|
1450
|
+
baselineName: baseline.name,
|
|
1451
|
+
currentName: current.name,
|
|
1452
|
+
baselineScore: baseline.score,
|
|
1453
|
+
currentScore: current.score,
|
|
1454
|
+
delta,
|
|
1455
|
+
regressions,
|
|
1456
|
+
improvements,
|
|
1457
|
+
regression: delta < 0 || hasFailedRegression
|
|
1458
|
+
};
|
|
1459
|
+
}
|
|
1460
|
+
function formatComparison(comparison) {
|
|
1461
|
+
const lines = [];
|
|
1462
|
+
const deltaSign = comparison.delta >= 0 ? "+" : "";
|
|
1463
|
+
const deltaPercent = `${deltaSign}${(comparison.delta * 100).toFixed(1)}%`;
|
|
1464
|
+
lines.push(`
|
|
1465
|
+
Comparison: ${comparison.baselineName} -> ${comparison.currentName}`);
|
|
1466
|
+
lines.push(` ${"─".repeat(50)}`);
|
|
1467
|
+
lines.push(` Baseline: ${(comparison.baselineScore * 100).toFixed(1)}% | Current: ${(comparison.currentScore * 100).toFixed(1)}% | Delta: ${deltaPercent}`);
|
|
1468
|
+
if (comparison.regressions.length > 0) {
|
|
1469
|
+
lines.push(`
|
|
1470
|
+
Regressions (${comparison.regressions.length}):`);
|
|
1471
|
+
for (const r of comparison.regressions) {
|
|
1472
|
+
lines.push(` - ${r.name}: ${(r.baselineScore * 100).toFixed(1)}% -> ${(r.currentScore * 100).toFixed(1)}%`);
|
|
1473
|
+
}
|
|
1474
|
+
}
|
|
1475
|
+
if (comparison.improvements.length > 0) {
|
|
1476
|
+
lines.push(`
|
|
1477
|
+
Improvements (${comparison.improvements.length}):`);
|
|
1478
|
+
for (const imp of comparison.improvements) {
|
|
1479
|
+
lines.push(` + ${imp.name}: ${(imp.baselineScore * 100).toFixed(1)}% -> ${(imp.currentScore * 100).toFixed(1)}%`);
|
|
1480
|
+
}
|
|
1481
|
+
}
|
|
1482
|
+
lines.push(` ${"─".repeat(50)}`);
|
|
1483
|
+
lines.push(` Result: ${comparison.regression ? "REGRESSION DETECTED" : "OK"}`);
|
|
1484
|
+
lines.push("");
|
|
1485
|
+
return lines.join(`
|
|
1486
|
+
`);
|
|
1487
|
+
}
|
|
1291
1488
|
export {
|
|
1292
1489
|
testSnapshot,
|
|
1490
|
+
saveBaseline,
|
|
1293
1491
|
runEvalSuite,
|
|
1294
1492
|
pinOutput,
|
|
1295
1493
|
mockProvider,
|
|
1296
1494
|
loadFixture,
|
|
1495
|
+
loadDatasetFromJSON,
|
|
1496
|
+
loadDatasetFromCSV,
|
|
1497
|
+
loadDataset,
|
|
1498
|
+
loadBaseline,
|
|
1297
1499
|
hashOutput,
|
|
1298
1500
|
formatEvalReport,
|
|
1501
|
+
formatComparison,
|
|
1299
1502
|
definePrompt,
|
|
1300
1503
|
createSnapshotStore,
|
|
1301
1504
|
createReplayRecorder,
|
|
@@ -1305,6 +1508,7 @@ export {
|
|
|
1305
1508
|
createPromptRegistry,
|
|
1306
1509
|
createPinStore,
|
|
1307
1510
|
createFixture,
|
|
1511
|
+
compareResults,
|
|
1308
1512
|
assertStable,
|
|
1309
1513
|
assertDeterministic
|
|
1310
1514
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@elsium-ai/testing",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0",
|
|
4
4
|
"description": "Testing utilities, mock providers, fixtures, and eval framework for ElsiumAI",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Eric Utrera <ebutrera9103@gmail.com>",
|
|
@@ -26,10 +26,10 @@
|
|
|
26
26
|
"dev": "bun --watch src/index.ts"
|
|
27
27
|
},
|
|
28
28
|
"dependencies": {
|
|
29
|
-
"@elsium-ai/core": "^0.
|
|
30
|
-
"@elsium-ai/gateway": "^0.
|
|
31
|
-
"@elsium-ai/agents": "^0.
|
|
32
|
-
"@elsium-ai/tools": "^0.
|
|
29
|
+
"@elsium-ai/core": "^0.9.0",
|
|
30
|
+
"@elsium-ai/gateway": "^0.9.0",
|
|
31
|
+
"@elsium-ai/agents": "^0.9.0",
|
|
32
|
+
"@elsium-ai/tools": "^0.9.0"
|
|
33
33
|
},
|
|
34
34
|
"devDependencies": {
|
|
35
35
|
"typescript": "^5.7.0"
|