@evalgate/sdk 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +638 -0
  2. package/README.md +398 -0
  3. package/dist/assertions.d.ts +189 -0
  4. package/dist/assertions.js +662 -0
  5. package/dist/batch.d.ts +68 -0
  6. package/dist/batch.js +179 -0
  7. package/dist/cache.d.ts +65 -0
  8. package/dist/cache.js +131 -0
  9. package/dist/cli/api.d.ts +108 -0
  10. package/dist/cli/api.js +132 -0
  11. package/dist/cli/baseline.d.ts +10 -0
  12. package/dist/cli/baseline.js +172 -0
  13. package/dist/cli/check.d.ts +73 -0
  14. package/dist/cli/check.js +355 -0
  15. package/dist/cli/ci-context.d.ts +6 -0
  16. package/dist/cli/ci-context.js +112 -0
  17. package/dist/cli/ci.d.ts +45 -0
  18. package/dist/cli/ci.js +192 -0
  19. package/dist/cli/config.d.ts +30 -0
  20. package/dist/cli/config.js +230 -0
  21. package/dist/cli/constants.d.ts +15 -0
  22. package/dist/cli/constants.js +18 -0
  23. package/dist/cli/diff.d.ts +173 -0
  24. package/dist/cli/diff.js +685 -0
  25. package/dist/cli/discover.d.ts +84 -0
  26. package/dist/cli/discover.js +419 -0
  27. package/dist/cli/doctor.d.ts +88 -0
  28. package/dist/cli/doctor.js +675 -0
  29. package/dist/cli/env.d.ts +21 -0
  30. package/dist/cli/env.js +42 -0
  31. package/dist/cli/explain.d.ts +58 -0
  32. package/dist/cli/explain.js +561 -0
  33. package/dist/cli/formatters/github.d.ts +8 -0
  34. package/dist/cli/formatters/github.js +135 -0
  35. package/dist/cli/formatters/human.d.ts +6 -0
  36. package/dist/cli/formatters/human.js +110 -0
  37. package/dist/cli/formatters/json.d.ts +6 -0
  38. package/dist/cli/formatters/json.js +10 -0
  39. package/dist/cli/formatters/pr-comment.d.ts +12 -0
  40. package/dist/cli/formatters/pr-comment.js +103 -0
  41. package/dist/cli/formatters/types.d.ts +103 -0
  42. package/dist/cli/formatters/types.js +8 -0
  43. package/dist/cli/gate.d.ts +21 -0
  44. package/dist/cli/gate.js +179 -0
  45. package/dist/cli/impact-analysis.d.ts +63 -0
  46. package/dist/cli/impact-analysis.js +252 -0
  47. package/dist/cli/index.d.ts +9 -0
  48. package/dist/cli/index.js +332 -0
  49. package/dist/cli/init.d.ts +16 -0
  50. package/dist/cli/init.js +292 -0
  51. package/dist/cli/manifest.d.ts +103 -0
  52. package/dist/cli/manifest.js +282 -0
  53. package/dist/cli/migrate.d.ts +41 -0
  54. package/dist/cli/migrate.js +349 -0
  55. package/dist/cli/policy-packs.d.ts +23 -0
  56. package/dist/cli/policy-packs.js +89 -0
  57. package/dist/cli/print-config.d.ts +29 -0
  58. package/dist/cli/print-config.js +270 -0
  59. package/dist/cli/profiles.d.ts +28 -0
  60. package/dist/cli/profiles.js +30 -0
  61. package/dist/cli/reason-codes.d.ts +17 -0
  62. package/dist/cli/reason-codes.js +19 -0
  63. package/dist/cli/regression-gate.d.ts +15 -0
  64. package/dist/cli/regression-gate.js +341 -0
  65. package/dist/cli/render/snippet.d.ts +5 -0
  66. package/dist/cli/render/snippet.js +15 -0
  67. package/dist/cli/render/sort.d.ts +10 -0
  68. package/dist/cli/render/sort.js +24 -0
  69. package/dist/cli/report/build-check-report.d.ts +19 -0
  70. package/dist/cli/report/build-check-report.js +132 -0
  71. package/dist/cli/run.d.ts +101 -0
  72. package/dist/cli/run.js +395 -0
  73. package/dist/cli/share.d.ts +17 -0
  74. package/dist/cli/share.js +91 -0
  75. package/dist/cli/upgrade.d.ts +15 -0
  76. package/dist/cli/upgrade.js +492 -0
  77. package/dist/cli/workspace.d.ts +31 -0
  78. package/dist/cli/workspace.js +68 -0
  79. package/dist/client.d.ts +368 -0
  80. package/dist/client.js +893 -0
  81. package/dist/client.request.test.d.ts +1 -0
  82. package/dist/client.request.test.js +232 -0
  83. package/dist/context.d.ts +134 -0
  84. package/dist/context.js +215 -0
  85. package/dist/errors.d.ts +82 -0
  86. package/dist/errors.js +298 -0
  87. package/dist/export.d.ts +195 -0
  88. package/dist/export.js +344 -0
  89. package/dist/index.d.ts +44 -0
  90. package/dist/index.js +153 -0
  91. package/dist/integrations/anthropic.d.ts +91 -0
  92. package/dist/integrations/anthropic.js +163 -0
  93. package/dist/integrations/openai-eval.d.ts +57 -0
  94. package/dist/integrations/openai-eval.js +232 -0
  95. package/dist/integrations/openai.d.ts +92 -0
  96. package/dist/integrations/openai.js +160 -0
  97. package/dist/local.d.ts +39 -0
  98. package/dist/local.js +148 -0
  99. package/dist/logger.d.ts +128 -0
  100. package/dist/logger.js +227 -0
  101. package/dist/matchers/index.d.ts +1 -0
  102. package/dist/matchers/index.js +6 -0
  103. package/dist/matchers/to-pass-gate.d.ts +29 -0
  104. package/dist/matchers/to-pass-gate.js +35 -0
  105. package/dist/pagination.d.ts +74 -0
  106. package/dist/pagination.js +139 -0
  107. package/dist/regression.d.ts +100 -0
  108. package/dist/regression.js +44 -0
  109. package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
  110. package/dist/runtime/adapters/config-to-dsl.js +400 -0
  111. package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
  112. package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
  113. package/dist/runtime/context.d.ts +26 -0
  114. package/dist/runtime/context.js +74 -0
  115. package/dist/runtime/eval.d.ts +46 -0
  116. package/dist/runtime/eval.js +244 -0
  117. package/dist/runtime/execution-mode.d.ts +80 -0
  118. package/dist/runtime/execution-mode.js +357 -0
  119. package/dist/runtime/executor.d.ts +16 -0
  120. package/dist/runtime/executor.js +152 -0
  121. package/dist/runtime/registry.d.ts +78 -0
  122. package/dist/runtime/registry.js +403 -0
  123. package/dist/runtime/run-report.d.ts +200 -0
  124. package/dist/runtime/run-report.js +222 -0
  125. package/dist/runtime/types.d.ts +356 -0
  126. package/dist/runtime/types.js +76 -0
  127. package/dist/snapshot.d.ts +176 -0
  128. package/dist/snapshot.js +322 -0
  129. package/dist/streaming.d.ts +173 -0
  130. package/dist/streaming.js +268 -0
  131. package/dist/testing.d.ts +273 -0
  132. package/dist/testing.js +317 -0
  133. package/dist/types.d.ts +754 -0
  134. package/dist/types.js +54 -0
  135. package/dist/utils/input-hash.d.ts +8 -0
  136. package/dist/utils/input-hash.js +41 -0
  137. package/dist/version.d.ts +7 -0
  138. package/dist/version.js +10 -0
  139. package/dist/workflows.d.ts +389 -0
  140. package/dist/workflows.js +671 -0
  141. package/package.json +117 -0
@@ -0,0 +1,685 @@
1
+ "use strict";
2
+ /**
3
+ * TICKET 5 — Behavioral Diff CLI (EVAL-401)
4
+ *
5
+ * Goal: "Git diff for AI behavior" from two RunReports
6
+ *
7
+ * Command:
8
+ * evalgate diff --base main (default uses git to find baseline run)
9
+ * evalgate diff --a <runReportPath> --b <runReportPath>
10
+ * evalgate diff main..feature (nice-to-have alias)
11
+ */
12
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
13
+ if (k2 === undefined) k2 = k;
14
+ var desc = Object.getOwnPropertyDescriptor(m, k);
15
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
16
+ desc = { enumerable: true, get: function() { return m[k]; } };
17
+ }
18
+ Object.defineProperty(o, k2, desc);
19
+ }) : (function(o, m, k, k2) {
20
+ if (k2 === undefined) k2 = k;
21
+ o[k2] = m[k];
22
+ }));
23
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
24
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
25
+ }) : function(o, v) {
26
+ o["default"] = v;
27
+ });
28
+ var __importStar = (this && this.__importStar) || (function () {
29
+ var ownKeys = function(o) {
30
+ ownKeys = Object.getOwnPropertyNames || function (o) {
31
+ var ar = [];
32
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
33
+ return ar;
34
+ };
35
+ return ownKeys(o);
36
+ };
37
+ return function (mod) {
38
+ if (mod && mod.__esModule) return mod;
39
+ var result = {};
40
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
41
+ __setModuleDefault(result, mod);
42
+ return result;
43
+ };
44
+ })();
45
+ Object.defineProperty(exports, "__esModule", { value: true });
46
+ exports.diffCore = exports.SUPPORTED_SCHEMA_VERSIONS = exports.DIFF_SCHEMA_VERSION = void 0;
47
+ exports.round = round;
48
+ exports.roundPct = roundPct;
49
+ exports.validateSchemaVersion = validateSchemaVersion;
50
+ exports.runDiff = runDiff;
51
+ exports.compareReports = compareReports;
52
+ exports.calculateDiffSummary = calculateDiffSummary;
53
+ exports.printHumanResults = printHumanResults;
54
+ exports.printJsonResults = printJsonResults;
55
+ exports.writeGitHubStepSummary = writeGitHubStepSummary;
56
+ exports.runDiffCLI = runDiffCLI;
57
+ exports.classifyDiff = classifyDiff;
58
+ exports.calculateDeltas = calculateDeltas;
59
+ const node_child_process_1 = require("node:child_process");
60
+ const fs = __importStar(require("node:fs/promises"));
61
+ const path = __importStar(require("node:path"));
62
+ const env_1 = require("./env");
63
+ const workspace_1 = require("./workspace");
64
+ /**
65
+ * Diff schema version
66
+ */
67
+ exports.DIFF_SCHEMA_VERSION = 1;
68
+ /**
69
+ * Supported RunReport schema versions
70
+ */
71
+ exports.SUPPORTED_SCHEMA_VERSIONS = [1];
72
+ /**
73
+ * Rounding helpers for floating point normalization
74
+ */
75
+ function round(value, precision = 4) {
76
+ return Math.round(value * 10 ** precision) / 10 ** precision;
77
+ }
78
+ function roundPct(value, precision = 1) {
79
+ return round(value * 100, precision);
80
+ }
81
+ /**
82
+ * Validate RunReport schema version
83
+ */
84
+ function validateSchemaVersion(report) {
85
+ if (!report.schemaVersion) {
86
+ throw new Error(`RunReport missing schemaVersion. This report was generated by an older version of EvalGate.\n` +
87
+ `Please regenerate the run report or upgrade to a compatible version.`);
88
+ }
89
+ if (!exports.SUPPORTED_SCHEMA_VERSIONS.includes(report.schemaVersion)) {
90
+ throw new Error(`Unsupported RunReport schema version: ${report.schemaVersion}\n` +
91
+ `Supported versions: ${exports.SUPPORTED_SCHEMA_VERSIONS.join(", ")}\n` +
92
+ `This report was generated by a newer version of EvalGate. Please upgrade your EvalGate CLI.`);
93
+ }
94
+ }
95
+ /**
96
+ * Check if running in CI environment
97
+ */
98
+ function isCIEnvironment() {
99
+ return (0, env_1.isCI)();
100
+ }
101
+ /**
102
+ * Generate CI-friendly error message for missing base
103
+ */
104
+ function generateCIBaseErrorMessage(baseRef) {
105
+ const lines = [
106
+ "🚫 Base run report not found in CI environment",
107
+ "",
108
+ "To fix this, download the base run artifact from your base branch workflow:",
109
+ "",
110
+ `1. Download run artifact from ${baseRef} branch workflow`,
111
+ "2. Save it as .evalgate/base-run.json",
112
+ "3. Re-run: evalgate diff --base .evalgate/base-run.json --head .evalgate/last-run.json",
113
+ "",
114
+ "Expected artifact patterns:",
115
+ " - .evalgate/runs/run-*.json",
116
+ " - .evalgate/last-run.json",
117
+ "",
118
+ "Or set a baseline pointer:",
119
+ " evalgate diff --base <runId> --head last --set-baseline",
120
+ ];
121
+ return lines.join("\n");
122
+ }
123
+ /**
124
+ * Run diff comparison
125
+ */
126
+ async function runDiff(options) {
127
+ // Resolve base and head reports
128
+ const baseReport = await resolveBaseReport(options.base);
129
+ const headReport = await resolveHeadReport(options.head);
130
+ if (!baseReport) {
131
+ if (isCIEnvironment() && options.base && (0, env_1.isGitRef)(options.base)) {
132
+ // In CI with git ref, provide helpful guidance
133
+ throw new Error(generateCIBaseErrorMessage(options.base));
134
+ }
135
+ throw new Error("Base run report not found. Use --base to specify a report or branch.");
136
+ }
137
+ if (!headReport) {
138
+ throw new Error("Head run report not found. Use --head to specify a report path.");
139
+ }
140
+ // Validate schema versions
141
+ validateSchemaVersion(baseReport);
142
+ validateSchemaVersion(headReport);
143
+ // Perform diff comparison
144
+ const diffResult = compareReports(baseReport, headReport);
145
+ return diffResult;
146
+ }
147
+ /**
148
+ * Resolve base report from options
149
+ */
150
+ async function resolveBaseReport(base) {
151
+ if (!base) {
152
+ // Default: try to find last run for main branch
153
+ return await findLastRunForBranch("main");
154
+ }
155
+ if (base === "last") {
156
+ // Resolve to previous run in index
157
+ return await findPreviousRun();
158
+ }
159
+ if (base === "baseline") {
160
+ // Resolve to baseline pointer or runId in index
161
+ return await findBaselineRun();
162
+ }
163
+ if (base.includes("..")) {
164
+ // Branch range like "main..feature" - extract base branch
165
+ const baseBranch = base.split("..")[0];
166
+ return await findLastRunForBranch(baseBranch);
167
+ }
168
+ // Check if it's a branch name
169
+ if (await isBranchName(base)) {
170
+ return await findLastRunForBranch(base);
171
+ }
172
+ // Check if it's a runId
173
+ if (base.startsWith("run-")) {
174
+ return await loadRunReport(`.evalgate/runs/${base}.json`);
175
+ }
176
+ // Treat as file path
177
+ return await loadRunReport(base);
178
+ }
179
+ /**
180
+ * Find baseline run from pointer file or index
181
+ */
182
+ async function findBaselineRun() {
183
+ const workspace = (0, workspace_1.resolveEvalWorkspace)();
184
+ // First, check for baseline pointer file
185
+ try {
186
+ const content = await fs.readFile(workspace.baselinePath, "utf-8");
187
+ return JSON.parse(content);
188
+ }
189
+ catch (_error) {
190
+ // Baseline file doesn't exist, try index
191
+ }
192
+ // Check index for baseline runId
193
+ try {
194
+ const content = await fs.readFile(workspace.indexPath, "utf-8");
195
+ const index = JSON.parse(content);
196
+ // Look for a run marked as baseline (could be a future enhancement)
197
+ // For now, just return the oldest run in index
198
+ if (index.length > 0) {
199
+ const oldestRunId = index[index.length - 1].runId;
200
+ return await loadRunReport(`.evalgate/runs/${oldestRunId}.json`);
201
+ }
202
+ }
203
+ catch (_error) {
204
+ // Index doesn't exist
205
+ }
206
+ throw new Error("No baseline run found. Set a baseline with 'evalgate diff --base <runId> --head last --set-baseline' or create .evalgate/baseline-run.json.");
207
+ }
208
+ /**
209
+ * Resolve head report from options
210
+ */
211
+ async function resolveHeadReport(head) {
212
+ if (head) {
213
+ if (head === "last") {
214
+ return await loadRunReport(".evalgate/last-run.json");
215
+ }
216
+ // Check if it's a runId
217
+ if (head.startsWith("run-")) {
218
+ return await loadRunReport(`.evalgate/runs/${head}.json`);
219
+ }
220
+ // Treat as file path
221
+ return await loadRunReport(head);
222
+ }
223
+ // Default: use last run
224
+ return await loadRunReport(".evalgate/last-run.json");
225
+ }
226
+ /**
227
+ * Find previous run from index
228
+ */
229
+ async function findPreviousRun() {
230
+ const indexPath = path.join(process.cwd(), ".evalgate", "runs", "index.json");
231
+ try {
232
+ const content = await fs.readFile(indexPath, "utf-8");
233
+ const index = JSON.parse(content);
234
+ if (index.length < 2) {
235
+ throw new Error("Need at least 2 runs to use 'last' shortcut. Run 'evalgate run --write-results' at least twice.");
236
+ }
237
+ // Return the second most recent run (index[0] is the most recent)
238
+ const previousRunId = index[1].runId;
239
+ return await loadRunReport(`.evalgate/runs/${previousRunId}.json`);
240
+ }
241
+ catch (error) {
242
+ if (error instanceof Error &&
243
+ error.message.includes("Need at least 2 runs")) {
244
+ throw error;
245
+ }
246
+ throw new Error("No run history found. Run 'evalgate run --write-results' first.");
247
+ }
248
+ }
249
+ /**
250
+ * Check if string is a branch name
251
+ */
252
+ async function isBranchName(name) {
253
+ return new Promise((resolve) => {
254
+ const git = (0, node_child_process_1.spawn)("git", ["rev-parse", "--verify", name], {
255
+ stdio: ["pipe", "pipe", "pipe"],
256
+ });
257
+ git.on("close", (code) => {
258
+ resolve(code === 0);
259
+ });
260
+ });
261
+ }
262
+ /**
263
+ * Find last run for a branch
264
+ */
265
+ async function findLastRunForBranch(_branch) {
266
+ // For now, just look for .evalgate/last-run.json
267
+ // In a real implementation, this would:
268
+ // 1. Check CI artifacts for the branch
269
+ // 2. Look for branch-specific run files
270
+ // 3. Fall back to local last-run.json
271
+ const lastRunPath = path.join(process.cwd(), ".evalgate", "last-run.json");
272
+ try {
273
+ const content = await fs.readFile(lastRunPath, "utf-8");
274
+ return JSON.parse(content);
275
+ }
276
+ catch (_error) {
277
+ return null;
278
+ }
279
+ }
280
+ /**
281
+ * Load run report from file
282
+ */
283
+ async function loadRunReport(filePath) {
284
+ try {
285
+ const content = await fs.readFile(path.resolve(filePath), "utf-8");
286
+ return JSON.parse(content);
287
+ }
288
+ catch (_error) {
289
+ return null;
290
+ }
291
+ }
292
+ /**
293
+ * Compare two run reports
294
+ */
295
+ function compareReports(base, head) {
296
+ const changedSpecs = [];
297
+ // Create maps for easy lookup
298
+ const baseSpecs = new Map(base.results.map((r) => [r.specId, r]));
299
+ const headSpecs = new Map(head.results.map((r) => [r.specId, r]));
300
+ const allSpecIds = new Set([...baseSpecs.keys(), ...headSpecs.keys()]);
301
+ // Analyze each spec
302
+ for (const specId of allSpecIds) {
303
+ const baseSpec = baseSpecs.get(specId);
304
+ const headSpec = headSpecs.get(specId);
305
+ const specDiff = analyzeSpecDiff(specId, baseSpec, headSpec);
306
+ if (specDiff) {
307
+ changedSpecs.push(specDiff);
308
+ }
309
+ }
310
+ // Sort by severity then ID
311
+ changedSpecs.sort((a, b) => {
312
+ const severityOrder = getSeverityOrder(a.classification) - getSeverityOrder(b.classification);
313
+ if (severityOrder !== 0)
314
+ return severityOrder;
315
+ return a.specId.localeCompare(b.specId);
316
+ });
317
+ // Calculate summary
318
+ const summary = calculateDiffSummary(base, head, changedSpecs);
319
+ return {
320
+ schemaVersion: exports.DIFF_SCHEMA_VERSION,
321
+ base,
322
+ head,
323
+ summary,
324
+ changedSpecs,
325
+ metadata: {
326
+ generatedAt: Date.now(),
327
+ baseSource: "local",
328
+ headSource: "local",
329
+ },
330
+ };
331
+ }
332
+ /**
333
+ * Analyze diff for a single spec
334
+ */
335
+ function analyzeSpecDiff(specId, base, head) {
336
+ const name = head?.name || base?.name || specId;
337
+ const filePath = head?.filePath || base?.filePath || "";
338
+ // Determine classification
339
+ const classification = classifyDiff(base, head);
340
+ // If no significant change, return null
341
+ if (classification === "execution_error" && base && head) {
342
+ // Check if it's actually identical
343
+ if (base.result.status === head.result.status &&
344
+ (base.result.score || 0) === (head.result.score || 0) &&
345
+ base.result.duration === head.result.duration &&
346
+ base.result.error === head.result.error) {
347
+ return null;
348
+ }
349
+ }
350
+ // Calculate deltas
351
+ const deltas = calculateDeltas(base, head);
352
+ return {
353
+ specId,
354
+ name,
355
+ filePath,
356
+ classification,
357
+ base: base
358
+ ? {
359
+ status: base.result.status,
360
+ score: base.result.score,
361
+ duration: base.result.duration,
362
+ error: base.result.error,
363
+ }
364
+ : undefined,
365
+ head: head
366
+ ? {
367
+ status: head.result.status,
368
+ score: head.result.score,
369
+ duration: head.result.duration,
370
+ error: head.result.error,
371
+ }
372
+ : undefined,
373
+ deltas,
374
+ };
375
+ }
376
+ /**
377
+ * Classify the type of change
378
+ */
379
+ function classifyDiff(base, head) {
380
+ if (!base && head) {
381
+ return "added";
382
+ }
383
+ if (base && !head) {
384
+ return "removed";
385
+ }
386
+ if (!base || !head) {
387
+ return "execution_error";
388
+ }
389
+ // Both exist - analyze changes
390
+ if (base.result.status === "passed" && head.result.status === "failed") {
391
+ return "new_failure";
392
+ }
393
+ if (base.result.status === "failed" && head.result.status === "passed") {
394
+ return "fixed_failure";
395
+ }
396
+ if (base.result.status === "skipped" && head.result.status !== "skipped") {
397
+ return "skipped_change";
398
+ }
399
+ if (head.result.status === "skipped" && base.result.status !== "skipped") {
400
+ return "skipped_change";
401
+ }
402
+ // Score changes
403
+ if (base.result.score && head.result.score) {
404
+ const delta = head.result.score - base.result.score;
405
+ if (delta < -0.05)
406
+ return "score_drop";
407
+ if (delta > 0.05)
408
+ return "score_improve";
409
+ }
410
+ // Default to no significant change
411
+ return "execution_error";
412
+ }
413
+ /**
414
+ * Calculate deltas between base and head
415
+ */
416
+ function calculateDeltas(base, head) {
417
+ const deltas = {};
418
+ if (base && head) {
419
+ if (base.result.score && head.result.score) {
420
+ deltas.scoreDelta = round(head.result.score - base.result.score, 4);
421
+ }
422
+ deltas.durationDelta = head.result.duration - base.result.duration;
423
+ if (base.result.status !== head.result.status) {
424
+ deltas.statusChange = `${base.result.status} → ${head.result.status}`;
425
+ }
426
+ }
427
+ return deltas;
428
+ }
429
+ /**
430
+ * Get severity order for sorting
431
+ */
432
+ function getSeverityOrder(classification) {
433
+ const severityMap = {
434
+ new_failure: 1,
435
+ score_drop: 2,
436
+ execution_error: 3,
437
+ skipped_change: 4,
438
+ removed: 5,
439
+ added: 6,
440
+ fixed_failure: 7,
441
+ score_improve: 8,
442
+ };
443
+ return severityMap[classification] || 9;
444
+ }
445
+ /**
446
+ * Calculate diff summary statistics
447
+ */
448
+ function calculateDiffSummary(base, head, changedSpecs) {
449
+ const baseTotal = base.results.length;
450
+ const headTotal = head.results.length;
451
+ const basePassRate = base.summary.passRate;
452
+ const headPassRate = head.summary.passRate;
453
+ const passRateDelta = round(headPassRate - basePassRate, 4);
454
+ // Calculate average scores
455
+ const baseScores = base.results
456
+ .filter((r) => r.result.score !== undefined)
457
+ .map((r) => r.result.score);
458
+ const headScores = head.results
459
+ .filter((r) => r.result.score !== undefined)
460
+ .map((r) => r.result.score);
461
+ const baseAvgScore = baseScores.length > 0
462
+ ? baseScores.reduce((a, b) => a + b, 0) / baseScores.length
463
+ : 0;
464
+ const headAvgScore = headScores.length > 0
465
+ ? headScores.reduce((a, b) => a + b, 0) / headScores.length
466
+ : 0;
467
+ const scoreDelta = round(headAvgScore - baseAvgScore, 4);
468
+ // Count classifications
469
+ const regressions = changedSpecs.filter((s) => ["new_failure", "score_drop", "execution_error"].includes(s.classification)).length;
470
+ const improvements = changedSpecs.filter((s) => ["fixed_failure", "score_improve"].includes(s.classification)).length;
471
+ const added = changedSpecs.filter((s) => s.classification === "added").length;
472
+ const removed = changedSpecs.filter((s) => s.classification === "removed").length;
473
+ return {
474
+ baseTotal,
475
+ headTotal,
476
+ passRateDelta,
477
+ scoreDelta,
478
+ regressions,
479
+ improvements,
480
+ added,
481
+ removed,
482
+ };
483
+ }
484
+ /**
485
+ * Print human-readable diff results
486
+ */
487
+ function printHumanResults(result) {
488
+ console.log("\n🔄 Behavioral Diff Results");
489
+ console.log(`📊 Base: ${result.metadata.baseSource} (${result.summary.baseTotal} specs)`);
490
+ console.log(`📈 Head: ${result.metadata.headSource} (${result.summary.headTotal} specs)`);
491
+ console.log("\n📈 Summary:");
492
+ console.log(` 📊 Pass Rate Delta: ${roundPct(result.summary.passRateDelta, 1).toFixed(1)}%`);
493
+ console.log(` 🎯 Score Delta: ${roundPct(result.summary.scoreDelta, 1).toFixed(1)}%`);
494
+ console.log(` 📉 Regressions: ${result.summary.regressions}`);
495
+ console.log(` 📈 Improvements: ${result.summary.improvements}`);
496
+ console.log(` ➕ Added: ${result.summary.added}`);
497
+ console.log(` ➖ Removed: ${result.summary.removed}`);
498
+ if (result.changedSpecs.length > 0) {
499
+ console.log("\n🔍 Changed Specifications:");
500
+ for (const spec of result.changedSpecs) {
501
+ const icon = getClassificationIcon(spec.classification);
502
+ const scoreInfo = spec.deltas.scoreDelta
503
+ ? ` (${spec.deltas.scoreDelta > 0 ? "+" : ""}${roundPct(spec.deltas.scoreDelta, 1).toFixed(1)}%)`
504
+ : "";
505
+ const durationInfo = spec.deltas.durationDelta
506
+ ? ` (${spec.deltas.durationDelta > 0 ? "+" : ""}${spec.deltas.durationDelta}ms)`
507
+ : "";
508
+ console.log(` ${icon} ${spec.name}${scoreInfo}${durationInfo}`);
509
+ if (spec.head?.error) {
510
+ console.log(` ❌ ${spec.head.error}`);
511
+ }
512
+ }
513
+ }
514
+ else {
515
+ console.log("\n✅ No changes detected");
516
+ }
517
+ }
518
+ /**
519
+ * Get icon for classification
520
+ */
521
+ function getClassificationIcon(classification) {
522
+ const iconMap = {
523
+ new_failure: "🆘",
524
+ fixed_failure: "✅",
525
+ score_drop: "📉",
526
+ score_improve: "📈",
527
+ execution_error: "❌",
528
+ skipped_change: "⏭️",
529
+ added: "➕",
530
+ removed: "➖",
531
+ };
532
+ return iconMap[classification] || "❓";
533
+ }
534
+ /**
535
+ * Print JSON results
536
+ */
537
+ function printJsonResults(result) {
538
+ console.log(JSON.stringify(result, null, 2));
539
+ }
540
+ /**
541
+ * Write GitHub Step Summary
542
+ */
543
+ async function writeGitHubStepSummary(result) {
544
+ if (!process.env.GITHUB_STEP_SUMMARY) {
545
+ return; // Not in GitHub Actions
546
+ }
547
+ const summaryPath = process.env.GITHUB_STEP_SUMMARY;
548
+ try {
549
+ const summary = generateGitHubSummary(result);
550
+ await fs.appendFile(summaryPath, `${summary}\n`, "utf-8");
551
+ }
552
+ catch (error) {
553
+ console.warn("Warning: Could not write GitHub Step Summary:", error);
554
+ }
555
+ }
556
+ /**
557
+ * Generate GitHub Step Summary content
558
+ */
559
+ function generateGitHubSummary(result) {
560
+ const lines = [];
561
+ // Header
562
+ lines.push("## 🤖 EvalGate Diff Results\n");
563
+ // Summary metrics
564
+ lines.push("### 📊 Summary Metrics");
565
+ lines.push(`- **Pass Rate Delta**: ${roundPct(result.summary.passRateDelta, 1).toFixed(1)}%`);
566
+ lines.push(`- **Score Delta**: ${roundPct(result.summary.scoreDelta, 1).toFixed(1)}%`);
567
+ lines.push(`- **🚨 Regressions**: ${result.summary.regressions}`);
568
+ lines.push(`- **📈 Improvements**: ${result.summary.improvements}`);
569
+ lines.push(`- **➕ Added**: ${result.summary.added}`);
570
+ lines.push(`- **➖ Removed**: ${result.summary.removed}`);
571
+ lines.push("");
572
+ // Status indicator
573
+ if (result.summary.regressions > 0) {
574
+ lines.push("### 🚨 Regressions Detected\n");
575
+ lines.push("**⚠️ This PR contains regressions that should be reviewed.**\n");
576
+ }
577
+ else {
578
+ lines.push("### ✅ No Regressions Detected\n");
579
+ lines.push("**🎉 All tests passed! No regressions found.**\n");
580
+ }
581
+ // Top regressions
582
+ const regressions = result.changedSpecs.filter((s) => ["new_failure", "execution_error", "score_drop"].includes(s.classification));
583
+ if (regressions.length > 0) {
584
+ lines.push("### 🔍 Top Regressions\n");
585
+ const topRegressions = regressions.slice(0, 5);
586
+ for (const spec of topRegressions) {
587
+ const icon = getClassificationIcon(spec.classification);
588
+ const scoreInfo = spec.deltas.scoreDelta
589
+ ? ` (${spec.deltas.scoreDelta > 0 ? "+" : ""}${roundPct(spec.deltas.scoreDelta, 1).toFixed(1)}%)`
590
+ : "";
591
+ lines.push(`${icon} **${spec.name}**${scoreInfo}`);
592
+ lines.push(` Classification: \`${spec.classification}\``);
593
+ if (spec.head?.error) {
594
+ lines.push(` Error: \`${spec.head.error}\``);
595
+ }
596
+ lines.push("");
597
+ }
598
+ if (regressions.length > 5) {
599
+ lines.push(`... and ${regressions.length - 5} more regressions\n`);
600
+ }
601
+ }
602
+ // Artifacts
603
+ lines.push("### 📁 Artifacts\n");
604
+ lines.push(`- **Run Index**: \`.evalgate/runs/index.json\``);
605
+ lines.push(`- **Latest Run**: \`.evalgate/runs/${result.head.runId}.json\``);
606
+ lines.push(`- **Last Run**: \`.evalgate/last-run.json\``);
607
+ lines.push("");
608
+ // Footer
609
+ lines.push("<details>");
610
+ lines.push("<summary>🔧 Technical Details</summary>");
611
+ lines.push("");
612
+ lines.push("```json");
613
+ lines.push(JSON.stringify({
614
+ baseRunId: result.base.runId,
615
+ headRunId: result.head.runId,
616
+ baseTotal: result.summary.baseTotal,
617
+ headTotal: result.summary.headTotal,
618
+ passRateDelta: result.summary.passRateDelta,
619
+ scoreDelta: result.summary.scoreDelta,
620
+ regressions: result.summary.regressions,
621
+ improvements: result.summary.improvements,
622
+ added: result.summary.added,
623
+ removed: result.summary.removed,
624
+ }, null, 2));
625
+ lines.push("```");
626
+ lines.push("");
627
+ lines.push("</details>");
628
+ return lines.join("\n");
629
+ }
630
+ /**
631
+ * CLI entry point
632
+ */
633
+ async function runDiffCLI(options) {
634
+ try {
635
+ const result = await runDiff(options);
636
+ if (options.format === "json") {
637
+ printJsonResults(result);
638
+ }
639
+ else {
640
+ printHumanResults(result);
641
+ }
642
+ // Write GitHub Step Summary if in CI
643
+ await writeGitHubStepSummary(result);
644
+ // Exit with appropriate code
645
+ if (result.summary.regressions > 0) {
646
+ process.exit(1); // Regressions detected
647
+ }
648
+ else {
649
+ process.exit(0); // Clean
650
+ }
651
+ }
652
+ catch (error) {
653
+ console.error(`EvalGate ERROR: ${error instanceof Error ? error.message : String(error)}`);
654
+ // In CI with git ref error, exit with config code
655
+ if (isCIEnvironment() &&
656
+ options.base &&
657
+ (0, env_1.isGitRef)(options.base) &&
658
+ error instanceof Error &&
659
+ error.message.includes("Base run report not found in CI environment")) {
660
+ process.exit(2); // Config/infra issue
661
+ }
662
+ else {
663
+ process.exit(2); // General error
664
+ }
665
+ }
666
+ }
667
+ // Public diff core API surface
668
+ exports.diffCore = {
669
+ /**
670
+ * Compare two run reports and return diff result
671
+ */
672
+ diffRunReports: compareReports,
673
+ /**
674
+ * Classify the type of change between two specs
675
+ */
676
+ classifyChange: classifyDiff,
677
+ /**
678
+ * Calculate summary statistics for a diff
679
+ */
680
+ summarizeDiff: calculateDiffSummary,
681
+ /**
682
+ * Calculate deltas between two spec results
683
+ */
684
+ calculateDeltas,
685
+ };