@pauly4010/evalai-sdk 1.8.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +54 -0
  2. package/dist/cli/ci.d.ts +45 -0
  3. package/dist/cli/ci.js +192 -0
  4. package/dist/cli/diff.d.ts +173 -0
  5. package/dist/cli/diff.js +680 -0
  6. package/dist/cli/discover.d.ts +84 -0
  7. package/dist/cli/discover.js +408 -0
  8. package/dist/cli/doctor.js +19 -10
  9. package/dist/cli/env.d.ts +21 -0
  10. package/dist/cli/env.js +42 -0
  11. package/dist/cli/explain.js +143 -37
  12. package/dist/cli/impact-analysis.d.ts +63 -0
  13. package/dist/cli/impact-analysis.js +251 -0
  14. package/dist/cli/index.js +173 -0
  15. package/dist/cli/manifest.d.ts +105 -0
  16. package/dist/cli/manifest.js +275 -0
  17. package/dist/cli/migrate.d.ts +41 -0
  18. package/dist/cli/migrate.js +349 -0
  19. package/dist/cli/print-config.js +18 -14
  20. package/dist/cli/run.d.ts +101 -0
  21. package/dist/cli/run.js +389 -0
  22. package/dist/cli/workspace.d.ts +28 -0
  23. package/dist/cli/workspace.js +58 -0
  24. package/dist/index.d.ts +6 -0
  25. package/dist/index.js +30 -5
  26. package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
  27. package/dist/runtime/adapters/config-to-dsl.js +391 -0
  28. package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
  29. package/dist/runtime/adapters/testsuite-to-dsl.js +271 -0
  30. package/dist/runtime/context.d.ts +26 -0
  31. package/dist/runtime/context.js +74 -0
  32. package/dist/runtime/eval.d.ts +46 -0
  33. package/dist/runtime/eval.js +237 -0
  34. package/dist/runtime/execution-mode.d.ts +80 -0
  35. package/dist/runtime/execution-mode.js +353 -0
  36. package/dist/runtime/executor.d.ts +16 -0
  37. package/dist/runtime/executor.js +152 -0
  38. package/dist/runtime/registry.d.ts +78 -0
  39. package/dist/runtime/registry.js +416 -0
  40. package/dist/runtime/run-report.d.ts +202 -0
  41. package/dist/runtime/run-report.js +220 -0
  42. package/dist/runtime/types.d.ts +356 -0
  43. package/dist/runtime/types.js +76 -0
  44. package/dist/testing.d.ts +65 -0
  45. package/dist/testing.js +42 -0
  46. package/dist/version.d.ts +1 -1
  47. package/dist/version.js +1 -1
  48. package/package.json +4 -3
@@ -0,0 +1,680 @@
1
+ "use strict";
2
+ /**
3
+ * TICKET 5 — Behavioral Diff CLI (EVAL-401)
4
+ *
5
+ * Goal: "Git diff for AI behavior" from two RunReports
6
+ *
7
+ * Command:
8
+ * evalai diff --base main (default uses git to find baseline run)
9
+ * evalai diff --a <runReportPath> --b <runReportPath>
10
+ * evalai diff main..feature (nice-to-have alias)
11
+ */
12
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
13
+ if (k2 === undefined) k2 = k;
14
+ var desc = Object.getOwnPropertyDescriptor(m, k);
15
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
16
+ desc = { enumerable: true, get: function() { return m[k]; } };
17
+ }
18
+ Object.defineProperty(o, k2, desc);
19
+ }) : (function(o, m, k, k2) {
20
+ if (k2 === undefined) k2 = k;
21
+ o[k2] = m[k];
22
+ }));
23
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
24
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
25
+ }) : function(o, v) {
26
+ o["default"] = v;
27
+ });
28
+ var __importStar = (this && this.__importStar) || (function () {
29
+ var ownKeys = function(o) {
30
+ ownKeys = Object.getOwnPropertyNames || function (o) {
31
+ var ar = [];
32
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
33
+ return ar;
34
+ };
35
+ return ownKeys(o);
36
+ };
37
+ return function (mod) {
38
+ if (mod && mod.__esModule) return mod;
39
+ var result = {};
40
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
41
+ __setModuleDefault(result, mod);
42
+ return result;
43
+ };
44
+ })();
45
+ Object.defineProperty(exports, "__esModule", { value: true });
46
+ exports.diffCore = exports.SUPPORTED_SCHEMA_VERSIONS = exports.DIFF_SCHEMA_VERSION = void 0;
47
+ exports.round = round;
48
+ exports.roundPct = roundPct;
49
+ exports.validateSchemaVersion = validateSchemaVersion;
50
+ exports.runDiff = runDiff;
51
+ exports.compareReports = compareReports;
52
+ exports.calculateDiffSummary = calculateDiffSummary;
53
+ exports.printHumanResults = printHumanResults;
54
+ exports.printJsonResults = printJsonResults;
55
+ exports.writeGitHubStepSummary = writeGitHubStepSummary;
56
+ exports.runDiffCLI = runDiffCLI;
57
+ exports.classifyDiff = classifyDiff;
58
+ exports.calculateDeltas = calculateDeltas;
59
+ const node_child_process_1 = require("node:child_process");
60
+ const fs = __importStar(require("node:fs/promises"));
61
+ const path = __importStar(require("node:path"));
62
+ const env_1 = require("./env");
63
+ const workspace_1 = require("./workspace");
64
+ /**
65
+ * Diff schema version
66
+ */
67
+ exports.DIFF_SCHEMA_VERSION = 1;
68
+ /**
69
+ * Supported RunReport schema versions
70
+ */
71
+ exports.SUPPORTED_SCHEMA_VERSIONS = [1];
72
+ /**
73
+ * Rounding helpers for floating point normalization
74
+ */
75
+ function round(value, precision = 4) {
76
+ return Math.round(value * 10 ** precision) / 10 ** precision;
77
+ }
78
+ function roundPct(value, precision = 1) {
79
+ return round(value * 100, precision);
80
+ }
81
+ /**
82
+ * Validate RunReport schema version
83
+ */
84
+ function validateSchemaVersion(report) {
85
+ if (!report.schemaVersion) {
86
+ throw new Error(`RunReport missing schemaVersion. This report was generated by an older version of EvalAI.\n` +
87
+ `Please regenerate the run report or upgrade to a compatible version.`);
88
+ }
89
+ if (!exports.SUPPORTED_SCHEMA_VERSIONS.includes(report.schemaVersion)) {
90
+ throw new Error(`Unsupported RunReport schema version: ${report.schemaVersion}\n` +
91
+ `Supported versions: ${exports.SUPPORTED_SCHEMA_VERSIONS.join(", ")}\n` +
92
+ `This report was generated by a newer version of EvalAI. Please upgrade your EvalAI CLI.`);
93
+ }
94
+ }
95
+ /**
96
+ * Check if running in CI environment
97
+ */
98
+ function isCIEnvironment() {
99
+ return (0, env_1.isCI)();
100
+ }
101
+ /**
102
+ * Generate CI-friendly error message for missing base
103
+ */
104
+ function generateCIBaseErrorMessage(baseRef) {
105
+ const lines = [
106
+ "🚫 Base run report not found in CI environment",
107
+ "",
108
+ "To fix this, download the base run artifact from your base branch workflow:",
109
+ "",
110
+ `1. Download run artifact from ${baseRef} branch workflow`,
111
+ "2. Save it as .evalai/base-run.json",
112
+ "3. Re-run: evalai diff --base .evalai/base-run.json --head .evalai/last-run.json",
113
+ "",
114
+ "Expected artifact patterns:",
115
+ " - .evalai/runs/run-*.json",
116
+ " - .evalai/last-run.json",
117
+ "",
118
+ "Or set a baseline pointer:",
119
+ " evalai diff --base <runId> --head last --set-baseline",
120
+ ];
121
+ return lines.join("\n");
122
+ }
123
+ /**
124
+ * Run diff comparison
125
+ */
126
+ async function runDiff(options) {
127
+ // Resolve base and head reports
128
+ const baseReport = await resolveBaseReport(options.base);
129
+ const headReport = await resolveHeadReport(options.head);
130
+ if (!baseReport) {
131
+ if (isCIEnvironment() && options.base && (0, env_1.isGitRef)(options.base)) {
132
+ // In CI with git ref, provide helpful guidance
133
+ throw new Error(generateCIBaseErrorMessage(options.base));
134
+ }
135
+ throw new Error("Base run report not found. Use --base to specify a report or branch.");
136
+ }
137
+ if (!headReport) {
138
+ throw new Error("Head run report not found. Use --head to specify a report path.");
139
+ }
140
+ // Validate schema versions
141
+ validateSchemaVersion(baseReport);
142
+ validateSchemaVersion(headReport);
143
+ // Perform diff comparison
144
+ const diffResult = compareReports(baseReport, headReport);
145
+ return diffResult;
146
+ }
147
+ /**
148
+ * Resolve base report from options
149
+ */
150
+ async function resolveBaseReport(base) {
151
+ if (!base) {
152
+ // Default: try to find last run for main branch
153
+ return await findLastRunForBranch("main");
154
+ }
155
+ if (base === "last") {
156
+ // Resolve to previous run in index
157
+ return await findPreviousRun();
158
+ }
159
+ if (base === "baseline") {
160
+ // Resolve to baseline pointer or runId in index
161
+ return await findBaselineRun();
162
+ }
163
+ if (base.includes("..")) {
164
+ // Branch range like "main..feature" - extract base branch
165
+ const baseBranch = base.split("..")[0];
166
+ return await findLastRunForBranch(baseBranch);
167
+ }
168
+ // Check if it's a branch name
169
+ if (await isBranchName(base)) {
170
+ return await findLastRunForBranch(base);
171
+ }
172
+ // Check if it's a runId
173
+ if (base.startsWith("run-")) {
174
+ return await loadRunReport(`.evalai/runs/${base}.json`);
175
+ }
176
+ // Treat as file path
177
+ return await loadRunReport(base);
178
+ }
179
+ /**
180
+ * Find baseline run from pointer file or index
181
+ */
182
+ async function findBaselineRun() {
183
+ const workspace = (0, workspace_1.resolveEvalWorkspace)();
184
+ // First, check for baseline pointer file
185
+ try {
186
+ const content = await fs.readFile(workspace.baselinePath, "utf-8");
187
+ return JSON.parse(content);
188
+ }
189
+ catch (error) {
190
+ // Baseline file doesn't exist, try index
191
+ }
192
+ // Check index for baseline runId
193
+ try {
194
+ const content = await fs.readFile(workspace.indexPath, "utf-8");
195
+ const index = JSON.parse(content);
196
+ // Look for a run marked as baseline (could be a future enhancement)
197
+ // For now, just return the oldest run in index
198
+ if (index.length > 0) {
199
+ const oldestRunId = index[index.length - 1].runId;
200
+ return await loadRunReport(`.evalai/runs/${oldestRunId}.json`);
201
+ }
202
+ }
203
+ catch (error) {
204
+ // Index doesn't exist
205
+ }
206
+ throw new Error("No baseline run found. Set a baseline with 'evalai diff --base <runId> --head last --set-baseline' or create .evalai/baseline-run.json.");
207
+ }
208
+ /**
209
+ * Resolve head report from options
210
+ */
211
+ async function resolveHeadReport(head) {
212
+ if (head) {
213
+ if (head === "last") {
214
+ return await loadRunReport(".evalai/last-run.json");
215
+ }
216
+ // Check if it's a runId
217
+ if (head.startsWith("run-")) {
218
+ return await loadRunReport(`.evalai/runs/${head}.json`);
219
+ }
220
+ // Treat as file path
221
+ return await loadRunReport(head);
222
+ }
223
+ // Default: use last run
224
+ return await loadRunReport(".evalai/last-run.json");
225
+ }
226
+ /**
227
+ * Find previous run from index
228
+ */
229
+ async function findPreviousRun() {
230
+ const indexPath = path.join(process.cwd(), ".evalai", "runs", "index.json");
231
+ try {
232
+ const content = await fs.readFile(indexPath, "utf-8");
233
+ const index = JSON.parse(content);
234
+ if (index.length < 2) {
235
+ throw new Error("Need at least 2 runs to use 'last' shortcut. Run 'evalai run --write-results' at least twice.");
236
+ }
237
+ // Return the second most recent run (index[0] is the most recent)
238
+ const previousRunId = index[1].runId;
239
+ return await loadRunReport(`.evalai/runs/${previousRunId}.json`);
240
+ }
241
+ catch (error) {
242
+ if (error instanceof Error && error.message.includes("Need at least 2 runs")) {
243
+ throw error;
244
+ }
245
+ throw new Error("No run history found. Run 'evalai run --write-results' first.");
246
+ }
247
+ }
248
+ /**
249
+ * Check if string is a branch name
250
+ */
251
+ async function isBranchName(name) {
252
+ return new Promise((resolve) => {
253
+ const git = (0, node_child_process_1.spawn)("git", ["rev-parse", "--verify", name], {
254
+ stdio: ["pipe", "pipe", "pipe"],
255
+ });
256
+ git.on("close", (code) => {
257
+ resolve(code === 0);
258
+ });
259
+ });
260
+ }
261
+ /**
262
+ * Find last run for a branch
263
+ */
264
+ async function findLastRunForBranch(branch) {
265
+ // For now, just look for .evalai/last-run.json
266
+ // In a real implementation, this would:
267
+ // 1. Check CI artifacts for the branch
268
+ // 2. Look for branch-specific run files
269
+ // 3. Fall back to local last-run.json
270
+ const lastRunPath = path.join(process.cwd(), ".evalai", "last-run.json");
271
+ try {
272
+ const content = await fs.readFile(lastRunPath, "utf-8");
273
+ return JSON.parse(content);
274
+ }
275
+ catch (error) {
276
+ return null;
277
+ }
278
+ }
279
+ /**
280
+ * Load run report from file
281
+ */
282
+ async function loadRunReport(filePath) {
283
+ try {
284
+ const content = await fs.readFile(path.resolve(filePath), "utf-8");
285
+ return JSON.parse(content);
286
+ }
287
+ catch (error) {
288
+ return null;
289
+ }
290
+ }
291
+ /**
292
+ * Compare two run reports
293
+ */
294
+ function compareReports(base, head) {
295
+ const changedSpecs = [];
296
+ // Create maps for easy lookup
297
+ const baseSpecs = new Map(base.results.map((r) => [r.specId, r]));
298
+ const headSpecs = new Map(head.results.map((r) => [r.specId, r]));
299
+ const allSpecIds = new Set([...baseSpecs.keys(), ...headSpecs.keys()]);
300
+ // Analyze each spec
301
+ for (const specId of allSpecIds) {
302
+ const baseSpec = baseSpecs.get(specId);
303
+ const headSpec = headSpecs.get(specId);
304
+ const specDiff = analyzeSpecDiff(specId, baseSpec, headSpec);
305
+ if (specDiff) {
306
+ changedSpecs.push(specDiff);
307
+ }
308
+ }
309
+ // Sort by severity then ID
310
+ changedSpecs.sort((a, b) => {
311
+ const severityOrder = getSeverityOrder(a.classification) - getSeverityOrder(b.classification);
312
+ if (severityOrder !== 0)
313
+ return severityOrder;
314
+ return a.specId.localeCompare(b.specId);
315
+ });
316
+ // Calculate summary
317
+ const summary = calculateDiffSummary(base, head, changedSpecs);
318
+ return {
319
+ schemaVersion: exports.DIFF_SCHEMA_VERSION,
320
+ base,
321
+ head,
322
+ summary,
323
+ changedSpecs,
324
+ metadata: {
325
+ generatedAt: Date.now(),
326
+ baseSource: "local",
327
+ headSource: "local",
328
+ },
329
+ };
330
+ }
331
+ /**
332
+ * Analyze diff for a single spec
333
+ */
334
+ function analyzeSpecDiff(specId, base, head) {
335
+ const name = head?.name || base?.name || specId;
336
+ const filePath = head?.filePath || base?.filePath || "";
337
+ // Determine classification
338
+ const classification = classifyDiff(base, head);
339
+ // If no significant change, return null
340
+ if (classification === "execution_error" && base && head) {
341
+ // Check if it's actually identical
342
+ if (base.result.status === head.result.status &&
343
+ (base.result.score || 0) === (head.result.score || 0) &&
344
+ base.result.duration === head.result.duration &&
345
+ base.result.error === head.result.error) {
346
+ return null;
347
+ }
348
+ }
349
+ // Calculate deltas
350
+ const deltas = calculateDeltas(base, head);
351
+ return {
352
+ specId,
353
+ name,
354
+ filePath,
355
+ classification,
356
+ base: base
357
+ ? {
358
+ status: base.result.status,
359
+ score: base.result.score,
360
+ duration: base.result.duration,
361
+ error: base.result.error,
362
+ }
363
+ : undefined,
364
+ head: head
365
+ ? {
366
+ status: head.result.status,
367
+ score: head.result.score,
368
+ duration: head.result.duration,
369
+ error: head.result.error,
370
+ }
371
+ : undefined,
372
+ deltas,
373
+ };
374
+ }
375
+ /**
376
+ * Classify the type of change
377
+ */
378
+ function classifyDiff(base, head) {
379
+ if (!base && head) {
380
+ return "added";
381
+ }
382
+ if (base && !head) {
383
+ return "removed";
384
+ }
385
+ if (!base || !head) {
386
+ return "execution_error";
387
+ }
388
+ // Both exist - analyze changes
389
+ if (base.result.status === "passed" && head.result.status === "failed") {
390
+ return "new_failure";
391
+ }
392
+ if (base.result.status === "failed" && head.result.status === "passed") {
393
+ return "fixed_failure";
394
+ }
395
+ if (base.result.status === "skipped" && head.result.status !== "skipped") {
396
+ return "skipped_change";
397
+ }
398
+ if (head.result.status === "skipped" && base.result.status !== "skipped") {
399
+ return "skipped_change";
400
+ }
401
+ // Score changes
402
+ if (base.result.score && head.result.score) {
403
+ const delta = head.result.score - base.result.score;
404
+ if (delta < -0.05)
405
+ return "score_drop";
406
+ if (delta > 0.05)
407
+ return "score_improve";
408
+ }
409
+ // Default to no significant change
410
+ return "execution_error";
411
+ }
412
+ /**
413
+ * Calculate deltas between base and head
414
+ */
415
+ function calculateDeltas(base, head) {
416
+ const deltas = {};
417
+ if (base && head) {
418
+ if (base.result.score && head.result.score) {
419
+ deltas.scoreDelta = round(head.result.score - base.result.score, 4);
420
+ }
421
+ deltas.durationDelta = head.result.duration - base.result.duration;
422
+ if (base.result.status !== head.result.status) {
423
+ deltas.statusChange = `${base.result.status} → ${head.result.status}`;
424
+ }
425
+ }
426
+ return deltas;
427
+ }
428
+ /**
429
+ * Get severity order for sorting
430
+ */
431
+ function getSeverityOrder(classification) {
432
+ const severityMap = {
433
+ new_failure: 1,
434
+ score_drop: 2,
435
+ execution_error: 3,
436
+ skipped_change: 4,
437
+ removed: 5,
438
+ added: 6,
439
+ fixed_failure: 7,
440
+ score_improve: 8,
441
+ };
442
+ return severityMap[classification] || 9;
443
+ }
444
+ /**
445
+ * Calculate diff summary statistics
446
+ */
447
+ function calculateDiffSummary(base, head, changedSpecs) {
448
+ const baseTotal = base.results.length;
449
+ const headTotal = head.results.length;
450
+ const basePassRate = base.summary.passRate;
451
+ const headPassRate = head.summary.passRate;
452
+ const passRateDelta = round(headPassRate - basePassRate, 4);
453
+ // Calculate average scores
454
+ const baseScores = base.results
455
+ .filter((r) => r.result.score !== undefined)
456
+ .map((r) => r.result.score);
457
+ const headScores = head.results
458
+ .filter((r) => r.result.score !== undefined)
459
+ .map((r) => r.result.score);
460
+ const baseAvgScore = baseScores.length > 0 ? baseScores.reduce((a, b) => a + b, 0) / baseScores.length : 0;
461
+ const headAvgScore = headScores.length > 0 ? headScores.reduce((a, b) => a + b, 0) / headScores.length : 0;
462
+ const scoreDelta = round(headAvgScore - baseAvgScore, 4);
463
+ // Count classifications
464
+ const regressions = changedSpecs.filter((s) => ["new_failure", "score_drop", "execution_error"].includes(s.classification)).length;
465
+ const improvements = changedSpecs.filter((s) => ["fixed_failure", "score_improve"].includes(s.classification)).length;
466
+ const added = changedSpecs.filter((s) => s.classification === "added").length;
467
+ const removed = changedSpecs.filter((s) => s.classification === "removed").length;
468
+ return {
469
+ baseTotal,
470
+ headTotal,
471
+ passRateDelta,
472
+ scoreDelta,
473
+ regressions,
474
+ improvements,
475
+ added,
476
+ removed,
477
+ };
478
+ }
479
+ /**
480
+ * Print human-readable diff results
481
+ */
482
+ function printHumanResults(result) {
483
+ console.log("\n🔄 Behavioral Diff Results");
484
+ console.log(`📊 Base: ${result.metadata.baseSource} (${result.summary.baseTotal} specs)`);
485
+ console.log(`📈 Head: ${result.metadata.headSource} (${result.summary.headTotal} specs)`);
486
+ console.log("\n📈 Summary:");
487
+ console.log(` 📊 Pass Rate Delta: ${roundPct(result.summary.passRateDelta, 1).toFixed(1)}%`);
488
+ console.log(` 🎯 Score Delta: ${roundPct(result.summary.scoreDelta, 1).toFixed(1)}%`);
489
+ console.log(` 📉 Regressions: ${result.summary.regressions}`);
490
+ console.log(` 📈 Improvements: ${result.summary.improvements}`);
491
+ console.log(` ➕ Added: ${result.summary.added}`);
492
+ console.log(` ➖ Removed: ${result.summary.removed}`);
493
+ if (result.changedSpecs.length > 0) {
494
+ console.log("\n🔍 Changed Specifications:");
495
+ for (const spec of result.changedSpecs) {
496
+ const icon = getClassificationIcon(spec.classification);
497
+ const scoreInfo = spec.deltas.scoreDelta
498
+ ? ` (${spec.deltas.scoreDelta > 0 ? "+" : ""}${roundPct(spec.deltas.scoreDelta, 1).toFixed(1)}%)`
499
+ : "";
500
+ const durationInfo = spec.deltas.durationDelta
501
+ ? ` (${spec.deltas.durationDelta > 0 ? "+" : ""}${spec.deltas.durationDelta}ms)`
502
+ : "";
503
+ console.log(` ${icon} ${spec.name}${scoreInfo}${durationInfo}`);
504
+ if (spec.head?.error) {
505
+ console.log(` ❌ ${spec.head.error}`);
506
+ }
507
+ }
508
+ }
509
+ else {
510
+ console.log("\n✅ No changes detected");
511
+ }
512
+ }
513
+ /**
514
+ * Get icon for classification
515
+ */
516
+ function getClassificationIcon(classification) {
517
+ const iconMap = {
518
+ new_failure: "🆘",
519
+ fixed_failure: "✅",
520
+ score_drop: "📉",
521
+ score_improve: "📈",
522
+ execution_error: "❌",
523
+ skipped_change: "⏭️",
524
+ added: "➕",
525
+ removed: "➖",
526
+ };
527
+ return iconMap[classification] || "❓";
528
+ }
529
+ /**
530
+ * Print JSON results
531
+ */
532
+ function printJsonResults(result) {
533
+ console.log(JSON.stringify(result, null, 2));
534
+ }
535
+ /**
536
+ * Write GitHub Step Summary
537
+ */
538
+ async function writeGitHubStepSummary(result) {
539
+ if (!process.env.GITHUB_STEP_SUMMARY) {
540
+ return; // Not in GitHub Actions
541
+ }
542
+ const summaryPath = process.env.GITHUB_STEP_SUMMARY;
543
+ try {
544
+ const summary = generateGitHubSummary(result);
545
+ await fs.appendFile(summaryPath, summary + "\n", "utf-8");
546
+ }
547
+ catch (error) {
548
+ console.warn("Warning: Could not write GitHub Step Summary:", error);
549
+ }
550
+ }
551
+ /**
552
+ * Generate GitHub Step Summary content
553
+ */
554
+ function generateGitHubSummary(result) {
555
+ const lines = [];
556
+ // Header
557
+ lines.push("## 🤖 EvalAI Diff Results\n");
558
+ // Summary metrics
559
+ lines.push("### 📊 Summary Metrics");
560
+ lines.push(`- **Pass Rate Delta**: ${roundPct(result.summary.passRateDelta, 1).toFixed(1)}%`);
561
+ lines.push(`- **Score Delta**: ${roundPct(result.summary.scoreDelta, 1).toFixed(1)}%`);
562
+ lines.push(`- **🚨 Regressions**: ${result.summary.regressions}`);
563
+ lines.push(`- **📈 Improvements**: ${result.summary.improvements}`);
564
+ lines.push(`- **➕ Added**: ${result.summary.added}`);
565
+ lines.push(`- **➖ Removed**: ${result.summary.removed}`);
566
+ lines.push("");
567
+ // Status indicator
568
+ if (result.summary.regressions > 0) {
569
+ lines.push("### 🚨 Regressions Detected\n");
570
+ lines.push("**⚠️ This PR contains regressions that should be reviewed.**\n");
571
+ }
572
+ else {
573
+ lines.push("### ✅ No Regressions Detected\n");
574
+ lines.push("**🎉 All tests passed! No regressions found.**\n");
575
+ }
576
+ // Top regressions
577
+ const regressions = result.changedSpecs.filter((s) => ["new_failure", "execution_error", "score_drop"].includes(s.classification));
578
+ if (regressions.length > 0) {
579
+ lines.push("### 🔍 Top Regressions\n");
580
+ const topRegressions = regressions.slice(0, 5);
581
+ for (const spec of topRegressions) {
582
+ const icon = getClassificationIcon(spec.classification);
583
+ const scoreInfo = spec.deltas.scoreDelta
584
+ ? ` (${spec.deltas.scoreDelta > 0 ? "+" : ""}${roundPct(spec.deltas.scoreDelta, 1).toFixed(1)}%)`
585
+ : "";
586
+ lines.push(`${icon} **${spec.name}**${scoreInfo}`);
587
+ lines.push(` Classification: \`${spec.classification}\``);
588
+ if (spec.head?.error) {
589
+ lines.push(` Error: \`${spec.head.error}\``);
590
+ }
591
+ lines.push("");
592
+ }
593
+ if (regressions.length > 5) {
594
+ lines.push(`... and ${regressions.length - 5} more regressions\n`);
595
+ }
596
+ }
597
+ // Artifacts
598
+ lines.push("### 📁 Artifacts\n");
599
+ lines.push(`- **Run Index**: \`.evalai/runs/index.json\``);
600
+ lines.push(`- **Latest Run**: \`.evalai/runs/${result.head.runId}.json\``);
601
+ lines.push(`- **Last Run**: \`.evalai/last-run.json\``);
602
+ lines.push("");
603
+ // Footer
604
+ lines.push("<details>");
605
+ lines.push("<summary>🔧 Technical Details</summary>");
606
+ lines.push("");
607
+ lines.push("```json");
608
+ lines.push(JSON.stringify({
609
+ baseRunId: result.base.runId,
610
+ headRunId: result.head.runId,
611
+ baseTotal: result.summary.baseTotal,
612
+ headTotal: result.summary.headTotal,
613
+ passRateDelta: result.summary.passRateDelta,
614
+ scoreDelta: result.summary.scoreDelta,
615
+ regressions: result.summary.regressions,
616
+ improvements: result.summary.improvements,
617
+ added: result.summary.added,
618
+ removed: result.summary.removed,
619
+ }, null, 2));
620
+ lines.push("```");
621
+ lines.push("");
622
+ lines.push("</details>");
623
+ return lines.join("\n");
624
+ }
625
+ /**
626
+ * CLI entry point
627
+ */
628
+ async function runDiffCLI(options) {
629
+ try {
630
+ const result = await runDiff(options);
631
+ if (options.format === "json") {
632
+ printJsonResults(result);
633
+ }
634
+ else {
635
+ printHumanResults(result);
636
+ }
637
+ // Write GitHub Step Summary if in CI
638
+ await writeGitHubStepSummary(result);
639
+ // Exit with appropriate code
640
+ if (result.summary.regressions > 0) {
641
+ process.exit(1); // Regressions detected
642
+ }
643
+ else {
644
+ process.exit(0); // Clean
645
+ }
646
+ }
647
+ catch (error) {
648
+ console.error(`EvalAI ERROR: ${error instanceof Error ? error.message : String(error)}`);
649
+ // In CI with git ref error, exit with config code
650
+ if (isCIEnvironment() &&
651
+ options.base &&
652
+ (0, env_1.isGitRef)(options.base) &&
653
+ error instanceof Error &&
654
+ error.message.includes("Base run report not found in CI environment")) {
655
+ process.exit(2); // Config/infra issue
656
+ }
657
+ else {
658
+ process.exit(2); // General error
659
+ }
660
+ }
661
+ }
662
+ // Public diff core API surface
663
+ exports.diffCore = {
664
+ /**
665
+ * Compare two run reports and return diff result
666
+ */
667
+ diffRunReports: compareReports,
668
+ /**
669
+ * Classify the type of change between two specs
670
+ */
671
+ classifyChange: classifyDiff,
672
+ /**
673
+ * Calculate summary statistics for a diff
674
+ */
675
+ summarizeDiff: calculateDiffSummary,
676
+ /**
677
+ * Calculate deltas between two spec results
678
+ */
679
+ calculateDeltas,
680
+ };