@pauly4010/evalai-sdk 1.9.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +136 -23
  2. package/dist/assertions.js +51 -18
  3. package/dist/batch.js +8 -2
  4. package/dist/cli/api.js +3 -1
  5. package/dist/cli/check.js +19 -6
  6. package/dist/cli/ci-context.js +3 -1
  7. package/dist/cli/config.js +28 -8
  8. package/dist/cli/diff.js +14 -9
  9. package/dist/cli/discover.js +18 -7
  10. package/dist/cli/doctor.js +43 -9
  11. package/dist/cli/explain.js +37 -11
  12. package/dist/cli/formatters/human.js +4 -1
  13. package/dist/cli/formatters/pr-comment.js +3 -1
  14. package/dist/cli/gate.js +6 -2
  15. package/dist/cli/impact-analysis.js +6 -5
  16. package/dist/cli/index.js +18 -6
  17. package/dist/cli/manifest.d.ts +3 -5
  18. package/dist/cli/manifest.js +21 -14
  19. package/dist/cli/migrate.js +4 -4
  20. package/dist/cli/policy-packs.js +8 -2
  21. package/dist/cli/print-config.js +19 -4
  22. package/dist/cli/regression-gate.js +8 -2
  23. package/dist/cli/report/build-check-report.js +8 -2
  24. package/dist/cli/run.js +11 -5
  25. package/dist/cli/share.js +3 -1
  26. package/dist/cli/upgrade.js +2 -1
  27. package/dist/client.d.ts +16 -19
  28. package/dist/client.js +60 -43
  29. package/dist/client.request.test.d.ts +1 -1
  30. package/dist/client.request.test.js +222 -147
  31. package/dist/context.js +3 -1
  32. package/dist/errors.js +11 -4
  33. package/dist/export.js +3 -1
  34. package/dist/index.d.ts +8 -8
  35. package/dist/index.js +19 -19
  36. package/dist/integrations/anthropic.d.ts +20 -1
  37. package/dist/integrations/openai-eval.js +4 -2
  38. package/dist/integrations/openai.d.ts +24 -1
  39. package/dist/local.js +3 -1
  40. package/dist/logger.js +6 -2
  41. package/dist/pagination.js +6 -2
  42. package/dist/runtime/adapters/config-to-dsl.js +12 -9
  43. package/dist/runtime/adapters/testsuite-to-dsl.d.ts +1 -1
  44. package/dist/runtime/adapters/testsuite-to-dsl.js +11 -6
  45. package/dist/runtime/eval.d.ts +1 -1
  46. package/dist/runtime/eval.js +12 -5
  47. package/dist/runtime/execution-mode.js +13 -9
  48. package/dist/runtime/registry.js +8 -21
  49. package/dist/runtime/run-report.d.ts +0 -2
  50. package/dist/runtime/run-report.js +12 -10
  51. package/dist/testing.js +7 -2
  52. package/dist/types.d.ts +100 -69
  53. package/dist/utils/input-hash.js +4 -1
  54. package/dist/version.d.ts +1 -1
  55. package/dist/version.js +1 -1
  56. package/dist/workflows.js +62 -14
  57. package/package.json +115 -111
package/README.md CHANGED
@@ -7,41 +7,150 @@
7
7
  [![Contract Version](https://img.shields.io/badge/report%20schema-v1-blue.svg)](#)
8
8
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
9
9
 
10
- **Stop LLM regressions in CI in minutes.**
10
+ **One-command CI for AI evaluation. Complete pipeline: discover → manifest → impact → run → diff → PR summary.**
11
11
 
12
- Zero to gate in under 5 minutes. No infra. No lock-in. Remove anytime.
12
+ Zero to production CI in 60 seconds. No infra. No lock-in. Remove anytime.
13
13
 
14
14
  ---
15
15
 
16
- ## Quick Start (2 minutes)
16
+ ## Quick Start (60 seconds)
17
+
18
+ Add this to your `.github/workflows/evalai.yml`:
19
+
20
+ ```yaml
21
+ name: EvalAI CI
22
+ on: [push, pull_request]
23
+ jobs:
24
+ evalai:
25
+ runs-on: ubuntu-latest
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ - uses: actions/setup-node@v4
29
+ - run: npm ci
30
+ - run: npx @pauly4010/evalai-sdk ci --format github --write-results --base main
31
+ - uses: actions/upload-artifact@v4
32
+ if: always()
33
+ with:
34
+ name: evalai-results
35
+ path: .evalai/
36
+ ```
37
+
38
+ Create `eval/your-spec.spec.ts`:
39
+
40
+ ```typescript
41
+ import { defineEval } from "@pauly4010/evalai-sdk";
42
+
43
+ defineEval({
44
+ name: "Basic Math Operations",
45
+ description: "Test fundamental arithmetic",
46
+ prompt: "Test: 1+1=2, string concatenation, array includes",
47
+ expected: "All tests should pass",
48
+ tags: ["basic", "math"],
49
+ category: "unit-test"
50
+ });
51
+ ```
17
52
 
18
53
  ```bash
19
- npx @pauly4010/evalai-sdk init
20
- git add evals/ .github/workflows/evalai-gate.yml evalai.config.json
21
- git commit -m "chore: add EvalAI regression gate"
54
+ git add .github/workflows/evalai.yml eval/
55
+ git commit -m "feat: add EvalAI CI pipeline"
22
56
  git push
23
57
  ```
24
58
 
25
- That's it. Open a PR and CI blocks regressions automatically.
26
-
27
- `evalai init` detects your project, creates a baseline from your current tests, and installs a GitHub Actions workflow. No manual config needed.
59
+ That's it! Your CI now:
60
+ - ✅ Discovers evaluation specs automatically
61
+ - Runs only impacted specs (smart caching)
62
+ - ✅ Compares results against base branch
63
+ - ✅ Posts rich summary in PR with regressions
64
+ - ✅ Exits with proper codes (0=clean, 1=regressions, 2=config)
28
65
 
29
66
  ---
30
67
 
31
- ## What `evalai init` does
68
+ ## 🚀 New in v1.9.0: One-Command CI
69
+
70
+ ### `evalai ci` - Complete CI Pipeline
71
+
72
+ ```bash
73
+ npx @pauly4010/evalai-sdk ci --format github --write-results --base main
74
+ ```
32
75
 
33
- 1. **Detects** your Node repo and package manager (npm/yarn/pnpm)
34
- 2. **Runs your tests** to capture a real baseline (pass/fail + test count)
35
- 3. **Creates `evals/baseline.json`** with provenance metadata
36
- 4. **Installs `.github/workflows/evalai-gate.yml`** (package-manager aware)
37
- 5. **Creates `evalai.config.json`**
38
- 6. **Prints next steps** just commit and push
76
+ **What it does:**
77
+ 1. **Discover** - Finds all evaluation specs automatically
78
+ 2. **Manifest** - Builds stable manifest if missing
79
+ 3. **Impact Analysis** - Runs only specs impacted by changes (optional)
80
+ 4. **Run** - Executes evaluations with artifact retention
81
+ 5. **Diff** - Compares results against base branch
82
+ 6. **PR Summary** - Posts rich markdown summary to GitHub
83
+ 7. **Debug Flow** - Prints copy/paste next step on failure
84
+
85
+ **Advanced Options:**
86
+ ```bash
87
+ npx @pauly4010/evalai-sdk ci --base main --impacted-only # Run only impacted specs
88
+ npx @pauly4010/evalai-sdk ci --format json --write-results # JSON output for automation
89
+ npx @pauly4010/evalai-sdk ci --base develop # Custom base branch
90
+ ```
91
+
92
+ ### Smart Diffing & GitHub Integration
93
+
94
+ ```bash
95
+ npx @pauly4010/evalai-sdk diff --base main --head last --format github
96
+ ```
97
+
98
+ **Features:**
99
+ - 📊 Pass rate delta and score changes
100
+ - 🚨 Regression detection with classifications
101
+ - 📈 Improvements and new specs
102
+ - 📁 Artifact links and technical details
103
+ - 🎯 Exit codes: 0=clean, 1=regressions, 2=config
104
+
105
+ ### Self-Documenting Failures
106
+
107
+ Every failure prints a clear next step:
108
+
109
+ ```
110
+ 🔧 Next step for debugging:
111
+ Download base artifact and run: evalai diff --base .evalai/base-run.json --head .evalai/last-run.json
112
+ Artifacts: .evalai/runs/
113
+ ```
39
114
 
40
115
  ---
41
116
 
42
117
  ## CLI Commands
43
118
 
44
- ### Regression Gate (local, no account needed)
119
+ ### 🚀 One-Command CI (v1.9.0)
120
+
121
+ | Command | Description |
122
+ |---------|-------------|
123
+ | `npx evalai ci` | Complete CI pipeline: discover → manifest → impact → run → diff → PR summary |
124
+ | `npx evalai ci --base main` | Run CI with diff against main branch |
125
+ | `npx evalai ci --impacted-only` | Run only specs impacted by changes |
126
+ | `npx evalai ci --format github` | GitHub Step Summary with rich markdown |
127
+ | `npx evalai ci --format json` | JSON output for automation |
128
+
129
+ ### Discovery & Manifest
130
+
131
+ | Command | Description |
132
+ |---------|-------------|
133
+ | `npx evalai discover` | Find and analyze evaluation specs |
134
+ | `npx evalai discover --manifest` | Generate stable manifest for incremental analysis |
135
+
136
+ ### Impact Analysis
137
+
138
+ | Command | Description |
139
+ |---------|-------------|
140
+ | `npx evalai impact-analysis --base main` | Analyze impact of changes |
141
+ | `npx evalai impact-analysis --changed-files file1.ts,file2.ts` | Analyze specific changed files |
142
+
143
+ ### Run & Diff
144
+
145
+ | Command | Description |
146
+ |---------|-------------|
147
+ | `npx evalai run` | Run evaluation specifications |
148
+ | `npx evalai run --write-results` | Run with artifact retention |
149
+ | `npx evalai diff --base main` | Compare results against base branch |
150
+ | `npx evalai diff --base last --head last` | Compare last two runs |
151
+ | `npx evalai diff --format github` | GitHub Step Summary with regressions |
152
+
153
+ ### Legacy Regression Gate (local, no account needed)
45
154
 
46
155
  | Command | Description |
47
156
  |---------|-------------|
@@ -68,25 +177,29 @@ That's it. Open a PR and CI blocks regressions automatically.
68
177
  | `npx evalai explain` | Offline report explainer — top failures, root cause classification, suggested fixes |
69
178
  | `npx evalai print-config` | Show resolved config with source-of-truth annotations (file/env/default/arg) |
70
179
 
180
+ ### Migration Tools
181
+
182
+ | Command | Description |
183
+ |---------|-------------|
184
+ | `npx evalai migrate config --in evalai.config.json --out eval/migrated.spec.ts` | Convert legacy config to DSL |
185
+
71
186
  **Guided failure flow:**
72
187
 
73
188
  ```
74
- evalai check → fails → "Next: evalai explain"
189
+ evalai ci → fails → "Next: evalai explain --report .evalai/last-run.json"
75
190
 
76
191
  evalai explain → root causes + fixes
77
192
  ```
78
193
 
79
- **GitHub Actions step summary** — gate result at a glance:
194
+ **GitHub Actions step summary** — CI result at a glance with regressions and artifacts:
80
195
 
81
- ![GitHub Actions step summary showing gate pass/fail with delta table](../../docs/images/evalai-gate-step-summary.svg)
196
+ ![GitHub Actions step summary showing CI pass/fail with delta table](../../docs/images/evalai-gate-step-summary.svg)
82
197
 
83
198
  **`evalai explain` terminal output** — root causes + fix commands:
84
199
 
85
200
  ![Terminal output of evalai explain showing top failures and suggested fixes](../../docs/images/evalai-explain-terminal.svg)
86
201
 
87
- `check` automatically writes `.evalai/last-report.json` so `explain` works with zero flags.
88
-
89
- `doctor` uses exit codes: **0** = ready, **2** = not ready, **3** = infra error. Use `--report` for a JSON diagnostic bundle.
202
+ All commands automatically write artifacts so `explain` works with zero flags.
90
203
 
91
204
  ### Gate Exit Codes
92
205
 
@@ -86,7 +86,9 @@ class Expectation {
86
86
  expected: substring,
87
87
  actual: text,
88
88
  message: message ||
89
- (passed ? `Text contains "${substring}"` : `Text does not contain "${substring}"`),
89
+ (passed
90
+ ? `Text contains "${substring}"`
91
+ : `Text does not contain "${substring}"`),
90
92
  };
91
93
  }
92
94
  /**
@@ -103,7 +105,9 @@ class Expectation {
103
105
  expected: keywords,
104
106
  actual: text,
105
107
  message: message ||
106
- (passed ? `Contains all keywords` : `Missing keywords: ${missingKeywords.join(", ")}`),
108
+ (passed
109
+ ? `Contains all keywords`
110
+ : `Missing keywords: ${missingKeywords.join(", ")}`),
107
111
  };
108
112
  }
109
113
  /**
@@ -119,7 +123,9 @@ class Expectation {
119
123
  expected: `not containing "${substring}"`,
120
124
  actual: text,
121
125
  message: message ||
122
- (passed ? `Text does not contain "${substring}"` : `Text contains "${substring}"`),
126
+ (passed
127
+ ? `Text does not contain "${substring}"`
128
+ : `Text contains "${substring}"`),
123
129
  };
124
130
  }
125
131
  /**
@@ -144,7 +150,8 @@ class Expectation {
144
150
  passed,
145
151
  expected: "no PII",
146
152
  actual: foundPII.length > 0 ? `Found: ${foundPII.join(", ")}` : "no PII",
147
- message: message || (passed ? "No PII detected" : `PII detected: ${foundPII.join(", ")}`),
153
+ message: message ||
154
+ (passed ? "No PII detected" : `PII detected: ${foundPII.join(", ")}`),
148
155
  };
149
156
  }
150
157
  /**
@@ -159,7 +166,10 @@ class Expectation {
159
166
  passed,
160
167
  expected: pattern.toString(),
161
168
  actual: text,
162
- message: message || (passed ? `Matches pattern ${pattern}` : `Does not match pattern ${pattern}`),
169
+ message: message ||
170
+ (passed
171
+ ? `Matches pattern ${pattern}`
172
+ : `Does not match pattern ${pattern}`),
163
173
  };
164
174
  }
165
175
  /**
@@ -205,7 +215,8 @@ class Expectation {
205
215
  passed,
206
216
  expected: schema,
207
217
  actual: parsedJson,
208
- message: message || (passed ? "JSON matches schema" : "JSON does not match schema"),
218
+ message: message ||
219
+ (passed ? "JSON matches schema" : "JSON does not match schema"),
209
220
  };
210
221
  }
211
222
  /**
@@ -253,7 +264,10 @@ class Expectation {
253
264
  passed,
254
265
  expected,
255
266
  actual,
256
- message: message || (passed ? `Sentiment is ${expected}` : `Expected ${expected}, got ${actual}`),
267
+ message: message ||
268
+ (passed
269
+ ? `Sentiment is ${expected}`
270
+ : `Expected ${expected}, got ${actual}`),
257
271
  };
258
272
  }
259
273
  /**
@@ -269,7 +283,10 @@ class Expectation {
269
283
  passed,
270
284
  expected: range,
271
285
  actual: length,
272
- message: message || (passed ? `Length ${length} is within range` : `Length ${length} not in range`),
286
+ message: message ||
287
+ (passed
288
+ ? `Length ${length} is within range`
289
+ : `Length ${length} not in range`),
273
290
  };
274
291
  }
275
292
  /**
@@ -284,9 +301,13 @@ class Expectation {
284
301
  name: "toNotHallucinate",
285
302
  passed,
286
303
  expected: "all ground truth facts",
287
- actual: missingFacts.length > 0 ? `Missing: ${missingFacts.join(", ")}` : "all facts present",
304
+ actual: missingFacts.length > 0
305
+ ? `Missing: ${missingFacts.join(", ")}`
306
+ : "all facts present",
288
307
  message: message ||
289
- (passed ? "No hallucinations detected" : `Missing facts: ${missingFacts.join(", ")}`),
308
+ (passed
309
+ ? "No hallucinations detected"
310
+ : `Missing facts: ${missingFacts.join(", ")}`),
290
311
  };
291
312
  }
292
313
  /**
@@ -301,7 +322,10 @@ class Expectation {
301
322
  passed,
302
323
  expected: `<= ${maxMs}ms`,
303
324
  actual: `${duration}ms`,
304
- message: message || (passed ? `${duration}ms within limit` : `${duration}ms exceeds ${maxMs}ms`),
325
+ message: message ||
326
+ (passed
327
+ ? `${duration}ms within limit`
328
+ : `${duration}ms exceeds ${maxMs}ms`),
305
329
  };
306
330
  }
307
331
  /**
@@ -344,7 +368,8 @@ class Expectation {
344
368
  passed,
345
369
  expected: `> ${expected}`,
346
370
  actual: value,
347
- message: message || (passed ? `${value} > ${expected}` : `${value} <= ${expected}`),
371
+ message: message ||
372
+ (passed ? `${value} > ${expected}` : `${value} <= ${expected}`),
348
373
  };
349
374
  }
350
375
  /**
@@ -359,7 +384,8 @@ class Expectation {
359
384
  passed,
360
385
  expected: `< ${expected}`,
361
386
  actual: value,
362
- message: message || (passed ? `${value} < ${expected}` : `${value} >= ${expected}`),
387
+ message: message ||
388
+ (passed ? `${value} < ${expected}` : `${value} >= ${expected}`),
363
389
  };
364
390
  }
365
391
  /**
@@ -374,7 +400,8 @@ class Expectation {
374
400
  passed,
375
401
  expected: `between ${min} and ${max}`,
376
402
  actual: value,
377
- message: message || (passed ? `${value} is within range` : `${value} is outside range`),
403
+ message: message ||
404
+ (passed ? `${value} is within range` : `${value} is outside range`),
378
405
  };
379
406
  }
380
407
  /**
@@ -389,7 +416,8 @@ class Expectation {
389
416
  passed: hasCodeBlock,
390
417
  expected: "code block",
391
418
  actual: text,
392
- message: message || (hasCodeBlock ? "Contains code block" : "No code block found"),
419
+ message: message ||
420
+ (hasCodeBlock ? "Contains code block" : "No code block found"),
393
421
  };
394
422
  }
395
423
  /**
@@ -405,9 +433,13 @@ class Expectation {
405
433
  name: "toBeProfessional",
406
434
  passed,
407
435
  expected: "professional tone",
408
- actual: foundProfanity.length > 0 ? `Found: ${foundProfanity.join(", ")}` : "professional",
436
+ actual: foundProfanity.length > 0
437
+ ? `Found: ${foundProfanity.join(", ")}`
438
+ : "professional",
409
439
  message: message ||
410
- (passed ? "Professional tone" : `Unprofessional language: ${foundProfanity.join(", ")}`),
440
+ (passed
441
+ ? "Professional tone"
442
+ : `Unprofessional language: ${foundProfanity.join(", ")}`),
411
443
  };
412
444
  }
413
445
  /**
@@ -432,7 +464,8 @@ class Expectation {
432
464
  passed,
433
465
  expected: "proper grammar",
434
466
  actual: issues.length > 0 ? `Issues: ${issues.join(", ")}` : "proper grammar",
435
- message: message || (passed ? "Proper grammar" : `Grammar issues: ${issues.join(", ")}`),
467
+ message: message ||
468
+ (passed ? "Proper grammar" : `Grammar issues: ${issues.join(", ")}`),
436
469
  };
437
470
  }
438
471
  }
package/dist/batch.js CHANGED
@@ -81,7 +81,8 @@ class RequestBatcher {
81
81
  pendingRequest.resolve(response.data);
82
82
  }
83
83
  else {
84
- pendingRequest.reject(new Error(response.error || `Request failed with status ${response.status}`));
84
+ pendingRequest.reject(new Error(response.error ||
85
+ `Request failed with status ${response.status}`));
85
86
  }
86
87
  }
87
88
  }
@@ -149,7 +150,12 @@ function canBatch(method, endpoint) {
149
150
  if (method !== "GET") {
150
151
  return false;
151
152
  }
152
- const batchableEndpoints = ["/traces", "/evaluations", "/annotations", "/results"];
153
+ const batchableEndpoints = [
154
+ "/traces",
155
+ "/evaluations",
156
+ "/annotations",
157
+ "/results",
158
+ ];
153
159
  return batchableEndpoints.some((pattern) => endpoint.includes(pattern));
154
160
  }
155
161
  /**
package/dist/cli/api.js CHANGED
@@ -73,7 +73,9 @@ async function publishShare(baseUrl, apiKey, evaluationId, exportData, evaluatio
73
73
  exportData,
74
74
  shareScope: "run",
75
75
  evaluationRunId,
76
- ...(options?.expiresInDays != null && { expiresInDays: options.expiresInDays }),
76
+ ...(options?.expiresInDays != null && {
77
+ expiresInDays: options.expiresInDays,
78
+ }),
77
79
  };
78
80
  const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/publish`;
79
81
  try {
package/dist/cli/check.js CHANGED
@@ -183,7 +183,11 @@ function parseArgs(argv) {
183
183
  };
184
184
  }
185
185
  if (Number.isNaN(minScore) || minScore < 0 || minScore > 100) {
186
- return { ok: false, exitCode: constants_1.EXIT.BAD_ARGS, message: "Error: --minScore must be 0-100" };
186
+ return {
187
+ ok: false,
188
+ exitCode: constants_1.EXIT.BAD_ARGS,
189
+ message: "Error: --minScore must be 0-100",
190
+ };
187
191
  }
188
192
  if (minN !== undefined && (Number.isNaN(minN) || minN < 1)) {
189
193
  return {
@@ -210,9 +214,15 @@ function parseArgs(argv) {
210
214
  onFail,
211
215
  share,
212
216
  prCommentOut,
213
- maxCostUsd: maxCostUsd != null && !Number.isNaN(maxCostUsd) ? maxCostUsd : undefined,
214
- maxLatencyMs: maxLatencyMs != null && !Number.isNaN(maxLatencyMs) ? maxLatencyMs : undefined,
215
- maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd) ? maxCostDeltaUsd : undefined,
217
+ maxCostUsd: maxCostUsd != null && !Number.isNaN(maxCostUsd)
218
+ ? maxCostUsd
219
+ : undefined,
220
+ maxLatencyMs: maxLatencyMs != null && !Number.isNaN(maxLatencyMs)
221
+ ? maxLatencyMs
222
+ : undefined,
223
+ maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd)
224
+ ? maxCostDeltaUsd
225
+ : undefined,
216
226
  },
217
227
  };
218
228
  }
@@ -297,7 +307,8 @@ async function runCheck(args) {
297
307
  runDetails?.results &&
298
308
  quality?.evaluationRunId) {
299
309
  const importResults = runDetails.results
300
- .filter((r) => r.testCaseId != null && (r.status === "passed" || r.status === "failed"))
310
+ .filter((r) => r.testCaseId != null &&
311
+ (r.status === "passed" || r.status === "failed"))
301
312
  .map((r) => ({
302
313
  testCaseId: r.testCaseId,
303
314
  status: r.status,
@@ -306,7 +317,9 @@ async function runCheck(args) {
306
317
  assertionsJson: r.assertionsJson,
307
318
  }));
308
319
  if (importResults.length > 0) {
309
- const idempotencyKey = ci ? (0, ci_context_1.computeIdempotencyKey)(args.evaluationId, ci) : undefined;
320
+ const idempotencyKey = ci
321
+ ? (0, ci_context_1.computeIdempotencyKey)(args.evaluationId, ci)
322
+ : undefined;
310
323
  const importRes = await (0, api_1.importRunOnFail)(args.baseUrl, args.apiKey, args.evaluationId, importResults, {
311
324
  idempotencyKey,
312
325
  ci,
@@ -89,7 +89,9 @@ function captureCiContext() {
89
89
  provider,
90
90
  repo,
91
91
  sha,
92
- branch: ref?.startsWith("refs/heads/") ? ref.slice("refs/heads/".length) : ref,
92
+ branch: ref?.startsWith("refs/heads/")
93
+ ? ref.slice("refs/heads/".length)
94
+ : ref,
93
95
  runUrl,
94
96
  actor,
95
97
  pr,
@@ -43,7 +43,11 @@ exports.mergeConfigWithArgs = mergeConfigWithArgs;
43
43
  const fs = __importStar(require("node:fs"));
44
44
  const path = __importStar(require("node:path"));
45
45
  const profiles_1 = require("./profiles");
46
- const CONFIG_FILES = ["evalai.config.json", "evalai.config.js", "evalai.config.cjs"];
46
+ const CONFIG_FILES = [
47
+ "evalai.config.json",
48
+ "evalai.config.js",
49
+ "evalai.config.cjs",
50
+ ];
47
51
  /**
48
52
  * Find config file path in directory, walking up to root
49
53
  */
@@ -113,7 +117,11 @@ function loadConfig(cwd = process.cwd()) {
113
117
  }
114
118
  for (const key of Object.keys(config.packages)) {
115
119
  if (relNorm === key || relNorm.startsWith(`${key}/`)) {
116
- return { ...config, ...config.packages[key], packages: config.packages };
120
+ return {
121
+ ...config,
122
+ ...config.packages[key],
123
+ packages: config.packages,
124
+ };
117
125
  }
118
126
  }
119
127
  }
@@ -156,11 +164,14 @@ function mergeConfigWithArgs(config, args) {
156
164
  merged.minScore = profile.minScore;
157
165
  if (merged.maxDrop === undefined && args.maxDrop === undefined)
158
166
  merged.maxDrop = profile.maxDrop;
159
- if (merged.warnDrop === undefined && args.warnDrop === undefined && "warnDrop" in profile)
167
+ if (merged.warnDrop === undefined &&
168
+ args.warnDrop === undefined &&
169
+ "warnDrop" in profile)
160
170
  merged.warnDrop = profile.warnDrop;
161
171
  if (merged.minN === undefined && args.minN === undefined)
162
172
  merged.minN = profile.minN;
163
- if (merged.allowWeakEvidence === undefined && args.allowWeakEvidence === undefined)
173
+ if (merged.allowWeakEvidence === undefined &&
174
+ args.allowWeakEvidence === undefined)
164
175
  merged.allowWeakEvidence = profile.allowWeakEvidence;
165
176
  }
166
177
  // Args override
@@ -172,18 +183,27 @@ function mergeConfigWithArgs(config, args) {
172
183
  }
173
184
  if (args.minScore !== undefined) {
174
185
  merged.minScore =
175
- typeof args.minScore === "number" ? args.minScore : parseInt(String(args.minScore), 10);
186
+ typeof args.minScore === "number"
187
+ ? args.minScore
188
+ : parseInt(String(args.minScore), 10);
176
189
  }
177
190
  if (args.maxDrop !== undefined) {
178
191
  merged.maxDrop =
179
- typeof args.maxDrop === "number" ? args.maxDrop : parseInt(String(args.maxDrop), 10);
192
+ typeof args.maxDrop === "number"
193
+ ? args.maxDrop
194
+ : parseInt(String(args.maxDrop), 10);
180
195
  }
181
196
  if (args.warnDrop !== undefined) {
182
197
  merged.warnDrop =
183
- typeof args.warnDrop === "number" ? args.warnDrop : parseInt(String(args.warnDrop), 10);
198
+ typeof args.warnDrop === "number"
199
+ ? args.warnDrop
200
+ : parseInt(String(args.warnDrop), 10);
184
201
  }
185
202
  if (args.minN !== undefined) {
186
- merged.minN = typeof args.minN === "number" ? args.minN : parseInt(String(args.minN), 10);
203
+ merged.minN =
204
+ typeof args.minN === "number"
205
+ ? args.minN
206
+ : parseInt(String(args.minN), 10);
187
207
  }
188
208
  if (args.allowWeakEvidence !== undefined) {
189
209
  merged.allowWeakEvidence =
package/dist/cli/diff.js CHANGED
@@ -186,7 +186,7 @@ async function findBaselineRun() {
186
186
  const content = await fs.readFile(workspace.baselinePath, "utf-8");
187
187
  return JSON.parse(content);
188
188
  }
189
- catch (error) {
189
+ catch (_error) {
190
190
  // Baseline file doesn't exist, try index
191
191
  }
192
192
  // Check index for baseline runId
@@ -200,7 +200,7 @@ async function findBaselineRun() {
200
200
  return await loadRunReport(`.evalai/runs/${oldestRunId}.json`);
201
201
  }
202
202
  }
203
- catch (error) {
203
+ catch (_error) {
204
204
  // Index doesn't exist
205
205
  }
206
206
  throw new Error("No baseline run found. Set a baseline with 'evalai diff --base <runId> --head last --set-baseline' or create .evalai/baseline-run.json.");
@@ -239,7 +239,8 @@ async function findPreviousRun() {
239
239
  return await loadRunReport(`.evalai/runs/${previousRunId}.json`);
240
240
  }
241
241
  catch (error) {
242
- if (error instanceof Error && error.message.includes("Need at least 2 runs")) {
242
+ if (error instanceof Error &&
243
+ error.message.includes("Need at least 2 runs")) {
243
244
  throw error;
244
245
  }
245
246
  throw new Error("No run history found. Run 'evalai run --write-results' first.");
@@ -261,7 +262,7 @@ async function isBranchName(name) {
261
262
  /**
262
263
  * Find last run for a branch
263
264
  */
264
- async function findLastRunForBranch(branch) {
265
+ async function findLastRunForBranch(_branch) {
265
266
  // For now, just look for .evalai/last-run.json
266
267
  // In a real implementation, this would:
267
268
  // 1. Check CI artifacts for the branch
@@ -272,7 +273,7 @@ async function findLastRunForBranch(branch) {
272
273
  const content = await fs.readFile(lastRunPath, "utf-8");
273
274
  return JSON.parse(content);
274
275
  }
275
- catch (error) {
276
+ catch (_error) {
276
277
  return null;
277
278
  }
278
279
  }
@@ -284,7 +285,7 @@ async function loadRunReport(filePath) {
284
285
  const content = await fs.readFile(path.resolve(filePath), "utf-8");
285
286
  return JSON.parse(content);
286
287
  }
287
- catch (error) {
288
+ catch (_error) {
288
289
  return null;
289
290
  }
290
291
  }
@@ -457,8 +458,12 @@ function calculateDiffSummary(base, head, changedSpecs) {
457
458
  const headScores = head.results
458
459
  .filter((r) => r.result.score !== undefined)
459
460
  .map((r) => r.result.score);
460
- const baseAvgScore = baseScores.length > 0 ? baseScores.reduce((a, b) => a + b, 0) / baseScores.length : 0;
461
- const headAvgScore = headScores.length > 0 ? headScores.reduce((a, b) => a + b, 0) / headScores.length : 0;
461
+ const baseAvgScore = baseScores.length > 0
462
+ ? baseScores.reduce((a, b) => a + b, 0) / baseScores.length
463
+ : 0;
464
+ const headAvgScore = headScores.length > 0
465
+ ? headScores.reduce((a, b) => a + b, 0) / headScores.length
466
+ : 0;
462
467
  const scoreDelta = round(headAvgScore - baseAvgScore, 4);
463
468
  // Count classifications
464
469
  const regressions = changedSpecs.filter((s) => ["new_failure", "score_drop", "execution_error"].includes(s.classification)).length;
@@ -542,7 +547,7 @@ async function writeGitHubStepSummary(result) {
542
547
  const summaryPath = process.env.GITHUB_STEP_SUMMARY;
543
548
  try {
544
549
  const summary = generateGitHubSummary(result);
545
- await fs.appendFile(summaryPath, summary + "\n", "utf-8");
550
+ await fs.appendFile(summaryPath, `${summary}\n`, "utf-8");
546
551
  }
547
552
  catch (error) {
548
553
  console.warn("Warning: Could not write GitHub Step Summary:", error);