@pauly4010/evalai-sdk 1.9.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +136 -23
- package/dist/assertions.js +51 -18
- package/dist/batch.js +8 -2
- package/dist/cli/api.js +3 -1
- package/dist/cli/check.js +19 -6
- package/dist/cli/ci-context.js +3 -1
- package/dist/cli/config.js +28 -8
- package/dist/cli/diff.js +14 -9
- package/dist/cli/discover.js +18 -7
- package/dist/cli/doctor.js +43 -9
- package/dist/cli/explain.js +37 -11
- package/dist/cli/formatters/human.js +4 -1
- package/dist/cli/formatters/pr-comment.js +3 -1
- package/dist/cli/gate.js +6 -2
- package/dist/cli/impact-analysis.js +6 -5
- package/dist/cli/index.js +18 -6
- package/dist/cli/manifest.d.ts +3 -5
- package/dist/cli/manifest.js +21 -14
- package/dist/cli/migrate.js +4 -4
- package/dist/cli/policy-packs.js +8 -2
- package/dist/cli/print-config.js +19 -4
- package/dist/cli/regression-gate.js +8 -2
- package/dist/cli/report/build-check-report.js +8 -2
- package/dist/cli/run.js +11 -5
- package/dist/cli/share.js +3 -1
- package/dist/cli/upgrade.js +2 -1
- package/dist/client.d.ts +16 -19
- package/dist/client.js +60 -43
- package/dist/client.request.test.d.ts +1 -1
- package/dist/client.request.test.js +222 -147
- package/dist/context.js +3 -1
- package/dist/errors.js +11 -4
- package/dist/export.js +3 -1
- package/dist/index.d.ts +8 -8
- package/dist/index.js +19 -19
- package/dist/integrations/anthropic.d.ts +20 -1
- package/dist/integrations/openai-eval.js +4 -2
- package/dist/integrations/openai.d.ts +24 -1
- package/dist/local.js +3 -1
- package/dist/logger.js +6 -2
- package/dist/pagination.js +6 -2
- package/dist/runtime/adapters/config-to-dsl.js +12 -9
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +1 -1
- package/dist/runtime/adapters/testsuite-to-dsl.js +11 -6
- package/dist/runtime/eval.d.ts +1 -1
- package/dist/runtime/eval.js +12 -5
- package/dist/runtime/execution-mode.js +13 -9
- package/dist/runtime/registry.js +8 -21
- package/dist/runtime/run-report.d.ts +0 -2
- package/dist/runtime/run-report.js +12 -10
- package/dist/testing.js +7 -2
- package/dist/types.d.ts +100 -69
- package/dist/utils/input-hash.js +4 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/workflows.js +62 -14
- package/package.json +115 -111
package/README.md
CHANGED
|
@@ -7,41 +7,150 @@
|
|
|
7
7
|
[](#)
|
|
8
8
|
[](https://opensource.org/licenses/MIT)
|
|
9
9
|
|
|
10
|
-
**
|
|
10
|
+
**One-command CI for AI evaluation. Complete pipeline: discover → manifest → impact → run → diff → PR summary.**
|
|
11
11
|
|
|
12
|
-
Zero to
|
|
12
|
+
Zero to production CI in 60 seconds. No infra. No lock-in. Remove anytime.
|
|
13
13
|
|
|
14
14
|
---
|
|
15
15
|
|
|
16
|
-
## Quick Start (
|
|
16
|
+
## Quick Start (60 seconds)
|
|
17
|
+
|
|
18
|
+
Add this to your `.github/workflows/evalai.yml`:
|
|
19
|
+
|
|
20
|
+
```yaml
|
|
21
|
+
name: EvalAI CI
|
|
22
|
+
on: [push, pull_request]
|
|
23
|
+
jobs:
|
|
24
|
+
evalai:
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
- uses: actions/setup-node@v4
|
|
29
|
+
- run: npm ci
|
|
30
|
+
- run: npx @pauly4010/evalai-sdk ci --format github --write-results --base main
|
|
31
|
+
- uses: actions/upload-artifact@v4
|
|
32
|
+
if: always()
|
|
33
|
+
with:
|
|
34
|
+
name: evalai-results
|
|
35
|
+
path: .evalai/
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Create `eval/your-spec.spec.ts`:
|
|
39
|
+
|
|
40
|
+
```typescript
|
|
41
|
+
import { defineEval } from "@pauly4010/evalai-sdk";
|
|
42
|
+
|
|
43
|
+
defineEval({
|
|
44
|
+
name: "Basic Math Operations",
|
|
45
|
+
description: "Test fundamental arithmetic",
|
|
46
|
+
prompt: "Test: 1+1=2, string concatenation, array includes",
|
|
47
|
+
expected: "All tests should pass",
|
|
48
|
+
tags: ["basic", "math"],
|
|
49
|
+
category: "unit-test"
|
|
50
|
+
});
|
|
51
|
+
```
|
|
17
52
|
|
|
18
53
|
```bash
|
|
19
|
-
|
|
20
|
-
git add
|
|
21
|
-
git commit -m "chore: add EvalAI regression gate"
|
|
54
|
+
git add .github/workflows/evalai.yml eval/
|
|
55
|
+
git commit -m "feat: add EvalAI CI pipeline"
|
|
22
56
|
git push
|
|
23
57
|
```
|
|
24
58
|
|
|
25
|
-
That's it
|
|
26
|
-
|
|
27
|
-
|
|
59
|
+
That's it! Your CI now:
|
|
60
|
+
- ✅ Discovers evaluation specs automatically
|
|
61
|
+
- ✅ Runs only impacted specs (smart caching)
|
|
62
|
+
- ✅ Compares results against base branch
|
|
63
|
+
- ✅ Posts rich summary in PR with regressions
|
|
64
|
+
- ✅ Exits with proper codes (0=clean, 1=regressions, 2=config)
|
|
28
65
|
|
|
29
66
|
---
|
|
30
67
|
|
|
31
|
-
##
|
|
68
|
+
## 🚀 New in v1.9.0: One-Command CI
|
|
69
|
+
|
|
70
|
+
### `evalai ci` - Complete CI Pipeline
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
npx @pauly4010/evalai-sdk ci --format github --write-results --base main
|
|
74
|
+
```
|
|
32
75
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
76
|
+
**What it does:**
|
|
77
|
+
1. **Discover** - Finds all evaluation specs automatically
|
|
78
|
+
2. **Manifest** - Builds stable manifest if missing
|
|
79
|
+
3. **Impact Analysis** - Runs only specs impacted by changes (optional)
|
|
80
|
+
4. **Run** - Executes evaluations with artifact retention
|
|
81
|
+
5. **Diff** - Compares results against base branch
|
|
82
|
+
6. **PR Summary** - Posts rich markdown summary to GitHub
|
|
83
|
+
7. **Debug Flow** - Prints copy/paste next step on failure
|
|
84
|
+
|
|
85
|
+
**Advanced Options:**
|
|
86
|
+
```bash
|
|
87
|
+
npx @pauly4010/evalai-sdk ci --base main --impacted-only # Run only impacted specs
|
|
88
|
+
npx @pauly4010/evalai-sdk ci --format json --write-results # JSON output for automation
|
|
89
|
+
npx @pauly4010/evalai-sdk ci --base develop # Custom base branch
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Smart Diffing & GitHub Integration
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
npx @pauly4010/evalai-sdk diff --base main --head last --format github
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Features:**
|
|
99
|
+
- 📊 Pass rate delta and score changes
|
|
100
|
+
- 🚨 Regression detection with classifications
|
|
101
|
+
- 📈 Improvements and new specs
|
|
102
|
+
- 📁 Artifact links and technical details
|
|
103
|
+
- 🎯 Exit codes: 0=clean, 1=regressions, 2=config
|
|
104
|
+
|
|
105
|
+
### Self-Documenting Failures
|
|
106
|
+
|
|
107
|
+
Every failure prints a clear next step:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
🔧 Next step for debugging:
|
|
111
|
+
Download base artifact and run: evalai diff --base .evalai/base-run.json --head .evalai/last-run.json
|
|
112
|
+
Artifacts: .evalai/runs/
|
|
113
|
+
```
|
|
39
114
|
|
|
40
115
|
---
|
|
41
116
|
|
|
42
117
|
## CLI Commands
|
|
43
118
|
|
|
44
|
-
###
|
|
119
|
+
### 🚀 One-Command CI (v1.9.0)
|
|
120
|
+
|
|
121
|
+
| Command | Description |
|
|
122
|
+
|---------|-------------|
|
|
123
|
+
| `npx evalai ci` | Complete CI pipeline: discover → manifest → impact → run → diff → PR summary |
|
|
124
|
+
| `npx evalai ci --base main` | Run CI with diff against main branch |
|
|
125
|
+
| `npx evalai ci --impacted-only` | Run only specs impacted by changes |
|
|
126
|
+
| `npx evalai ci --format github` | GitHub Step Summary with rich markdown |
|
|
127
|
+
| `npx evalai ci --format json` | JSON output for automation |
|
|
128
|
+
|
|
129
|
+
### Discovery & Manifest
|
|
130
|
+
|
|
131
|
+
| Command | Description |
|
|
132
|
+
|---------|-------------|
|
|
133
|
+
| `npx evalai discover` | Find and analyze evaluation specs |
|
|
134
|
+
| `npx evalai discover --manifest` | Generate stable manifest for incremental analysis |
|
|
135
|
+
|
|
136
|
+
### Impact Analysis
|
|
137
|
+
|
|
138
|
+
| Command | Description |
|
|
139
|
+
|---------|-------------|
|
|
140
|
+
| `npx evalai impact-analysis --base main` | Analyze impact of changes |
|
|
141
|
+
| `npx evalai impact-analysis --changed-files file1.ts,file2.ts` | Analyze specific changed files |
|
|
142
|
+
|
|
143
|
+
### Run & Diff
|
|
144
|
+
|
|
145
|
+
| Command | Description |
|
|
146
|
+
|---------|-------------|
|
|
147
|
+
| `npx evalai run` | Run evaluation specifications |
|
|
148
|
+
| `npx evalai run --write-results` | Run with artifact retention |
|
|
149
|
+
| `npx evalai diff --base main` | Compare results against base branch |
|
|
150
|
+
| `npx evalai diff --base last --head last` | Compare last two runs |
|
|
151
|
+
| `npx evalai diff --format github` | GitHub Step Summary with regressions |
|
|
152
|
+
|
|
153
|
+
### Legacy Regression Gate (local, no account needed)
|
|
45
154
|
|
|
46
155
|
| Command | Description |
|
|
47
156
|
|---------|-------------|
|
|
@@ -68,25 +177,29 @@ That's it. Open a PR and CI blocks regressions automatically.
|
|
|
68
177
|
| `npx evalai explain` | Offline report explainer — top failures, root cause classification, suggested fixes |
|
|
69
178
|
| `npx evalai print-config` | Show resolved config with source-of-truth annotations (file/env/default/arg) |
|
|
70
179
|
|
|
180
|
+
### Migration Tools
|
|
181
|
+
|
|
182
|
+
| Command | Description |
|
|
183
|
+
|---------|-------------|
|
|
184
|
+
| `npx evalai migrate config --in evalai.config.json --out eval/migrated.spec.ts` | Convert legacy config to DSL |
|
|
185
|
+
|
|
71
186
|
**Guided failure flow:**
|
|
72
187
|
|
|
73
188
|
```
|
|
74
|
-
evalai
|
|
189
|
+
evalai ci → fails → "Next: evalai explain --report .evalai/last-run.json"
|
|
75
190
|
↓
|
|
76
191
|
evalai explain → root causes + fixes
|
|
77
192
|
```
|
|
78
193
|
|
|
79
|
-
**GitHub Actions step summary** —
|
|
194
|
+
**GitHub Actions step summary** — CI result at a glance with regressions and artifacts:
|
|
80
195
|
|
|
81
|
-

|
|
82
197
|
|
|
83
198
|
**`evalai explain` terminal output** — root causes + fix commands:
|
|
84
199
|
|
|
85
200
|

|
|
86
201
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
`doctor` uses exit codes: **0** = ready, **2** = not ready, **3** = infra error. Use `--report` for a JSON diagnostic bundle.
|
|
202
|
+
All commands automatically write artifacts so `explain` works with zero flags.
|
|
90
203
|
|
|
91
204
|
### Gate Exit Codes
|
|
92
205
|
|
package/dist/assertions.js
CHANGED
|
@@ -86,7 +86,9 @@ class Expectation {
|
|
|
86
86
|
expected: substring,
|
|
87
87
|
actual: text,
|
|
88
88
|
message: message ||
|
|
89
|
-
(passed
|
|
89
|
+
(passed
|
|
90
|
+
? `Text contains "${substring}"`
|
|
91
|
+
: `Text does not contain "${substring}"`),
|
|
90
92
|
};
|
|
91
93
|
}
|
|
92
94
|
/**
|
|
@@ -103,7 +105,9 @@ class Expectation {
|
|
|
103
105
|
expected: keywords,
|
|
104
106
|
actual: text,
|
|
105
107
|
message: message ||
|
|
106
|
-
(passed
|
|
108
|
+
(passed
|
|
109
|
+
? `Contains all keywords`
|
|
110
|
+
: `Missing keywords: ${missingKeywords.join(", ")}`),
|
|
107
111
|
};
|
|
108
112
|
}
|
|
109
113
|
/**
|
|
@@ -119,7 +123,9 @@ class Expectation {
|
|
|
119
123
|
expected: `not containing "${substring}"`,
|
|
120
124
|
actual: text,
|
|
121
125
|
message: message ||
|
|
122
|
-
(passed
|
|
126
|
+
(passed
|
|
127
|
+
? `Text does not contain "${substring}"`
|
|
128
|
+
: `Text contains "${substring}"`),
|
|
123
129
|
};
|
|
124
130
|
}
|
|
125
131
|
/**
|
|
@@ -144,7 +150,8 @@ class Expectation {
|
|
|
144
150
|
passed,
|
|
145
151
|
expected: "no PII",
|
|
146
152
|
actual: foundPII.length > 0 ? `Found: ${foundPII.join(", ")}` : "no PII",
|
|
147
|
-
message: message ||
|
|
153
|
+
message: message ||
|
|
154
|
+
(passed ? "No PII detected" : `PII detected: ${foundPII.join(", ")}`),
|
|
148
155
|
};
|
|
149
156
|
}
|
|
150
157
|
/**
|
|
@@ -159,7 +166,10 @@ class Expectation {
|
|
|
159
166
|
passed,
|
|
160
167
|
expected: pattern.toString(),
|
|
161
168
|
actual: text,
|
|
162
|
-
message: message ||
|
|
169
|
+
message: message ||
|
|
170
|
+
(passed
|
|
171
|
+
? `Matches pattern ${pattern}`
|
|
172
|
+
: `Does not match pattern ${pattern}`),
|
|
163
173
|
};
|
|
164
174
|
}
|
|
165
175
|
/**
|
|
@@ -205,7 +215,8 @@ class Expectation {
|
|
|
205
215
|
passed,
|
|
206
216
|
expected: schema,
|
|
207
217
|
actual: parsedJson,
|
|
208
|
-
message: message ||
|
|
218
|
+
message: message ||
|
|
219
|
+
(passed ? "JSON matches schema" : "JSON does not match schema"),
|
|
209
220
|
};
|
|
210
221
|
}
|
|
211
222
|
/**
|
|
@@ -253,7 +264,10 @@ class Expectation {
|
|
|
253
264
|
passed,
|
|
254
265
|
expected,
|
|
255
266
|
actual,
|
|
256
|
-
message: message ||
|
|
267
|
+
message: message ||
|
|
268
|
+
(passed
|
|
269
|
+
? `Sentiment is ${expected}`
|
|
270
|
+
: `Expected ${expected}, got ${actual}`),
|
|
257
271
|
};
|
|
258
272
|
}
|
|
259
273
|
/**
|
|
@@ -269,7 +283,10 @@ class Expectation {
|
|
|
269
283
|
passed,
|
|
270
284
|
expected: range,
|
|
271
285
|
actual: length,
|
|
272
|
-
message: message ||
|
|
286
|
+
message: message ||
|
|
287
|
+
(passed
|
|
288
|
+
? `Length ${length} is within range`
|
|
289
|
+
: `Length ${length} not in range`),
|
|
273
290
|
};
|
|
274
291
|
}
|
|
275
292
|
/**
|
|
@@ -284,9 +301,13 @@ class Expectation {
|
|
|
284
301
|
name: "toNotHallucinate",
|
|
285
302
|
passed,
|
|
286
303
|
expected: "all ground truth facts",
|
|
287
|
-
actual: missingFacts.length > 0
|
|
304
|
+
actual: missingFacts.length > 0
|
|
305
|
+
? `Missing: ${missingFacts.join(", ")}`
|
|
306
|
+
: "all facts present",
|
|
288
307
|
message: message ||
|
|
289
|
-
(passed
|
|
308
|
+
(passed
|
|
309
|
+
? "No hallucinations detected"
|
|
310
|
+
: `Missing facts: ${missingFacts.join(", ")}`),
|
|
290
311
|
};
|
|
291
312
|
}
|
|
292
313
|
/**
|
|
@@ -301,7 +322,10 @@ class Expectation {
|
|
|
301
322
|
passed,
|
|
302
323
|
expected: `<= ${maxMs}ms`,
|
|
303
324
|
actual: `${duration}ms`,
|
|
304
|
-
message: message ||
|
|
325
|
+
message: message ||
|
|
326
|
+
(passed
|
|
327
|
+
? `${duration}ms within limit`
|
|
328
|
+
: `${duration}ms exceeds ${maxMs}ms`),
|
|
305
329
|
};
|
|
306
330
|
}
|
|
307
331
|
/**
|
|
@@ -344,7 +368,8 @@ class Expectation {
|
|
|
344
368
|
passed,
|
|
345
369
|
expected: `> ${expected}`,
|
|
346
370
|
actual: value,
|
|
347
|
-
message: message ||
|
|
371
|
+
message: message ||
|
|
372
|
+
(passed ? `${value} > ${expected}` : `${value} <= ${expected}`),
|
|
348
373
|
};
|
|
349
374
|
}
|
|
350
375
|
/**
|
|
@@ -359,7 +384,8 @@ class Expectation {
|
|
|
359
384
|
passed,
|
|
360
385
|
expected: `< ${expected}`,
|
|
361
386
|
actual: value,
|
|
362
|
-
message: message ||
|
|
387
|
+
message: message ||
|
|
388
|
+
(passed ? `${value} < ${expected}` : `${value} >= ${expected}`),
|
|
363
389
|
};
|
|
364
390
|
}
|
|
365
391
|
/**
|
|
@@ -374,7 +400,8 @@ class Expectation {
|
|
|
374
400
|
passed,
|
|
375
401
|
expected: `between ${min} and ${max}`,
|
|
376
402
|
actual: value,
|
|
377
|
-
message: message ||
|
|
403
|
+
message: message ||
|
|
404
|
+
(passed ? `${value} is within range` : `${value} is outside range`),
|
|
378
405
|
};
|
|
379
406
|
}
|
|
380
407
|
/**
|
|
@@ -389,7 +416,8 @@ class Expectation {
|
|
|
389
416
|
passed: hasCodeBlock,
|
|
390
417
|
expected: "code block",
|
|
391
418
|
actual: text,
|
|
392
|
-
message: message ||
|
|
419
|
+
message: message ||
|
|
420
|
+
(hasCodeBlock ? "Contains code block" : "No code block found"),
|
|
393
421
|
};
|
|
394
422
|
}
|
|
395
423
|
/**
|
|
@@ -405,9 +433,13 @@ class Expectation {
|
|
|
405
433
|
name: "toBeProfessional",
|
|
406
434
|
passed,
|
|
407
435
|
expected: "professional tone",
|
|
408
|
-
actual: foundProfanity.length > 0
|
|
436
|
+
actual: foundProfanity.length > 0
|
|
437
|
+
? `Found: ${foundProfanity.join(", ")}`
|
|
438
|
+
: "professional",
|
|
409
439
|
message: message ||
|
|
410
|
-
(passed
|
|
440
|
+
(passed
|
|
441
|
+
? "Professional tone"
|
|
442
|
+
: `Unprofessional language: ${foundProfanity.join(", ")}`),
|
|
411
443
|
};
|
|
412
444
|
}
|
|
413
445
|
/**
|
|
@@ -432,7 +464,8 @@ class Expectation {
|
|
|
432
464
|
passed,
|
|
433
465
|
expected: "proper grammar",
|
|
434
466
|
actual: issues.length > 0 ? `Issues: ${issues.join(", ")}` : "proper grammar",
|
|
435
|
-
message: message ||
|
|
467
|
+
message: message ||
|
|
468
|
+
(passed ? "Proper grammar" : `Grammar issues: ${issues.join(", ")}`),
|
|
436
469
|
};
|
|
437
470
|
}
|
|
438
471
|
}
|
package/dist/batch.js
CHANGED
|
@@ -81,7 +81,8 @@ class RequestBatcher {
|
|
|
81
81
|
pendingRequest.resolve(response.data);
|
|
82
82
|
}
|
|
83
83
|
else {
|
|
84
|
-
pendingRequest.reject(new Error(response.error ||
|
|
84
|
+
pendingRequest.reject(new Error(response.error ||
|
|
85
|
+
`Request failed with status ${response.status}`));
|
|
85
86
|
}
|
|
86
87
|
}
|
|
87
88
|
}
|
|
@@ -149,7 +150,12 @@ function canBatch(method, endpoint) {
|
|
|
149
150
|
if (method !== "GET") {
|
|
150
151
|
return false;
|
|
151
152
|
}
|
|
152
|
-
const batchableEndpoints = [
|
|
153
|
+
const batchableEndpoints = [
|
|
154
|
+
"/traces",
|
|
155
|
+
"/evaluations",
|
|
156
|
+
"/annotations",
|
|
157
|
+
"/results",
|
|
158
|
+
];
|
|
153
159
|
return batchableEndpoints.some((pattern) => endpoint.includes(pattern));
|
|
154
160
|
}
|
|
155
161
|
/**
|
package/dist/cli/api.js
CHANGED
|
@@ -73,7 +73,9 @@ async function publishShare(baseUrl, apiKey, evaluationId, exportData, evaluatio
|
|
|
73
73
|
exportData,
|
|
74
74
|
shareScope: "run",
|
|
75
75
|
evaluationRunId,
|
|
76
|
-
...(options?.expiresInDays != null && {
|
|
76
|
+
...(options?.expiresInDays != null && {
|
|
77
|
+
expiresInDays: options.expiresInDays,
|
|
78
|
+
}),
|
|
77
79
|
};
|
|
78
80
|
const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/publish`;
|
|
79
81
|
try {
|
package/dist/cli/check.js
CHANGED
|
@@ -183,7 +183,11 @@ function parseArgs(argv) {
|
|
|
183
183
|
};
|
|
184
184
|
}
|
|
185
185
|
if (Number.isNaN(minScore) || minScore < 0 || minScore > 100) {
|
|
186
|
-
return {
|
|
186
|
+
return {
|
|
187
|
+
ok: false,
|
|
188
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
189
|
+
message: "Error: --minScore must be 0-100",
|
|
190
|
+
};
|
|
187
191
|
}
|
|
188
192
|
if (minN !== undefined && (Number.isNaN(minN) || minN < 1)) {
|
|
189
193
|
return {
|
|
@@ -210,9 +214,15 @@ function parseArgs(argv) {
|
|
|
210
214
|
onFail,
|
|
211
215
|
share,
|
|
212
216
|
prCommentOut,
|
|
213
|
-
maxCostUsd: maxCostUsd != null && !Number.isNaN(maxCostUsd)
|
|
214
|
-
|
|
215
|
-
|
|
217
|
+
maxCostUsd: maxCostUsd != null && !Number.isNaN(maxCostUsd)
|
|
218
|
+
? maxCostUsd
|
|
219
|
+
: undefined,
|
|
220
|
+
maxLatencyMs: maxLatencyMs != null && !Number.isNaN(maxLatencyMs)
|
|
221
|
+
? maxLatencyMs
|
|
222
|
+
: undefined,
|
|
223
|
+
maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd)
|
|
224
|
+
? maxCostDeltaUsd
|
|
225
|
+
: undefined,
|
|
216
226
|
},
|
|
217
227
|
};
|
|
218
228
|
}
|
|
@@ -297,7 +307,8 @@ async function runCheck(args) {
|
|
|
297
307
|
runDetails?.results &&
|
|
298
308
|
quality?.evaluationRunId) {
|
|
299
309
|
const importResults = runDetails.results
|
|
300
|
-
.filter((r) => r.testCaseId != null &&
|
|
310
|
+
.filter((r) => r.testCaseId != null &&
|
|
311
|
+
(r.status === "passed" || r.status === "failed"))
|
|
301
312
|
.map((r) => ({
|
|
302
313
|
testCaseId: r.testCaseId,
|
|
303
314
|
status: r.status,
|
|
@@ -306,7 +317,9 @@ async function runCheck(args) {
|
|
|
306
317
|
assertionsJson: r.assertionsJson,
|
|
307
318
|
}));
|
|
308
319
|
if (importResults.length > 0) {
|
|
309
|
-
const idempotencyKey = ci
|
|
320
|
+
const idempotencyKey = ci
|
|
321
|
+
? (0, ci_context_1.computeIdempotencyKey)(args.evaluationId, ci)
|
|
322
|
+
: undefined;
|
|
310
323
|
const importRes = await (0, api_1.importRunOnFail)(args.baseUrl, args.apiKey, args.evaluationId, importResults, {
|
|
311
324
|
idempotencyKey,
|
|
312
325
|
ci,
|
package/dist/cli/ci-context.js
CHANGED
|
@@ -89,7 +89,9 @@ function captureCiContext() {
|
|
|
89
89
|
provider,
|
|
90
90
|
repo,
|
|
91
91
|
sha,
|
|
92
|
-
branch: ref?.startsWith("refs/heads/")
|
|
92
|
+
branch: ref?.startsWith("refs/heads/")
|
|
93
|
+
? ref.slice("refs/heads/".length)
|
|
94
|
+
: ref,
|
|
93
95
|
runUrl,
|
|
94
96
|
actor,
|
|
95
97
|
pr,
|
package/dist/cli/config.js
CHANGED
|
@@ -43,7 +43,11 @@ exports.mergeConfigWithArgs = mergeConfigWithArgs;
|
|
|
43
43
|
const fs = __importStar(require("node:fs"));
|
|
44
44
|
const path = __importStar(require("node:path"));
|
|
45
45
|
const profiles_1 = require("./profiles");
|
|
46
|
-
const CONFIG_FILES = [
|
|
46
|
+
const CONFIG_FILES = [
|
|
47
|
+
"evalai.config.json",
|
|
48
|
+
"evalai.config.js",
|
|
49
|
+
"evalai.config.cjs",
|
|
50
|
+
];
|
|
47
51
|
/**
|
|
48
52
|
* Find config file path in directory, walking up to root
|
|
49
53
|
*/
|
|
@@ -113,7 +117,11 @@ function loadConfig(cwd = process.cwd()) {
|
|
|
113
117
|
}
|
|
114
118
|
for (const key of Object.keys(config.packages)) {
|
|
115
119
|
if (relNorm === key || relNorm.startsWith(`${key}/`)) {
|
|
116
|
-
return {
|
|
120
|
+
return {
|
|
121
|
+
...config,
|
|
122
|
+
...config.packages[key],
|
|
123
|
+
packages: config.packages,
|
|
124
|
+
};
|
|
117
125
|
}
|
|
118
126
|
}
|
|
119
127
|
}
|
|
@@ -156,11 +164,14 @@ function mergeConfigWithArgs(config, args) {
|
|
|
156
164
|
merged.minScore = profile.minScore;
|
|
157
165
|
if (merged.maxDrop === undefined && args.maxDrop === undefined)
|
|
158
166
|
merged.maxDrop = profile.maxDrop;
|
|
159
|
-
if (merged.warnDrop === undefined &&
|
|
167
|
+
if (merged.warnDrop === undefined &&
|
|
168
|
+
args.warnDrop === undefined &&
|
|
169
|
+
"warnDrop" in profile)
|
|
160
170
|
merged.warnDrop = profile.warnDrop;
|
|
161
171
|
if (merged.minN === undefined && args.minN === undefined)
|
|
162
172
|
merged.minN = profile.minN;
|
|
163
|
-
if (merged.allowWeakEvidence === undefined &&
|
|
173
|
+
if (merged.allowWeakEvidence === undefined &&
|
|
174
|
+
args.allowWeakEvidence === undefined)
|
|
164
175
|
merged.allowWeakEvidence = profile.allowWeakEvidence;
|
|
165
176
|
}
|
|
166
177
|
// Args override
|
|
@@ -172,18 +183,27 @@ function mergeConfigWithArgs(config, args) {
|
|
|
172
183
|
}
|
|
173
184
|
if (args.minScore !== undefined) {
|
|
174
185
|
merged.minScore =
|
|
175
|
-
typeof args.minScore === "number"
|
|
186
|
+
typeof args.minScore === "number"
|
|
187
|
+
? args.minScore
|
|
188
|
+
: parseInt(String(args.minScore), 10);
|
|
176
189
|
}
|
|
177
190
|
if (args.maxDrop !== undefined) {
|
|
178
191
|
merged.maxDrop =
|
|
179
|
-
typeof args.maxDrop === "number"
|
|
192
|
+
typeof args.maxDrop === "number"
|
|
193
|
+
? args.maxDrop
|
|
194
|
+
: parseInt(String(args.maxDrop), 10);
|
|
180
195
|
}
|
|
181
196
|
if (args.warnDrop !== undefined) {
|
|
182
197
|
merged.warnDrop =
|
|
183
|
-
typeof args.warnDrop === "number"
|
|
198
|
+
typeof args.warnDrop === "number"
|
|
199
|
+
? args.warnDrop
|
|
200
|
+
: parseInt(String(args.warnDrop), 10);
|
|
184
201
|
}
|
|
185
202
|
if (args.minN !== undefined) {
|
|
186
|
-
merged.minN =
|
|
203
|
+
merged.minN =
|
|
204
|
+
typeof args.minN === "number"
|
|
205
|
+
? args.minN
|
|
206
|
+
: parseInt(String(args.minN), 10);
|
|
187
207
|
}
|
|
188
208
|
if (args.allowWeakEvidence !== undefined) {
|
|
189
209
|
merged.allowWeakEvidence =
|
package/dist/cli/diff.js
CHANGED
|
@@ -186,7 +186,7 @@ async function findBaselineRun() {
|
|
|
186
186
|
const content = await fs.readFile(workspace.baselinePath, "utf-8");
|
|
187
187
|
return JSON.parse(content);
|
|
188
188
|
}
|
|
189
|
-
catch (
|
|
189
|
+
catch (_error) {
|
|
190
190
|
// Baseline file doesn't exist, try index
|
|
191
191
|
}
|
|
192
192
|
// Check index for baseline runId
|
|
@@ -200,7 +200,7 @@ async function findBaselineRun() {
|
|
|
200
200
|
return await loadRunReport(`.evalai/runs/${oldestRunId}.json`);
|
|
201
201
|
}
|
|
202
202
|
}
|
|
203
|
-
catch (
|
|
203
|
+
catch (_error) {
|
|
204
204
|
// Index doesn't exist
|
|
205
205
|
}
|
|
206
206
|
throw new Error("No baseline run found. Set a baseline with 'evalai diff --base <runId> --head last --set-baseline' or create .evalai/baseline-run.json.");
|
|
@@ -239,7 +239,8 @@ async function findPreviousRun() {
|
|
|
239
239
|
return await loadRunReport(`.evalai/runs/${previousRunId}.json`);
|
|
240
240
|
}
|
|
241
241
|
catch (error) {
|
|
242
|
-
if (error instanceof Error &&
|
|
242
|
+
if (error instanceof Error &&
|
|
243
|
+
error.message.includes("Need at least 2 runs")) {
|
|
243
244
|
throw error;
|
|
244
245
|
}
|
|
245
246
|
throw new Error("No run history found. Run 'evalai run --write-results' first.");
|
|
@@ -261,7 +262,7 @@ async function isBranchName(name) {
|
|
|
261
262
|
/**
|
|
262
263
|
* Find last run for a branch
|
|
263
264
|
*/
|
|
264
|
-
async function findLastRunForBranch(
|
|
265
|
+
async function findLastRunForBranch(_branch) {
|
|
265
266
|
// For now, just look for .evalai/last-run.json
|
|
266
267
|
// In a real implementation, this would:
|
|
267
268
|
// 1. Check CI artifacts for the branch
|
|
@@ -272,7 +273,7 @@ async function findLastRunForBranch(branch) {
|
|
|
272
273
|
const content = await fs.readFile(lastRunPath, "utf-8");
|
|
273
274
|
return JSON.parse(content);
|
|
274
275
|
}
|
|
275
|
-
catch (
|
|
276
|
+
catch (_error) {
|
|
276
277
|
return null;
|
|
277
278
|
}
|
|
278
279
|
}
|
|
@@ -284,7 +285,7 @@ async function loadRunReport(filePath) {
|
|
|
284
285
|
const content = await fs.readFile(path.resolve(filePath), "utf-8");
|
|
285
286
|
return JSON.parse(content);
|
|
286
287
|
}
|
|
287
|
-
catch (
|
|
288
|
+
catch (_error) {
|
|
288
289
|
return null;
|
|
289
290
|
}
|
|
290
291
|
}
|
|
@@ -457,8 +458,12 @@ function calculateDiffSummary(base, head, changedSpecs) {
|
|
|
457
458
|
const headScores = head.results
|
|
458
459
|
.filter((r) => r.result.score !== undefined)
|
|
459
460
|
.map((r) => r.result.score);
|
|
460
|
-
const baseAvgScore = baseScores.length > 0
|
|
461
|
-
|
|
461
|
+
const baseAvgScore = baseScores.length > 0
|
|
462
|
+
? baseScores.reduce((a, b) => a + b, 0) / baseScores.length
|
|
463
|
+
: 0;
|
|
464
|
+
const headAvgScore = headScores.length > 0
|
|
465
|
+
? headScores.reduce((a, b) => a + b, 0) / headScores.length
|
|
466
|
+
: 0;
|
|
462
467
|
const scoreDelta = round(headAvgScore - baseAvgScore, 4);
|
|
463
468
|
// Count classifications
|
|
464
469
|
const regressions = changedSpecs.filter((s) => ["new_failure", "score_drop", "execution_error"].includes(s.classification)).length;
|
|
@@ -542,7 +547,7 @@ async function writeGitHubStepSummary(result) {
|
|
|
542
547
|
const summaryPath = process.env.GITHUB_STEP_SUMMARY;
|
|
543
548
|
try {
|
|
544
549
|
const summary = generateGitHubSummary(result);
|
|
545
|
-
await fs.appendFile(summaryPath, summary
|
|
550
|
+
await fs.appendFile(summaryPath, `${summary}\n`, "utf-8");
|
|
546
551
|
}
|
|
547
552
|
catch (error) {
|
|
548
553
|
console.warn("Warning: Could not write GitHub Step Summary:", error);
|