@pauly4010/evalai-sdk 1.8.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/README.md +136 -23
- package/dist/assertions.js +51 -18
- package/dist/batch.js +8 -2
- package/dist/cli/api.js +3 -1
- package/dist/cli/check.js +19 -6
- package/dist/cli/ci-context.js +3 -1
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.js +28 -8
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.js +62 -19
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.js +168 -36
- package/dist/cli/formatters/human.js +4 -1
- package/dist/cli/formatters/pr-comment.js +3 -1
- package/dist/cli/gate.js +6 -2
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.js +185 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.js +8 -2
- package/dist/cli/print-config.js +33 -14
- package/dist/cli/regression-gate.js +8 -2
- package/dist/cli/report/build-check-report.js +8 -2
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.js +3 -1
- package/dist/cli/upgrade.js +2 -1
- package/dist/cli/workspace.d.ts +28 -0
- package/dist/cli/workspace.js +58 -0
- package/dist/client.d.ts +16 -19
- package/dist/client.js +60 -43
- package/dist/client.request.test.d.ts +1 -1
- package/dist/client.request.test.js +222 -147
- package/dist/context.js +3 -1
- package/dist/errors.js +11 -4
- package/dist/export.js +3 -1
- package/dist/index.d.ts +8 -2
- package/dist/index.js +30 -5
- package/dist/integrations/anthropic.d.ts +20 -1
- package/dist/integrations/openai-eval.js +4 -2
- package/dist/integrations/openai.d.ts +24 -1
- package/dist/local.js +3 -1
- package/dist/logger.js +6 -2
- package/dist/pagination.js +6 -2
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +394 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/testing.d.ts +65 -0
- package/dist/testing.js +49 -2
- package/dist/types.d.ts +100 -69
- package/dist/utils/input-hash.js +4 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/workflows.js +62 -14
- package/package.json +115 -110
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,60 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.9.0] - 2026-02-27
|
|
9
|
+
|
|
10
|
+
### ✨ Added
|
|
11
|
+
|
|
12
|
+
#### CLI — One-Command CI Loop (`evalai ci`)
|
|
13
|
+
|
|
14
|
+
- **`evalai ci`** — Single command teams put in GitHub workflows and never think about again
|
|
15
|
+
- **Complete CI pipeline**: discover → manifest → impact → run → diff → PR summary → safe failure → "next step"
|
|
16
|
+
- **Automatic manifest building**: Builds manifest if missing, no manual steps required
|
|
17
|
+
- **Impact analysis integration**: `--impacted-only` flag for targeted testing
|
|
18
|
+
- **Smart exit codes**: 0=clean, 1=regressions, 2=config/infra issues
|
|
19
|
+
- **Self-documenting failures**: Always prints copy/paste next step for debugging
|
|
20
|
+
- **GitHub Step Summary integration**: Automatic PR summaries with regressions and artifacts
|
|
21
|
+
|
|
22
|
+
#### CLI — Durable Run History & Diff System
|
|
23
|
+
|
|
24
|
+
- **Run artifact retention**: Timestamped artifacts in `.evalai/runs/run-<runId>.json`
|
|
25
|
+
- **Run index file**: `.evalai/runs/index.json` tracks all runs with metadata
|
|
26
|
+
- **Schema versioning**: `RunResult` and `DiffResult` include `schemaVersion` for compatibility
|
|
27
|
+
- **Base/head shortcuts**: `--base baseline`, `--base last`, `--head last` for common cases
|
|
28
|
+
- **Floating point normalization**: Consistent score/delta calculations across runs
|
|
29
|
+
- **Comprehensive diff comparison**: Classifies regressions, improvements, added, removed specs
|
|
30
|
+
|
|
31
|
+
#### CLI — Centralized Architecture
|
|
32
|
+
|
|
33
|
+
- **Environment detection**: `isCI()`, `isGitHubActions()`, `getGitHubStepSummaryPath()` unified
|
|
34
|
+
- **Workspace resolution**: `resolveEvalWorkspace()` provides all `.evalai` paths
|
|
35
|
+
- **Git reference detection**: Comprehensive patterns for branches, tags, and ranges
|
|
36
|
+
- **No more duplication**: All commands use shared utilities for consistency
|
|
37
|
+
|
|
38
|
+
#### CLI — CI Friendliness
|
|
39
|
+
|
|
40
|
+
- **Fail-safe base resolution**: Clear error messages when base artifacts missing in CI
|
|
41
|
+
- **GitHub Step Summary**: Rich markdown summaries with metrics, regressions, and artifact links
|
|
42
|
+
- **CI-specific error handling**: Exit code 2 for config issues with helpful guidance
|
|
43
|
+
- **Artifact download instructions**: Exact commands for manual base artifact setup
|
|
44
|
+
|
|
45
|
+
### 🔧 Changed
|
|
46
|
+
|
|
47
|
+
- **Exit codes standardized**: 0=clean, 1=regressions, 2=config/infra issues across all commands
|
|
48
|
+
- **Schema compatibility**: Added `schemaVersion` validation for future-proofing
|
|
49
|
+
- **Path resolution**: All commands use centralized workspace helpers
|
|
50
|
+
- **Error messages**: More actionable and context-aware guidance
|
|
51
|
+
|
|
52
|
+
### 📊 New Features Summary
|
|
53
|
+
|
|
54
|
+
- **One-command CI**: `evalai ci` replaces multi-step workflows
|
|
55
|
+
- **Durable history**: Run artifacts preserved with smart indexing
|
|
56
|
+
- **Smart diffing**: Automated regression detection with GitHub integration
|
|
57
|
+
- **Centralized utilities**: Environment detection and workspace resolution unified
|
|
58
|
+
- **Self-documenting**: Clear next steps for any failure scenario
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
8
62
|
## [1.8.0] - 2026-02-26
|
|
9
63
|
|
|
10
64
|
### ✨ Added
|
package/README.md
CHANGED
|
@@ -7,41 +7,150 @@
|
|
|
7
7
|
[](#)
|
|
8
8
|
[](https://opensource.org/licenses/MIT)
|
|
9
9
|
|
|
10
|
-
**
|
|
10
|
+
**One-command CI for AI evaluation. Complete pipeline: discover → manifest → impact → run → diff → PR summary.**
|
|
11
11
|
|
|
12
|
-
Zero to
|
|
12
|
+
Zero to production CI in 60 seconds. No infra. No lock-in. Remove anytime.
|
|
13
13
|
|
|
14
14
|
---
|
|
15
15
|
|
|
16
|
-
## Quick Start (
|
|
16
|
+
## Quick Start (60 seconds)
|
|
17
|
+
|
|
18
|
+
Add this to your `.github/workflows/evalai.yml`:
|
|
19
|
+
|
|
20
|
+
```yaml
|
|
21
|
+
name: EvalAI CI
|
|
22
|
+
on: [push, pull_request]
|
|
23
|
+
jobs:
|
|
24
|
+
evalai:
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
- uses: actions/setup-node@v4
|
|
29
|
+
- run: npm ci
|
|
30
|
+
- run: npx @pauly4010/evalai-sdk ci --format github --write-results --base main
|
|
31
|
+
- uses: actions/upload-artifact@v4
|
|
32
|
+
if: always()
|
|
33
|
+
with:
|
|
34
|
+
name: evalai-results
|
|
35
|
+
path: .evalai/
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Create `eval/your-spec.spec.ts`:
|
|
39
|
+
|
|
40
|
+
```typescript
|
|
41
|
+
import { defineEval } from "@pauly4010/evalai-sdk";
|
|
42
|
+
|
|
43
|
+
defineEval({
|
|
44
|
+
name: "Basic Math Operations",
|
|
45
|
+
description: "Test fundamental arithmetic",
|
|
46
|
+
prompt: "Test: 1+1=2, string concatenation, array includes",
|
|
47
|
+
expected: "All tests should pass",
|
|
48
|
+
tags: ["basic", "math"],
|
|
49
|
+
category: "unit-test"
|
|
50
|
+
});
|
|
51
|
+
```
|
|
17
52
|
|
|
18
53
|
```bash
|
|
19
|
-
|
|
20
|
-
git add
|
|
21
|
-
git commit -m "chore: add EvalAI regression gate"
|
|
54
|
+
git add .github/workflows/evalai.yml eval/
|
|
55
|
+
git commit -m "feat: add EvalAI CI pipeline"
|
|
22
56
|
git push
|
|
23
57
|
```
|
|
24
58
|
|
|
25
|
-
That's it
|
|
26
|
-
|
|
27
|
-
|
|
59
|
+
That's it! Your CI now:
|
|
60
|
+
- ✅ Discovers evaluation specs automatically
|
|
61
|
+
- ✅ Runs only impacted specs (smart caching)
|
|
62
|
+
- ✅ Compares results against base branch
|
|
63
|
+
- ✅ Posts rich summary in PR with regressions
|
|
64
|
+
- ✅ Exits with proper codes (0=clean, 1=regressions, 2=config)
|
|
28
65
|
|
|
29
66
|
---
|
|
30
67
|
|
|
31
|
-
##
|
|
68
|
+
## 🚀 New in v1.9.0: One-Command CI
|
|
69
|
+
|
|
70
|
+
### `evalai ci` - Complete CI Pipeline
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
npx @pauly4010/evalai-sdk ci --format github --write-results --base main
|
|
74
|
+
```
|
|
32
75
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
76
|
+
**What it does:**
|
|
77
|
+
1. **Discover** - Finds all evaluation specs automatically
|
|
78
|
+
2. **Manifest** - Builds stable manifest if missing
|
|
79
|
+
3. **Impact Analysis** - Runs only specs impacted by changes (optional)
|
|
80
|
+
4. **Run** - Executes evaluations with artifact retention
|
|
81
|
+
5. **Diff** - Compares results against base branch
|
|
82
|
+
6. **PR Summary** - Posts rich markdown summary to GitHub
|
|
83
|
+
7. **Debug Flow** - Prints copy/paste next step on failure
|
|
84
|
+
|
|
85
|
+
**Advanced Options:**
|
|
86
|
+
```bash
|
|
87
|
+
npx @pauly4010/evalai-sdk ci --base main --impacted-only # Run only impacted specs
|
|
88
|
+
npx @pauly4010/evalai-sdk ci --format json --write-results # JSON output for automation
|
|
89
|
+
npx @pauly4010/evalai-sdk ci --base develop # Custom base branch
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Smart Diffing & GitHub Integration
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
npx @pauly4010/evalai-sdk diff --base main --head last --format github
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Features:**
|
|
99
|
+
- 📊 Pass rate delta and score changes
|
|
100
|
+
- 🚨 Regression detection with classifications
|
|
101
|
+
- 📈 Improvements and new specs
|
|
102
|
+
- 📁 Artifact links and technical details
|
|
103
|
+
- 🎯 Exit codes: 0=clean, 1=regressions, 2=config
|
|
104
|
+
|
|
105
|
+
### Self-Documenting Failures
|
|
106
|
+
|
|
107
|
+
Every failure prints a clear next step:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
🔧 Next step for debugging:
|
|
111
|
+
Download base artifact and run: evalai diff --base .evalai/base-run.json --head .evalai/last-run.json
|
|
112
|
+
Artifacts: .evalai/runs/
|
|
113
|
+
```
|
|
39
114
|
|
|
40
115
|
---
|
|
41
116
|
|
|
42
117
|
## CLI Commands
|
|
43
118
|
|
|
44
|
-
###
|
|
119
|
+
### 🚀 One-Command CI (v1.9.0)
|
|
120
|
+
|
|
121
|
+
| Command | Description |
|
|
122
|
+
|---------|-------------|
|
|
123
|
+
| `npx evalai ci` | Complete CI pipeline: discover → manifest → impact → run → diff → PR summary |
|
|
124
|
+
| `npx evalai ci --base main` | Run CI with diff against main branch |
|
|
125
|
+
| `npx evalai ci --impacted-only` | Run only specs impacted by changes |
|
|
126
|
+
| `npx evalai ci --format github` | GitHub Step Summary with rich markdown |
|
|
127
|
+
| `npx evalai ci --format json` | JSON output for automation |
|
|
128
|
+
|
|
129
|
+
### Discovery & Manifest
|
|
130
|
+
|
|
131
|
+
| Command | Description |
|
|
132
|
+
|---------|-------------|
|
|
133
|
+
| `npx evalai discover` | Find and analyze evaluation specs |
|
|
134
|
+
| `npx evalai discover --manifest` | Generate stable manifest for incremental analysis |
|
|
135
|
+
|
|
136
|
+
### Impact Analysis
|
|
137
|
+
|
|
138
|
+
| Command | Description |
|
|
139
|
+
|---------|-------------|
|
|
140
|
+
| `npx evalai impact-analysis --base main` | Analyze impact of changes |
|
|
141
|
+
| `npx evalai impact-analysis --changed-files file1.ts,file2.ts` | Analyze specific changed files |
|
|
142
|
+
|
|
143
|
+
### Run & Diff
|
|
144
|
+
|
|
145
|
+
| Command | Description |
|
|
146
|
+
|---------|-------------|
|
|
147
|
+
| `npx evalai run` | Run evaluation specifications |
|
|
148
|
+
| `npx evalai run --write-results` | Run with artifact retention |
|
|
149
|
+
| `npx evalai diff --base main` | Compare results against base branch |
|
|
150
|
+
| `npx evalai diff --base last --head last` | Compare last two runs |
|
|
151
|
+
| `npx evalai diff --format github` | GitHub Step Summary with regressions |
|
|
152
|
+
|
|
153
|
+
### Legacy Regression Gate (local, no account needed)
|
|
45
154
|
|
|
46
155
|
| Command | Description |
|
|
47
156
|
|---------|-------------|
|
|
@@ -68,25 +177,29 @@ That's it. Open a PR and CI blocks regressions automatically.
|
|
|
68
177
|
| `npx evalai explain` | Offline report explainer — top failures, root cause classification, suggested fixes |
|
|
69
178
|
| `npx evalai print-config` | Show resolved config with source-of-truth annotations (file/env/default/arg) |
|
|
70
179
|
|
|
180
|
+
### Migration Tools
|
|
181
|
+
|
|
182
|
+
| Command | Description |
|
|
183
|
+
|---------|-------------|
|
|
184
|
+
| `npx evalai migrate config --in evalai.config.json --out eval/migrated.spec.ts` | Convert legacy config to DSL |
|
|
185
|
+
|
|
71
186
|
**Guided failure flow:**
|
|
72
187
|
|
|
73
188
|
```
|
|
74
|
-
evalai
|
|
189
|
+
evalai ci → fails → "Next: evalai explain --report .evalai/last-run.json"
|
|
75
190
|
↓
|
|
76
191
|
evalai explain → root causes + fixes
|
|
77
192
|
```
|
|
78
193
|
|
|
79
|
-
**GitHub Actions step summary** —
|
|
194
|
+
**GitHub Actions step summary** — CI result at a glance with regressions and artifacts:
|
|
80
195
|
|
|
81
|
-

|
|
82
197
|
|
|
83
198
|
**`evalai explain` terminal output** — root causes + fix commands:
|
|
84
199
|
|
|
85
200
|

|
|
86
201
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
`doctor` uses exit codes: **0** = ready, **2** = not ready, **3** = infra error. Use `--report` for a JSON diagnostic bundle.
|
|
202
|
+
All commands automatically write artifacts so `explain` works with zero flags.
|
|
90
203
|
|
|
91
204
|
### Gate Exit Codes
|
|
92
205
|
|
package/dist/assertions.js
CHANGED
|
@@ -86,7 +86,9 @@ class Expectation {
|
|
|
86
86
|
expected: substring,
|
|
87
87
|
actual: text,
|
|
88
88
|
message: message ||
|
|
89
|
-
(passed
|
|
89
|
+
(passed
|
|
90
|
+
? `Text contains "${substring}"`
|
|
91
|
+
: `Text does not contain "${substring}"`),
|
|
90
92
|
};
|
|
91
93
|
}
|
|
92
94
|
/**
|
|
@@ -103,7 +105,9 @@ class Expectation {
|
|
|
103
105
|
expected: keywords,
|
|
104
106
|
actual: text,
|
|
105
107
|
message: message ||
|
|
106
|
-
(passed
|
|
108
|
+
(passed
|
|
109
|
+
? `Contains all keywords`
|
|
110
|
+
: `Missing keywords: ${missingKeywords.join(", ")}`),
|
|
107
111
|
};
|
|
108
112
|
}
|
|
109
113
|
/**
|
|
@@ -119,7 +123,9 @@ class Expectation {
|
|
|
119
123
|
expected: `not containing "${substring}"`,
|
|
120
124
|
actual: text,
|
|
121
125
|
message: message ||
|
|
122
|
-
(passed
|
|
126
|
+
(passed
|
|
127
|
+
? `Text does not contain "${substring}"`
|
|
128
|
+
: `Text contains "${substring}"`),
|
|
123
129
|
};
|
|
124
130
|
}
|
|
125
131
|
/**
|
|
@@ -144,7 +150,8 @@ class Expectation {
|
|
|
144
150
|
passed,
|
|
145
151
|
expected: "no PII",
|
|
146
152
|
actual: foundPII.length > 0 ? `Found: ${foundPII.join(", ")}` : "no PII",
|
|
147
|
-
message: message ||
|
|
153
|
+
message: message ||
|
|
154
|
+
(passed ? "No PII detected" : `PII detected: ${foundPII.join(", ")}`),
|
|
148
155
|
};
|
|
149
156
|
}
|
|
150
157
|
/**
|
|
@@ -159,7 +166,10 @@ class Expectation {
|
|
|
159
166
|
passed,
|
|
160
167
|
expected: pattern.toString(),
|
|
161
168
|
actual: text,
|
|
162
|
-
message: message ||
|
|
169
|
+
message: message ||
|
|
170
|
+
(passed
|
|
171
|
+
? `Matches pattern ${pattern}`
|
|
172
|
+
: `Does not match pattern ${pattern}`),
|
|
163
173
|
};
|
|
164
174
|
}
|
|
165
175
|
/**
|
|
@@ -205,7 +215,8 @@ class Expectation {
|
|
|
205
215
|
passed,
|
|
206
216
|
expected: schema,
|
|
207
217
|
actual: parsedJson,
|
|
208
|
-
message: message ||
|
|
218
|
+
message: message ||
|
|
219
|
+
(passed ? "JSON matches schema" : "JSON does not match schema"),
|
|
209
220
|
};
|
|
210
221
|
}
|
|
211
222
|
/**
|
|
@@ -253,7 +264,10 @@ class Expectation {
|
|
|
253
264
|
passed,
|
|
254
265
|
expected,
|
|
255
266
|
actual,
|
|
256
|
-
message: message ||
|
|
267
|
+
message: message ||
|
|
268
|
+
(passed
|
|
269
|
+
? `Sentiment is ${expected}`
|
|
270
|
+
: `Expected ${expected}, got ${actual}`),
|
|
257
271
|
};
|
|
258
272
|
}
|
|
259
273
|
/**
|
|
@@ -269,7 +283,10 @@ class Expectation {
|
|
|
269
283
|
passed,
|
|
270
284
|
expected: range,
|
|
271
285
|
actual: length,
|
|
272
|
-
message: message ||
|
|
286
|
+
message: message ||
|
|
287
|
+
(passed
|
|
288
|
+
? `Length ${length} is within range`
|
|
289
|
+
: `Length ${length} not in range`),
|
|
273
290
|
};
|
|
274
291
|
}
|
|
275
292
|
/**
|
|
@@ -284,9 +301,13 @@ class Expectation {
|
|
|
284
301
|
name: "toNotHallucinate",
|
|
285
302
|
passed,
|
|
286
303
|
expected: "all ground truth facts",
|
|
287
|
-
actual: missingFacts.length > 0
|
|
304
|
+
actual: missingFacts.length > 0
|
|
305
|
+
? `Missing: ${missingFacts.join(", ")}`
|
|
306
|
+
: "all facts present",
|
|
288
307
|
message: message ||
|
|
289
|
-
(passed
|
|
308
|
+
(passed
|
|
309
|
+
? "No hallucinations detected"
|
|
310
|
+
: `Missing facts: ${missingFacts.join(", ")}`),
|
|
290
311
|
};
|
|
291
312
|
}
|
|
292
313
|
/**
|
|
@@ -301,7 +322,10 @@ class Expectation {
|
|
|
301
322
|
passed,
|
|
302
323
|
expected: `<= ${maxMs}ms`,
|
|
303
324
|
actual: `${duration}ms`,
|
|
304
|
-
message: message ||
|
|
325
|
+
message: message ||
|
|
326
|
+
(passed
|
|
327
|
+
? `${duration}ms within limit`
|
|
328
|
+
: `${duration}ms exceeds ${maxMs}ms`),
|
|
305
329
|
};
|
|
306
330
|
}
|
|
307
331
|
/**
|
|
@@ -344,7 +368,8 @@ class Expectation {
|
|
|
344
368
|
passed,
|
|
345
369
|
expected: `> ${expected}`,
|
|
346
370
|
actual: value,
|
|
347
|
-
message: message ||
|
|
371
|
+
message: message ||
|
|
372
|
+
(passed ? `${value} > ${expected}` : `${value} <= ${expected}`),
|
|
348
373
|
};
|
|
349
374
|
}
|
|
350
375
|
/**
|
|
@@ -359,7 +384,8 @@ class Expectation {
|
|
|
359
384
|
passed,
|
|
360
385
|
expected: `< ${expected}`,
|
|
361
386
|
actual: value,
|
|
362
|
-
message: message ||
|
|
387
|
+
message: message ||
|
|
388
|
+
(passed ? `${value} < ${expected}` : `${value} >= ${expected}`),
|
|
363
389
|
};
|
|
364
390
|
}
|
|
365
391
|
/**
|
|
@@ -374,7 +400,8 @@ class Expectation {
|
|
|
374
400
|
passed,
|
|
375
401
|
expected: `between ${min} and ${max}`,
|
|
376
402
|
actual: value,
|
|
377
|
-
message: message ||
|
|
403
|
+
message: message ||
|
|
404
|
+
(passed ? `${value} is within range` : `${value} is outside range`),
|
|
378
405
|
};
|
|
379
406
|
}
|
|
380
407
|
/**
|
|
@@ -389,7 +416,8 @@ class Expectation {
|
|
|
389
416
|
passed: hasCodeBlock,
|
|
390
417
|
expected: "code block",
|
|
391
418
|
actual: text,
|
|
392
|
-
message: message ||
|
|
419
|
+
message: message ||
|
|
420
|
+
(hasCodeBlock ? "Contains code block" : "No code block found"),
|
|
393
421
|
};
|
|
394
422
|
}
|
|
395
423
|
/**
|
|
@@ -405,9 +433,13 @@ class Expectation {
|
|
|
405
433
|
name: "toBeProfessional",
|
|
406
434
|
passed,
|
|
407
435
|
expected: "professional tone",
|
|
408
|
-
actual: foundProfanity.length > 0
|
|
436
|
+
actual: foundProfanity.length > 0
|
|
437
|
+
? `Found: ${foundProfanity.join(", ")}`
|
|
438
|
+
: "professional",
|
|
409
439
|
message: message ||
|
|
410
|
-
(passed
|
|
440
|
+
(passed
|
|
441
|
+
? "Professional tone"
|
|
442
|
+
: `Unprofessional language: ${foundProfanity.join(", ")}`),
|
|
411
443
|
};
|
|
412
444
|
}
|
|
413
445
|
/**
|
|
@@ -432,7 +464,8 @@ class Expectation {
|
|
|
432
464
|
passed,
|
|
433
465
|
expected: "proper grammar",
|
|
434
466
|
actual: issues.length > 0 ? `Issues: ${issues.join(", ")}` : "proper grammar",
|
|
435
|
-
message: message ||
|
|
467
|
+
message: message ||
|
|
468
|
+
(passed ? "Proper grammar" : `Grammar issues: ${issues.join(", ")}`),
|
|
436
469
|
};
|
|
437
470
|
}
|
|
438
471
|
}
|
package/dist/batch.js
CHANGED
|
@@ -81,7 +81,8 @@ class RequestBatcher {
|
|
|
81
81
|
pendingRequest.resolve(response.data);
|
|
82
82
|
}
|
|
83
83
|
else {
|
|
84
|
-
pendingRequest.reject(new Error(response.error ||
|
|
84
|
+
pendingRequest.reject(new Error(response.error ||
|
|
85
|
+
`Request failed with status ${response.status}`));
|
|
85
86
|
}
|
|
86
87
|
}
|
|
87
88
|
}
|
|
@@ -149,7 +150,12 @@ function canBatch(method, endpoint) {
|
|
|
149
150
|
if (method !== "GET") {
|
|
150
151
|
return false;
|
|
151
152
|
}
|
|
152
|
-
const batchableEndpoints = [
|
|
153
|
+
const batchableEndpoints = [
|
|
154
|
+
"/traces",
|
|
155
|
+
"/evaluations",
|
|
156
|
+
"/annotations",
|
|
157
|
+
"/results",
|
|
158
|
+
];
|
|
153
159
|
return batchableEndpoints.some((pattern) => endpoint.includes(pattern));
|
|
154
160
|
}
|
|
155
161
|
/**
|
package/dist/cli/api.js
CHANGED
|
@@ -73,7 +73,9 @@ async function publishShare(baseUrl, apiKey, evaluationId, exportData, evaluatio
|
|
|
73
73
|
exportData,
|
|
74
74
|
shareScope: "run",
|
|
75
75
|
evaluationRunId,
|
|
76
|
-
...(options?.expiresInDays != null && {
|
|
76
|
+
...(options?.expiresInDays != null && {
|
|
77
|
+
expiresInDays: options.expiresInDays,
|
|
78
|
+
}),
|
|
77
79
|
};
|
|
78
80
|
const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/publish`;
|
|
79
81
|
try {
|
package/dist/cli/check.js
CHANGED
|
@@ -183,7 +183,11 @@ function parseArgs(argv) {
|
|
|
183
183
|
};
|
|
184
184
|
}
|
|
185
185
|
if (Number.isNaN(minScore) || minScore < 0 || minScore > 100) {
|
|
186
|
-
return {
|
|
186
|
+
return {
|
|
187
|
+
ok: false,
|
|
188
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
189
|
+
message: "Error: --minScore must be 0-100",
|
|
190
|
+
};
|
|
187
191
|
}
|
|
188
192
|
if (minN !== undefined && (Number.isNaN(minN) || minN < 1)) {
|
|
189
193
|
return {
|
|
@@ -210,9 +214,15 @@ function parseArgs(argv) {
|
|
|
210
214
|
onFail,
|
|
211
215
|
share,
|
|
212
216
|
prCommentOut,
|
|
213
|
-
maxCostUsd: maxCostUsd != null && !Number.isNaN(maxCostUsd)
|
|
214
|
-
|
|
215
|
-
|
|
217
|
+
maxCostUsd: maxCostUsd != null && !Number.isNaN(maxCostUsd)
|
|
218
|
+
? maxCostUsd
|
|
219
|
+
: undefined,
|
|
220
|
+
maxLatencyMs: maxLatencyMs != null && !Number.isNaN(maxLatencyMs)
|
|
221
|
+
? maxLatencyMs
|
|
222
|
+
: undefined,
|
|
223
|
+
maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd)
|
|
224
|
+
? maxCostDeltaUsd
|
|
225
|
+
: undefined,
|
|
216
226
|
},
|
|
217
227
|
};
|
|
218
228
|
}
|
|
@@ -297,7 +307,8 @@ async function runCheck(args) {
|
|
|
297
307
|
runDetails?.results &&
|
|
298
308
|
quality?.evaluationRunId) {
|
|
299
309
|
const importResults = runDetails.results
|
|
300
|
-
.filter((r) => r.testCaseId != null &&
|
|
310
|
+
.filter((r) => r.testCaseId != null &&
|
|
311
|
+
(r.status === "passed" || r.status === "failed"))
|
|
301
312
|
.map((r) => ({
|
|
302
313
|
testCaseId: r.testCaseId,
|
|
303
314
|
status: r.status,
|
|
@@ -306,7 +317,9 @@ async function runCheck(args) {
|
|
|
306
317
|
assertionsJson: r.assertionsJson,
|
|
307
318
|
}));
|
|
308
319
|
if (importResults.length > 0) {
|
|
309
|
-
const idempotencyKey = ci
|
|
320
|
+
const idempotencyKey = ci
|
|
321
|
+
? (0, ci_context_1.computeIdempotencyKey)(args.evaluationId, ci)
|
|
322
|
+
: undefined;
|
|
310
323
|
const importRes = await (0, api_1.importRunOnFail)(args.baseUrl, args.apiKey, args.evaluationId, importResults, {
|
|
311
324
|
idempotencyKey,
|
|
312
325
|
ci,
|
package/dist/cli/ci-context.js
CHANGED
|
@@ -89,7 +89,9 @@ function captureCiContext() {
|
|
|
89
89
|
provider,
|
|
90
90
|
repo,
|
|
91
91
|
sha,
|
|
92
|
-
branch: ref?.startsWith("refs/heads/")
|
|
92
|
+
branch: ref?.startsWith("refs/heads/")
|
|
93
|
+
? ref.slice("refs/heads/".length)
|
|
94
|
+
: ref,
|
|
93
95
|
runUrl,
|
|
94
96
|
actor,
|
|
95
97
|
pr,
|
package/dist/cli/ci.d.ts
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* UX-401: One-command CI loop (evalai ci)
|
|
3
|
+
*
|
|
4
|
+
* Provides a single command teams put in .github/workflows/* and never think about again.
|
|
5
|
+
*/
|
|
6
|
+
import type { DiffResult } from "./diff";
|
|
7
|
+
import type { RunResult } from "./run";
|
|
8
|
+
/**
|
|
9
|
+
* CI command options
|
|
10
|
+
*/
|
|
11
|
+
export interface CIOptions {
|
|
12
|
+
/** Base reference for diff comparison */
|
|
13
|
+
base?: string;
|
|
14
|
+
/** Run only impacted specs */
|
|
15
|
+
impactedOnly?: boolean;
|
|
16
|
+
/** Output format */
|
|
17
|
+
format?: "human" | "json" | "github";
|
|
18
|
+
/** Write run results */
|
|
19
|
+
writeResults?: boolean;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* CI execution result
|
|
23
|
+
*/
|
|
24
|
+
export interface CIResult {
|
|
25
|
+
/** Success status */
|
|
26
|
+
success: boolean;
|
|
27
|
+
/** Exit code */
|
|
28
|
+
exitCode: number;
|
|
29
|
+
/** Execution narrative */
|
|
30
|
+
narrative: string;
|
|
31
|
+
/** Run result (if executed) */
|
|
32
|
+
runResult?: RunResult;
|
|
33
|
+
/** Diff result (if executed) */
|
|
34
|
+
diffResult?: DiffResult;
|
|
35
|
+
/** Error message (if failed) */
|
|
36
|
+
error?: string;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Run CI command
|
|
40
|
+
*/
|
|
41
|
+
export declare function runCI(options: CIOptions, projectRoot?: string): Promise<CIResult>;
|
|
42
|
+
/**
|
|
43
|
+
* CLI entry point
|
|
44
|
+
*/
|
|
45
|
+
export declare function runCICLI(options: CIOptions): Promise<void>;
|