@pauly4010/evalai-sdk 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/README.md +172 -251
- package/dist/cli/baseline.js +1 -1
- package/dist/cli/index.js +6 -0
- package/dist/cli/init.d.ts +11 -2
- package/dist/cli/init.js +227 -16
- package/dist/cli/regression-gate.d.ts +6 -2
- package/dist/cli/regression-gate.js +246 -61
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +491 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +7 -7
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,50 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.7.0] - 2026-02-25
|
|
9
|
+
|
|
10
|
+
### ✨ Added
|
|
11
|
+
|
|
12
|
+
#### CLI — `evalai init` Full Project Scaffolder
|
|
13
|
+
|
|
14
|
+
- **`evalai init`** — Zero-to-gate in under 5 minutes:
|
|
15
|
+
- Detects Node repo + package manager (npm/yarn/pnpm)
|
|
16
|
+
- Runs existing tests to capture real pass/fail + test count
|
|
17
|
+
- Creates `evals/baseline.json` with provenance metadata
|
|
18
|
+
- Installs `.github/workflows/evalai-gate.yml` (package-manager aware)
|
|
19
|
+
- Creates `evalai.config.json`
|
|
20
|
+
- Prints copy-paste next steps — just commit and push
|
|
21
|
+
- Idempotent: skips files that already exist
|
|
22
|
+
|
|
23
|
+
#### CLI — `evalai upgrade --full` (Tier 1 → Tier 2)
|
|
24
|
+
|
|
25
|
+
- **`evalai upgrade --full`** — Upgrade from built-in gate to full gate:
|
|
26
|
+
- Creates `scripts/regression-gate.ts` (full gate with `--update-baseline`)
|
|
27
|
+
- Adds `eval:regression-gate` + `eval:baseline-update` npm scripts
|
|
28
|
+
- Creates `.github/workflows/baseline-governance.yml` (label + diff enforcement)
|
|
29
|
+
- Upgrades `evalai-gate.yml` to project mode
|
|
30
|
+
- Adds `CODEOWNERS` entry for `evals/baseline.json`
|
|
31
|
+
|
|
32
|
+
#### Gate Output — Machine-Readable Improvements
|
|
33
|
+
|
|
34
|
+
- **`detectRunner()`** — Identifies test runner from `package.json` scripts: vitest, jest, mocha, node:test, ava, tap, or unknown
|
|
35
|
+
- **BuiltinReport** now always emits: `durationMs`, `command`, `runner`, `baseline` metadata
|
|
36
|
+
- Report schema updated with optional `durationMs`, `command`, `runner` properties
|
|
37
|
+
|
|
38
|
+
#### Init Scaffolder Integration Tests
|
|
39
|
+
|
|
40
|
+
- 4 fixtures: npm+jest, pnpm+vitest, yarn+jest, pnpm monorepo
|
|
41
|
+
- 25 tests: files created, YAML valid, pm-aware workflow, idempotent runs
|
|
42
|
+
- All fixtures use `node:test` (zero external deps)
|
|
43
|
+
|
|
44
|
+
### 🔧 Changed
|
|
45
|
+
|
|
46
|
+
- CLI help text updated to include `upgrade` command
|
|
47
|
+
- Gate report includes runner detection and timing metadata
|
|
48
|
+
- SDK test count: 147 → 172 tests (12 → 15 contract tests)
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
8
52
|
## [1.6.0] - 2026-02-24
|
|
9
53
|
|
|
10
54
|
### ✨ Added
|
package/README.md
CHANGED
|
@@ -1,22 +1,108 @@
|
|
|
1
1
|
# @pauly4010/evalai-sdk
|
|
2
2
|
|
|
3
|
-
[](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
|
|
3
|
+
[](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
|
|
4
4
|
[](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
|
|
5
5
|
|
|
6
6
|
**Stop LLM regressions in CI in minutes.**
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
No lock-in — remove by deleting `evalai.config.json`.
|
|
8
|
+
Zero to gate in under 5 minutes. No infra. No lock-in. Remove anytime.
|
|
10
9
|
|
|
11
10
|
---
|
|
12
11
|
|
|
13
|
-
|
|
12
|
+
## Quick Start (2 minutes)
|
|
14
13
|
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
```bash
|
|
15
|
+
npx @pauly4010/evalai-sdk init
|
|
16
|
+
git add evals/ .github/workflows/evalai-gate.yml evalai.config.json
|
|
17
|
+
git commit -m "chore: add EvalAI regression gate"
|
|
18
|
+
git push
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
That's it. Open a PR and CI blocks regressions automatically.
|
|
22
|
+
|
|
23
|
+
`evalai init` detects your project, creates a baseline from your current tests, and installs a GitHub Actions workflow. No manual config needed.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## What `evalai init` does
|
|
28
|
+
|
|
29
|
+
1. **Detects** your Node repo and package manager (npm/yarn/pnpm)
|
|
30
|
+
2. **Runs your tests** to capture a real baseline (pass/fail + test count)
|
|
31
|
+
3. **Creates `evals/baseline.json`** with provenance metadata
|
|
32
|
+
4. **Installs `.github/workflows/evalai-gate.yml`** (package-manager aware)
|
|
33
|
+
5. **Creates `evalai.config.json`**
|
|
34
|
+
6. **Prints next steps** — just commit and push
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## CLI Commands
|
|
39
|
+
|
|
40
|
+
### Regression Gate (local, no account needed)
|
|
41
|
+
|
|
42
|
+
| Command | Description |
|
|
43
|
+
|---------|-------------|
|
|
44
|
+
| `npx evalai init` | Full project scaffolder — creates everything you need |
|
|
45
|
+
| `npx evalai gate` | Run regression gate locally |
|
|
46
|
+
| `npx evalai gate --format json` | Machine-readable JSON output |
|
|
47
|
+
| `npx evalai gate --format github` | GitHub Step Summary with delta table |
|
|
48
|
+
| `npx evalai baseline init` | Create starter `evals/baseline.json` |
|
|
49
|
+
| `npx evalai baseline update` | Re-run tests and update baseline with real scores |
|
|
50
|
+
| `npx evalai upgrade --full` | Upgrade from Tier 1 (built-in) to Tier 2 (full gate) |
|
|
51
|
+
|
|
52
|
+
### API Gate (requires account)
|
|
53
|
+
|
|
54
|
+
| Command | Description |
|
|
55
|
+
|---------|-------------|
|
|
56
|
+
| `npx evalai check` | Gate on quality score from dashboard |
|
|
57
|
+
| `npx evalai doctor` | Verify CI/CD setup |
|
|
58
|
+
| `npx evalai share` | Create share link for a run |
|
|
59
|
+
|
|
60
|
+
### Gate Exit Codes
|
|
61
|
+
|
|
62
|
+
| Code | Meaning |
|
|
63
|
+
|------|---------|
|
|
64
|
+
| 0 | Pass — no regression |
|
|
65
|
+
| 1 | Regression detected |
|
|
66
|
+
| 2 | Infra error (baseline missing, tests crashed) |
|
|
67
|
+
|
|
68
|
+
### Check Exit Codes (API mode)
|
|
69
|
+
|
|
70
|
+
| Code | Meaning |
|
|
71
|
+
|------|---------|
|
|
72
|
+
| 0 | Pass |
|
|
73
|
+
| 1 | Score below threshold |
|
|
74
|
+
| 2 | Regression failure |
|
|
75
|
+
| 3 | Policy violation |
|
|
76
|
+
| 4 | API error |
|
|
77
|
+
| 5 | Bad arguments |
|
|
78
|
+
| 6 | Low test count |
|
|
79
|
+
| 7 | Weak evidence |
|
|
80
|
+
| 8 | Warn (soft regression) |
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## How the Gate Works
|
|
85
|
+
|
|
86
|
+
**Built-in mode** (any Node project, no config needed):
|
|
87
|
+
- Runs `<pm> test`, captures exit code + test count
|
|
88
|
+
- Compares against `evals/baseline.json`
|
|
89
|
+
- Writes `evals/regression-report.json`
|
|
90
|
+
- Fails CI if tests regress
|
|
91
|
+
|
|
92
|
+
**Project mode** (advanced, for full regression gate):
|
|
93
|
+
- If `eval:regression-gate` script exists in `package.json`, delegates to it
|
|
94
|
+
- Supports golden eval scores, confidence tests, p95 latency, cost tracking
|
|
95
|
+
- Full delta table with tolerances
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Run a Regression Test Locally (no account)
|
|
17
100
|
|
|
18
101
|
```bash
|
|
19
102
|
npm install @pauly4010/evalai-sdk openai
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
```typescript
|
|
20
106
|
import { openAIChatEval } from "@pauly4010/evalai-sdk";
|
|
21
107
|
|
|
22
108
|
await openAIChatEval({
|
|
@@ -26,14 +112,14 @@ await openAIChatEval({
|
|
|
26
112
|
{ input: "2 + 2 = ?", expectedOutput: "4" },
|
|
27
113
|
],
|
|
28
114
|
});
|
|
29
|
-
|
|
115
|
+
```
|
|
30
116
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
} from "@pauly4010/evalai-sdk";
|
|
117
|
+
Output: `PASS 2/2 (score: 100)`. No account needed. Just a score.
|
|
118
|
+
|
|
119
|
+
### Vitest Integration
|
|
120
|
+
|
|
121
|
+
```typescript
|
|
122
|
+
import { openAIChatEval, extendExpectWithToPassGate } from "@pauly4010/evalai-sdk";
|
|
37
123
|
import { expect } from "vitest";
|
|
38
124
|
|
|
39
125
|
extendExpectWithToPassGate(expect);
|
|
@@ -46,278 +132,113 @@ it("passes gate", async () => {
|
|
|
46
132
|
{ input: "2 + 2 = ?", expectedOutput: "4" },
|
|
47
133
|
],
|
|
48
134
|
});
|
|
49
|
-
|
|
50
135
|
expect(result).toPassGate();
|
|
51
136
|
});
|
|
52
|
-
|
|
53
|
-
PASS 2/2 (score: 100)
|
|
54
|
-
|
|
55
|
-
Tip: Want dashboards and history?
|
|
56
|
-
Set EVALAI_API_KEY and connect this to the platform.
|
|
57
|
-
Failures show:
|
|
58
|
-
|
|
59
|
-
FAIL 9/10 (score: 90)
|
|
60
|
-
with failed cases and CI guidance.
|
|
61
|
-
|
|
62
|
-
⚡ 2) Optional: Add a CI gate (2 minutes)
|
|
63
|
-
When you're ready to gate PRs on quality and regressions:
|
|
64
|
-
|
|
65
|
-
npx -y @pauly4010/evalai-sdk@^1 init
|
|
66
|
-
Create an evaluation in the dashboard and paste its ID into:
|
|
67
|
-
|
|
68
|
-
{
|
|
69
|
-
"evaluationId": "42"
|
|
70
|
-
}
|
|
71
|
-
Add to your CI:
|
|
72
|
-
|
|
73
|
-
- name: EvalAI gate
|
|
74
|
-
env:
|
|
75
|
-
EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}
|
|
76
|
-
run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import --warnDrop 1
|
|
77
|
-
You’ll get:
|
|
78
|
-
|
|
79
|
-
GitHub annotations
|
|
80
|
-
|
|
81
|
-
Step summary
|
|
82
|
-
|
|
83
|
-
Optional dashboard link
|
|
84
|
-
|
|
85
|
-
PASS / WARN / FAIL (v1.5.7)
|
|
86
|
-
EvalAI introduces a WARN band so teams can see meaningful regressions without always blocking merges.
|
|
87
|
-
|
|
88
|
-
Behavior
|
|
89
|
-
|
|
90
|
-
PASS → within thresholds
|
|
91
|
-
|
|
92
|
-
WARN → regression > warnDrop but < maxDrop
|
|
93
|
-
|
|
94
|
-
FAIL → regression > maxDrop
|
|
95
|
-
|
|
96
|
-
Key flags
|
|
97
|
-
|
|
98
|
-
--warnDrop → soft regression warning
|
|
99
|
-
|
|
100
|
-
--maxDrop → hard regression fail
|
|
101
|
-
|
|
102
|
-
--fail-on-flake → fail if unknown test is unstable
|
|
103
|
-
|
|
104
|
-
This lets teams tune signal vs noise in CI.
|
|
105
|
-
|
|
106
|
-
🔒 3) No lock-in
|
|
107
|
-
To stop using EvalAI:
|
|
108
|
-
|
|
109
|
-
rm evalai.config.json
|
|
110
|
-
Your local openAIChatEval runs continue to work exactly the same.
|
|
111
|
-
|
|
112
|
-
No account cancellation. No data export required.
|
|
113
|
-
|
|
114
|
-
📦 Installation
|
|
115
|
-
npm install @pauly4010/evalai-sdk openai
|
|
116
|
-
# or
|
|
117
|
-
yarn add @pauly4010/evalai-sdk openai
|
|
118
|
-
# or
|
|
119
|
-
pnpm add @pauly4010/evalai-sdk openai
|
|
120
|
-
🖥️ Environment Support
|
|
121
|
-
This SDK works in both Node.js and browsers, with some Node-only features.
|
|
122
|
-
|
|
123
|
-
✅ Works Everywhere (Node.js + Browser)
|
|
124
|
-
Traces API
|
|
125
|
-
|
|
126
|
-
Evaluations API
|
|
127
|
-
|
|
128
|
-
LLM Judge API
|
|
129
|
-
|
|
130
|
-
Annotations API
|
|
131
|
-
|
|
132
|
-
Developer API (API Keys, Webhooks, Usage)
|
|
133
|
-
|
|
134
|
-
Organizations API
|
|
135
|
-
|
|
136
|
-
Assertions Library
|
|
137
|
-
|
|
138
|
-
Test Suites
|
|
139
|
-
|
|
140
|
-
Error Handling
|
|
141
|
-
|
|
142
|
-
CJS/ESM Compatibility
|
|
137
|
+
```
|
|
143
138
|
|
|
144
|
-
|
|
145
|
-
These require Node.js:
|
|
139
|
+
---
|
|
146
140
|
|
|
147
|
-
|
|
141
|
+
## SDK Exports
|
|
148
142
|
|
|
149
|
-
|
|
143
|
+
### Regression Gate Constants
|
|
150
144
|
|
|
151
|
-
|
|
145
|
+
```typescript
|
|
146
|
+
import {
|
|
147
|
+
GATE_EXIT, // { PASS: 0, REGRESSION: 1, INFRA_ERROR: 2, ... }
|
|
148
|
+
GATE_CATEGORY, // { PASS: "pass", REGRESSION: "regression", INFRA_ERROR: "infra_error" }
|
|
149
|
+
REPORT_SCHEMA_VERSION,
|
|
150
|
+
ARTIFACTS, // { BASELINE, REGRESSION_REPORT, CONFIDENCE_SUMMARY, LATENCY_BENCHMARK }
|
|
151
|
+
} from "@pauly4010/evalai-sdk";
|
|
152
152
|
|
|
153
|
-
|
|
153
|
+
// Or tree-shakeable:
|
|
154
|
+
import { GATE_EXIT } from "@pauly4010/evalai-sdk/regression";
|
|
155
|
+
```
|
|
154
156
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
+
### Types
|
|
158
|
+
|
|
159
|
+
```typescript
|
|
160
|
+
import type {
|
|
161
|
+
RegressionReport,
|
|
162
|
+
RegressionDelta,
|
|
163
|
+
Baseline,
|
|
164
|
+
BaselineTolerance,
|
|
165
|
+
GateExitCode,
|
|
166
|
+
GateCategory,
|
|
167
|
+
} from "@pauly4010/evalai-sdk/regression";
|
|
168
|
+
```
|
|
157
169
|
|
|
158
|
-
|
|
170
|
+
### Platform Client
|
|
159
171
|
|
|
160
|
-
|
|
172
|
+
```typescript
|
|
161
173
|
import { AIEvalClient } from "@pauly4010/evalai-sdk";
|
|
162
174
|
|
|
163
|
-
//
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
const client2 = new AIEvalClient({
|
|
168
|
-
apiKey: "your-api-key",
|
|
169
|
-
organizationId: 123,
|
|
170
|
-
debug: true,
|
|
171
|
-
});
|
|
172
|
-
🧪 evalai CLI (v1.5.7)
|
|
173
|
-
The CLI gates deployments on quality, regression, and policy.
|
|
174
|
-
|
|
175
|
-
Quick start
|
|
176
|
-
npx -y @pauly4010/evalai-sdk@^1 check \
|
|
177
|
-
--evaluationId 42 \
|
|
178
|
-
--apiKey $EVALAI_API_KEY
|
|
179
|
-
evalai check
|
|
180
|
-
Option Description
|
|
181
|
-
--evaluationId <id> Required. Evaluation to gate on
|
|
182
|
-
--apiKey <key> API key (or EVALAI_API_KEY)
|
|
183
|
-
--format <fmt> human, json, or github
|
|
184
|
-
--onFail import Import failing run to dashboard
|
|
185
|
-
--explain Show score breakdown
|
|
186
|
-
--minScore <n> Fail if score < n
|
|
187
|
-
--warnDrop <n> Warn if regression exceeds n
|
|
188
|
-
--maxDrop <n> Fail if regression exceeds n
|
|
189
|
-
--minN <n> Fail if test count < n
|
|
190
|
-
--allowWeakEvidence Permit weak evidence
|
|
191
|
-
--policy <name> HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511
|
|
192
|
-
--baseline <mode> published, previous, production
|
|
193
|
-
--fail-on-flake Fail if unknown case is flaky
|
|
194
|
-
--baseUrl <url> Override API base URL
|
|
195
|
-
|
|
196
|
-
Exit codes
|
|
197
|
-
Code Meaning
|
|
198
|
-
0 PASS
|
|
199
|
-
8 WARN
|
|
200
|
-
1 Score below threshold
|
|
201
|
-
2 Regression failure
|
|
202
|
-
3 Policy violation
|
|
203
|
-
4 API error
|
|
204
|
-
5 Bad arguments
|
|
205
|
-
6 Low test count
|
|
206
|
-
7 Weak evidence
|
|
207
|
-
evalai doctor
|
|
208
|
-
Verify CI setup before running the gate:
|
|
209
|
-
|
|
210
|
-
npx -y @pauly4010/evalai-sdk@^1 doctor \
|
|
211
|
-
--evaluationId 42 \
|
|
212
|
-
--apiKey $EVALAI_API_KEY
|
|
213
|
-
If doctor passes, check will work.
|
|
214
|
-
|
|
215
|
-
🧯 Error Handling
|
|
216
|
-
import { EvalAIError, RateLimitError } from "@pauly4010/evalai-sdk";
|
|
217
|
-
|
|
218
|
-
try {
|
|
219
|
-
await client.traces.create({ name: "User Query" });
|
|
220
|
-
} catch (err) {
|
|
221
|
-
if (err instanceof RateLimitError) {
|
|
222
|
-
console.log("Retry after:", err.retryAfter);
|
|
223
|
-
} else if (err instanceof EvalAIError) {
|
|
224
|
-
console.log(err.code, err.message, err.requestId);
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
🔍 Traces
|
|
230
|
-
const trace = await client.traces.create({
|
|
231
|
-
name: "User Query",
|
|
232
|
-
traceId: "trace-123",
|
|
233
|
-
metadata: { userId: "456" },
|
|
234
|
-
});
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
📝 Evaluations
|
|
238
|
-
import { EvaluationTemplates } from "@pauly4010/evalai-sdk";
|
|
239
|
-
|
|
240
|
-
const evaluation = await client.evaluations.create({
|
|
241
|
-
name: "Chatbot Responses",
|
|
242
|
-
type: EvaluationTemplates.OUTPUT_QUALITY,
|
|
243
|
-
createdBy: userId,
|
|
244
|
-
});
|
|
175
|
+
const client = AIEvalClient.init(); // from EVALAI_API_KEY env
|
|
176
|
+
// or
|
|
177
|
+
const client = new AIEvalClient({ apiKey: "...", organizationId: 123 });
|
|
178
|
+
```
|
|
245
179
|
|
|
180
|
+
### Framework Integrations
|
|
246
181
|
|
|
247
|
-
|
|
182
|
+
```typescript
|
|
248
183
|
import { traceOpenAI } from "@pauly4010/evalai-sdk/integrations/openai";
|
|
249
|
-
import
|
|
250
|
-
|
|
251
|
-
const openai = traceOpenAI(new OpenAI(), client);
|
|
252
|
-
|
|
253
|
-
await openai.chat.completions.create({
|
|
254
|
-
model: "gpt-4",
|
|
255
|
-
messages: [{ role: "user", content: "Hello" }],
|
|
256
|
-
});
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
🧭 Changelog
|
|
260
|
-
v1.5.8 (Latest)
|
|
261
|
-
Fixed secureRoute TypeScript overload compatibility
|
|
262
|
-
|
|
263
|
-
Fixed test infrastructure (expect.any, NextRequest constructor)
|
|
264
|
-
|
|
265
|
-
Fixed 304 response handling in exports API
|
|
266
|
-
|
|
267
|
-
Improved error catalog test coverage
|
|
268
|
-
|
|
269
|
-
v1.5.7
|
|
270
|
-
Documentation updates for CJS compatibility
|
|
271
|
-
|
|
272
|
-
Version alignment across README and changelog
|
|
273
|
-
|
|
274
|
-
Environment support section updated
|
|
275
|
-
|
|
276
|
-
v1.5.6
|
|
277
|
-
PASS/WARN/FAIL gate semantics
|
|
184
|
+
import { traceAnthropic } from "@pauly4010/evalai-sdk/integrations/anthropic";
|
|
185
|
+
```
|
|
278
186
|
|
|
279
|
-
|
|
187
|
+
---
|
|
280
188
|
|
|
281
|
-
|
|
189
|
+
## Installation
|
|
282
190
|
|
|
283
|
-
|
|
191
|
+
```bash
|
|
192
|
+
npm install @pauly4010/evalai-sdk
|
|
193
|
+
# or
|
|
194
|
+
yarn add @pauly4010/evalai-sdk
|
|
195
|
+
# or
|
|
196
|
+
pnpm add @pauly4010/evalai-sdk
|
|
197
|
+
```
|
|
284
198
|
|
|
285
|
-
|
|
199
|
+
Add `openai` as a peer dependency if using `openAIChatEval`:
|
|
286
200
|
|
|
287
|
-
|
|
201
|
+
```bash
|
|
202
|
+
npm install openai
|
|
203
|
+
```
|
|
288
204
|
|
|
289
|
-
|
|
205
|
+
## Environment Support
|
|
290
206
|
|
|
291
|
-
|
|
207
|
+
| Feature | Node.js | Browser |
|
|
208
|
+
|---------|---------|---------|
|
|
209
|
+
| Platform APIs (Traces, Evaluations, LLM Judge) | ✅ | ✅ |
|
|
210
|
+
| Assertions, Test Suites, Error Handling | ✅ | ✅ |
|
|
211
|
+
| CJS/ESM | ✅ | ✅ |
|
|
212
|
+
| CLI, Snapshots, File Export | ✅ | — |
|
|
213
|
+
| Context Propagation | ✅ Full | ⚠️ Basic |
|
|
292
214
|
|
|
293
|
-
|
|
294
|
-
GitHub annotations formatter
|
|
215
|
+
## No Lock-in
|
|
295
216
|
|
|
296
|
-
|
|
217
|
+
```bash
|
|
218
|
+
rm evalai.config.json
|
|
219
|
+
```
|
|
297
220
|
|
|
298
|
-
|
|
221
|
+
Your local `openAIChatEval` runs continue to work. No account cancellation. No data export required.
|
|
299
222
|
|
|
300
|
-
|
|
223
|
+
## Changelog
|
|
301
224
|
|
|
302
|
-
|
|
225
|
+
See [CHANGELOG.md](CHANGELOG.md) for the full release history.
|
|
303
226
|
|
|
304
|
-
|
|
227
|
+
**v1.7.0** — `evalai init` scaffolder, `evalai upgrade --full`, `detectRunner()`, machine-readable gate output, init test matrix
|
|
305
228
|
|
|
229
|
+
**v1.6.0** — `evalai gate`, `evalai baseline`, regression gate constants & types
|
|
306
230
|
|
|
307
|
-
|
|
231
|
+
**v1.5.8** — secureRoute fix, test infra fixes, 304 handling fix
|
|
308
232
|
|
|
309
|
-
|
|
233
|
+
**v1.5.5** — PASS/WARN/FAIL semantics, flake intelligence, golden regression suite
|
|
310
234
|
|
|
311
|
-
|
|
235
|
+
**v1.5.0** — GitHub annotations, `--onFail import`, `evalai doctor`
|
|
312
236
|
|
|
237
|
+
## License
|
|
313
238
|
|
|
314
|
-
📄 License
|
|
315
239
|
MIT
|
|
316
240
|
|
|
317
|
-
|
|
318
|
-
Documentation:
|
|
319
|
-
https://v0-ai-evaluation-platform-nu.vercel.app/documentation
|
|
241
|
+
## Support
|
|
320
242
|
|
|
321
|
-
|
|
322
|
-
https://github.com/pauly7610/ai-evaluation-platform/issues
|
|
323
|
-
```
|
|
243
|
+
- **Docs:** https://v0-ai-evaluation-platform-nu.vercel.app/documentation
|
|
244
|
+
- **Issues:** https://github.com/pauly7610/ai-evaluation-platform/issues
|
package/dist/cli/baseline.js
CHANGED
|
@@ -142,7 +142,7 @@ function runBaselineUpdate(cwd) {
|
|
|
142
142
|
}
|
|
143
143
|
if (!pkg.scripts?.["eval:baseline-update"]) {
|
|
144
144
|
console.error("❌ Missing 'eval:baseline-update' script in package.json.");
|
|
145
|
-
console.error(
|
|
145
|
+
console.error(' Add it: "eval:baseline-update": "npx tsx scripts/regression-gate.ts --update-baseline"');
|
|
146
146
|
return 1;
|
|
147
147
|
}
|
|
148
148
|
console.log("📊 Running baseline update...\n");
|
package/dist/cli/index.js
CHANGED
|
@@ -14,6 +14,7 @@ const doctor_1 = require("./doctor");
|
|
|
14
14
|
const init_1 = require("./init");
|
|
15
15
|
const regression_gate_1 = require("./regression-gate");
|
|
16
16
|
const share_1 = require("./share");
|
|
17
|
+
const upgrade_1 = require("./upgrade");
|
|
17
18
|
const argv = process.argv.slice(2);
|
|
18
19
|
const subcommand = argv[0];
|
|
19
20
|
if (subcommand === "init") {
|
|
@@ -29,6 +30,10 @@ else if (subcommand === "gate") {
|
|
|
29
30
|
const code = (0, regression_gate_1.runGate)(argv.slice(1));
|
|
30
31
|
process.exit(code);
|
|
31
32
|
}
|
|
33
|
+
else if (subcommand === "upgrade") {
|
|
34
|
+
const code = (0, upgrade_1.runUpgrade)(argv.slice(1));
|
|
35
|
+
process.exit(code);
|
|
36
|
+
}
|
|
32
37
|
else if (subcommand === "doctor") {
|
|
33
38
|
(0, doctor_1.runDoctor)(argv.slice(1))
|
|
34
39
|
.then((code) => process.exit(code))
|
|
@@ -71,6 +76,7 @@ Usage:
|
|
|
71
76
|
evalai baseline init Create starter evals/baseline.json
|
|
72
77
|
evalai baseline update Run tests and update baseline with real scores
|
|
73
78
|
evalai gate [options] Run regression gate (local test-based)
|
|
79
|
+
evalai upgrade --full Upgrade from Tier 1 to Tier 2 (full gate)
|
|
74
80
|
evalai doctor [options] Verify CI/CD setup (same endpoint as check)
|
|
75
81
|
evalai check [options] CI/CD evaluation gate (API-based)
|
|
76
82
|
evalai share [options] Create share link for a run
|
package/dist/cli/init.d.ts
CHANGED
|
@@ -1,7 +1,16 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
/**
|
|
3
|
-
* evalai init —
|
|
3
|
+
* evalai init — Full project scaffolder
|
|
4
4
|
*
|
|
5
|
-
*
|
|
5
|
+
* Zero-to-gate in under 5 minutes:
|
|
6
|
+
* npx evalai init
|
|
7
|
+
* git push
|
|
8
|
+
* …CI starts blocking regressions.
|
|
9
|
+
*
|
|
10
|
+
* What it does:
|
|
11
|
+
* 1. Detects Node repo + package manager
|
|
12
|
+
* 2. Creates evals/ directory + baseline.json
|
|
13
|
+
* 3. Installs .github/workflows/evalai-gate.yml
|
|
14
|
+
* 4. Prints next steps (no docs required)
|
|
6
15
|
*/
|
|
7
16
|
export declare function runInit(cwd?: string): boolean;
|