@evalgate/sdk 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +638 -0
- package/README.md +398 -0
- package/dist/assertions.d.ts +189 -0
- package/dist/assertions.js +662 -0
- package/dist/batch.d.ts +68 -0
- package/dist/batch.js +179 -0
- package/dist/cache.d.ts +65 -0
- package/dist/cache.js +131 -0
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +132 -0
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/check.d.ts +73 -0
- package/dist/cli/check.js +355 -0
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +112 -0
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +230 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.d.ts +88 -0
- package/dist/cli/doctor.js +675 -0
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.d.ts +58 -0
- package/dist/cli/explain.js +561 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +135 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +110 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +103 -0
- package/dist/cli/formatters/types.d.ts +103 -0
- package/dist/cli/formatters/types.js +8 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +179 -0
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.d.ts +9 -0
- package/dist/cli/index.js +332 -0
- package/dist/cli/init.d.ts +16 -0
- package/dist/cli/init.js +292 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +89 -0
- package/dist/cli/print-config.d.ts +29 -0
- package/dist/cli/print-config.js +270 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/regression-gate.d.ts +15 -0
- package/dist/cli/regression-gate.js +341 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +132 -0
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +91 -0
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +492 -0
- package/dist/cli/workspace.d.ts +31 -0
- package/dist/cli/workspace.js +68 -0
- package/dist/client.d.ts +368 -0
- package/dist/client.js +893 -0
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +232 -0
- package/dist/context.d.ts +134 -0
- package/dist/context.js +215 -0
- package/dist/errors.d.ts +82 -0
- package/dist/errors.js +298 -0
- package/dist/export.d.ts +195 -0
- package/dist/export.js +344 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.js +153 -0
- package/dist/integrations/anthropic.d.ts +91 -0
- package/dist/integrations/anthropic.js +163 -0
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +232 -0
- package/dist/integrations/openai.d.ts +92 -0
- package/dist/integrations/openai.js +160 -0
- package/dist/local.d.ts +39 -0
- package/dist/local.js +148 -0
- package/dist/logger.d.ts +128 -0
- package/dist/logger.js +227 -0
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +74 -0
- package/dist/pagination.js +139 -0
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +400 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/snapshot.d.ts +176 -0
- package/dist/snapshot.js +322 -0
- package/dist/streaming.d.ts +173 -0
- package/dist/streaming.js +268 -0
- package/dist/testing.d.ts +273 -0
- package/dist/testing.js +317 -0
- package/dist/types.d.ts +754 -0
- package/dist/types.js +54 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +41 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +389 -0
- package/dist/workflows.js +671 -0
- package/package.json +117 -0
package/README.md
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
# @evalgate/sdk
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/@evalgate/sdk)
|
|
4
|
+
[](https://www.npmjs.com/package/@evalgate/sdk)
|
|
5
|
+
[](https://www.typescriptlang.org/)
|
|
6
|
+
[](#)
|
|
7
|
+
[](#)
|
|
8
|
+
[](https://opensource.org/licenses/MIT)
|
|
9
|
+
|
|
10
|
+
**One-command CI for AI evaluation. Complete pipeline: discover → manifest → impact → run → diff → PR summary.**
|
|
11
|
+
|
|
12
|
+
Zero to production CI in 60 seconds. No infra. No lock-in. Remove anytime.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Quick Start (60 seconds)
|
|
17
|
+
|
|
18
|
+
Add this to your `.github/workflows/evalai.yml`:
|
|
19
|
+
|
|
20
|
+
```yaml
|
|
21
|
+
name: EvalGate CI
|
|
22
|
+
on: [push, pull_request]
|
|
23
|
+
jobs:
|
|
24
|
+
evalai:
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
- uses: actions/setup-node@v4
|
|
29
|
+
- run: npm ci
|
|
30
|
+
- run: npx @evalgate/sdk ci --format github --write-results --base main
|
|
31
|
+
- uses: actions/upload-artifact@v4
|
|
32
|
+
if: always()
|
|
33
|
+
with:
|
|
34
|
+
name: evalai-results
|
|
35
|
+
path: .evalai/
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Create `eval/your-spec.spec.ts`:
|
|
39
|
+
|
|
40
|
+
```typescript
|
|
41
|
+
import { defineEval } from "@evalgate/sdk";
|
|
42
|
+
|
|
43
|
+
defineEval({
|
|
44
|
+
name: "Basic Math Operations",
|
|
45
|
+
description: "Test fundamental arithmetic",
|
|
46
|
+
prompt: "Test: 1+1=2, string concatenation, array includes",
|
|
47
|
+
expected: "All tests should pass",
|
|
48
|
+
tags: ["basic", "math"],
|
|
49
|
+
category: "unit-test"
|
|
50
|
+
});
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
git add .github/workflows/evalai.yml eval/
|
|
55
|
+
git commit -m "feat: add EvalGate CI pipeline"
|
|
56
|
+
git push
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
That's it! Your CI now:
|
|
60
|
+
- ✅ Discovers evaluation specs automatically
|
|
61
|
+
- ✅ Runs only impacted specs (smart caching)
|
|
62
|
+
- ✅ Compares results against base branch
|
|
63
|
+
- ✅ Posts rich summary in PR with regressions
|
|
64
|
+
- ✅ Exits with proper codes (0=clean, 1=regressions, 2=config)
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## 🚀 New in v2.0.0: One-Command CI
|
|
69
|
+
|
|
70
|
+
### `evalai ci` - Complete CI Pipeline
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
npx @evalgate/sdk ci --format github --write-results --base main
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
**What it does:**
|
|
77
|
+
1. **Discover** - Finds all evaluation specs automatically
|
|
78
|
+
2. **Manifest** - Builds stable manifest if missing
|
|
79
|
+
3. **Impact Analysis** - Runs only specs impacted by changes (optional)
|
|
80
|
+
4. **Run** - Executes evaluations with artifact retention
|
|
81
|
+
5. **Diff** - Compares results against base branch
|
|
82
|
+
6. **PR Summary** - Posts rich markdown summary to GitHub
|
|
83
|
+
7. **Debug Flow** - Prints copy/paste next step on failure
|
|
84
|
+
|
|
85
|
+
**Advanced Options:**
|
|
86
|
+
```bash
|
|
87
|
+
npx @evalgate/sdk ci --base main --impacted-only # Run only impacted specs
|
|
88
|
+
npx @evalgate/sdk ci --format json --write-results # JSON output for automation
|
|
89
|
+
npx @evalgate/sdk ci --base develop # Custom base branch
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Smart Diffing & GitHub Integration
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
npx @evalgate/sdk diff --base main --head last --format github
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Features:**
|
|
99
|
+
- 📊 Pass rate delta and score changes
|
|
100
|
+
- 🚨 Regression detection with classifications
|
|
101
|
+
- 📈 Improvements and new specs
|
|
102
|
+
- 📁 Artifact links and technical details
|
|
103
|
+
- 🎯 Exit codes: 0=clean, 1=regressions, 2=config
|
|
104
|
+
|
|
105
|
+
### Self-Documenting Failures
|
|
106
|
+
|
|
107
|
+
Every failure prints a clear next step:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
🔧 Next step for debugging:
|
|
111
|
+
Download base artifact and run: evalai diff --base .evalai/base-run.json --head .evalai/last-run.json
|
|
112
|
+
Artifacts: .evalai/runs/
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## CLI Commands
|
|
118
|
+
|
|
119
|
+
### 🚀 One-Command CI (v2.0.0)
|
|
120
|
+
|
|
121
|
+
| Command | Description |
|
|
122
|
+
|---------|-------------|
|
|
123
|
+
| `npx evalgate ci` | Complete CI pipeline: discover → manifest → impact → run → diff → PR summary |
|
|
124
|
+
| `npx evalgate ci --base main` | Run CI with diff against main branch |
|
|
125
|
+
| `npx evalgate ci --impacted-only` | Run only specs impacted by changes |
|
|
126
|
+
| `npx evalgate ci --format github` | GitHub Step Summary with rich markdown |
|
|
127
|
+
| `npx evalgate ci --format json` | JSON output for automation |
|
|
128
|
+
|
|
129
|
+
### Discovery & Manifest
|
|
130
|
+
|
|
131
|
+
| Command | Description |
|
|
132
|
+
|---------|-------------|
|
|
133
|
+
| `npx evalgate discover` | Find and analyze evaluation specs |
|
|
134
|
+
| `npx evalgate discover --manifest` | Generate stable manifest for incremental analysis |
|
|
135
|
+
|
|
136
|
+
### Impact Analysis
|
|
137
|
+
|
|
138
|
+
| Command | Description |
|
|
139
|
+
|---------|-------------|
|
|
140
|
+
| `npx evalgate impact-analysis --base main` | Analyze impact of changes |
|
|
141
|
+
| `npx evalgate impact-analysis --changed-files file1.ts,file2.ts` | Analyze specific changed files |
|
|
142
|
+
|
|
143
|
+
### Run & Diff
|
|
144
|
+
|
|
145
|
+
| Command | Description |
|
|
146
|
+
|---------|-------------|
|
|
147
|
+
| `npx evalgate run` | Run evaluation specifications |
|
|
148
|
+
| `npx evalgate run --write-results` | Run with artifact retention |
|
|
149
|
+
| `npx evalgate diff --base main` | Compare results against base branch |
|
|
150
|
+
| `npx evalgate diff --base last --head last` | Compare last two runs |
|
|
151
|
+
| `npx evalgate diff --format github` | GitHub Step Summary with regressions |
|
|
152
|
+
|
|
153
|
+
### Legacy Regression Gate (local, no account needed)
|
|
154
|
+
|
|
155
|
+
| Command | Description |
|
|
156
|
+
|---------|-------------|
|
|
157
|
+
| `npx evalgate init` | Full project scaffolder — creates everything you need |
|
|
158
|
+
| `npx evalgate gate` | Run regression gate locally |
|
|
159
|
+
| `npx evalgate gate --format json` | Machine-readable JSON output |
|
|
160
|
+
| `npx evalgate gate --format github` | GitHub Step Summary with delta table |
|
|
161
|
+
| `npx evalgate baseline init` | Create starter `evals/baseline.json` |
|
|
162
|
+
| `npx evalgate baseline update` | Re-run tests and update baseline with real scores |
|
|
163
|
+
| `npx evalgate upgrade --full` | Upgrade from Tier 1 (built-in) to Tier 2 (full gate) |
|
|
164
|
+
|
|
165
|
+
### API Gate (requires account)
|
|
166
|
+
|
|
167
|
+
| Command | Description |
|
|
168
|
+
|---------|-------------|
|
|
169
|
+
| `npx evalgate check` | Gate on quality score from dashboard |
|
|
170
|
+
| `npx evalgate share` | Create share link for a run |
|
|
171
|
+
|
|
172
|
+
### Debugging & Diagnostics
|
|
173
|
+
|
|
174
|
+
| Command | Description |
|
|
175
|
+
|---------|-------------|
|
|
176
|
+
| `npx evalgate doctor` | Comprehensive preflight checklist — verifies config, baseline, auth, API, CI wiring |
|
|
177
|
+
| `npx evalgate explain` | Offline report explainer — top failures, root cause classification, suggested fixes |
|
|
178
|
+
| `npx evalgate print-config` | Show resolved config with source-of-truth annotations (file/env/default/arg) |
|
|
179
|
+
|
|
180
|
+
### Migration Tools
|
|
181
|
+
|
|
182
|
+
| Command | Description |
|
|
183
|
+
|---------|-------------|
|
|
184
|
+
| `npx evalgate migrate config --in evalai.config.json --out eval/migrated.spec.ts` | Convert legacy config to DSL |
|
|
185
|
+
|
|
186
|
+
**Guided failure flow:**
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
evalai ci → fails → "Next: evalai explain --report .evalai/last-run.json"
|
|
190
|
+
↓
|
|
191
|
+
evalai explain → root causes + fixes
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**GitHub Actions step summary** — CI result at a glance with regressions and artifacts:
|
|
195
|
+
|
|
196
|
+

|
|
197
|
+
|
|
198
|
+
**`evalai explain` terminal output** — root causes + fix commands:
|
|
199
|
+
|
|
200
|
+

|
|
201
|
+
|
|
202
|
+
All commands automatically write artifacts so `explain` works with zero flags.
|
|
203
|
+
|
|
204
|
+
### Gate Exit Codes
|
|
205
|
+
|
|
206
|
+
| Code | Meaning |
|
|
207
|
+
|------|---------|
|
|
208
|
+
| 0 | Pass — no regression |
|
|
209
|
+
| 1 | Regression detected |
|
|
210
|
+
| 2 | Infra error (baseline missing, tests crashed) |
|
|
211
|
+
|
|
212
|
+
### Check Exit Codes (API mode)
|
|
213
|
+
|
|
214
|
+
| Code | Meaning |
|
|
215
|
+
|------|---------|
|
|
216
|
+
| 0 | Pass |
|
|
217
|
+
| 1 | Score below threshold |
|
|
218
|
+
| 2 | Regression failure |
|
|
219
|
+
| 3 | Policy violation |
|
|
220
|
+
| 4 | API error |
|
|
221
|
+
| 5 | Bad arguments |
|
|
222
|
+
| 6 | Low test count |
|
|
223
|
+
| 7 | Weak evidence |
|
|
224
|
+
| 8 | Warn (soft regression) |
|
|
225
|
+
|
|
226
|
+
### Doctor Exit Codes
|
|
227
|
+
|
|
228
|
+
| Code | Meaning |
|
|
229
|
+
|------|---------|
|
|
230
|
+
| 0 | Ready — all checks passed |
|
|
231
|
+
| 2 | Not ready — one or more checks failed |
|
|
232
|
+
| 3 | Infrastructure error |
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## How the Gate Works
|
|
237
|
+
|
|
238
|
+
**Built-in mode** (any Node project, no config needed):
|
|
239
|
+
- Runs `<pm> test`, captures exit code + test count
|
|
240
|
+
- Compares against `evals/baseline.json`
|
|
241
|
+
- Writes `evals/regression-report.json`
|
|
242
|
+
- Fails CI if tests regress
|
|
243
|
+
|
|
244
|
+
**Project mode** (advanced, for full regression gate):
|
|
245
|
+
- If `eval:regression-gate` script exists in `package.json`, delegates to it
|
|
246
|
+
- Supports golden eval scores, confidence tests, p95 latency, cost tracking
|
|
247
|
+
- Full delta table with tolerances
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Run a Regression Test Locally (no account)
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
npm install @evalgate/sdk openai
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
```typescript
|
|
258
|
+
import { openAIChatEval } from "@evalgate/sdk";
|
|
259
|
+
|
|
260
|
+
await openAIChatEval({
|
|
261
|
+
name: "chat-regression",
|
|
262
|
+
cases: [
|
|
263
|
+
{ input: "Hello", expectedOutput: "greeting" },
|
|
264
|
+
{ input: "2 + 2 = ?", expectedOutput: "4" },
|
|
265
|
+
],
|
|
266
|
+
});
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
Output: `PASS 2/2 (score: 100)`. No account needed. Just a score.
|
|
270
|
+
|
|
271
|
+
### Vitest Integration
|
|
272
|
+
|
|
273
|
+
```typescript
|
|
274
|
+
import { openAIChatEval, extendExpectWithToPassGate } from "@evalgate/sdk";
|
|
275
|
+
import { expect } from "vitest";
|
|
276
|
+
|
|
277
|
+
extendExpectWithToPassGate(expect);
|
|
278
|
+
|
|
279
|
+
it("passes gate", async () => {
|
|
280
|
+
const result = await openAIChatEval({
|
|
281
|
+
name: "chat-regression",
|
|
282
|
+
cases: [
|
|
283
|
+
{ input: "Hello", expectedOutput: "greeting" },
|
|
284
|
+
{ input: "2 + 2 = ?", expectedOutput: "4" },
|
|
285
|
+
],
|
|
286
|
+
});
|
|
287
|
+
expect(result).toPassGate();
|
|
288
|
+
});
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
## SDK Exports
|
|
294
|
+
|
|
295
|
+
### Regression Gate Constants
|
|
296
|
+
|
|
297
|
+
```typescript
|
|
298
|
+
import {
|
|
299
|
+
GATE_EXIT, // { PASS: 0, REGRESSION: 1, INFRA_ERROR: 2, ... }
|
|
300
|
+
GATE_CATEGORY, // { PASS: "pass", REGRESSION: "regression", INFRA_ERROR: "infra_error" }
|
|
301
|
+
REPORT_SCHEMA_VERSION,
|
|
302
|
+
ARTIFACTS, // { BASELINE, REGRESSION_REPORT, CONFIDENCE_SUMMARY, LATENCY_BENCHMARK }
|
|
303
|
+
} from "@evalgate/sdk";
|
|
304
|
+
|
|
305
|
+
// Or tree-shakeable:
|
|
306
|
+
import { GATE_EXIT } from "@evalgate/sdk/regression";
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### Types
|
|
310
|
+
|
|
311
|
+
```typescript
|
|
312
|
+
import type {
|
|
313
|
+
RegressionReport,
|
|
314
|
+
RegressionDelta,
|
|
315
|
+
Baseline,
|
|
316
|
+
BaselineTolerance,
|
|
317
|
+
GateExitCode,
|
|
318
|
+
GateCategory,
|
|
319
|
+
} from "@evalgate/sdk/regression";
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### Platform Client
|
|
323
|
+
|
|
324
|
+
```typescript
|
|
325
|
+
import { AIEvalClient } from "@evalgate/sdk";
|
|
326
|
+
|
|
327
|
+
const client = AIEvalClient.init(); // from EVALAI_API_KEY env
|
|
328
|
+
// or
|
|
329
|
+
const client = new AIEvalClient({ apiKey: "...", organizationId: 123 });
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
### Framework Integrations
|
|
333
|
+
|
|
334
|
+
```typescript
|
|
335
|
+
import { traceOpenAI } from "@evalgate/sdk/integrations/openai";
|
|
336
|
+
import { traceAnthropic } from "@evalgate/sdk/integrations/anthropic";
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
## Installation
|
|
342
|
+
|
|
343
|
+
```bash
|
|
344
|
+
npm install @evalgate/sdk
|
|
345
|
+
# or
|
|
346
|
+
yarn add @evalgate/sdk
|
|
347
|
+
# or
|
|
348
|
+
pnpm add @evalgate/sdk
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
Add `openai` as a peer dependency if using `openAIChatEval`:
|
|
352
|
+
|
|
353
|
+
```bash
|
|
354
|
+
npm install openai
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
## Environment Support
|
|
358
|
+
|
|
359
|
+
| Feature | Node.js | Browser |
|
|
360
|
+
|---------|---------|---------|
|
|
361
|
+
| Platform APIs (Traces, Evaluations, LLM Judge) | ✅ | ✅ |
|
|
362
|
+
| Assertions, Test Suites, Error Handling | ✅ | ✅ |
|
|
363
|
+
| CJS/ESM | ✅ | ✅ |
|
|
364
|
+
| CLI, Snapshots, File Export | ✅ | — |
|
|
365
|
+
| Context Propagation | ✅ Full | ⚠️ Basic |
|
|
366
|
+
|
|
367
|
+
## No Lock-in
|
|
368
|
+
|
|
369
|
+
```bash
|
|
370
|
+
rm evalai.config.json
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
Your local `openAIChatEval` runs continue to work. No account cancellation. No data export required.
|
|
374
|
+
|
|
375
|
+
## Changelog
|
|
376
|
+
|
|
377
|
+
See [CHANGELOG.md](CHANGELOG.md) for the full release history.
|
|
378
|
+
|
|
379
|
+
**v1.8.0** — `evalai doctor` rewrite (9-check checklist), `evalai explain` command, guided failure flow, CI template with doctor preflight
|
|
380
|
+
|
|
381
|
+
**v1.7.0** — `evalai init` scaffolder, `evalai upgrade --full`, `detectRunner()`, machine-readable gate output, init test matrix
|
|
382
|
+
|
|
383
|
+
**v1.6.0** — `evalai gate`, `evalai baseline`, regression gate constants & types
|
|
384
|
+
|
|
385
|
+
**v1.5.8** — secureRoute fix, test infra fixes, 304 handling fix
|
|
386
|
+
|
|
387
|
+
**v1.5.5** — PASS/WARN/FAIL semantics, flake intelligence, golden regression suite
|
|
388
|
+
|
|
389
|
+
**v1.5.0** — GitHub annotations, `--onFail import`, `evalai doctor`
|
|
390
|
+
|
|
391
|
+
## License
|
|
392
|
+
|
|
393
|
+
MIT
|
|
394
|
+
|
|
395
|
+
## Support
|
|
396
|
+
|
|
397
|
+
- **Docs:** https://evalgate.com/documentation
|
|
398
|
+
- **Issues:** https://github.com/pauly7610/ai-evaluation-platform/issues
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Enhanced Assertion Library
|
|
3
|
+
* Tier 1.3: Pre-Built Assertion Library with 20+ built-in assertions
|
|
4
|
+
*
|
|
5
|
+
* @example
|
|
6
|
+
* ```typescript
|
|
7
|
+
* import { expect } from '@ai-eval-platform/sdk';
|
|
8
|
+
*
|
|
9
|
+
* const output = "Hello, how can I help you today?";
|
|
10
|
+
*
|
|
11
|
+
* expect(output).toContainKeywords(['help', 'today']);
|
|
12
|
+
* expect(output).toHaveSentiment('positive');
|
|
13
|
+
* expect(output).toMatchPattern(/help/i);
|
|
14
|
+
* expect(output).toHaveLength({ min: 10, max: 100 });
|
|
15
|
+
* ```
|
|
16
|
+
*/
|
|
17
|
+
export interface AssertionResult {
|
|
18
|
+
name: string;
|
|
19
|
+
passed: boolean;
|
|
20
|
+
expected: unknown;
|
|
21
|
+
actual: unknown;
|
|
22
|
+
message?: string;
|
|
23
|
+
}
|
|
24
|
+
export declare class AssertionError extends Error {
|
|
25
|
+
expected: unknown;
|
|
26
|
+
actual: unknown;
|
|
27
|
+
constructor(message: string, expected: unknown, actual: unknown);
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Fluent assertion builder
|
|
31
|
+
*/
|
|
32
|
+
export declare class Expectation {
|
|
33
|
+
private value;
|
|
34
|
+
constructor(value: unknown);
|
|
35
|
+
/**
|
|
36
|
+
* Assert value equals expected
|
|
37
|
+
* @example expect(output).toEqual("Hello")
|
|
38
|
+
*/
|
|
39
|
+
toEqual(expected: unknown, message?: string): AssertionResult;
|
|
40
|
+
/**
|
|
41
|
+
* Assert value contains substring
|
|
42
|
+
* @example expect(output).toContain("help")
|
|
43
|
+
*/
|
|
44
|
+
toContain(substring: string, message?: string): AssertionResult;
|
|
45
|
+
/**
|
|
46
|
+
* Assert value contains all keywords
|
|
47
|
+
* @example expect(output).toContainKeywords(['help', 'support'])
|
|
48
|
+
*/
|
|
49
|
+
toContainKeywords(keywords: string[], message?: string): AssertionResult;
|
|
50
|
+
/**
|
|
51
|
+
* Assert value does not contain substring
|
|
52
|
+
* @example expect(output).toNotContain("error")
|
|
53
|
+
*/
|
|
54
|
+
toNotContain(substring: string, message?: string): AssertionResult;
|
|
55
|
+
/**
|
|
56
|
+
* Assert value does not contain PII (emails, phone numbers, SSN)
|
|
57
|
+
* @example expect(output).toNotContainPII()
|
|
58
|
+
*/
|
|
59
|
+
toNotContainPII(message?: string): AssertionResult;
|
|
60
|
+
/**
|
|
61
|
+
* Assert value matches regular expression
|
|
62
|
+
* @example expect(output).toMatchPattern(/\d{3}-\d{3}-\d{4}/)
|
|
63
|
+
*/
|
|
64
|
+
toMatchPattern(pattern: RegExp, message?: string): AssertionResult;
|
|
65
|
+
/**
|
|
66
|
+
* Assert value is valid JSON
|
|
67
|
+
* @example expect(output).toBeValidJSON()
|
|
68
|
+
*/
|
|
69
|
+
toBeValidJSON(message?: string): AssertionResult;
|
|
70
|
+
/**
|
|
71
|
+
* Assert JSON matches schema
|
|
72
|
+
* @example expect(output).toMatchJSON({ status: 'success' })
|
|
73
|
+
*/
|
|
74
|
+
toMatchJSON(schema: Record<string, unknown>, message?: string): AssertionResult;
|
|
75
|
+
/**
|
|
76
|
+
* Assert value has expected sentiment
|
|
77
|
+
* @example expect(output).toHaveSentiment('positive')
|
|
78
|
+
*/
|
|
79
|
+
toHaveSentiment(expected: "positive" | "negative" | "neutral", message?: string): AssertionResult;
|
|
80
|
+
/**
|
|
81
|
+
* Assert string length is within range
|
|
82
|
+
* @example expect(output).toHaveLength({ min: 10, max: 100 })
|
|
83
|
+
*/
|
|
84
|
+
toHaveLength(range: {
|
|
85
|
+
min?: number;
|
|
86
|
+
max?: number;
|
|
87
|
+
}, message?: string): AssertionResult;
|
|
88
|
+
/**
|
|
89
|
+
* Assert no hallucinations (all ground truth facts present)
|
|
90
|
+
* @example expect(output).toNotHallucinate(['fact1', 'fact2'])
|
|
91
|
+
*/
|
|
92
|
+
toNotHallucinate(groundTruth: string[], message?: string): AssertionResult;
|
|
93
|
+
/**
|
|
94
|
+
* Assert response latency is within limit
|
|
95
|
+
* @example expect(durationMs).toBeFasterThan(1000)
|
|
96
|
+
*/
|
|
97
|
+
toBeFasterThan(maxMs: number, message?: string): AssertionResult;
|
|
98
|
+
/**
|
|
99
|
+
* Assert value is truthy
|
|
100
|
+
* @example expect(result).toBeTruthy()
|
|
101
|
+
*/
|
|
102
|
+
toBeTruthy(message?: string): AssertionResult;
|
|
103
|
+
/**
|
|
104
|
+
* Assert value is falsy
|
|
105
|
+
* @example expect(error).toBeFalsy()
|
|
106
|
+
*/
|
|
107
|
+
toBeFalsy(message?: string): AssertionResult;
|
|
108
|
+
/**
|
|
109
|
+
* Assert value is greater than expected
|
|
110
|
+
* @example expect(score).toBeGreaterThan(0.8)
|
|
111
|
+
*/
|
|
112
|
+
toBeGreaterThan(expected: number, message?: string): AssertionResult;
|
|
113
|
+
/**
|
|
114
|
+
* Assert value is less than expected
|
|
115
|
+
* @example expect(errorRate).toBeLessThan(0.05)
|
|
116
|
+
*/
|
|
117
|
+
toBeLessThan(expected: number, message?: string): AssertionResult;
|
|
118
|
+
/**
|
|
119
|
+
* Assert value is between min and max
|
|
120
|
+
* @example expect(score).toBeBetween(0, 1)
|
|
121
|
+
*/
|
|
122
|
+
toBeBetween(min: number, max: number, message?: string): AssertionResult;
|
|
123
|
+
/**
|
|
124
|
+
* Assert value contains code block
|
|
125
|
+
* @example expect(output).toContainCode()
|
|
126
|
+
*/
|
|
127
|
+
toContainCode(message?: string): AssertionResult;
|
|
128
|
+
/**
|
|
129
|
+
* Assert value is professional tone (no profanity)
|
|
130
|
+
* @example expect(output).toBeProfessional()
|
|
131
|
+
*/
|
|
132
|
+
toBeProfessional(message?: string): AssertionResult;
|
|
133
|
+
/**
|
|
134
|
+
* Assert value has proper grammar (basic checks)
|
|
135
|
+
* @example expect(output).toHaveProperGrammar()
|
|
136
|
+
*/
|
|
137
|
+
toHaveProperGrammar(message?: string): AssertionResult;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Create an expectation for fluent assertions
|
|
141
|
+
*
|
|
142
|
+
* @example
|
|
143
|
+
* ```typescript
|
|
144
|
+
* const output = "Hello, how can I help you?";
|
|
145
|
+
*
|
|
146
|
+
* expect(output).toContain("help");
|
|
147
|
+
* expect(output).toHaveSentiment('positive');
|
|
148
|
+
* expect(output).toHaveLength({ min: 10, max: 100 });
|
|
149
|
+
* ```
|
|
150
|
+
*/
|
|
151
|
+
export declare function expect(value: unknown): Expectation;
|
|
152
|
+
/**
|
|
153
|
+
* Run multiple assertions and collect results
|
|
154
|
+
*
|
|
155
|
+
* @example
|
|
156
|
+
* ```typescript
|
|
157
|
+
* const results = runAssertions([
|
|
158
|
+
* () => expect(output).toContain("help"),
|
|
159
|
+
* () => expect(output).toHaveSentiment('positive'),
|
|
160
|
+
* () => expect(output).toHaveLength({ min: 10 })
|
|
161
|
+
* ]);
|
|
162
|
+
*
|
|
163
|
+
* const allPassed = results.every(r => r.passed);
|
|
164
|
+
* ```
|
|
165
|
+
*/
|
|
166
|
+
export declare function runAssertions(assertions: (() => AssertionResult)[]): AssertionResult[];
|
|
167
|
+
export declare function containsKeywords(text: string, keywords: string[]): boolean;
|
|
168
|
+
export declare function matchesPattern(text: string, pattern: RegExp): boolean;
|
|
169
|
+
export declare function hasLength(text: string, range: {
|
|
170
|
+
min?: number;
|
|
171
|
+
max?: number;
|
|
172
|
+
}): boolean;
|
|
173
|
+
export declare function containsJSON(text: string): boolean;
|
|
174
|
+
export declare function notContainsPII(text: string): boolean;
|
|
175
|
+
export declare function hasSentiment(text: string, expected: "positive" | "negative" | "neutral"): boolean;
|
|
176
|
+
export declare function similarTo(text1: string, text2: string, threshold?: number): boolean;
|
|
177
|
+
export declare function withinRange(value: number, min: number, max: number): boolean;
|
|
178
|
+
export declare function isValidEmail(email: string): boolean;
|
|
179
|
+
export declare function isValidURL(url: string): boolean;
|
|
180
|
+
export declare function hasNoHallucinations(text: string, groundTruth: string[]): boolean;
|
|
181
|
+
export declare function matchesSchema(value: unknown, schema: Record<string, unknown>): boolean;
|
|
182
|
+
export declare function hasReadabilityScore(text: string, minScore: number): boolean;
|
|
183
|
+
export declare function containsLanguage(text: string, language: string): boolean;
|
|
184
|
+
export declare function hasFactualAccuracy(text: string, facts: string[]): boolean;
|
|
185
|
+
export declare function respondedWithinTime(startTime: number, maxMs: number): boolean;
|
|
186
|
+
export declare function hasNoToxicity(text: string): boolean;
|
|
187
|
+
export declare function followsInstructions(text: string, instructions: string[]): boolean;
|
|
188
|
+
export declare function containsAllRequiredFields(obj: unknown, requiredFields: string[]): boolean;
|
|
189
|
+
export declare function hasValidCodeSyntax(code: string, language: string): boolean;
|