@evalgate/sdk 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +638 -0
- package/README.md +398 -0
- package/dist/assertions.d.ts +189 -0
- package/dist/assertions.js +662 -0
- package/dist/batch.d.ts +68 -0
- package/dist/batch.js +179 -0
- package/dist/cache.d.ts +65 -0
- package/dist/cache.js +131 -0
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +132 -0
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/check.d.ts +73 -0
- package/dist/cli/check.js +355 -0
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +112 -0
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +230 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.d.ts +88 -0
- package/dist/cli/doctor.js +675 -0
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.d.ts +58 -0
- package/dist/cli/explain.js +561 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +135 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +110 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +103 -0
- package/dist/cli/formatters/types.d.ts +103 -0
- package/dist/cli/formatters/types.js +8 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +179 -0
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.d.ts +9 -0
- package/dist/cli/index.js +332 -0
- package/dist/cli/init.d.ts +16 -0
- package/dist/cli/init.js +292 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +89 -0
- package/dist/cli/print-config.d.ts +29 -0
- package/dist/cli/print-config.js +270 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/regression-gate.d.ts +15 -0
- package/dist/cli/regression-gate.js +341 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +132 -0
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +91 -0
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +492 -0
- package/dist/cli/workspace.d.ts +31 -0
- package/dist/cli/workspace.js +68 -0
- package/dist/client.d.ts +368 -0
- package/dist/client.js +893 -0
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +232 -0
- package/dist/context.d.ts +134 -0
- package/dist/context.js +215 -0
- package/dist/errors.d.ts +82 -0
- package/dist/errors.js +298 -0
- package/dist/export.d.ts +195 -0
- package/dist/export.js +344 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.js +153 -0
- package/dist/integrations/anthropic.d.ts +91 -0
- package/dist/integrations/anthropic.js +163 -0
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +232 -0
- package/dist/integrations/openai.d.ts +92 -0
- package/dist/integrations/openai.js +160 -0
- package/dist/local.d.ts +39 -0
- package/dist/local.js +148 -0
- package/dist/logger.d.ts +128 -0
- package/dist/logger.js +227 -0
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +74 -0
- package/dist/pagination.js +139 -0
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +400 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/snapshot.d.ts +176 -0
- package/dist/snapshot.js +322 -0
- package/dist/streaming.d.ts +173 -0
- package/dist/streaming.js +268 -0
- package/dist/testing.d.ts +273 -0
- package/dist/testing.js +317 -0
- package/dist/types.d.ts +754 -0
- package/dist/types.js +54 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +41 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +389 -0
- package/dist/workflows.js +671 -0
- package/package.json +117 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,638 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to the @evalgate/sdk package will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [2.0.0] - 2026-03-01
|
|
9
|
+
|
|
10
|
+
### Breaking โ EvalGate Rebrand
|
|
11
|
+
|
|
12
|
+
- **Package:** `@pauly4010/evalai-sdk` โ `@evalgate/sdk`
|
|
13
|
+
- **CLI:** `evalai` โ `evalgate`
|
|
14
|
+
- **Config dir:** `.evalai/` โ `.evalgate/` (legacy still read with deprecation warning)
|
|
15
|
+
- **Env vars:** `EVALAI_*` โ `EVALGATE_*` (legacy still work with deprecation warning)
|
|
16
|
+
- **Error class:** `EvalAIError` โ `EvalGateError`
|
|
17
|
+
- **HTTP headers:** `X-EvalAI-*` โ `X-EvalGate-*`
|
|
18
|
+
|
|
19
|
+
### Added
|
|
20
|
+
|
|
21
|
+
- Deprecation warnings when using `EVALAI_*` env vars or `.evalai/` config
|
|
22
|
+
|
|
23
|
+
### Deprecated
|
|
24
|
+
|
|
25
|
+
- `@pauly4010/evalai-sdk` โ use `@evalgate/sdk` instead
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## [1.9.0] - 2026-02-27
|
|
30
|
+
|
|
31
|
+
### โจ Added
|
|
32
|
+
|
|
33
|
+
#### CLI โ One-Command CI Loop (`evalai ci`)
|
|
34
|
+
|
|
35
|
+
- **`evalai ci`** โ Single command teams put in GitHub workflows and never think about again
|
|
36
|
+
- **Complete CI pipeline**: discover โ manifest โ impact โ run โ diff โ PR summary โ safe failure โ "next step"
|
|
37
|
+
- **Automatic manifest building**: Builds manifest if missing, no manual steps required
|
|
38
|
+
- **Impact analysis integration**: `--impacted-only` flag for targeted testing
|
|
39
|
+
- **Smart exit codes**: 0=clean, 1=regressions, 2=config/infra issues
|
|
40
|
+
- **Self-documenting failures**: Always prints copy/paste next step for debugging
|
|
41
|
+
- **GitHub Step Summary integration**: Automatic PR summaries with regressions and artifacts
|
|
42
|
+
|
|
43
|
+
#### CLI โ Durable Run History & Diff System
|
|
44
|
+
|
|
45
|
+
- **Run artifact retention**: Timestamped artifacts in `.evalai/runs/run-<runId>.json`
|
|
46
|
+
- **Run index file**: `.evalai/runs/index.json` tracks all runs with metadata
|
|
47
|
+
- **Schema versioning**: `RunResult` and `DiffResult` include `schemaVersion` for compatibility
|
|
48
|
+
- **Base/head shortcuts**: `--base baseline`, `--base last`, `--head last` for common cases
|
|
49
|
+
- **Floating point normalization**: Consistent score/delta calculations across runs
|
|
50
|
+
- **Comprehensive diff comparison**: Classifies regressions, improvements, added, removed specs
|
|
51
|
+
|
|
52
|
+
#### CLI โ Centralized Architecture
|
|
53
|
+
|
|
54
|
+
- **Environment detection**: `isCI()`, `isGitHubActions()`, `getGitHubStepSummaryPath()` unified
|
|
55
|
+
- **Workspace resolution**: `resolveEvalWorkspace()` provides all `.evalai` paths
|
|
56
|
+
- **Git reference detection**: Comprehensive patterns for branches, tags, and ranges
|
|
57
|
+
- **No more duplication**: All commands use shared utilities for consistency
|
|
58
|
+
|
|
59
|
+
#### CLI โ CI Friendliness
|
|
60
|
+
|
|
61
|
+
- **Fail-safe base resolution**: Clear error messages when base artifacts missing in CI
|
|
62
|
+
- **GitHub Step Summary**: Rich markdown summaries with metrics, regressions, and artifact links
|
|
63
|
+
- **CI-specific error handling**: Exit code 2 for config issues with helpful guidance
|
|
64
|
+
- **Artifact download instructions**: Exact commands for manual base artifact setup
|
|
65
|
+
|
|
66
|
+
### ๐ง Changed
|
|
67
|
+
|
|
68
|
+
- **Exit codes standardized**: 0=clean, 1=regressions, 2=config/infra issues across all commands
|
|
69
|
+
- **Schema compatibility**: Added `schemaVersion` validation for future-proofing
|
|
70
|
+
- **Path resolution**: All commands use centralized workspace helpers
|
|
71
|
+
- **Error messages**: More actionable and context-aware guidance
|
|
72
|
+
|
|
73
|
+
### ๐ New Features Summary
|
|
74
|
+
|
|
75
|
+
- **One-command CI**: `evalai ci` replaces multi-step workflows
|
|
76
|
+
- **Durable history**: Run artifacts preserved with smart indexing
|
|
77
|
+
- **Smart diffing**: Automated regression detection with GitHub integration
|
|
78
|
+
- **Centralized utilities**: Environment detection and workspace resolution unified
|
|
79
|
+
- **Self-documenting**: Clear next steps for any failure scenario
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## [1.8.0] - 2026-02-26
|
|
84
|
+
|
|
85
|
+
### โจ Added
|
|
86
|
+
|
|
87
|
+
#### CLI โ `evalai doctor` Rewrite (Comprehensive Checklist)
|
|
88
|
+
|
|
89
|
+
- **9 itemized checks** with pass/fail/warn/skip status and exact remediation commands:
|
|
90
|
+
1. Project detection (package.json + lockfile + package manager)
|
|
91
|
+
2. Config file validity (evalai.config.json)
|
|
92
|
+
3. Baseline file (evals/baseline.json โ schema, staleness)
|
|
93
|
+
4. Authentication (API key presence, redacted display)
|
|
94
|
+
5. Evaluation target (evaluationId configured)
|
|
95
|
+
6. API connectivity (reachable, latency)
|
|
96
|
+
7. Evaluation access (permissions, baseline presence)
|
|
97
|
+
8. CI wiring (.github/workflows/evalai-gate.yml)
|
|
98
|
+
9. Provider env vars (OpenAI/Anthropic/Azure โ optional)
|
|
99
|
+
- **Exit codes**: `0` ready, `2` not ready, `3` infrastructure error
|
|
100
|
+
- **`--report`** flag outputs full JSON diagnostic bundle (versions, hashes, latency, all checks)
|
|
101
|
+
- **`--format json`** for machine-readable output
|
|
102
|
+
|
|
103
|
+
#### CLI โ `evalai explain` (New Command)
|
|
104
|
+
|
|
105
|
+
- **Offline report explainer** โ reads `.evalai/last-report.json` or `evals/regression-report.json` with zero flags
|
|
106
|
+
- **Top 3 failing test cases** with input/expected/actual
|
|
107
|
+
- **What changed** โ baseline vs current with directional indicators
|
|
108
|
+
- **Root cause classification**: prompt drift, retrieval drift, formatting drift, tool-use drift, safety/cost/latency regression, coverage drop, baseline stale
|
|
109
|
+
- **Prioritized suggested fixes** with actionable commands
|
|
110
|
+
- Works with both `evalai check` reports (CheckReport) and `evalai gate` reports (BuiltinReport)
|
|
111
|
+
- **`--format json`** for CI pipeline consumption
|
|
112
|
+
|
|
113
|
+
#### Guided Failure Flow
|
|
114
|
+
|
|
115
|
+
- **`evalai check` now writes `.evalai/last-report.json`** automatically after every run
|
|
116
|
+
- **Failure hint**: prints `Next: evalai explain` on gate failure
|
|
117
|
+
- **GitHub step summary**: adds tip about `evalai explain` and report artifact location on failure
|
|
118
|
+
|
|
119
|
+
#### CI Template Improvements
|
|
120
|
+
|
|
121
|
+
- **Doctor preflight step** added to generated workflow (`continue-on-error: true`)
|
|
122
|
+
- **Report artifact upload** now includes both `evals/regression-report.json` and `.evalai/last-report.json`
|
|
123
|
+
|
|
124
|
+
#### `evalai init` Output Updated
|
|
125
|
+
|
|
126
|
+
- First recommendation: `npx evalai doctor` (verify setup)
|
|
127
|
+
- Full command reference: doctor, gate, check, explain, baseline update
|
|
128
|
+
|
|
129
|
+
#### CLI โ `evalai print-config` (New Command)
|
|
130
|
+
|
|
131
|
+
- **Resolved config viewer** โ prints every config field with its current value
|
|
132
|
+
- **Source-of-truth annotations**: `[file]`, `[env]`, `[default]`, `[profile]`, `[arg]` for each field
|
|
133
|
+
- **Secrets redacted** โ API keys shown as `sk_t...abcd`
|
|
134
|
+
- **Environment summary** โ shows all relevant env vars (EVALAI_API_KEY, OPENAI_API_KEY, CI, etc.)
|
|
135
|
+
- **`--format json`** for machine-readable output
|
|
136
|
+
- Accepts `--evaluationId`, `--baseUrl`, etc. to show how CLI args would merge
|
|
137
|
+
|
|
138
|
+
#### Minimal Green Example
|
|
139
|
+
|
|
140
|
+
- **`examples/minimal-green/`** โ passes on first run, no account needed
|
|
141
|
+
- Zero dependencies, 3 `node:test` tests
|
|
142
|
+
- Clone โ init โ doctor โ gate โ โ
|
|
143
|
+
|
|
144
|
+
### ๐ง Changed
|
|
145
|
+
|
|
146
|
+
- `evalai doctor` exit codes changed: was `0`/`1`, now `0`/`2`/`3`
|
|
147
|
+
- SDK README: added Debugging & Diagnostics section with guided flow diagram
|
|
148
|
+
- SDK README: added Doctor Exit Codes table
|
|
149
|
+
- Doctor test count: 4 โ 29 tests; added 9 explain tests (38 total new tests)
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## [1.7.0] - 2026-02-25
|
|
154
|
+
|
|
155
|
+
### โจ Added
|
|
156
|
+
|
|
157
|
+
#### CLI โ `evalai init` Full Project Scaffolder
|
|
158
|
+
|
|
159
|
+
- **`evalai init`** โ Zero-to-gate in under 5 minutes:
|
|
160
|
+
- Detects Node repo + package manager (npm/yarn/pnpm)
|
|
161
|
+
- Runs existing tests to capture real pass/fail + test count
|
|
162
|
+
- Creates `evals/baseline.json` with provenance metadata
|
|
163
|
+
- Installs `.github/workflows/evalai-gate.yml` (package-manager aware)
|
|
164
|
+
- Creates `evalai.config.json`
|
|
165
|
+
- Prints copy-paste next steps โ just commit and push
|
|
166
|
+
- Idempotent: skips files that already exist
|
|
167
|
+
|
|
168
|
+
#### CLI โ `evalai upgrade --full` (Tier 1 โ Tier 2)
|
|
169
|
+
|
|
170
|
+
- **`evalai upgrade --full`** โ Upgrade from built-in gate to full gate:
|
|
171
|
+
- Creates `scripts/regression-gate.ts` (full gate with `--update-baseline`)
|
|
172
|
+
- Adds `eval:regression-gate` + `eval:baseline-update` npm scripts
|
|
173
|
+
- Creates `.github/workflows/baseline-governance.yml` (label + diff enforcement)
|
|
174
|
+
- Upgrades `evalai-gate.yml` to project mode
|
|
175
|
+
- Adds `CODEOWNERS` entry for `evals/baseline.json`
|
|
176
|
+
|
|
177
|
+
#### Gate Output โ Machine-Readable Improvements
|
|
178
|
+
|
|
179
|
+
- **`detectRunner()`** โ Identifies test runner from `package.json` scripts: vitest, jest, mocha, node:test, ava, tap, or unknown
|
|
180
|
+
- **BuiltinReport** now always emits: `durationMs`, `command`, `runner`, `baseline` metadata
|
|
181
|
+
- Report schema updated with optional `durationMs`, `command`, `runner` properties
|
|
182
|
+
|
|
183
|
+
#### Init Scaffolder Integration Tests
|
|
184
|
+
|
|
185
|
+
- 4 fixtures: npm+jest, pnpm+vitest, yarn+jest, pnpm monorepo
|
|
186
|
+
- 25 tests: files created, YAML valid, pm-aware workflow, idempotent runs
|
|
187
|
+
- All fixtures use `node:test` (zero external deps)
|
|
188
|
+
|
|
189
|
+
### ๐ง Changed
|
|
190
|
+
|
|
191
|
+
- CLI help text updated to include `upgrade` command
|
|
192
|
+
- Gate report includes runner detection and timing metadata
|
|
193
|
+
- SDK test count: 147 โ 172 tests (12 โ 15 contract tests)
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## [1.6.0] - 2026-02-24
|
|
198
|
+
|
|
199
|
+
### โจ Added
|
|
200
|
+
|
|
201
|
+
#### CLI โ Regression Gate & Baseline Management
|
|
202
|
+
|
|
203
|
+
- **`evalai baseline init`** โ Create a starter `evals/baseline.json` with sample values and provenance metadata
|
|
204
|
+
- **`evalai baseline update`** โ Run confidence tests, golden eval, and latency benchmark, then update baseline with real scores
|
|
205
|
+
- **`evalai gate`** โ Run the local regression gate with proper exit code taxonomy (0=pass, 1=regression, 2=infra_error, 3=confidence_failed, 4=confidence_missing)
|
|
206
|
+
- **`evalai gate --format json`** โ Output `evals/regression-report.json` as machine-readable JSON to stdout
|
|
207
|
+
- **`evalai gate --format github`** โ Output GitHub Step Summary markdown with delta table
|
|
208
|
+
|
|
209
|
+
#### SDK Exports โ Regression Gate Constants & Types
|
|
210
|
+
|
|
211
|
+
- **`GATE_EXIT`** โ Exit code constants (`PASS`, `REGRESSION`, `INFRA_ERROR`, `CONFIDENCE_FAILED`, `CONFIDENCE_MISSING`)
|
|
212
|
+
- **`GATE_CATEGORY`** โ Report category constants (`pass`, `regression`, `infra_error`)
|
|
213
|
+
- **`REPORT_SCHEMA_VERSION`** โ Current schema version for `regression-report.json`
|
|
214
|
+
- **`ARTIFACTS`** โ Well-known artifact paths (`BASELINE`, `REGRESSION_REPORT`, `CONFIDENCE_SUMMARY`, `LATENCY_BENCHMARK`)
|
|
215
|
+
- **Types**: `RegressionReport`, `RegressionDelta`, `Baseline`, `BaselineTolerance`, `GateExitCode`, `GateCategory`
|
|
216
|
+
- **Subpath export**: `@pauly4010/evalai-sdk/regression` for tree-shakeable imports
|
|
217
|
+
|
|
218
|
+
### ๐ง Changed
|
|
219
|
+
|
|
220
|
+
- CLI help text updated to include `baseline` and `gate` commands
|
|
221
|
+
- SDK becomes the public contract for regression gate โ scripts are implementation detail
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## [1.5.8] - 2026-02-22
|
|
226
|
+
|
|
227
|
+
### ๐ Fixed
|
|
228
|
+
|
|
229
|
+
- **secureRoute TypeScript overload compatibility** โ Fixed implementation signature to use `ctx: any` for proper overload compatibility
|
|
230
|
+
- **Test infrastructure fixes** โ Replaced invalid `expect.unknown()` with `expect.any()` across test files
|
|
231
|
+
- **NextRequest constructor** โ Fixed test mocks using incorrect `(NextRequest as any)()` syntax
|
|
232
|
+
- **304 response handling** โ Fixed exports API returning invalid 304 response with body
|
|
233
|
+
- **Error catalog tests** โ Updated test expectations to match actual EvalAIError behavior
|
|
234
|
+
- **Redis cache timeout** โ Added explicit timeout to prevent test hangs
|
|
235
|
+
|
|
236
|
+
### ๐ง Changed
|
|
237
|
+
|
|
238
|
+
- **Biome formatting** โ Applied consistent line endings across 199 files
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## [1.5.7] - 2026-02-20
|
|
243
|
+
|
|
244
|
+
### ๐ Documentation
|
|
245
|
+
|
|
246
|
+
- **Version bump** โ Updated documentation to reflect v1.5.6 changes including CJS compatibility
|
|
247
|
+
- **README consistency** โ Aligned version references across CLI section and changelog
|
|
248
|
+
- **Environment support** โ Added CJS/ESM compatibility to supported features list
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## [1.5.6] - 2026-02-19
|
|
253
|
+
|
|
254
|
+
### ๐ง Changed
|
|
255
|
+
|
|
256
|
+
- **CJS compatibility** โ Added `require` entries for all subpath exports (`./assertions`, `./testing`, `./integrations/*`, `./matchers`). CJS consumers no longer need custom resolve configuration.
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## [1.5.5] - 2026-02-19
|
|
261
|
+
|
|
262
|
+
### โจ Added
|
|
263
|
+
|
|
264
|
+
#### Gate semantics (PASS / WARN / FAIL)
|
|
265
|
+
|
|
266
|
+
- **`--warnDrop <n>`** โ Introduce a WARN band when score drops > `warnDrop` but < `maxDrop`
|
|
267
|
+
- **Gate verdicts:** PASS, WARN, FAIL
|
|
268
|
+
- **Profiles:** `strict` (warnDrop: 0), `balanced` (warnDrop: 1), `fast` (warnDrop: 2)
|
|
269
|
+
- **`--fail-on-flake`** โ Fail the gate if unknown case is flagged as flaky (partial pass rate across determinism runs)
|
|
270
|
+
|
|
271
|
+
#### Determinism & flake intelligence
|
|
272
|
+
|
|
273
|
+
- **Adaptive variance thresholds** โ Determinism audit passes if `absVariance โค 5` OR `relVariance โค 2%`
|
|
274
|
+
- **Per-case variance reporting** โ Reports per-case pass rate across N runs and flags `[FLAKY]` cases
|
|
275
|
+
- **Golden dataset regression** โ Added `evals/golden` with `pnpm eval:golden` to prevent semantic regressions
|
|
276
|
+
- **Golden drift output** โ Writes `evals/golden/golden-results.json` with `currentScore`, `baselineScore`, `delta`, `passed`, and timestamps
|
|
277
|
+
|
|
278
|
+
#### CI audits & workflows
|
|
279
|
+
|
|
280
|
+
- **Nightly audits** โ Added `audit-nightly.yml` for determinism + performance budgets (skips without `OPENAI_API_KEY`)
|
|
281
|
+
- **SDK compatibility matrix** โ Added `sdk-compat.yml` to validate older SDK versions against current API
|
|
282
|
+
- **New audits:** `audit:retention`, `audit:migrations`, `audit:performance`, `audit:determinism`
|
|
283
|
+
|
|
284
|
+
#### Platform safety & governance (docs + proofs)
|
|
285
|
+
|
|
286
|
+
- **Audit trail docs** โ Added `docs/audit-trail.md`
|
|
287
|
+
- **Observability docs** โ Added `docs/observability.md` (log schema + requestId)
|
|
288
|
+
- **Retention docs** โ Added `docs/data-retention.md`
|
|
289
|
+
- **Migration safety docs** โ Added `docs/migration-safety.md`
|
|
290
|
+
- **Adoption benchmark** โ Added `docs/adoption-benchmark.md`
|
|
291
|
+
- **Examples** โ Added real-world example suites (RAG regression + agent tool-use)
|
|
292
|
+
|
|
293
|
+
### ๐ง Changed
|
|
294
|
+
|
|
295
|
+
- **Exit codes updated** โ 0=pass, **8=warn**, failures remain as documented for score/regression/policy/API/config
|
|
296
|
+
- **GitHub + human formatters** โ Render WARN state, top contributors, and flake indicators where available
|
|
297
|
+
- **Rate limiting** โ Adds `Retry-After` header on 429 responses
|
|
298
|
+
- **RequestId propagation** โ `EvalAIError` surfaces `requestId` from response body or `x-request-id` header
|
|
299
|
+
|
|
300
|
+
### ๐งช Testing
|
|
301
|
+
|
|
302
|
+
- Added tests for:
|
|
303
|
+
- access boundaries (no tenant info leak)
|
|
304
|
+
- rate-limit abuse patterns + `Retry-After`
|
|
305
|
+
- executor failure modes (timeouts / upstream 429 / malformed responses)
|
|
306
|
+
- error catalog stability + graceful handling of unknown codes
|
|
307
|
+
- exports contract (retention visibility, 410 semantics)
|
|
308
|
+
|
|
309
|
+
--
|
|
310
|
+
|
|
311
|
+
## [1.5.0] - 2026-02-18
|
|
312
|
+
|
|
313
|
+
### โจ Added
|
|
314
|
+
|
|
315
|
+
#### evalai CLI โ CI DevX
|
|
316
|
+
|
|
317
|
+
- **`--format github`** โ GitHub Actions annotations + step summary (`$GITHUB_STEP_SUMMARY`)
|
|
318
|
+
- **`--format json`** โ Machine-readable output only
|
|
319
|
+
- **`--onFail import`** โ On gate failure, import run metadata + failures to dashboard (idempotent per CI run)
|
|
320
|
+
- **`--explain`** โ Show score breakdown (contribPts) and thresholds
|
|
321
|
+
- **`evalai doctor`** โ Verify CI setup (config, API key, quality endpoint, baseline)
|
|
322
|
+
- **Pinned CLI invocation** โ Use `npx -y @pauly4010/evalai-sdk@^1` for stable CI (avoids surprise v2 breaks)
|
|
323
|
+
|
|
324
|
+
#### Documentation
|
|
325
|
+
|
|
326
|
+
- **README** โ 3-section adoption flow: 60s local โ optional CI gate โ no lock-in
|
|
327
|
+
- **Init output** โ Shows path written, pinned snippet with `--format github --onFail import`
|
|
328
|
+
- **openAIChatEval** โ "Gate this in CI" hint uses pinned invocation
|
|
329
|
+
|
|
330
|
+
### ๐ง Changed
|
|
331
|
+
|
|
332
|
+
- **evalai init** โ Output: "Wrote evalai.config.json at {path}", one next step, uninstall line
|
|
333
|
+
- **Baseline missing** โ Treated as config failure (BAD_ARGS), not API error
|
|
334
|
+
- **parseArgs** โ Returns `{ ok, args }` or `{ ok: false }` (no `process.exit` inside) for testability
|
|
335
|
+
|
|
336
|
+
### ๐ฆ Internal
|
|
337
|
+
|
|
338
|
+
- Refactored `check.ts` into modules: `api.ts`, `gate.ts`, `report/build-check-report.ts`, `formatters/`
|
|
339
|
+
- Deterministic helpers: `truncateSnippet`, `sortFailedCases`
|
|
340
|
+
- Formatter tests: `json.test.ts`, `github.test.ts`
|
|
341
|
+
- Doctor tests: `doctor.test.ts`
|
|
342
|
+
|
|
343
|
+
---
|
|
344
|
+
|
|
345
|
+
## [1.4.1] - 2026-02-18
|
|
346
|
+
|
|
347
|
+
### โจ Added
|
|
348
|
+
|
|
349
|
+
- **evalai check `--baseline production`** โ Compare against latest run tagged with `environment=prod`
|
|
350
|
+
- **Baseline missing handling** โ Clear failure when baseline not found and comparison requested
|
|
351
|
+
|
|
352
|
+
### ๐ง Changed
|
|
353
|
+
|
|
354
|
+
- **Package hardening** โ `files`, `module`, `sideEffects: false` for leaner npm publish
|
|
355
|
+
- **CLI** โ Passes `baseline` param to quality API for deterministic CI gates
|
|
356
|
+
|
|
357
|
+
## [1.3.0] - 2025-10-21
|
|
358
|
+
|
|
359
|
+
### โจ Added
|
|
360
|
+
|
|
361
|
+
#### Performance Optimizations
|
|
362
|
+
|
|
363
|
+
- **Client-side Request Caching**: Automatic caching of GET requests with smart TTL
|
|
364
|
+
- Configurable cache size via `config.cacheSize` (default: 1000 entries)
|
|
365
|
+
- Automatic cache invalidation on mutations (POST/PUT/DELETE/PATCH)
|
|
366
|
+
- Intelligent TTL based on data type (automatic)
|
|
367
|
+
- Cache hit/miss logging in debug mode
|
|
368
|
+
- Advanced: Manual cache control available via `RequestCache` class for power users
|
|
369
|
+
|
|
370
|
+
- **Cursor-based Pagination**: Modern pagination utilities for efficient data fetching
|
|
371
|
+
- `PaginatedIterator` class for easy iteration over all pages
|
|
372
|
+
- `autoPaginate()` async generator for streaming individual items
|
|
373
|
+
- `encodeCursor()` / `decodeCursor()` for pagination state management
|
|
374
|
+
- `createPaginationMeta()` helper for response metadata
|
|
375
|
+
- Works in both Node.js and browser environments
|
|
376
|
+
|
|
377
|
+
- **Request Batching**: Combine multiple API requests for better performance
|
|
378
|
+
- Configurable batch size via `config.batchSize` (default: 10)
|
|
379
|
+
- Configurable batch delay via `config.batchDelay` (default: 50ms)
|
|
380
|
+
- Automatic batching for compatible endpoints
|
|
381
|
+
- `RequestBatcher` class for custom batching logic
|
|
382
|
+
- Reduces network overhead by 50-80% for bulk operations
|
|
383
|
+
|
|
384
|
+
- **Connection Pooling**: HTTP keep-alive for connection reuse
|
|
385
|
+
- Enable via `config.keepAlive` option (default: true)
|
|
386
|
+
- Reduces connection overhead for sequential requests
|
|
387
|
+
- Improves performance for high-frequency API usage
|
|
388
|
+
|
|
389
|
+
- **Enhanced Retry Logic**: Already had exponential backoff, now fully configurable
|
|
390
|
+
- Choose between 'exponential', 'linear', or 'fixed' backoff strategies
|
|
391
|
+
- Configure retry attempts via `config.retry.maxAttempts`
|
|
392
|
+
- Customize retryable error codes
|
|
393
|
+
|
|
394
|
+
#### Developer Experience
|
|
395
|
+
|
|
396
|
+
- **Comprehensive Examples**: New example files with real-world usage patterns
|
|
397
|
+
- `examples/performance-optimization.ts`: All performance features demonstrated
|
|
398
|
+
- `examples/complete-workflow.ts`: End-to-end SDK usage guide
|
|
399
|
+
- Examples show caching, batching, pagination, and combined optimizations
|
|
400
|
+
|
|
401
|
+
- **New Configuration Options**:
|
|
402
|
+
```typescript
|
|
403
|
+
new AIEvalClient({
|
|
404
|
+
enableCaching: true, // Enable request caching
|
|
405
|
+
cacheSize: 1000, // Max cache entries
|
|
406
|
+
enableBatching: true, // Enable request batching
|
|
407
|
+
batchSize: 10, // Requests per batch
|
|
408
|
+
batchDelay: 50, // ms to wait before processing batch
|
|
409
|
+
keepAlive: true, // Enable connection pooling
|
|
410
|
+
});
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
### ๐ง Changed
|
|
414
|
+
|
|
415
|
+
- Updated `ClientConfig` interface with performance options
|
|
416
|
+
- Enhanced `request()` method with automatic caching and invalidation
|
|
417
|
+
- Improved TypeScript types for pagination utilities
|
|
418
|
+
- SDK description updated to reflect performance optimizations
|
|
419
|
+
|
|
420
|
+
### ๐ Documentation
|
|
421
|
+
|
|
422
|
+
- Added detailed performance optimization guide
|
|
423
|
+
- Created complete workflow documentation
|
|
424
|
+
- Updated README with new features and configuration options
|
|
425
|
+
- Added JSDoc comments for all new utilities
|
|
426
|
+
|
|
427
|
+
### ๐ Performance Improvements
|
|
428
|
+
|
|
429
|
+
- **50-80% reduction** in network requests through batching
|
|
430
|
+
- **30-60% faster** repeated queries through caching
|
|
431
|
+
- **20-40% lower** latency for sequential requests through connection pooling
|
|
432
|
+
- **Automatic optimization** with zero code changes (backward compatible)
|
|
433
|
+
|
|
434
|
+
## [1.2.2] - 2025-10-20
|
|
435
|
+
|
|
436
|
+
### ๐ Fixed
|
|
437
|
+
|
|
438
|
+
#### Additional Browser Compatibility Fixes
|
|
439
|
+
|
|
440
|
+
- **process.env Access**: Added safe `getEnvVar()` helper function for browser compatibility
|
|
441
|
+
- Client constructor now works in browsers without `process.env`
|
|
442
|
+
- `AIEvalClient.init()` now safe in browsers
|
|
443
|
+
- Falls back gracefully when environment variables are not available
|
|
444
|
+
- **Type Name Collision**: Renamed test suite types to avoid confusion
|
|
445
|
+
- `TestCase` โ `TestSuiteCase` (for test suite definitions)
|
|
446
|
+
- `TestCaseResult` โ `TestSuiteCaseResult`
|
|
447
|
+
- Legacy type aliases provided for backward compatibility
|
|
448
|
+
- API `TestCase` type (from types.ts) remains unchanged
|
|
449
|
+
- Removed duplicate `TestCase` export from main index to prevent TypeScript errors
|
|
450
|
+
|
|
451
|
+
#### TypeScript Compilation Fixes
|
|
452
|
+
|
|
453
|
+
- **AsyncLocalStorage Type Error**: Fixed `TS2347` error in `context.ts`
|
|
454
|
+
- Removed generic type argument from dynamically required `AsyncLocalStorage`
|
|
455
|
+
- Now compiles without errors in strict mode
|
|
456
|
+
- **Duplicate Identifier**: Fixed `TS2300` error for `TestCase` in `index.ts`
|
|
457
|
+
- Resolved export collision between test suite and API types
|
|
458
|
+
- Use `TestSuiteCase` for test definitions, `TestCase` for API responses
|
|
459
|
+
|
|
460
|
+
### ๐ Documentation
|
|
461
|
+
|
|
462
|
+
- Updated `AIEvalClient.init()` JSDoc with browser usage examples
|
|
463
|
+
- Added deprecation notices for legacy test suite type names
|
|
464
|
+
- Clarified environment variable behavior (Node.js only)
|
|
465
|
+
|
|
466
|
+
### ๐ Migration Notes
|
|
467
|
+
|
|
468
|
+
No breaking changes! Legacy type names are aliased for backward compatibility:
|
|
469
|
+
|
|
470
|
+
- `TestCase` still works (aliased to `TestSuiteCase`)
|
|
471
|
+
- `TestCaseResult` still works (aliased to `TestSuiteCaseResult`)
|
|
472
|
+
|
|
473
|
+
**Recommended**: Update to new type names to avoid future deprecation:
|
|
474
|
+
|
|
475
|
+
```typescript
|
|
476
|
+
// OLD (still works, but deprecated)
|
|
477
|
+
import { TestCase } from "@pauly4010/evalai-sdk";
|
|
478
|
+
|
|
479
|
+
// NEW (recommended)
|
|
480
|
+
import { TestSuiteCase } from "@pauly4010/evalai-sdk";
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
---
|
|
484
|
+
|
|
485
|
+
## [1.2.1] - 2025-01-20
|
|
486
|
+
|
|
487
|
+
### ๐ Fixed
|
|
488
|
+
|
|
489
|
+
#### Critical Bug Fixes
|
|
490
|
+
|
|
491
|
+
- **CLI Import Paths**: Fixed imports in CLI to use compiled paths (`../client.js`) instead of source paths (`../src/client`)
|
|
492
|
+
- **Duplicate Traces**: Fixed OpenAI and Anthropic integrations creating duplicate trace entries. Now creates a single trace with the final status
|
|
493
|
+
- **Commander.js Syntax**: Fixed invalid nested command structure (`eval` -> `run` to `eval:run`)
|
|
494
|
+
- **Context System Browser Compatibility**: Replaced Node.js-only `AsyncLocalStorage` with environment-aware implementation
|
|
495
|
+
- Node.js: Uses `AsyncLocalStorage` for true async context propagation
|
|
496
|
+
- Browser: Uses stack-based approach with helpful limitations documented
|
|
497
|
+
- **Path Traversal Security**: Added comprehensive security checks to snapshot path sanitization
|
|
498
|
+
- Prevents empty names
|
|
499
|
+
- Prevents path traversal attacks (`../`)
|
|
500
|
+
- Validates paths stay within snapshot directory
|
|
501
|
+
- Sanitizes to alphanumeric, hyphens, and underscores only
|
|
502
|
+
|
|
503
|
+
#### Developer Experience Improvements
|
|
504
|
+
|
|
505
|
+
- **Environment Detection**: Added runtime checks for Node.js-only features
|
|
506
|
+
- `snapshot.ts` - Throws helpful error in browsers
|
|
507
|
+
- `local.ts` - Throws helpful error in browsers
|
|
508
|
+
- `context.ts` - Gracefully degrades in browsers
|
|
509
|
+
- **Empty Exports Removed**: Removed misleading empty `StreamingClient` and `BatchClient` objects
|
|
510
|
+
- Now exports actual implementations: `batchProcess`, `streamEvaluation`, `batchRead`, `RateLimiter`
|
|
511
|
+
- **Error Handling**: Integration wrappers now catch and ignore trace creation errors to avoid masking original errors
|
|
512
|
+
|
|
513
|
+
### ๐ฆ Changed
|
|
514
|
+
|
|
515
|
+
#### Dependencies
|
|
516
|
+
|
|
517
|
+
- **Updated**: `commander` from `^12.0.0` to `^14.0.0`
|
|
518
|
+
- **Added**: Peer dependencies (optional)
|
|
519
|
+
- `openai`: `^4.0.0`
|
|
520
|
+
- `@anthropic-ai/sdk`: `^0.20.0`
|
|
521
|
+
- **Added**: Node.js engine requirement `>=16.0.0`
|
|
522
|
+
|
|
523
|
+
#### Package Metadata
|
|
524
|
+
|
|
525
|
+
- **Version**: Bumped to `1.2.1`
|
|
526
|
+
- **Keywords**: Added `openai` and `anthropic`
|
|
527
|
+
|
|
528
|
+
### ๐ Documentation
|
|
529
|
+
|
|
530
|
+
#### README Updates
|
|
531
|
+
|
|
532
|
+
- **Environment Support Section**: New section clarifying Node.js vs Browser features
|
|
533
|
+
- โ
Works Everywhere: Core APIs, assertions, test suites
|
|
534
|
+
- ๐ก Node.js Only: Snapshots, local storage, CLI, file exports
|
|
535
|
+
- ๐ Context: Full support in Node.js, basic in browsers
|
|
536
|
+
- **Changelog**: Updated with v1.2.1 fixes
|
|
537
|
+
- **Installation**: Unchanged
|
|
538
|
+
- **Examples**: All existing examples remain valid
|
|
539
|
+
|
|
540
|
+
#### Code Documentation
|
|
541
|
+
|
|
542
|
+
- Added JSDoc warnings to Node.js-only modules
|
|
543
|
+
- Added inline comments explaining environment checks
|
|
544
|
+
- Updated integration examples to reflect single-trace behavior
|
|
545
|
+
|
|
546
|
+
### ๐ Security
|
|
547
|
+
|
|
548
|
+
- **Path Traversal Prevention**: Multiple layers of validation in snapshot system
|
|
549
|
+
- **Input Sanitization**: Comprehensive name validation before filesystem operations
|
|
550
|
+
- **Directory Boundary Enforcement**: Prevents writing outside designated directories
|
|
551
|
+
|
|
552
|
+
### โก Performance
|
|
553
|
+
|
|
554
|
+
- **Reduced API Calls**: Integration wrappers now make 1 trace call instead of 2
|
|
555
|
+
- **Faster Errors**: Environment checks happen at module load time
|
|
556
|
+
|
|
557
|
+
### ๐ Migration Guide from 1.2.0 to 1.2.1
|
|
558
|
+
|
|
559
|
+
#### No Breaking Changes! โ
|
|
560
|
+
|
|
561
|
+
All fixes are backward compatible. However, you may notice:
|
|
562
|
+
|
|
563
|
+
1. **Integration Tracing**: You'll see fewer trace entries (1 per call instead of 2)
|
|
564
|
+
- **Before**: `pending` trace โ `success` trace (2 entries)
|
|
565
|
+
- **After**: `success` trace (1 entry)
|
|
566
|
+
|
|
567
|
+
2. **CLI Command**: Use `evalai eval:run` instead of `evalai eval run`
|
|
568
|
+
- Old syntax will fail, update your scripts
|
|
569
|
+
|
|
570
|
+
3. **Browser Usage**: Node.js-only features now throw helpful errors
|
|
571
|
+
|
|
572
|
+
```javascript
|
|
573
|
+
// In browser:
|
|
574
|
+
import { snapshot } from "@pauly4010/evalai-sdk";
|
|
575
|
+
snapshot("test", "name"); // โ Throws: "Snapshot testing requires Node.js..."
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
4. **Context in Browsers**: Limited async propagation
|
|
579
|
+
```javascript
|
|
580
|
+
// Works in both Node.js and browser, but browser has limitations
|
|
581
|
+
await withContext({ userId: "123" }, async () => {
|
|
582
|
+
await client.traces.create({ name: "test" });
|
|
583
|
+
// Node.js: โ
Full context propagation
|
|
584
|
+
// Browser: โ ๏ธ Basic context, not safe across all async boundaries
|
|
585
|
+
});
|
|
586
|
+
```
|
|
587
|
+
|
|
588
|
+
#### Recommended Actions
|
|
589
|
+
|
|
590
|
+
1. **Update CLI scripts** if using `evalai eval run`
|
|
591
|
+
2. **Test browser builds** if using SDK in browsers
|
|
592
|
+
3. **Review trace counts** if you have monitoring based on trace volume
|
|
593
|
+
4. **Update dependencies**: Run `npm update @pauly4010/evalai-sdk`
|
|
594
|
+
|
|
595
|
+
### ๐งช Testing
|
|
596
|
+
|
|
597
|
+
All fixes have been:
|
|
598
|
+
|
|
599
|
+
- โ
Syntax validated
|
|
600
|
+
- โ
Import paths verified
|
|
601
|
+
- โ
Security tests for path traversal
|
|
602
|
+
- โ
Environment detection tested
|
|
603
|
+
- โ
No linting errors
|
|
604
|
+
|
|
605
|
+
---
|
|
606
|
+
|
|
607
|
+
## [1.2.0] - 2025-10-15
|
|
608
|
+
|
|
609
|
+
### ๐ Added
|
|
610
|
+
|
|
611
|
+
- **100% API Coverage** - All backend endpoints now supported
|
|
612
|
+
- **Annotations API** - Complete human-in-the-loop evaluation
|
|
613
|
+
- **Developer API** - Full API key and webhook management
|
|
614
|
+
- **LLM Judge Extended** - Enhanced judge capabilities
|
|
615
|
+
- **Organizations API** - Organization details access
|
|
616
|
+
- **Enhanced Types** - 40+ new TypeScript interfaces
|
|
617
|
+
|
|
618
|
+
---
|
|
619
|
+
|
|
620
|
+
## [1.1.0] - 2025-01-10
|
|
621
|
+
|
|
622
|
+
### โจ Added
|
|
623
|
+
|
|
624
|
+
- Comprehensive evaluation template types
|
|
625
|
+
- Organization resource limits tracking
|
|
626
|
+
- `getOrganizationLimits()` method
|
|
627
|
+
|
|
628
|
+
---
|
|
629
|
+
|
|
630
|
+
## [1.0.0] - 2025-01-01
|
|
631
|
+
|
|
632
|
+
### ๐ Initial Release
|
|
633
|
+
|
|
634
|
+
- Traces, Evaluations, LLM Judge APIs
|
|
635
|
+
- Framework integrations (OpenAI, Anthropic)
|
|
636
|
+
- Test suite builder
|
|
637
|
+
- Context propagation
|
|
638
|
+
- Error handling & retries
|