@pauly4010/evalai-sdk 1.4.1 → 1.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +85 -0
- package/README.md +205 -543
- package/dist/assertions.d.ts +2 -2
- package/dist/assertions.js +104 -71
- package/dist/batch.js +12 -17
- package/dist/cache.js +7 -11
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +130 -0
- package/dist/cli/check.d.ts +28 -13
- package/dist/cli/check.js +249 -142
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +110 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +207 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/doctor.d.ts +11 -0
- package/dist/cli/doctor.js +82 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +130 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +107 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +101 -0
- package/dist/cli/formatters/types.d.ts +100 -0
- package/dist/cli/formatters/types.js +5 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +175 -0
- package/dist/cli/index.d.ts +1 -0
- package/dist/cli/index.js +67 -23
- package/dist/cli/init.d.ts +7 -0
- package/dist/cli/init.js +69 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +83 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +124 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +83 -0
- package/dist/client.d.ts +2 -2
- package/dist/client.js +144 -132
- package/dist/context.d.ts +1 -1
- package/dist/context.js +4 -6
- package/dist/errors.d.ts +2 -0
- package/dist/errors.js +116 -107
- package/dist/export.d.ts +6 -6
- package/dist/export.js +39 -33
- package/dist/index.d.ts +25 -24
- package/dist/index.js +62 -56
- package/dist/integrations/anthropic.d.ts +1 -1
- package/dist/integrations/anthropic.js +23 -19
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +230 -0
- package/dist/integrations/openai.d.ts +1 -1
- package/dist/integrations/openai.js +23 -19
- package/dist/local.d.ts +2 -2
- package/dist/local.js +25 -25
- package/dist/logger.d.ts +1 -1
- package/dist/logger.js +24 -28
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +1 -1
- package/dist/pagination.js +6 -6
- package/dist/snapshot.js +24 -24
- package/dist/streaming.js +11 -11
- package/dist/testing.d.ts +6 -2
- package/dist/testing.js +30 -12
- package/dist/types.d.ts +22 -22
- package/dist/types.js +13 -13
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +38 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +7 -7
- package/dist/workflows.js +44 -44
- package/package.json +102 -90
- package/dist/__tests__/assertions.test.d.ts +0 -1
- package/dist/__tests__/assertions.test.js +0 -288
- package/dist/__tests__/client.test.d.ts +0 -1
- package/dist/__tests__/client.test.js +0 -185
- package/dist/__tests__/testing.test.d.ts +0 -1
- package/dist/__tests__/testing.test.js +0 -230
- package/dist/__tests__/workflows.test.d.ts +0 -1
- package/dist/__tests__/workflows.test.js +0 -222
package/README.md
CHANGED
|
@@ -1,641 +1,303 @@
|
|
|
1
1
|
# @pauly4010/evalai-sdk
|
|
2
2
|
|
|
3
|
-
[](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
|
|
3
|
+
[](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
|
|
4
4
|
[](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
**Stop LLM regressions in CI in minutes.**
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
Evaluate locally in 60 seconds. Add an optional CI gate in 2 minutes.
|
|
9
|
+
No lock-in — remove by deleting `evalai.config.json`.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# 🚀 1) 60 seconds: Run locally (no account)
|
|
14
|
+
|
|
15
|
+
Install, run, get a score.
|
|
16
|
+
No EvalAI account. No API key. No dashboard required.
|
|
9
17
|
|
|
10
18
|
```bash
|
|
11
|
-
npm install @pauly4010/evalai-sdk
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
19
|
+
npm install @pauly4010/evalai-sdk openai
|
|
20
|
+
import { openAIChatEval } from "@pauly4010/evalai-sdk";
|
|
21
|
+
|
|
22
|
+
await openAIChatEval({
|
|
23
|
+
name: "chat-regression",
|
|
24
|
+
cases: [
|
|
25
|
+
{ input: "Hello", expectedOutput: "greeting" },
|
|
26
|
+
{ input: "2 + 2 = ?", expectedOutput: "4" },
|
|
27
|
+
],
|
|
28
|
+
});
|
|
29
|
+
Set:
|
|
30
|
+
|
|
31
|
+
OPENAI_API_KEY=...
|
|
32
|
+
✅ Vitest integration (recommended)
|
|
33
|
+
import {
|
|
34
|
+
openAIChatEval,
|
|
35
|
+
extendExpectWithToPassGate,
|
|
36
|
+
} from "@pauly4010/evalai-sdk";
|
|
37
|
+
import { expect } from "vitest";
|
|
38
|
+
|
|
39
|
+
extendExpectWithToPassGate(expect);
|
|
40
|
+
|
|
41
|
+
it("passes gate", async () => {
|
|
42
|
+
const result = await openAIChatEval({
|
|
43
|
+
name: "chat-regression",
|
|
44
|
+
cases: [
|
|
45
|
+
{ input: "Hello", expectedOutput: "greeting" },
|
|
46
|
+
{ input: "2 + 2 = ?", expectedOutput: "4" },
|
|
47
|
+
],
|
|
48
|
+
});
|
|
17
49
|
|
|
18
|
-
|
|
50
|
+
expect(result).toPassGate();
|
|
51
|
+
});
|
|
52
|
+
Example output
|
|
53
|
+
PASS 2/2 (score: 100)
|
|
19
54
|
|
|
20
|
-
|
|
55
|
+
Tip: Want dashboards and history?
|
|
56
|
+
Set EVALAI_API_KEY and connect this to the platform.
|
|
57
|
+
Failures show:
|
|
21
58
|
|
|
22
|
-
|
|
59
|
+
FAIL 9/10 (score: 90)
|
|
60
|
+
with failed cases and CI guidance.
|
|
23
61
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
- LLM Judge API
|
|
27
|
-
- Annotations API
|
|
28
|
-
- Developer API (API Keys, Webhooks, Usage)
|
|
29
|
-
- Organizations API
|
|
30
|
-
- Assertions Library
|
|
31
|
-
- Test Suites
|
|
32
|
-
- Error Handling
|
|
62
|
+
⚡ 2) Optional: Add a CI gate (2 minutes)
|
|
63
|
+
When you're ready to gate PRs on quality and regressions:
|
|
33
64
|
|
|
34
|
-
|
|
65
|
+
npx -y @pauly4010/evalai-sdk@^1 init
|
|
66
|
+
Create an evaluation in the dashboard and paste its ID into:
|
|
35
67
|
|
|
36
|
-
|
|
68
|
+
{
|
|
69
|
+
"evaluationId": "42"
|
|
70
|
+
}
|
|
71
|
+
Add to your CI:
|
|
37
72
|
|
|
38
|
-
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
-
|
|
73
|
+
- name: EvalAI gate
|
|
74
|
+
env:
|
|
75
|
+
EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}
|
|
76
|
+
run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import --warnDrop 1
|
|
77
|
+
You’ll get:
|
|
42
78
|
|
|
43
|
-
|
|
79
|
+
GitHub annotations
|
|
44
80
|
|
|
45
|
-
|
|
46
|
-
- **Browser**: Basic context support (not safe across all async boundaries)
|
|
81
|
+
Step summary
|
|
47
82
|
|
|
48
|
-
|
|
83
|
+
Optional dashboard link
|
|
49
84
|
|
|
50
|
-
|
|
85
|
+
PASS / WARN / FAIL (v1.5.5)
|
|
86
|
+
EvalAI introduces a WARN band so teams can see meaningful regressions without always blocking merges.
|
|
51
87
|
|
|
52
|
-
|
|
53
|
-
import { AIEvalClient } from "@pauly4010/evalai-sdk";
|
|
88
|
+
Behavior
|
|
54
89
|
|
|
55
|
-
|
|
56
|
-
const client = AIEvalClient.init();
|
|
90
|
+
PASS → within thresholds
|
|
57
91
|
|
|
58
|
-
|
|
59
|
-
const client = new AIEvalClient({
|
|
60
|
-
apiKey: "your-api-key",
|
|
61
|
-
organizationId: 123,
|
|
62
|
-
debug: true,
|
|
63
|
-
});
|
|
64
|
-
```
|
|
92
|
+
WARN → regression > warnDrop but < maxDrop
|
|
65
93
|
|
|
66
|
-
|
|
94
|
+
FAIL → regression > maxDrop
|
|
67
95
|
|
|
68
|
-
|
|
96
|
+
Key flags
|
|
69
97
|
|
|
70
|
-
|
|
98
|
+
--warnDrop → soft regression warning
|
|
71
99
|
|
|
72
|
-
|
|
73
|
-
import { EvaluationTemplates } from "@pauly4010/evalai-sdk";
|
|
100
|
+
--maxDrop → hard regression fail
|
|
74
101
|
|
|
75
|
-
|
|
76
|
-
await client.evaluations.create({
|
|
77
|
-
name: "Prompt Optimization Test",
|
|
78
|
-
type: EvaluationTemplates.PROMPT_OPTIMIZATION,
|
|
79
|
-
createdBy: userId,
|
|
80
|
-
});
|
|
102
|
+
--fail-on-flake → fail if any test is unstable
|
|
81
103
|
|
|
82
|
-
|
|
83
|
-
// Core Testing
|
|
84
|
-
EvaluationTemplates.UNIT_TESTING;
|
|
85
|
-
EvaluationTemplates.OUTPUT_QUALITY;
|
|
86
|
-
|
|
87
|
-
// Advanced Evaluation
|
|
88
|
-
EvaluationTemplates.PROMPT_OPTIMIZATION;
|
|
89
|
-
EvaluationTemplates.CHAIN_OF_THOUGHT;
|
|
90
|
-
EvaluationTemplates.LONG_CONTEXT_TESTING;
|
|
91
|
-
EvaluationTemplates.MODEL_STEERING;
|
|
92
|
-
EvaluationTemplates.REGRESSION_TESTING;
|
|
93
|
-
EvaluationTemplates.CONFIDENCE_CALIBRATION;
|
|
94
|
-
|
|
95
|
-
// Safety & Compliance
|
|
96
|
-
EvaluationTemplates.SAFETY_COMPLIANCE;
|
|
97
|
-
|
|
98
|
-
// Domain-Specific
|
|
99
|
-
EvaluationTemplates.RAG_EVALUATION;
|
|
100
|
-
EvaluationTemplates.CODE_GENERATION;
|
|
101
|
-
EvaluationTemplates.SUMMARIZATION;
|
|
102
|
-
```
|
|
104
|
+
This lets teams tune signal vs noise in CI.
|
|
103
105
|
|
|
104
|
-
|
|
106
|
+
🔒 3) No lock-in
|
|
107
|
+
To stop using EvalAI:
|
|
105
108
|
|
|
106
|
-
|
|
109
|
+
rm evalai.config.json
|
|
110
|
+
Your local openAIChatEval runs continue to work exactly the same.
|
|
107
111
|
|
|
108
|
-
|
|
109
|
-
// Get current usage and limits
|
|
110
|
-
const limits = await client.getOrganizationLimits();
|
|
112
|
+
No account cancellation. No data export required.
|
|
111
113
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
114
|
+
📦 Installation
|
|
115
|
+
npm install @pauly4010/evalai-sdk openai
|
|
116
|
+
# or
|
|
117
|
+
yarn add @pauly4010/evalai-sdk openai
|
|
118
|
+
# or
|
|
119
|
+
pnpm add @pauly4010/evalai-sdk openai
|
|
120
|
+
🖥️ Environment Support
|
|
121
|
+
This SDK works in both Node.js and browsers, with some Node-only features.
|
|
117
122
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
balance: limits.evals_per_organization?.balance,
|
|
121
|
-
total: limits.evals_per_organization?.included_usage,
|
|
122
|
-
});
|
|
123
|
+
✅ Works Everywhere (Node.js + Browser)
|
|
124
|
+
Traces API
|
|
123
125
|
|
|
124
|
-
|
|
125
|
-
usage: limits.annotations_per_organization?.usage,
|
|
126
|
-
balance: limits.annotations_per_organization?.balance,
|
|
127
|
-
total: limits.annotations_per_organization?.included_usage,
|
|
128
|
-
});
|
|
129
|
-
```
|
|
126
|
+
Evaluations API
|
|
130
127
|
|
|
131
|
-
|
|
128
|
+
LLM Judge API
|
|
132
129
|
|
|
133
|
-
|
|
134
|
-
// Create a trace
|
|
135
|
-
const trace = await client.traces.create({
|
|
136
|
-
name: "User Query",
|
|
137
|
-
traceId: "trace-123",
|
|
138
|
-
metadata: { userId: "456" },
|
|
139
|
-
});
|
|
130
|
+
Annotations API
|
|
140
131
|
|
|
141
|
-
|
|
142
|
-
const traces = await client.traces.list({
|
|
143
|
-
limit: 10,
|
|
144
|
-
status: "success",
|
|
145
|
-
});
|
|
132
|
+
Developer API (API Keys, Webhooks, Usage)
|
|
146
133
|
|
|
147
|
-
|
|
148
|
-
const span = await client.traces.createSpan(trace.id, {
|
|
149
|
-
name: "LLM Call",
|
|
150
|
-
spanId: "span-456",
|
|
151
|
-
startTime: new Date().toISOString(),
|
|
152
|
-
metadata: { model: "gpt-4" },
|
|
153
|
-
});
|
|
154
|
-
```
|
|
134
|
+
Organizations API
|
|
155
135
|
|
|
156
|
-
|
|
136
|
+
Assertions Library
|
|
157
137
|
|
|
158
|
-
|
|
159
|
-
// Create evaluation
|
|
160
|
-
const evaluation = await client.evaluations.create({
|
|
161
|
-
name: "Chatbot Responses",
|
|
162
|
-
type: EvaluationTemplates.OUTPUT_QUALITY,
|
|
163
|
-
description: "Test chatbot response quality",
|
|
164
|
-
createdBy: userId,
|
|
165
|
-
});
|
|
138
|
+
Test Suites
|
|
166
139
|
|
|
167
|
-
|
|
168
|
-
await client.evaluations.createTestCase(evaluation.id, {
|
|
169
|
-
input: "What is the capital of France?",
|
|
170
|
-
expectedOutput: "Paris",
|
|
171
|
-
});
|
|
140
|
+
Error Handling
|
|
172
141
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
status: "running",
|
|
176
|
-
});
|
|
177
|
-
```
|
|
142
|
+
🟡 Node.js Only
|
|
143
|
+
These require Node.js:
|
|
178
144
|
|
|
179
|
-
|
|
145
|
+
Snapshot Testing
|
|
180
146
|
|
|
181
|
-
|
|
182
|
-
// Evaluate with LLM judge
|
|
183
|
-
const result = await client.llmJudge.evaluate({
|
|
184
|
-
configId: 1,
|
|
185
|
-
input: "Translate: Hello world",
|
|
186
|
-
output: "Bonjour le monde",
|
|
187
|
-
metadata: { language: "French" },
|
|
188
|
-
});
|
|
147
|
+
Local Storage Mode
|
|
189
148
|
|
|
190
|
-
|
|
191
|
-
console.log("Reasoning:", result.result.reasoning);
|
|
192
|
-
```
|
|
149
|
+
CLI Tool
|
|
193
150
|
|
|
194
|
-
|
|
151
|
+
Export to File
|
|
195
152
|
|
|
196
|
-
|
|
153
|
+
🔄 Context Propagation
|
|
154
|
+
Node.js: full async context via AsyncLocalStorage
|
|
197
155
|
|
|
198
|
-
|
|
199
|
-
# Required
|
|
200
|
-
EVALAI_API_KEY=your-api-key
|
|
156
|
+
Browser: basic support (not safe across all async boundaries)
|
|
201
157
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
EVALAI_BASE_URL=https://api.example.com
|
|
205
|
-
```
|
|
158
|
+
🧠 AIEvalClient (Platform API)
|
|
159
|
+
import { AIEvalClient } from "@pauly4010/evalai-sdk";
|
|
206
160
|
|
|
207
|
-
|
|
161
|
+
// From env
|
|
162
|
+
const client = AIEvalClient.init();
|
|
208
163
|
|
|
209
|
-
|
|
210
|
-
const
|
|
164
|
+
// Explicit
|
|
165
|
+
const client2 = new AIEvalClient({
|
|
211
166
|
apiKey: "your-api-key",
|
|
212
167
|
organizationId: 123,
|
|
213
|
-
baseUrl: "https://api.example.com",
|
|
214
|
-
timeout: 30000,
|
|
215
168
|
debug: true,
|
|
216
|
-
logLevel: "debug",
|
|
217
|
-
retry: {
|
|
218
|
-
maxAttempts: 3,
|
|
219
|
-
backoff: "exponential",
|
|
220
|
-
retryableErrors: ["RATE_LIMIT_EXCEEDED", "TIMEOUT"],
|
|
221
|
-
},
|
|
222
169
|
});
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
170
|
+
🧪 evalai CLI (v1.5.5)
|
|
171
|
+
The CLI gates deployments on quality, regression, and policy.
|
|
172
|
+
|
|
173
|
+
Quick start
|
|
174
|
+
npx -y @pauly4010/evalai-sdk@^1 check \
|
|
175
|
+
--evaluationId 42 \
|
|
176
|
+
--apiKey $EVALAI_API_KEY
|
|
177
|
+
evalai check
|
|
178
|
+
Option Description
|
|
179
|
+
--evaluationId <id> Required. Evaluation to gate on
|
|
180
|
+
--apiKey <key> API key (or EVALAI_API_KEY)
|
|
181
|
+
--format <fmt> human, json, or github
|
|
182
|
+
--onFail import Import failing run to dashboard
|
|
183
|
+
--explain Show score breakdown
|
|
184
|
+
--minScore <n> Fail if score < n
|
|
185
|
+
--warnDrop <n> Warn if regression exceeds n
|
|
186
|
+
--maxDrop <n> Fail if regression exceeds n
|
|
187
|
+
--minN <n> Fail if test count < n
|
|
188
|
+
--allowWeakEvidence Permit weak evidence
|
|
189
|
+
--policy <name> HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511
|
|
190
|
+
--baseline <mode> published, previous, production
|
|
191
|
+
--fail-on-flake Fail if any case is flaky
|
|
192
|
+
--baseUrl <url> Override API base URL
|
|
193
|
+
|
|
194
|
+
Exit codes
|
|
195
|
+
Code Meaning
|
|
196
|
+
0 PASS
|
|
197
|
+
8 WARN
|
|
198
|
+
1 Score below threshold
|
|
199
|
+
2 Regression failure
|
|
200
|
+
3 Policy violation
|
|
201
|
+
4 API error
|
|
202
|
+
5 Bad arguments
|
|
203
|
+
6 Low test count
|
|
204
|
+
7 Weak evidence
|
|
205
|
+
evalai doctor
|
|
206
|
+
Verify CI setup before running the gate:
|
|
207
|
+
|
|
208
|
+
npx -y @pauly4010/evalai-sdk@^1 doctor \
|
|
209
|
+
--evaluationId 42 \
|
|
210
|
+
--apiKey $EVALAI_API_KEY
|
|
211
|
+
If doctor passes, check will work.
|
|
212
|
+
|
|
213
|
+
🧯 Error Handling
|
|
214
|
+
import { EvalAIError, RateLimitError } from "@pauly4010/evalai-sdk";
|
|
229
215
|
|
|
230
216
|
try {
|
|
231
|
-
await client.traces.create({
|
|
232
|
-
} catch (
|
|
233
|
-
if (
|
|
234
|
-
console.log(
|
|
235
|
-
} else if (
|
|
236
|
-
console.log(
|
|
217
|
+
await client.traces.create({ name: "User Query" });
|
|
218
|
+
} catch (err) {
|
|
219
|
+
if (err instanceof RateLimitError) {
|
|
220
|
+
console.log("Retry after:", err.retryAfter);
|
|
221
|
+
} else if (err instanceof EvalAIError) {
|
|
222
|
+
console.log(err.code, err.message, err.requestId);
|
|
237
223
|
}
|
|
238
224
|
}
|
|
239
|
-
```
|
|
240
225
|
|
|
241
|
-
## Advanced Features
|
|
242
226
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
withContext({ userId: "123", sessionId: "abc" }, async () => {
|
|
249
|
-
// Context automatically included in all traces
|
|
250
|
-
await client.traces.create({
|
|
251
|
-
name: "Query",
|
|
252
|
-
traceId: "trace-1",
|
|
253
|
-
});
|
|
227
|
+
🔍 Traces
|
|
228
|
+
const trace = await client.traces.create({
|
|
229
|
+
name: "User Query",
|
|
230
|
+
traceId: "trace-123",
|
|
231
|
+
metadata: { userId: "456" },
|
|
254
232
|
});
|
|
255
|
-
```
|
|
256
233
|
|
|
257
|
-
### Test Suites
|
|
258
234
|
|
|
259
|
-
|
|
260
|
-
import {
|
|
235
|
+
📝 Evaluations
|
|
236
|
+
import { EvaluationTemplates } from "@pauly4010/evalai-sdk";
|
|
261
237
|
|
|
262
|
-
const
|
|
263
|
-
name: "Chatbot
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
name: "Greeting",
|
|
267
|
-
input: "Hello",
|
|
268
|
-
expectedOutput: "Hi there!",
|
|
269
|
-
},
|
|
270
|
-
],
|
|
238
|
+
const evaluation = await client.evaluations.create({
|
|
239
|
+
name: "Chatbot Responses",
|
|
240
|
+
type: EvaluationTemplates.OUTPUT_QUALITY,
|
|
241
|
+
createdBy: userId,
|
|
271
242
|
});
|
|
272
243
|
|
|
273
|
-
await suite.run(client);
|
|
274
|
-
```
|
|
275
244
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
```typescript
|
|
245
|
+
🔌 Framework Integrations
|
|
279
246
|
import { traceOpenAI } from "@pauly4010/evalai-sdk/integrations/openai";
|
|
280
247
|
import OpenAI from "openai";
|
|
281
248
|
|
|
282
249
|
const openai = traceOpenAI(new OpenAI(), client);
|
|
283
250
|
|
|
284
|
-
|
|
285
|
-
const response = await openai.chat.completions.create({
|
|
251
|
+
await openai.chat.completions.create({
|
|
286
252
|
model: "gpt-4",
|
|
287
253
|
messages: [{ role: "user", content: "Hello" }],
|
|
288
254
|
});
|
|
289
|
-
```
|
|
290
|
-
|
|
291
|
-
## TypeScript Support
|
|
292
|
-
|
|
293
|
-
The SDK is fully typed with TypeScript generics for type-safe metadata:
|
|
294
|
-
|
|
295
|
-
```typescript
|
|
296
|
-
interface CustomMetadata {
|
|
297
|
-
userId: string;
|
|
298
|
-
sessionId: string;
|
|
299
|
-
model: string;
|
|
300
|
-
}
|
|
301
|
-
|
|
302
|
-
const trace = await client.traces.create<CustomMetadata>({
|
|
303
|
-
name: "Query",
|
|
304
|
-
traceId: "trace-1",
|
|
305
|
-
metadata: {
|
|
306
|
-
userId: "123",
|
|
307
|
-
sessionId: "abc",
|
|
308
|
-
model: "gpt-4",
|
|
309
|
-
},
|
|
310
|
-
});
|
|
311
|
-
|
|
312
|
-
// TypeScript knows the exact metadata type
|
|
313
|
-
console.log(trace.metadata.userId);
|
|
314
|
-
```
|
|
315
|
-
|
|
316
|
-
## 📋 Annotations API (v1.2.0)
|
|
317
|
-
|
|
318
|
-
Human-in-the-loop evaluation for quality assurance:
|
|
319
|
-
|
|
320
|
-
```typescript
|
|
321
|
-
// Create an annotation
|
|
322
|
-
const annotation = await client.annotations.create({
|
|
323
|
-
evaluationRunId: 123,
|
|
324
|
-
testCaseId: 456,
|
|
325
|
-
rating: 5,
|
|
326
|
-
feedback: "Excellent response!",
|
|
327
|
-
labels: { category: "helpful", sentiment: "positive" },
|
|
328
|
-
});
|
|
329
|
-
|
|
330
|
-
// List annotations
|
|
331
|
-
const annotations = await client.annotations.list({
|
|
332
|
-
evaluationRunId: 123,
|
|
333
|
-
});
|
|
334
|
-
|
|
335
|
-
// Annotation Tasks
|
|
336
|
-
const task = await client.annotations.tasks.create({
|
|
337
|
-
name: "Q4 Quality Review",
|
|
338
|
-
type: "classification",
|
|
339
|
-
organizationId: 1,
|
|
340
|
-
instructions: "Rate responses from 1-5",
|
|
341
|
-
});
|
|
342
|
-
|
|
343
|
-
const tasks = await client.annotations.tasks.list({
|
|
344
|
-
organizationId: 1,
|
|
345
|
-
status: "pending",
|
|
346
|
-
});
|
|
347
|
-
|
|
348
|
-
const taskDetail = await client.annotations.tasks.get(taskId);
|
|
349
|
-
|
|
350
|
-
// Annotation Items
|
|
351
|
-
const item = await client.annotations.tasks.items.create(taskId, {
|
|
352
|
-
content: "Response to evaluate",
|
|
353
|
-
annotation: { rating: 4, category: "good" },
|
|
354
|
-
});
|
|
355
|
-
|
|
356
|
-
const items = await client.annotations.tasks.items.list(taskId);
|
|
357
|
-
```
|
|
358
|
-
|
|
359
|
-
## 🔑 Developer API (v1.2.0)
|
|
360
|
-
|
|
361
|
-
Manage API keys, webhooks, and monitor usage:
|
|
362
|
-
|
|
363
|
-
### API Keys
|
|
364
|
-
|
|
365
|
-
```typescript
|
|
366
|
-
// Create an API key
|
|
367
|
-
const { apiKey, id, keyPrefix } = await client.developer.apiKeys.create({
|
|
368
|
-
name: "Production Key",
|
|
369
|
-
organizationId: 1,
|
|
370
|
-
scopes: ["traces:read", "traces:write", "evaluations:read"],
|
|
371
|
-
expiresAt: "2025-12-31T23:59:59Z",
|
|
372
|
-
});
|
|
373
255
|
|
|
374
|
-
// IMPORTANT: Save the apiKey securely - it's only shown once!
|
|
375
256
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
});
|
|
257
|
+
🧭 Changelog
|
|
258
|
+
v1.5.5 (Latest)
|
|
259
|
+
PASS/WARN/FAIL gate semantics
|
|
380
260
|
|
|
381
|
-
|
|
382
|
-
await client.developer.apiKeys.update(keyId, {
|
|
383
|
-
name: "Updated Name",
|
|
384
|
-
scopes: ["traces:read"],
|
|
385
|
-
});
|
|
261
|
+
--warnDrop soft regression band
|
|
386
262
|
|
|
387
|
-
|
|
388
|
-
await client.developer.apiKeys.revoke(keyId);
|
|
263
|
+
Flake intelligence + per-case pass rates
|
|
389
264
|
|
|
390
|
-
|
|
391
|
-
const usage = await client.developer.apiKeys.getUsage(keyId);
|
|
392
|
-
console.log("Total requests:", usage.totalRequests);
|
|
393
|
-
console.log("By endpoint:", usage.usageByEndpoint);
|
|
394
|
-
```
|
|
265
|
+
--fail-on-flake enforcement
|
|
395
266
|
|
|
396
|
-
|
|
267
|
+
Golden regression suite
|
|
397
268
|
|
|
398
|
-
|
|
399
|
-
// Create a webhook
|
|
400
|
-
const webhook = await client.developer.webhooks.create({
|
|
401
|
-
organizationId: 1,
|
|
402
|
-
url: "https://your-app.com/webhooks/evalai",
|
|
403
|
-
events: ["trace.created", "evaluation.completed", "annotation.created"],
|
|
404
|
-
});
|
|
269
|
+
Nightly determinism + performance audits
|
|
405
270
|
|
|
406
|
-
|
|
407
|
-
const webhooks = await client.developer.webhooks.list({
|
|
408
|
-
organizationId: 1,
|
|
409
|
-
status: "active",
|
|
410
|
-
});
|
|
271
|
+
Audit trail, observability, retention, and migration safety docs
|
|
411
272
|
|
|
412
|
-
|
|
413
|
-
|
|
273
|
+
v1.5.0
|
|
274
|
+
GitHub annotations formatter
|
|
414
275
|
|
|
415
|
-
|
|
416
|
-
await client.developer.webhooks.update(webhookId, {
|
|
417
|
-
url: "https://new-url.com/webhooks",
|
|
418
|
-
events: ["trace.created"],
|
|
419
|
-
status: "inactive",
|
|
420
|
-
});
|
|
276
|
+
JSON formatter
|
|
421
277
|
|
|
422
|
-
|
|
423
|
-
await client.developer.webhooks.delete(webhookId);
|
|
278
|
+
--onFail import
|
|
424
279
|
|
|
425
|
-
|
|
426
|
-
const deliveries = await client.developer.webhooks.getDeliveries(webhookId, {
|
|
427
|
-
limit: 50,
|
|
428
|
-
success: false, // Only failed deliveries
|
|
429
|
-
});
|
|
430
|
-
```
|
|
280
|
+
--explain
|
|
431
281
|
|
|
432
|
-
|
|
282
|
+
evalai doctor
|
|
433
283
|
|
|
434
|
-
|
|
435
|
-
// Get detailed usage statistics
|
|
436
|
-
const stats = await client.developer.getUsage({
|
|
437
|
-
organizationId: 1,
|
|
438
|
-
startDate: "2025-01-01",
|
|
439
|
-
endDate: "2025-01-31",
|
|
440
|
-
});
|
|
284
|
+
CI pinned invocation guidance
|
|
441
285
|
|
|
442
|
-
console.log("Traces:", stats.traces.total);
|
|
443
|
-
console.log("Evaluations by type:", stats.evaluations.byType);
|
|
444
|
-
console.log("API calls by endpoint:", stats.apiCalls.byEndpoint);
|
|
445
286
|
|
|
446
|
-
|
|
447
|
-
const summary = await client.developer.getUsageSummary(organizationId);
|
|
448
|
-
console.log("Current period:", summary.currentPeriod);
|
|
449
|
-
console.log("Limits:", summary.limits);
|
|
450
|
-
```
|
|
451
|
-
|
|
452
|
-
## ⚖️ LLM Judge Extended (v1.2.0)
|
|
287
|
+
Environment Variable Safety
|
|
453
288
|
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
```typescript
|
|
457
|
-
// Create a judge configuration
|
|
458
|
-
const config = await client.llmJudge.createConfig({
|
|
459
|
-
name: "GPT-4 Accuracy Judge",
|
|
460
|
-
description: "Evaluates factual accuracy",
|
|
461
|
-
model: "gpt-4",
|
|
462
|
-
rubric: "Score 1-10 based on factual accuracy...",
|
|
463
|
-
temperature: 0.3,
|
|
464
|
-
maxTokens: 500,
|
|
465
|
-
organizationId: 1,
|
|
466
|
-
createdBy: userId,
|
|
467
|
-
});
|
|
289
|
+
The SDK never assumes `process.env` exists. All environment reads are guarded, so the client can initialize safely in browser, edge, and server runtimes.
|
|
468
290
|
|
|
469
|
-
|
|
470
|
-
const configs = await client.llmJudge.listConfigs({
|
|
471
|
-
organizationId: 1,
|
|
472
|
-
});
|
|
291
|
+
If environment variables are unavailable, the SDK falls back to explicit config.
|
|
473
292
|
|
|
474
|
-
// List results
|
|
475
|
-
const results = await client.llmJudge.listResults({
|
|
476
|
-
configId: config.id,
|
|
477
|
-
evaluationId: 123,
|
|
478
|
-
});
|
|
479
|
-
|
|
480
|
-
// Get alignment analysis
|
|
481
|
-
const alignment = await client.llmJudge.getAlignment({
|
|
482
|
-
configId: config.id,
|
|
483
|
-
startDate: "2025-01-01",
|
|
484
|
-
endDate: "2025-01-31",
|
|
485
|
-
});
|
|
486
|
-
|
|
487
|
-
console.log("Average score:", alignment.averageScore);
|
|
488
|
-
console.log("Accuracy:", alignment.alignmentMetrics.accuracy);
|
|
489
|
-
console.log("Agreement with human:", alignment.comparisonWithHuman?.agreement);
|
|
490
|
-
```
|
|
491
|
-
|
|
492
|
-
## 🏢 Organizations API (v1.2.0)
|
|
493
|
-
|
|
494
|
-
Manage organization details:
|
|
495
|
-
|
|
496
|
-
```typescript
|
|
497
|
-
// Get current organization
|
|
498
|
-
const org = await client.organizations.getCurrent();
|
|
499
|
-
console.log("Organization:", org.name);
|
|
500
|
-
console.log("Plan:", org.plan);
|
|
501
|
-
console.log("Status:", org.status);
|
|
502
|
-
```
|
|
503
|
-
|
|
504
|
-
## evalai CLI (v1.4.1)
|
|
505
|
-
|
|
506
|
-
The SDK includes a CLI for CI/CD evaluation gates. Install globally or use via `npx`:
|
|
507
|
-
|
|
508
|
-
```bash
|
|
509
|
-
# Via npx (no global install)
|
|
510
|
-
npx @pauly4010/evalai-sdk check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
511
|
-
|
|
512
|
-
# Or install globally
|
|
513
|
-
npm install -g @pauly4010/evalai-sdk
|
|
514
|
-
evalai check --minScore 92 --evaluationId 42
|
|
515
|
-
```
|
|
516
|
-
|
|
517
|
-
### evalai check
|
|
518
|
-
|
|
519
|
-
Gate deployments on quality scores, regression, and compliance:
|
|
520
|
-
|
|
521
|
-
| Option | Description |
|
|
522
|
-
|--------|-------------|
|
|
523
|
-
| `--evaluationId <id>` | **Required.** Evaluation to gate on |
|
|
524
|
-
| `--apiKey <key>` | API key (or `EVALAI_API_KEY` env) |
|
|
525
|
-
| `--minScore <n>` | Fail if score < n (0–100) |
|
|
526
|
-
| `--maxDrop <n>` | Fail if score dropped > n from baseline |
|
|
527
|
-
| `--minN <n>` | Fail if total test cases < n |
|
|
528
|
-
| `--allowWeakEvidence` | Permit weak evidence level |
|
|
529
|
-
| `--policy <name>` | Enforce HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511 |
|
|
530
|
-
| `--baseline <mode>` | `published`, `previous`, or `production` |
|
|
531
|
-
| `--baseUrl <url>` | API base URL |
|
|
532
|
-
|
|
533
|
-
**Exit codes:** 0=pass, 1=score below, 2=regression, 3=policy violation, 4=API error, 5=bad args, 6=low N, 7=weak evidence
|
|
534
|
-
|
|
535
|
-
## Changelog
|
|
536
|
-
|
|
537
|
-
### v1.4.1 (Latest)
|
|
538
|
-
|
|
539
|
-
- **evalai check `--baseline production`** — Compare against latest prod-tagged run
|
|
540
|
-
- **Package hardening** — Leaner npm publish with `files`, `sideEffects: false`
|
|
541
|
-
|
|
542
|
-
### v1.4.0
|
|
543
|
-
|
|
544
|
-
- **evalai CLI** — Command-line tool for CI/CD evaluation gates
|
|
545
|
-
- `evalai check` — Gate deployments on quality scores, regression, and compliance
|
|
546
|
-
- `--minScore <n>` — Fail if quality score < n (0–100)
|
|
547
|
-
- `--maxDrop <n>` — Fail if score dropped > n points from baseline
|
|
548
|
-
- `--minN <n>` — Fail if total test cases < n
|
|
549
|
-
- `--allowWeakEvidence` — Permit weak evidence level (default: fail)
|
|
550
|
-
- `--policy <name>` — Enforce compliance (HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511)
|
|
551
|
-
- `--baseline <mode>` — Compare to `published` or `previous` run
|
|
552
|
-
- `--evaluationId <id>` — Required. Evaluation to gate on
|
|
553
|
-
- Environment: `EVALAI_API_KEY`, `EVALAI_BASE_URL`
|
|
554
|
-
- Exit codes: 0=pass, 1=score below, 2=regression, 3=policy violation, 4=API error, 5=bad args, 6=low N, 7=weak evidence
|
|
555
|
-
- **CLI Exports** — `parseArgs`, `runCheck`, `EXIT` from `@pauly4010/evalai-sdk` for programmatic use
|
|
556
|
-
|
|
557
|
-
### v1.3.0
|
|
558
|
-
|
|
559
|
-
- **Workflow Tracing** — Multi-agent orchestration with full lifecycle instrumentation
|
|
560
|
-
- `WorkflowTracer` class with `startWorkflow`, `endWorkflow`, `startAgentSpan`, `endAgentSpan`
|
|
561
|
-
- `createWorkflowTracer` convenience factory
|
|
562
|
-
- `traceWorkflowStep` generic wrapper for any async function
|
|
563
|
-
- Agent handoff recording (`delegation`, `escalation`, `parallel`, `fallback`)
|
|
564
|
-
- Decision auditing with alternatives, confidence scores, reasoning, and context factors
|
|
565
|
-
- Cost tracking per span/workflow with automatic pricing (16+ models)
|
|
566
|
-
- Cost breakdown by category (`llm`, `tool`, `embedding`, `other`)
|
|
567
|
-
- **Framework Integrations** — Wrap popular multi-agent frameworks:
|
|
568
|
-
- `traceLangChainAgent` — wraps `.invoke()` and `.call()` with auto-tracing
|
|
569
|
-
- `traceCrewAI` — wraps `.kickoff()` with workflow start/end
|
|
570
|
-
- `traceAutoGen` — wraps `.initiate_chat()` with workflow start/end
|
|
571
|
-
- **Performance Utilities**
|
|
572
|
-
- `RequestCache` with configurable TTL (`CacheTTL` presets)
|
|
573
|
-
- `PaginatedIterator` / `createPaginatedIterator` / `autoPaginate` for cursor-based pagination
|
|
574
|
-
- `RequestBatcher` for batching API calls
|
|
575
|
-
- `RateLimiter` client-side rate limit handling
|
|
576
|
-
- **Cost Tracking Types** — `CostRecord`, `CostBreakdown`, `ProviderPricing` interfaces
|
|
577
|
-
- **Agent Decision Auditing Types** — `AgentDecision`, `DecisionAlternative`, `RecordDecisionParams` interfaces
|
|
578
|
-
- **Benchmark Types** — `Benchmark`, `BenchmarkResult`, `AgentConfig` interfaces
|
|
579
|
-
|
|
580
|
-
### v1.2.1 (Bug Fixes)
|
|
581
|
-
|
|
582
|
-
- 🐛 **Critical Fixes**
|
|
583
|
-
- Fixed CLI import paths for proper npm package distribution
|
|
584
|
-
- Fixed duplicate trace creation in OpenAI/Anthropic integrations
|
|
585
|
-
- Fixed Commander.js command structure
|
|
586
|
-
- Added browser/Node.js environment detection and helpful errors
|
|
587
|
-
- Fixed context system to work in both Node.js and browsers
|
|
588
|
-
- Added security checks to snapshot path sanitization
|
|
589
|
-
- Removed misleading empty exports (StreamingClient, BatchClient)
|
|
590
|
-
- 📦 **Dependencies**
|
|
591
|
-
- Updated Commander to v14
|
|
592
|
-
- Added peer dependencies for OpenAI and Anthropic SDKs (optional)
|
|
593
|
-
- Added Node.js engine requirement (>=16.0.0)
|
|
594
|
-
- 📚 **Documentation**
|
|
595
|
-
- Clarified Node.js-only vs universal features
|
|
596
|
-
- Added environment support section
|
|
597
|
-
- Updated examples with security best practices
|
|
598
|
-
|
|
599
|
-
### v1.2.0
|
|
600
|
-
|
|
601
|
-
- 🎉 **100% API Coverage** - All backend endpoints now supported!
|
|
602
|
-
- 📋 **Annotations API** - Complete human-in-the-loop evaluation
|
|
603
|
-
- Create and list annotations
|
|
604
|
-
- Manage annotation tasks
|
|
605
|
-
- Handle annotation items
|
|
606
|
-
- 🔑 **Developer API** - Full API key and webhook management
|
|
607
|
-
- CRUD operations for API keys
|
|
608
|
-
- Webhook management with delivery tracking
|
|
609
|
-
- Usage analytics and monitoring
|
|
610
|
-
- ⚖️ **LLM Judge Extended** - Enhanced judge capabilities
|
|
611
|
-
- Configuration management
|
|
612
|
-
- Results querying
|
|
613
|
-
- Alignment analysis
|
|
614
|
-
- 🏢 **Organizations API** - Organization details access
|
|
615
|
-
- 📊 **Enhanced Types** - 40+ new TypeScript interfaces
|
|
616
|
-
- 📚 **Comprehensive Documentation** - Examples for all new features
|
|
617
|
-
|
|
618
|
-
### v1.1.0
|
|
619
|
-
|
|
620
|
-
- ✨ Added comprehensive evaluation template types
|
|
621
|
-
- ✨ Added organization resource limits tracking
|
|
622
|
-
- ✨ Added `getOrganizationLimits()` method
|
|
623
|
-
- 📚 Enhanced documentation with new features
|
|
624
|
-
|
|
625
|
-
### v1.0.0
|
|
626
|
-
|
|
627
|
-
- 🎉 Initial release
|
|
628
|
-
- ✅ Traces, Evaluations, LLM Judge APIs
|
|
629
|
-
- ✅ Framework integrations (OpenAI, Anthropic)
|
|
630
|
-
- ✅ Test suite builder
|
|
631
|
-
- ✅ Context propagation
|
|
632
|
-
- ✅ Error handling & retries
|
|
633
|
-
|
|
634
|
-
## License
|
|
635
293
|
|
|
294
|
+
📄 License
|
|
636
295
|
MIT
|
|
637
296
|
|
|
638
|
-
|
|
297
|
+
🤝 Support
|
|
298
|
+
Documentation:
|
|
299
|
+
https://v0-ai-evaluation-platform-nu.vercel.app/documentation
|
|
639
300
|
|
|
640
|
-
|
|
641
|
-
|
|
301
|
+
Issues:
|
|
302
|
+
https://github.com/pauly7610/ai-evaluation-platform/issues
|
|
303
|
+
```
|