@pauly4010/evalai-sdk 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +8 -3
- package/dist/cli/check.d.ts +2 -2
- package/dist/cli/check.js +14 -3
- package/package.json +6 -3
- package/.env.example +0 -0
- package/ADDITIONAL_ISSUES_FOUND.md +0 -174
- package/evalai-sdk-1.2.0.tgz +0 -0
- package/postcss.config.mjs +0 -2
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,18 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.4.1] - 2026-02-18
|
|
9
|
+
|
|
10
|
+
### ✨ Added
|
|
11
|
+
|
|
12
|
+
- **evalai check `--baseline production`** — Compare against latest run tagged with `environment=prod`
|
|
13
|
+
- **Baseline missing handling** — Clear failure when baseline not found and comparison requested
|
|
14
|
+
|
|
15
|
+
### 🔧 Changed
|
|
16
|
+
|
|
17
|
+
- **Package hardening** — `files`, `module`, `sideEffects: false` for leaner npm publish
|
|
18
|
+
- **CLI** — Passes `baseline` param to quality API for deterministic CI gates
|
|
19
|
+
|
|
8
20
|
## [1.3.0] - 2025-10-21
|
|
9
21
|
|
|
10
22
|
### ✨ Added
|
package/README.md
CHANGED
|
@@ -501,7 +501,7 @@ console.log("Plan:", org.plan);
|
|
|
501
501
|
console.log("Status:", org.status);
|
|
502
502
|
```
|
|
503
503
|
|
|
504
|
-
## evalai CLI (v1.4.
|
|
504
|
+
## evalai CLI (v1.4.1)
|
|
505
505
|
|
|
506
506
|
The SDK includes a CLI for CI/CD evaluation gates. Install globally or use via `npx`:
|
|
507
507
|
|
|
@@ -527,14 +527,19 @@ Gate deployments on quality scores, regression, and compliance:
|
|
|
527
527
|
| `--minN <n>` | Fail if total test cases < n |
|
|
528
528
|
| `--allowWeakEvidence` | Permit weak evidence level |
|
|
529
529
|
| `--policy <name>` | Enforce HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511 |
|
|
530
|
-
| `--baseline <mode>` | `published` or `
|
|
530
|
+
| `--baseline <mode>` | `published`, `previous`, or `production` |
|
|
531
531
|
| `--baseUrl <url>` | API base URL |
|
|
532
532
|
|
|
533
533
|
**Exit codes:** 0=pass, 1=score below, 2=regression, 3=policy violation, 4=API error, 5=bad args, 6=low N, 7=weak evidence
|
|
534
534
|
|
|
535
535
|
## Changelog
|
|
536
536
|
|
|
537
|
-
### v1.4.
|
|
537
|
+
### v1.4.1 (Latest)
|
|
538
|
+
|
|
539
|
+
- **evalai check `--baseline production`** — Compare against latest prod-tagged run
|
|
540
|
+
- **Package hardening** — Leaner npm publish with `files`, `sideEffects: false`
|
|
541
|
+
|
|
542
|
+
### v1.4.0
|
|
538
543
|
|
|
539
544
|
- **evalai CLI** — Command-line tool for CI/CD evaluation gates
|
|
540
545
|
- `evalai check` — Gate deployments on quality scores, regression, and compliance
|
package/dist/cli/check.d.ts
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
* --minN <n> Fail if total test cases < n (low sample size)
|
|
15
15
|
* --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
|
|
16
16
|
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
17
|
-
* --baseline <mode> Baseline comparison mode: "published" (default) or "
|
|
17
|
+
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
18
18
|
* --evaluationId <id> Required. The evaluation to gate on.
|
|
19
19
|
* --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
|
|
20
20
|
* --apiKey <key> API key (default: EVALAI_API_KEY env var)
|
|
@@ -52,7 +52,7 @@ export interface CheckArgs {
|
|
|
52
52
|
allowWeakEvidence: boolean;
|
|
53
53
|
evaluationId: string;
|
|
54
54
|
policy?: string;
|
|
55
|
-
baseline: 'published' | 'previous';
|
|
55
|
+
baseline: 'published' | 'previous' | 'production';
|
|
56
56
|
}
|
|
57
57
|
export declare function parseArgs(argv: string[]): CheckArgs;
|
|
58
58
|
export declare function runCheck(args: CheckArgs): Promise<number>;
|
package/dist/cli/check.js
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* --minN <n> Fail if total test cases < n (low sample size)
|
|
16
16
|
* --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
|
|
17
17
|
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
18
|
-
* --baseline <mode> Baseline comparison mode: "published" (default) or "
|
|
18
|
+
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
19
19
|
* --evaluationId <id> Required. The evaluation to gate on.
|
|
20
20
|
* --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
|
|
21
21
|
* --apiKey <key> API key (default: EVALAI_API_KEY env var)
|
|
@@ -73,7 +73,11 @@ function parseArgs(argv) {
|
|
|
73
73
|
const allowWeakEvidence = args.allowWeakEvidence === 'true' || args.allowWeakEvidence === '1';
|
|
74
74
|
const evaluationId = args.evaluationId || '';
|
|
75
75
|
const policy = args.policy || undefined;
|
|
76
|
-
const baseline = (args.baseline === 'previous'
|
|
76
|
+
const baseline = (args.baseline === 'previous'
|
|
77
|
+
? 'previous'
|
|
78
|
+
: args.baseline === 'production'
|
|
79
|
+
? 'production'
|
|
80
|
+
: 'published');
|
|
77
81
|
if (!apiKey) {
|
|
78
82
|
console.error('Error: --apiKey or EVALAI_API_KEY is required');
|
|
79
83
|
process.exit(exports.EXIT.BAD_ARGS);
|
|
@@ -95,7 +99,7 @@ function parseArgs(argv) {
|
|
|
95
99
|
async function runCheck(args) {
|
|
96
100
|
const headers = { Authorization: `Bearer ${args.apiKey}` };
|
|
97
101
|
// ── 1. Fetch latest quality score ──
|
|
98
|
-
const scoreUrl = `${args.baseUrl}/api/quality?evaluationId=${args.evaluationId}&action=latest`;
|
|
102
|
+
const scoreUrl = `${args.baseUrl}/api/quality?evaluationId=${args.evaluationId}&action=latest&baseline=${args.baseline}`;
|
|
99
103
|
let scoreRes;
|
|
100
104
|
try {
|
|
101
105
|
scoreRes = await fetch(scoreUrl, { headers });
|
|
@@ -115,7 +119,14 @@ async function runCheck(args) {
|
|
|
115
119
|
const evidenceLevel = data?.evidenceLevel ?? null;
|
|
116
120
|
const baselineScore = data?.baselineScore ?? null;
|
|
117
121
|
const regressionDelta = data?.regressionDelta ?? null;
|
|
122
|
+
const baselineMissing = data?.baselineMissing === true;
|
|
118
123
|
const breakdown = data?.breakdown ?? {};
|
|
124
|
+
// ── Gate: baseline missing (when baseline comparison requested) ──
|
|
125
|
+
if (baselineMissing && (args.baseline !== 'published' || args.maxDrop !== undefined)) {
|
|
126
|
+
console.error(`\n✗ FAILED: baseline (${args.baseline}) not found. ` +
|
|
127
|
+
`Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`);
|
|
128
|
+
return exports.EXIT.API_ERROR;
|
|
129
|
+
}
|
|
119
130
|
// ── Gate: minN (low sample size) ──
|
|
120
131
|
if (args.minN !== undefined && total !== null && total < args.minN) {
|
|
121
132
|
console.error(`\n✗ FAILED: total test cases (${total}) < minN (${args.minN})`);
|
package/package.json
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pauly4010/evalai-sdk",
|
|
3
|
-
"version": "1.4.
|
|
3
|
+
"version": "1.4.1",
|
|
4
4
|
"description": "AI Evaluation Platform SDK - Complete API Coverage with Performance Optimizations",
|
|
5
|
-
"main": "
|
|
6
|
-
"
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"module": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"sideEffects": false,
|
|
9
|
+
"files": ["dist", "README.md", "CHANGELOG.md"],
|
|
7
10
|
"bin": {
|
|
8
11
|
"evalai": "./dist/cli/index.js"
|
|
9
12
|
},
|
package/.env.example
DELETED
|
Binary file
|
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
# Additional Issues Found in Second Review
|
|
2
|
-
|
|
3
|
-
## 🔴 Issues Discovered
|
|
4
|
-
|
|
5
|
-
### 1. **process.env Usage in Browser Context** ⚠️ HIGH PRIORITY
|
|
6
|
-
|
|
7
|
-
**Files**: `client.ts` (lines 105, 116, 178)
|
|
8
|
-
|
|
9
|
-
**Problem**: The SDK uses `process.env` directly, which is undefined in browsers:
|
|
10
|
-
|
|
11
|
-
```typescript
|
|
12
|
-
// Line 105
|
|
13
|
-
this.apiKey = config.apiKey || process.env.EVALAI_API_KEY || ...
|
|
14
|
-
|
|
15
|
-
// Line 116
|
|
16
|
-
const orgIdFromEnv = process.env.EVALAI_ORGANIZATION_ID || ...
|
|
17
|
-
|
|
18
|
-
// Line 178 (in static init method)
|
|
19
|
-
baseUrl: process.env.EVALAI_BASE_URL,
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
**Impact**:
|
|
23
|
-
- Will cause "Cannot read property of undefined" errors in browsers
|
|
24
|
-
- Breaks zero-config initialization in browsers
|
|
25
|
-
- `AIEvalClient.init()` won't work in browsers
|
|
26
|
-
|
|
27
|
-
**Severity**: HIGH - Core functionality breaks in browsers
|
|
28
|
-
|
|
29
|
-
---
|
|
30
|
-
|
|
31
|
-
### 2. **Type Name Collision** 🟡 MEDIUM PRIORITY
|
|
32
|
-
|
|
33
|
-
**Files**: `types.ts` (line 209) and `testing.ts` (line 27)
|
|
34
|
-
|
|
35
|
-
**Problem**: Two different `TestCase` interfaces with same name but different purposes:
|
|
36
|
-
|
|
37
|
-
**types.ts** (Database Model):
|
|
38
|
-
```typescript
|
|
39
|
-
export interface TestCase {
|
|
40
|
-
id: number;
|
|
41
|
-
evaluationId: number;
|
|
42
|
-
input: string;
|
|
43
|
-
expectedOutput: string | null;
|
|
44
|
-
metadata: Record<string, any> | null;
|
|
45
|
-
createdAt: string;
|
|
46
|
-
}
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
**testing.ts** (Test Suite Model):
|
|
50
|
-
```typescript
|
|
51
|
-
export interface TestCase {
|
|
52
|
-
id?: string;
|
|
53
|
-
input: string;
|
|
54
|
-
expected?: string;
|
|
55
|
-
metadata?: Record<string, any>;
|
|
56
|
-
assertions?: ((output: string) => AssertionResult)[];
|
|
57
|
-
}
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
**Impact**:
|
|
61
|
-
- Confusing for developers
|
|
62
|
-
- IDE autocomplete shows wrong interface
|
|
63
|
-
- Only `types.ts` version is exported from index.ts (line 117)
|
|
64
|
-
- Could cause type errors if both are imported
|
|
65
|
-
|
|
66
|
-
**Severity**: MEDIUM - Causes confusion but only types.ts version is publicly exported
|
|
67
|
-
|
|
68
|
-
---
|
|
69
|
-
|
|
70
|
-
### 3. **Dynamic Import Pattern in export.ts** 🟢 LOW PRIORITY
|
|
71
|
-
|
|
72
|
-
**Files**: `export.ts` (lines 296, 316)
|
|
73
|
-
|
|
74
|
-
**Pattern**:
|
|
75
|
-
```typescript
|
|
76
|
-
const fs = await import('fs');
|
|
77
|
-
fs.writeFileSync(filePath, ...);
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
**Issue**:
|
|
81
|
-
- Dynamic import returns a module namespace object
|
|
82
|
-
- Works but is unusual pattern (normally use static imports in Node.js-only files)
|
|
83
|
-
- Could fail in some bundler configurations
|
|
84
|
-
|
|
85
|
-
**Impact**:
|
|
86
|
-
- Works but non-standard
|
|
87
|
-
- Tree-shaking friendly but unnecessary for Node.js-only code
|
|
88
|
-
- Some bundlers might have issues
|
|
89
|
-
|
|
90
|
-
**Severity**: LOW - Works but not best practice
|
|
91
|
-
|
|
92
|
-
---
|
|
93
|
-
|
|
94
|
-
### 4. **TypeScript Module Configuration** 🟢 INFO
|
|
95
|
-
|
|
96
|
-
**File**: `tsconfig.json`
|
|
97
|
-
|
|
98
|
-
**Current**:
|
|
99
|
-
```json
|
|
100
|
-
{
|
|
101
|
-
"module": "commonjs"
|
|
102
|
-
}
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
**Observation**:
|
|
106
|
-
- Using CommonJS but package.json has ES module exports
|
|
107
|
-
- CLI uses `.js` extensions in imports (which is correct for ES modules)
|
|
108
|
-
- Mismatch between TypeScript config and runtime expectations
|
|
109
|
-
|
|
110
|
-
**Impact**:
|
|
111
|
-
- May cause issues with module resolution
|
|
112
|
-
- CLI imports might not work as expected
|
|
113
|
-
- Bundlers might be confused
|
|
114
|
-
|
|
115
|
-
**Severity**: LOW - Currently working but could cause subtle issues
|
|
116
|
-
|
|
117
|
-
---
|
|
118
|
-
|
|
119
|
-
## 📊 Summary
|
|
120
|
-
|
|
121
|
-
| Issue | Severity | Impact | Affected |
|
|
122
|
-
|-------|----------|--------|----------|
|
|
123
|
-
| process.env in browser | 🔴 HIGH | Breaks in browsers | Core client |
|
|
124
|
-
| TestCase collision | 🟡 MEDIUM | Developer confusion | Types |
|
|
125
|
-
| Dynamic imports | 🟢 LOW | Unusual pattern | export.ts |
|
|
126
|
-
| Module config | 🟢 INFO | Potential confusion | Build system |
|
|
127
|
-
|
|
128
|
-
---
|
|
129
|
-
|
|
130
|
-
## ✅ Recommended Fixes
|
|
131
|
-
|
|
132
|
-
### Fix 1: Safe process.env Access
|
|
133
|
-
|
|
134
|
-
Add helper function:
|
|
135
|
-
```typescript
|
|
136
|
-
// utils.ts or client.ts
|
|
137
|
-
function getEnvVar(name: string): string | undefined {
|
|
138
|
-
if (typeof process !== 'undefined' && process.env) {
|
|
139
|
-
return process.env[name];
|
|
140
|
-
}
|
|
141
|
-
return undefined;
|
|
142
|
-
}
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
Then use:
|
|
146
|
-
```typescript
|
|
147
|
-
this.apiKey = config.apiKey || getEnvVar('EVALAI_API_KEY') || ...
|
|
148
|
-
```
|
|
149
|
-
|
|
150
|
-
### Fix 2: Rename Test Suite TestCase
|
|
151
|
-
|
|
152
|
-
Rename in `testing.ts`:
|
|
153
|
-
```typescript
|
|
154
|
-
export interface TestSuiteCase { // Was: TestCase
|
|
155
|
-
id?: string;
|
|
156
|
-
input: string;
|
|
157
|
-
expected?: string;
|
|
158
|
-
// ...
|
|
159
|
-
}
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
### Fix 3: Static Imports in export.ts
|
|
163
|
-
|
|
164
|
-
Since already checked for Node.js environment:
|
|
165
|
-
```typescript
|
|
166
|
-
import * as fs from 'fs'; // Instead of: const fs = await import('fs')
|
|
167
|
-
```
|
|
168
|
-
|
|
169
|
-
### Fix 4: Consider ES Modules
|
|
170
|
-
|
|
171
|
-
Either:
|
|
172
|
-
- Change tsconfig to `"module": "es2020"`
|
|
173
|
-
- Or change package.json exports to use `.cjs` extensions
|
|
174
|
-
|
package/evalai-sdk-1.2.0.tgz
DELETED
|
Binary file
|
package/postcss.config.mjs
DELETED