@pauly4010/evalai-sdk 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -5,6 +5,18 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.4.1] - 2026-02-18
9
+
10
+ ### ✨ Added
11
+
12
+ - **evalai check `--baseline production`** — Compare against latest run tagged with `environment=prod`
13
+ - **Baseline missing handling** — Clear failure when baseline not found and comparison requested
14
+
15
+ ### 🔧 Changed
16
+
17
+ - **Package hardening** — `files`, `module`, `sideEffects: false` for leaner npm publish
18
+ - **CLI** — Passes `baseline` param to quality API for deterministic CI gates
19
+
8
20
  ## [1.3.0] - 2025-10-21
9
21
 
10
22
  ### ✨ Added
package/README.md CHANGED
@@ -501,7 +501,7 @@ console.log("Plan:", org.plan);
501
501
  console.log("Status:", org.status);
502
502
  ```
503
503
 
504
- ## evalai CLI (v1.4.0)
504
+ ## evalai CLI (v1.4.1)
505
505
 
506
506
  The SDK includes a CLI for CI/CD evaluation gates. Install globally or use via `npx`:
507
507
 
@@ -527,14 +527,19 @@ Gate deployments on quality scores, regression, and compliance:
527
527
  | `--minN <n>` | Fail if total test cases &lt; n |
528
528
  | `--allowWeakEvidence` | Permit weak evidence level |
529
529
  | `--policy <name>` | Enforce HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511 |
530
- | `--baseline <mode>` | `published` or `previous` |
530
+ | `--baseline <mode>` | `published`, `previous`, or `production` |
531
531
  | `--baseUrl <url>` | API base URL |
532
532
 
533
533
  **Exit codes:** 0=pass, 1=score below, 2=regression, 3=policy violation, 4=API error, 5=bad args, 6=low N, 7=weak evidence
534
534
 
535
535
  ## Changelog
536
536
 
537
- ### v1.4.0 (Latest)
537
+ ### v1.4.1 (Latest)
538
+
539
+ - **evalai check `--baseline production`** — Compare against latest prod-tagged run
540
+ - **Package hardening** — Leaner npm publish with `files`, `sideEffects: false`
541
+
542
+ ### v1.4.0
538
543
 
539
544
  - **evalai CLI** — Command-line tool for CI/CD evaluation gates
540
545
  - `evalai check` — Gate deployments on quality scores, regression, and compliance
@@ -14,7 +14,7 @@
14
14
  * --minN <n> Fail if total test cases < n (low sample size)
15
15
  * --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
16
16
  * --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
17
- * --baseline <mode> Baseline comparison mode: "published" (default) or "previous"
17
+ * --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
18
18
  * --evaluationId <id> Required. The evaluation to gate on.
19
19
  * --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
20
20
  * --apiKey <key> API key (default: EVALAI_API_KEY env var)
@@ -52,7 +52,7 @@ export interface CheckArgs {
52
52
  allowWeakEvidence: boolean;
53
53
  evaluationId: string;
54
54
  policy?: string;
55
- baseline: 'published' | 'previous';
55
+ baseline: 'published' | 'previous' | 'production';
56
56
  }
57
57
  export declare function parseArgs(argv: string[]): CheckArgs;
58
58
  export declare function runCheck(args: CheckArgs): Promise<number>;
package/dist/cli/check.js CHANGED
@@ -15,7 +15,7 @@
15
15
  * --minN <n> Fail if total test cases < n (low sample size)
16
16
  * --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
17
17
  * --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
18
- * --baseline <mode> Baseline comparison mode: "published" (default) or "previous"
18
+ * --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
19
19
  * --evaluationId <id> Required. The evaluation to gate on.
20
20
  * --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
21
21
  * --apiKey <key> API key (default: EVALAI_API_KEY env var)
@@ -73,7 +73,11 @@ function parseArgs(argv) {
73
73
  const allowWeakEvidence = args.allowWeakEvidence === 'true' || args.allowWeakEvidence === '1';
74
74
  const evaluationId = args.evaluationId || '';
75
75
  const policy = args.policy || undefined;
76
- const baseline = (args.baseline === 'previous' ? 'previous' : 'published');
76
+ const baseline = (args.baseline === 'previous'
77
+ ? 'previous'
78
+ : args.baseline === 'production'
79
+ ? 'production'
80
+ : 'published');
77
81
  if (!apiKey) {
78
82
  console.error('Error: --apiKey or EVALAI_API_KEY is required');
79
83
  process.exit(exports.EXIT.BAD_ARGS);
@@ -95,7 +99,7 @@ function parseArgs(argv) {
95
99
  async function runCheck(args) {
96
100
  const headers = { Authorization: `Bearer ${args.apiKey}` };
97
101
  // ── 1. Fetch latest quality score ──
98
- const scoreUrl = `${args.baseUrl}/api/quality?evaluationId=${args.evaluationId}&action=latest`;
102
+ const scoreUrl = `${args.baseUrl}/api/quality?evaluationId=${args.evaluationId}&action=latest&baseline=${args.baseline}`;
99
103
  let scoreRes;
100
104
  try {
101
105
  scoreRes = await fetch(scoreUrl, { headers });
@@ -115,7 +119,14 @@ async function runCheck(args) {
115
119
  const evidenceLevel = data?.evidenceLevel ?? null;
116
120
  const baselineScore = data?.baselineScore ?? null;
117
121
  const regressionDelta = data?.regressionDelta ?? null;
122
+ const baselineMissing = data?.baselineMissing === true;
118
123
  const breakdown = data?.breakdown ?? {};
124
+ // ── Gate: baseline missing (when baseline comparison requested) ──
125
+ if (baselineMissing && (args.baseline !== 'published' || args.maxDrop !== undefined)) {
126
+ console.error(`\n✗ FAILED: baseline (${args.baseline}) not found. ` +
127
+ `Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`);
128
+ return exports.EXIT.API_ERROR;
129
+ }
119
130
  // ── Gate: minN (low sample size) ──
120
131
  if (args.minN !== undefined && total !== null && total < args.minN) {
121
132
  console.error(`\n✗ FAILED: total test cases (${total}) < minN (${args.minN})`);
package/package.json CHANGED
@@ -1,9 +1,12 @@
1
1
  {
2
2
  "name": "@pauly4010/evalai-sdk",
3
- "version": "1.4.0",
3
+ "version": "1.4.1",
4
4
  "description": "AI Evaluation Platform SDK - Complete API Coverage with Performance Optimizations",
5
- "main": "./dist/index.js",
6
- "types": "./dist/index.d.ts",
5
+ "main": "dist/index.js",
6
+ "module": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "sideEffects": false,
9
+ "files": ["dist", "README.md", "CHANGELOG.md"],
7
10
  "bin": {
8
11
  "evalai": "./dist/cli/index.js"
9
12
  },
package/.env.example DELETED
Binary file
@@ -1,174 +0,0 @@
1
- # Additional Issues Found in Second Review
2
-
3
- ## 🔴 Issues Discovered
4
-
5
- ### 1. **process.env Usage in Browser Context** ⚠️ HIGH PRIORITY
6
-
7
- **Files**: `client.ts` (lines 105, 116, 178)
8
-
9
- **Problem**: The SDK uses `process.env` directly, which is undefined in browsers:
10
-
11
- ```typescript
12
- // Line 105
13
- this.apiKey = config.apiKey || process.env.EVALAI_API_KEY || ...
14
-
15
- // Line 116
16
- const orgIdFromEnv = process.env.EVALAI_ORGANIZATION_ID || ...
17
-
18
- // Line 178 (in static init method)
19
- baseUrl: process.env.EVALAI_BASE_URL,
20
- ```
21
-
22
- **Impact**:
23
- - Will cause "Cannot read property of undefined" errors in browsers
24
- - Breaks zero-config initialization in browsers
25
- - `AIEvalClient.init()` won't work in browsers
26
-
27
- **Severity**: HIGH - Core functionality breaks in browsers
28
-
29
- ---
30
-
31
- ### 2. **Type Name Collision** 🟡 MEDIUM PRIORITY
32
-
33
- **Files**: `types.ts` (line 209) and `testing.ts` (line 27)
34
-
35
- **Problem**: Two different `TestCase` interfaces with same name but different purposes:
36
-
37
- **types.ts** (Database Model):
38
- ```typescript
39
- export interface TestCase {
40
- id: number;
41
- evaluationId: number;
42
- input: string;
43
- expectedOutput: string | null;
44
- metadata: Record<string, any> | null;
45
- createdAt: string;
46
- }
47
- ```
48
-
49
- **testing.ts** (Test Suite Model):
50
- ```typescript
51
- export interface TestCase {
52
- id?: string;
53
- input: string;
54
- expected?: string;
55
- metadata?: Record<string, any>;
56
- assertions?: ((output: string) => AssertionResult)[];
57
- }
58
- ```
59
-
60
- **Impact**:
61
- - Confusing for developers
62
- - IDE autocomplete shows wrong interface
63
- - Only `types.ts` version is exported from index.ts (line 117)
64
- - Could cause type errors if both are imported
65
-
66
- **Severity**: MEDIUM - Causes confusion but only types.ts version is publicly exported
67
-
68
- ---
69
-
70
- ### 3. **Dynamic Import Pattern in export.ts** 🟢 LOW PRIORITY
71
-
72
- **Files**: `export.ts` (lines 296, 316)
73
-
74
- **Pattern**:
75
- ```typescript
76
- const fs = await import('fs');
77
- fs.writeFileSync(filePath, ...);
78
- ```
79
-
80
- **Issue**:
81
- - Dynamic import returns a module namespace object
82
- - Works but is unusual pattern (normally use static imports in Node.js-only files)
83
- - Could fail in some bundler configurations
84
-
85
- **Impact**:
86
- - Works but non-standard
87
- - Tree-shaking friendly but unnecessary for Node.js-only code
88
- - Some bundlers might have issues
89
-
90
- **Severity**: LOW - Works but not best practice
91
-
92
- ---
93
-
94
- ### 4. **TypeScript Module Configuration** 🟢 INFO
95
-
96
- **File**: `tsconfig.json`
97
-
98
- **Current**:
99
- ```json
100
- {
101
- "module": "commonjs"
102
- }
103
- ```
104
-
105
- **Observation**:
106
- - Using CommonJS but package.json has ES module exports
107
- - CLI uses `.js` extensions in imports (which is correct for ES modules)
108
- - Mismatch between TypeScript config and runtime expectations
109
-
110
- **Impact**:
111
- - May cause issues with module resolution
112
- - CLI imports might not work as expected
113
- - Bundlers might be confused
114
-
115
- **Severity**: LOW - Currently working but could cause subtle issues
116
-
117
- ---
118
-
119
- ## 📊 Summary
120
-
121
- | Issue | Severity | Impact | Affected |
122
- |-------|----------|--------|----------|
123
- | process.env in browser | 🔴 HIGH | Breaks in browsers | Core client |
124
- | TestCase collision | 🟡 MEDIUM | Developer confusion | Types |
125
- | Dynamic imports | 🟢 LOW | Unusual pattern | export.ts |
126
- | Module config | 🟢 INFO | Potential confusion | Build system |
127
-
128
- ---
129
-
130
- ## ✅ Recommended Fixes
131
-
132
- ### Fix 1: Safe process.env Access
133
-
134
- Add helper function:
135
- ```typescript
136
- // utils.ts or client.ts
137
- function getEnvVar(name: string): string | undefined {
138
- if (typeof process !== 'undefined' && process.env) {
139
- return process.env[name];
140
- }
141
- return undefined;
142
- }
143
- ```
144
-
145
- Then use:
146
- ```typescript
147
- this.apiKey = config.apiKey || getEnvVar('EVALAI_API_KEY') || ...
148
- ```
149
-
150
- ### Fix 2: Rename Test Suite TestCase
151
-
152
- Rename in `testing.ts`:
153
- ```typescript
154
- export interface TestSuiteCase { // Was: TestCase
155
- id?: string;
156
- input: string;
157
- expected?: string;
158
- // ...
159
- }
160
- ```
161
-
162
- ### Fix 3: Static Imports in export.ts
163
-
164
- Since already checked for Node.js environment:
165
- ```typescript
166
- import * as fs from 'fs'; // Instead of: const fs = await import('fs')
167
- ```
168
-
169
- ### Fix 4: Consider ES Modules
170
-
171
- Either:
172
- - Change tsconfig to `"module": "es2020"`
173
- - Or change package.json exports to use `.cjs` extensions
174
-
Binary file
@@ -1,2 +0,0 @@
1
- // Empty PostCSS config to prevent inheriting root config
2
- export default { plugins: {} };