npm - @pauly4010/evalai-sdk - Versions diffs - 1.4.0 → 1.5.0 - Mend

@pauly4010/evalai-sdk 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/CHANGELOG.md +46 -0
package/README.md +108 -9
package/dist/cli/api.d.ts +79 -0
package/dist/cli/api.js +74 -0
package/dist/cli/check.d.ts +16 -13
package/dist/cli/check.js +117 -127
package/dist/cli/ci-context.d.ts +6 -0
package/dist/cli/ci-context.js +51 -0
package/dist/cli/config.d.ts +24 -0
package/dist/cli/config.js +158 -0
package/dist/cli/constants.d.ts +13 -0
package/dist/cli/constants.js +16 -0
package/dist/cli/doctor.d.ts +11 -0
package/dist/cli/doctor.js +82 -0
package/dist/cli/formatters/github.d.ts +8 -0
package/dist/cli/formatters/github.js +119 -0
package/dist/cli/formatters/human.d.ts +6 -0
package/dist/cli/formatters/human.js +92 -0
package/dist/cli/formatters/json.d.ts +6 -0
package/dist/cli/formatters/json.js +10 -0
package/dist/cli/formatters/types.d.ts +76 -0
package/dist/cli/formatters/types.js +5 -0
package/dist/cli/gate.d.ts +13 -0
package/dist/cli/gate.js +108 -0
package/dist/cli/index.d.ts +1 -0
package/dist/cli/index.js +31 -5
package/dist/cli/init.d.ts +7 -0
package/dist/cli/init.js +69 -0
package/dist/cli/render/snippet.d.ts +5 -0
package/dist/cli/render/snippet.js +15 -0
package/dist/cli/render/sort.d.ts +10 -0
package/dist/cli/render/sort.js +24 -0
package/dist/cli/report/build-check-report.d.ts +16 -0
package/dist/cli/report/build-check-report.js +94 -0
package/dist/index.d.ts +1 -0
package/dist/index.js +4 -1
package/dist/integrations/openai-eval.d.ts +53 -0
package/dist/integrations/openai-eval.js +226 -0
package/dist/utils/input-hash.d.ts +8 -0
package/dist/utils/input-hash.js +38 -0
package/package.json +10 -3
package/.env.example +0 -0
package/ADDITIONAL_ISSUES_FOUND.md +0 -174
package/dist/__tests__/assertions.test.d.ts +0 -1
package/dist/__tests__/assertions.test.js +0 -288
package/dist/__tests__/client.test.d.ts +0 -1
package/dist/__tests__/client.test.js +0 -185
package/dist/__tests__/testing.test.d.ts +0 -1
package/dist/__tests__/testing.test.js +0 -230
package/dist/__tests__/workflows.test.d.ts +0 -1
package/dist/__tests__/workflows.test.js +0 -222
package/evalai-sdk-1.2.0.tgz +0 -0
package/postcss.config.mjs +0 -2

package/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,52 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [1.5.0] - 2026-02-18
+### ✨ Added
+#### evalai CLI — CI DevX
+- **`--format github`** — GitHub Actions annotations + step summary (`$GITHUB_STEP_SUMMARY`)
+- **`--format json`** — Machine-readable output only
+- **`--onFail import`** — On gate failure, import run metadata + failures to dashboard (idempotent per CI run)
+- **`--explain`** — Show score breakdown (contribPts) and thresholds
+- **`evalai doctor`** — Verify CI setup (config, API key, quality endpoint, baseline)
+- **Pinned CLI invocation** — Use `npx -y @pauly4010/evalai-sdk@^1` for stable CI (avoids surprise v2 breaks)
+#### Documentation
+- **README** — 3-section adoption flow: 60s local → optional CI gate → no lock-in
+- **Init output** — Shows path written, pinned snippet with `--format github --onFail import`
+- **openAIChatEval** — "Gate this in CI" hint uses pinned invocation
+### 🔧 Changed
+- **evalai init** — Output: "Wrote evalai.config.json at {path}", one next step, uninstall line
+- **Baseline missing** — Treated as config failure (BAD_ARGS), not API error
+- **parseArgs** — Returns `{ ok, args }` or `{ ok: false }` (no `process.exit` inside) for testability
+### 📦 Internal
+- Refactored `check.ts` into modules: `api.ts`, `gate.ts`, `report/build-check-report.ts`, `formatters/`
+- Deterministic helpers: `truncateSnippet`, `sortFailedCases`
+- Formatter tests: `json.test.ts`, `github.test.ts`
+- Doctor tests: `doctor.test.ts`
+---
+## [1.4.1] - 2026-02-18
+### ✨ Added
+- **evalai check `--baseline production`** — Compare against latest run tagged with `environment=prod`
+- **Baseline missing handling** — Clear failure when baseline not found and comparison requested
+### 🔧 Changed
+- **Package hardening** — `files`, `module`, `sideEffects: false` for leaner npm publish
+- **CLI** — Passes `baseline` param to quality API for deterministic CI gates
 ## [1.3.0] - 2025-10-21
 ### ✨ Added

package/README.md CHANGED Viewed

@@ -3,16 +3,87 @@
 [![npm version](https://img.shields.io/npm/v/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
 [![npm downloads](https://img.shields.io/npm/dm/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
-Official TypeScript/JavaScript SDK for the AI Evaluation Platform. Build confidence in your AI systems with comprehensive evaluation tools.
+Evaluate your AI systems locally in 60 seconds. Add an optional CI gate in 2 minutes. No lock-in — remove by deleting the config file.
+---
+## 1. 60 seconds: Run locally (no account)
+Install, run, get a score. No EvalAI account, no API key, no dashboard.
+```bash
+npm install @pauly4010/evalai-sdk openai
+```
+```typescript
+import { openAIChatEval } from "@pauly4010/evalai-sdk";
+await openAIChatEval({
+  name: "chat-regression",
+  cases: [
+    { input: "Hello", expectedOutput: "greeting" },
+    { input: "2 + 2 = ?", expectedOutput: "4" },
+  ],
+});
+```
+Set `OPENAI_API_KEY` in your environment. You'll see something like:
+```
+PASS 2/2  (score: 100)
+Tip: Want dashboards and history?
+Set EVALAI_API_KEY and connect this to the platform.
+```
+With failures you get `FAIL 9/10 (score 90)`, failed cases listed, and a hint: `Gate this in CI: npx -y @pauly4010/evalai-sdk@^1 init`.
+---
+## 2. Optional: Add a CI gate (2 minutes)
+When you're ready to gate PRs on quality:
+```bash
+npx -y @pauly4010/evalai-sdk@^1 init
+```
+**Create an evaluation in the dashboard → paste its ID into `evalai.config.json`:**
+```json
+{ "evaluationId": "42" }
+```
+Then add to your CI:
+```yaml
+- name: EvalAI gate
+  env:
+    EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}
+  run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import
+```
+You'll get GitHub annotations + a step summary + a dashboard link.
+- `--format github` — Annotations and step summary in GitHub Actions
+- `--onFail import` — On failure, EvalAI imports the run metadata + failures into the dashboard (idempotent per CI run)
+---
+## 3. No lock-in
+To stop using EvalAI: delete `evalai.config.json`. Your local `openAIChatEval` runs work the same without it. No account cancellation, no data export.
+---
 ## Installation
 ```bash
-npm install @pauly4010/evalai-sdk
+npm install @pauly4010/evalai-sdk openai
 # or
-yarn add @pauly4010/evalai-sdk
+yarn add @pauly4010/evalai-sdk openai
 # or
-pnpm add @pauly4010/evalai-sdk
+pnpm add @pauly4010/evalai-sdk openai
 ```
 ## Environment Support
@@ -47,7 +118,7 @@ The following features require Node.js and **will not work in browsers**:
 Use appropriate features based on your environment. The SDK will throw helpful errors if you try to use Node.js-only features in a browser.
-## Quick Start
+## AIEvalClient (Platform API)
 ```typescript
 import { AIEvalClient } from "@pauly4010/evalai-sdk";
@@ -501,13 +572,13 @@ console.log("Plan:", org.plan);
 console.log("Status:", org.status);
 ```
-## evalai CLI (v1.4.0)
+## evalai CLI (v1.5.0)
 The SDK includes a CLI for CI/CD evaluation gates. Install globally or use via `npx`:
 ```bash
 # Via npx (no global install)
-npx @pauly4010/evalai-sdk check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
+npx -y @pauly4010/evalai-sdk@^1 check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
 # Or install globally
 npm install -g @pauly4010/evalai-sdk
@@ -522,19 +593,47 @@ Gate deployments on quality scores, regression, and compliance:
 |--------|-------------|
 | `--evaluationId <id>` | **Required.** Evaluation to gate on |
 | `--apiKey <key>` | API key (or `EVALAI_API_KEY` env) |
+| `--format <fmt>` | `human` (default), `json`, or `github` (annotations + step summary) |
+| `--onFail import` | When gate fails, import run with CI context for debugging |
+| `--explain` | Show score breakdown and thresholds |
 | `--minScore <n>` | Fail if score &lt; n (0–100) |
 | `--maxDrop <n>` | Fail if score dropped &gt; n from baseline |
 | `--minN <n>` | Fail if total test cases &lt; n |
 | `--allowWeakEvidence` | Permit weak evidence level |
 | `--policy <name>` | Enforce HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511 |
-| `--baseline <mode>` | `published` or `previous` |
+| `--baseline <mode>` | `published`, `previous`, or `production` |
 | `--baseUrl <url>` | API base URL |
 **Exit codes:** 0=pass, 1=score below, 2=regression, 3=policy violation, 4=API error, 5=bad args, 6=low N, 7=weak evidence
+### evalai doctor
+Verify CI/CD setup before running check:
+```bash
+npx -y @pauly4010/evalai-sdk@^1 doctor --evaluationId 42 --apiKey $EVALAI_API_KEY
+```
+Uses the same quality endpoint as `check` — if doctor passes, check works.
 ## Changelog
-### v1.4.0 (Latest)
+### v1.5.0 (Latest)
+- **`--format github`** — Annotations + step summary in GitHub Actions
+- **`--format json`** — Machine-readable output
+- **`--onFail import`** — Import failing runs to dashboard (idempotent per CI run)
+- **`--explain`** — Score breakdown and thresholds
+- **`evalai doctor`** — Verify CI setup
+- **Pinned invocation** — Use `npx -y @pauly4010/evalai-sdk@^1` for stable CI
+- **README** — 3-section adoption flow (60s local → CI gate → no lock-in)
+### v1.4.1
+- **evalai check `--baseline production`** — Compare against latest prod-tagged run
+- **Package hardening** — Leaner npm publish with `files`, `sideEffects: false`
+### v1.4.0
 - **evalai CLI** — Command-line tool for CI/CD evaluation gates
   - `evalai check` — Gate deployments on quality scores, regression, and compliance

package/dist/cli/api.d.ts ADDED Viewed

@@ -0,0 +1,79 @@
+/**
+ * API fetch helpers for evalai check.
+ * Captures x-request-id from response headers.
+ */
+export type QualityLatestData = {
+    score?: number;
+    total?: number | null;
+    evidenceLevel?: string | null;
+    baselineScore?: number | null;
+    regressionDelta?: number | null;
+    baselineMissing?: boolean | null;
+    breakdown?: {
+        passRate?: number;
+        safety?: number;
+        judge?: number;
+    };
+    flags?: string[];
+    evaluationRunId?: number;
+    evaluationId?: number;
+};
+export type RunDetailsData = {
+    results?: Array<{
+        testCaseId?: number;
+        status?: string;
+        output?: string;
+        durationMs?: number;
+        assertionsJson?: Record<string, unknown>;
+        test_cases?: {
+            name?: string;
+            input?: string;
+            expectedOutput?: string;
+        };
+    }>;
+};
+export declare function fetchQualityLatest(baseUrl: string, apiKey: string, evaluationId: string, baseline: string): Promise<{
+    ok: true;
+    data: QualityLatestData;
+    requestId?: string;
+} | {
+    ok: false;
+    status: number;
+    body: string;
+    requestId?: string;
+}>;
+export declare function fetchRunDetails(baseUrl: string, apiKey: string, evaluationId: string, runId: number): Promise<{
+    ok: true;
+    data: RunDetailsData;
+} | {
+    ok: false;
+}>;
+export type CiContext = {
+    provider?: 'github' | 'gitlab' | 'circle' | 'unknown';
+    repo?: string;
+    sha?: string;
+    branch?: string;
+    pr?: number;
+    runUrl?: string;
+    actor?: string;
+};
+export type ImportResult = {
+    testCaseId: number;
+    status: 'passed' | 'failed';
+    output: string;
+    latencyMs?: number;
+    costUsd?: number;
+    assertionsJson?: Record<string, unknown>;
+};
+export declare function importRunOnFail(baseUrl: string, apiKey: string, evaluationId: string, results: ImportResult[], options: {
+    idempotencyKey?: string;
+    ci?: CiContext;
+    importClientVersion?: string;
+}): Promise<{
+    ok: true;
+    runId: number;
+} | {
+    ok: false;
+    status: number;
+    body: string;
+}>;

package/dist/cli/api.js ADDED Viewed

@@ -0,0 +1,74 @@
+"use strict";
+/**
+ * API fetch helpers for evalai check.
+ * Captures x-request-id from response headers.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.fetchQualityLatest = fetchQualityLatest;
+exports.fetchRunDetails = fetchRunDetails;
+exports.importRunOnFail = importRunOnFail;
+async function fetchQualityLatest(baseUrl, apiKey, evaluationId, baseline) {
+    const headers = { Authorization: `Bearer ${apiKey}` };
+    const url = `${baseUrl.replace(/\/$/, '')}/api/quality?evaluationId=${evaluationId}&action=latest&baseline=${baseline}`;
+    try {
+        const res = await fetch(url, { headers });
+        const requestId = res.headers.get('x-request-id') ?? undefined;
+        const body = await res.text();
+        if (!res.ok) {
+            return { ok: false, status: res.status, body, requestId };
+        }
+        const data = JSON.parse(body);
+        return { ok: true, data, requestId };
+    }
+    catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        return { ok: false, status: 0, body: msg, requestId: undefined };
+    }
+}
+async function fetchRunDetails(baseUrl, apiKey, evaluationId, runId) {
+    const headers = { Authorization: `Bearer ${apiKey}` };
+    const url = `${baseUrl.replace(/\/$/, '')}/api/evaluations/${evaluationId}/runs/${runId}`;
+    try {
+        const res = await fetch(url, { headers });
+        if (!res.ok)
+            return { ok: false };
+        const data = (await res.json());
+        return { ok: true, data };
+    }
+    catch {
+        return { ok: false };
+    }
+}
+async function importRunOnFail(baseUrl, apiKey, evaluationId, results, options) {
+    const headers = {
+        Authorization: `Bearer ${apiKey}`,
+        'Content-Type': 'application/json',
+    };
+    if (options.idempotencyKey) {
+        headers['Idempotency-Key'] = options.idempotencyKey;
+    }
+    const body = {
+        environment: 'dev',
+        results,
+        importClientVersion: options.importClientVersion ?? 'evalai-cli',
+        ci: options.ci,
+    };
+    const url = `${baseUrl.replace(/\/$/, '')}/api/evaluations/${evaluationId}/runs/import`;
+    try {
+        const res = await fetch(url, {
+            method: 'POST',
+            headers,
+            body: JSON.stringify(body),
+        });
+        const text = await res.text();
+        if (!res.ok) {
+            return { ok: false, status: res.status, body: text };
+        }
+        const data = JSON.parse(text);
+        return { ok: true, runId: data.runId };
+    }
+    catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        return { ok: false, status: 0, body: msg };
+    }
+}

package/dist/cli/check.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@
  *   --minN <n>           Fail if total test cases < n (low sample size)
  *   --allowWeakEvidence  If false (default), fail when evidenceLevel is 'weak'
  *   --policy <name>      Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
- *   --baseline <mode>    Baseline comparison mode: "published" (default) or "previous"
+ *   --baseline <mode>   Baseline comparison mode: "published" (default), "previous", or "production"
  *   --evaluationId <id>  Required. The evaluation to gate on.
  *   --baseUrl <url>      API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
  *   --apiKey <key>       API key (default: EVALAI_API_KEY env var)
@@ -33,16 +33,8 @@
  *   EVALAI_BASE_URL  — API base URL (default: http://localhost:3000)
  *   EVALAI_API_KEY   — API key for authentication
  */
-export declare const EXIT: {
-    readonly PASS: 0;
-    readonly SCORE_BELOW: 1;
-    readonly REGRESSION: 2;
-    readonly POLICY_VIOLATION: 3;
-    readonly API_ERROR: 4;
-    readonly BAD_ARGS: 5;
-    readonly LOW_N: 6;
-    readonly WEAK_EVIDENCE: 7;
-};
+export { EXIT } from './constants';
+export type FormatType = 'human' | 'json' | 'github';
 export interface CheckArgs {
     baseUrl: string;
     apiKey: string;
@@ -52,7 +44,18 @@ export interface CheckArgs {
     allowWeakEvidence: boolean;
     evaluationId: string;
     policy?: string;
-    baseline: 'published' | 'previous';
+    baseline: 'published' | 'previous' | 'production';
+    format: FormatType;
+    explain: boolean;
+    onFail?: 'import';
 }
-export declare function parseArgs(argv: string[]): CheckArgs;
+export type ParseArgsResult = {
+    ok: true;
+    args: CheckArgs;
+} | {
+    ok: false;
+    exitCode: number;
+    message: string;
+};
+export declare function parseArgs(argv: string[]): ParseArgsResult;
 export declare function runCheck(args: CheckArgs): Promise<number>;