@pauly4010/evalai-sdk 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +46 -0
  2. package/README.md +108 -9
  3. package/dist/cli/api.d.ts +79 -0
  4. package/dist/cli/api.js +74 -0
  5. package/dist/cli/check.d.ts +16 -13
  6. package/dist/cli/check.js +117 -127
  7. package/dist/cli/ci-context.d.ts +6 -0
  8. package/dist/cli/ci-context.js +51 -0
  9. package/dist/cli/config.d.ts +24 -0
  10. package/dist/cli/config.js +158 -0
  11. package/dist/cli/constants.d.ts +13 -0
  12. package/dist/cli/constants.js +16 -0
  13. package/dist/cli/doctor.d.ts +11 -0
  14. package/dist/cli/doctor.js +82 -0
  15. package/dist/cli/formatters/github.d.ts +8 -0
  16. package/dist/cli/formatters/github.js +119 -0
  17. package/dist/cli/formatters/human.d.ts +6 -0
  18. package/dist/cli/formatters/human.js +92 -0
  19. package/dist/cli/formatters/json.d.ts +6 -0
  20. package/dist/cli/formatters/json.js +10 -0
  21. package/dist/cli/formatters/types.d.ts +76 -0
  22. package/dist/cli/formatters/types.js +5 -0
  23. package/dist/cli/gate.d.ts +13 -0
  24. package/dist/cli/gate.js +108 -0
  25. package/dist/cli/index.d.ts +1 -0
  26. package/dist/cli/index.js +31 -5
  27. package/dist/cli/init.d.ts +7 -0
  28. package/dist/cli/init.js +69 -0
  29. package/dist/cli/render/snippet.d.ts +5 -0
  30. package/dist/cli/render/snippet.js +15 -0
  31. package/dist/cli/render/sort.d.ts +10 -0
  32. package/dist/cli/render/sort.js +24 -0
  33. package/dist/cli/report/build-check-report.d.ts +16 -0
  34. package/dist/cli/report/build-check-report.js +94 -0
  35. package/dist/index.d.ts +1 -0
  36. package/dist/index.js +4 -1
  37. package/dist/integrations/openai-eval.d.ts +53 -0
  38. package/dist/integrations/openai-eval.js +226 -0
  39. package/dist/utils/input-hash.d.ts +8 -0
  40. package/dist/utils/input-hash.js +38 -0
  41. package/package.json +10 -3
  42. package/.env.example +0 -0
  43. package/ADDITIONAL_ISSUES_FOUND.md +0 -174
  44. package/dist/__tests__/assertions.test.d.ts +0 -1
  45. package/dist/__tests__/assertions.test.js +0 -288
  46. package/dist/__tests__/client.test.d.ts +0 -1
  47. package/dist/__tests__/client.test.js +0 -185
  48. package/dist/__tests__/testing.test.d.ts +0 -1
  49. package/dist/__tests__/testing.test.js +0 -230
  50. package/dist/__tests__/workflows.test.d.ts +0 -1
  51. package/dist/__tests__/workflows.test.js +0 -222
  52. package/evalai-sdk-1.2.0.tgz +0 -0
  53. package/postcss.config.mjs +0 -2
package/CHANGELOG.md CHANGED
@@ -5,6 +5,52 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.5.0] - 2026-02-18
9
+
10
+ ### ✨ Added
11
+
12
+ #### evalai CLI — CI DevX
13
+
14
+ - **`--format github`** — GitHub Actions annotations + step summary (`$GITHUB_STEP_SUMMARY`)
15
+ - **`--format json`** — Machine-readable output only
16
+ - **`--onFail import`** — On gate failure, import run metadata + failures to dashboard (idempotent per CI run)
17
+ - **`--explain`** — Show score breakdown (contribPts) and thresholds
18
+ - **`evalai doctor`** — Verify CI setup (config, API key, quality endpoint, baseline)
19
+ - **Pinned CLI invocation** — Use `npx -y @pauly4010/evalai-sdk@^1` for stable CI (avoids surprise v2 breaks)
20
+
21
+ #### Documentation
22
+
23
+ - **README** — 3-section adoption flow: 60s local → optional CI gate → no lock-in
24
+ - **Init output** — Shows path written, pinned snippet with `--format github --onFail import`
25
+ - **openAIChatEval** — "Gate this in CI" hint uses pinned invocation
26
+
27
+ ### 🔧 Changed
28
+
29
+ - **evalai init** — Output: "Wrote evalai.config.json at {path}", one next step, uninstall line
30
+ - **Baseline missing** — Treated as config failure (BAD_ARGS), not API error
31
+ - **parseArgs** — Returns `{ ok, args }` or `{ ok: false }` (no `process.exit` inside) for testability
32
+
33
+ ### 📦 Internal
34
+
35
+ - Refactored `check.ts` into modules: `api.ts`, `gate.ts`, `report/build-check-report.ts`, `formatters/`
36
+ - Deterministic helpers: `truncateSnippet`, `sortFailedCases`
37
+ - Formatter tests: `json.test.ts`, `github.test.ts`
38
+ - Doctor tests: `doctor.test.ts`
39
+
40
+ ---
41
+
42
+ ## [1.4.1] - 2026-02-18
43
+
44
+ ### ✨ Added
45
+
46
+ - **evalai check `--baseline production`** — Compare against latest run tagged with `environment=prod`
47
+ - **Baseline missing handling** — Clear failure when baseline not found and comparison requested
48
+
49
+ ### 🔧 Changed
50
+
51
+ - **Package hardening** — `files`, `module`, `sideEffects: false` for leaner npm publish
52
+ - **CLI** — Passes `baseline` param to quality API for deterministic CI gates
53
+
8
54
  ## [1.3.0] - 2025-10-21
9
55
 
10
56
  ### ✨ Added
package/README.md CHANGED
@@ -3,16 +3,87 @@
3
3
  [![npm version](https://img.shields.io/npm/v/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
4
4
  [![npm downloads](https://img.shields.io/npm/dm/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
5
5
 
6
- Official TypeScript/JavaScript SDK for the AI Evaluation Platform. Build confidence in your AI systems with comprehensive evaluation tools.
6
+ Evaluate your AI systems locally in 60 seconds. Add an optional CI gate in 2 minutes. No lock-in remove by deleting the config file.
7
+
8
+ ---
9
+
10
+ ## 1. 60 seconds: Run locally (no account)
11
+
12
+ Install, run, get a score. No EvalAI account, no API key, no dashboard.
13
+
14
+ ```bash
15
+ npm install @pauly4010/evalai-sdk openai
16
+ ```
17
+
18
+ ```typescript
19
+ import { openAIChatEval } from "@pauly4010/evalai-sdk";
20
+
21
+ await openAIChatEval({
22
+ name: "chat-regression",
23
+ cases: [
24
+ { input: "Hello", expectedOutput: "greeting" },
25
+ { input: "2 + 2 = ?", expectedOutput: "4" },
26
+ ],
27
+ });
28
+ ```
29
+
30
+ Set `OPENAI_API_KEY` in your environment. You'll see something like:
31
+
32
+ ```
33
+ PASS 2/2 (score: 100)
34
+
35
+ Tip: Want dashboards and history?
36
+ Set EVALAI_API_KEY and connect this to the platform.
37
+ ```
38
+
39
+ With failures you get `FAIL 9/10 (score 90)`, failed cases listed, and a hint: `Gate this in CI: npx -y @pauly4010/evalai-sdk@^1 init`.
40
+
41
+ ---
42
+
43
+ ## 2. Optional: Add a CI gate (2 minutes)
44
+
45
+ When you're ready to gate PRs on quality:
46
+
47
+ ```bash
48
+ npx -y @pauly4010/evalai-sdk@^1 init
49
+ ```
50
+
51
+ **Create an evaluation in the dashboard → paste its ID into `evalai.config.json`:**
52
+
53
+ ```json
54
+ { "evaluationId": "42" }
55
+ ```
56
+
57
+ Then add to your CI:
58
+
59
+ ```yaml
60
+ - name: EvalAI gate
61
+ env:
62
+ EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}
63
+ run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import
64
+ ```
65
+
66
+ You'll get GitHub annotations + a step summary + a dashboard link.
67
+
68
+ - `--format github` — Annotations and step summary in GitHub Actions
69
+ - `--onFail import` — On failure, EvalAI imports the run metadata + failures into the dashboard (idempotent per CI run)
70
+
71
+ ---
72
+
73
+ ## 3. No lock-in
74
+
75
+ To stop using EvalAI: delete `evalai.config.json`. Your local `openAIChatEval` runs work the same without it. No account cancellation, no data export.
76
+
77
+ ---
7
78
 
8
79
  ## Installation
9
80
 
10
81
  ```bash
11
- npm install @pauly4010/evalai-sdk
82
+ npm install @pauly4010/evalai-sdk openai
12
83
  # or
13
- yarn add @pauly4010/evalai-sdk
84
+ yarn add @pauly4010/evalai-sdk openai
14
85
  # or
15
- pnpm add @pauly4010/evalai-sdk
86
+ pnpm add @pauly4010/evalai-sdk openai
16
87
  ```
17
88
 
18
89
  ## Environment Support
@@ -47,7 +118,7 @@ The following features require Node.js and **will not work in browsers**:
47
118
 
48
119
  Use appropriate features based on your environment. The SDK will throw helpful errors if you try to use Node.js-only features in a browser.
49
120
 
50
- ## Quick Start
121
+ ## AIEvalClient (Platform API)
51
122
 
52
123
  ```typescript
53
124
  import { AIEvalClient } from "@pauly4010/evalai-sdk";
@@ -501,13 +572,13 @@ console.log("Plan:", org.plan);
501
572
  console.log("Status:", org.status);
502
573
  ```
503
574
 
504
- ## evalai CLI (v1.4.0)
575
+ ## evalai CLI (v1.5.0)
505
576
 
506
577
  The SDK includes a CLI for CI/CD evaluation gates. Install globally or use via `npx`:
507
578
 
508
579
  ```bash
509
580
  # Via npx (no global install)
510
- npx @pauly4010/evalai-sdk check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
581
+ npx -y @pauly4010/evalai-sdk@^1 check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
511
582
 
512
583
  # Or install globally
513
584
  npm install -g @pauly4010/evalai-sdk
@@ -522,19 +593,47 @@ Gate deployments on quality scores, regression, and compliance:
522
593
  |--------|-------------|
523
594
  | `--evaluationId <id>` | **Required.** Evaluation to gate on |
524
595
  | `--apiKey <key>` | API key (or `EVALAI_API_KEY` env) |
596
+ | `--format <fmt>` | `human` (default), `json`, or `github` (annotations + step summary) |
597
+ | `--onFail import` | When gate fails, import run with CI context for debugging |
598
+ | `--explain` | Show score breakdown and thresholds |
525
599
  | `--minScore <n>` | Fail if score &lt; n (0–100) |
526
600
  | `--maxDrop <n>` | Fail if score dropped &gt; n from baseline |
527
601
  | `--minN <n>` | Fail if total test cases &lt; n |
528
602
  | `--allowWeakEvidence` | Permit weak evidence level |
529
603
  | `--policy <name>` | Enforce HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511 |
530
- | `--baseline <mode>` | `published` or `previous` |
604
+ | `--baseline <mode>` | `published`, `previous`, or `production` |
531
605
  | `--baseUrl <url>` | API base URL |
532
606
 
533
607
  **Exit codes:** 0=pass, 1=score below, 2=regression, 3=policy violation, 4=API error, 5=bad args, 6=low N, 7=weak evidence
534
608
 
609
+ ### evalai doctor
610
+
611
+ Verify CI/CD setup before running check:
612
+
613
+ ```bash
614
+ npx -y @pauly4010/evalai-sdk@^1 doctor --evaluationId 42 --apiKey $EVALAI_API_KEY
615
+ ```
616
+
617
+ Uses the same quality endpoint as `check` — if doctor passes, check works.
618
+
535
619
  ## Changelog
536
620
 
537
- ### v1.4.0 (Latest)
621
+ ### v1.5.0 (Latest)
622
+
623
+ - **`--format github`** — Annotations + step summary in GitHub Actions
624
+ - **`--format json`** — Machine-readable output
625
+ - **`--onFail import`** — Import failing runs to dashboard (idempotent per CI run)
626
+ - **`--explain`** — Score breakdown and thresholds
627
+ - **`evalai doctor`** — Verify CI setup
628
+ - **Pinned invocation** — Use `npx -y @pauly4010/evalai-sdk@^1` for stable CI
629
+ - **README** — 3-section adoption flow (60s local → CI gate → no lock-in)
630
+
631
+ ### v1.4.1
632
+
633
+ - **evalai check `--baseline production`** — Compare against latest prod-tagged run
634
+ - **Package hardening** — Leaner npm publish with `files`, `sideEffects: false`
635
+
636
+ ### v1.4.0
538
637
 
539
638
  - **evalai CLI** — Command-line tool for CI/CD evaluation gates
540
639
  - `evalai check` — Gate deployments on quality scores, regression, and compliance
@@ -0,0 +1,79 @@
1
+ /**
2
+ * API fetch helpers for evalai check.
3
+ * Captures x-request-id from response headers.
4
+ */
5
+ export type QualityLatestData = {
6
+ score?: number;
7
+ total?: number | null;
8
+ evidenceLevel?: string | null;
9
+ baselineScore?: number | null;
10
+ regressionDelta?: number | null;
11
+ baselineMissing?: boolean | null;
12
+ breakdown?: {
13
+ passRate?: number;
14
+ safety?: number;
15
+ judge?: number;
16
+ };
17
+ flags?: string[];
18
+ evaluationRunId?: number;
19
+ evaluationId?: number;
20
+ };
21
+ export type RunDetailsData = {
22
+ results?: Array<{
23
+ testCaseId?: number;
24
+ status?: string;
25
+ output?: string;
26
+ durationMs?: number;
27
+ assertionsJson?: Record<string, unknown>;
28
+ test_cases?: {
29
+ name?: string;
30
+ input?: string;
31
+ expectedOutput?: string;
32
+ };
33
+ }>;
34
+ };
35
+ export declare function fetchQualityLatest(baseUrl: string, apiKey: string, evaluationId: string, baseline: string): Promise<{
36
+ ok: true;
37
+ data: QualityLatestData;
38
+ requestId?: string;
39
+ } | {
40
+ ok: false;
41
+ status: number;
42
+ body: string;
43
+ requestId?: string;
44
+ }>;
45
+ export declare function fetchRunDetails(baseUrl: string, apiKey: string, evaluationId: string, runId: number): Promise<{
46
+ ok: true;
47
+ data: RunDetailsData;
48
+ } | {
49
+ ok: false;
50
+ }>;
51
+ export type CiContext = {
52
+ provider?: 'github' | 'gitlab' | 'circle' | 'unknown';
53
+ repo?: string;
54
+ sha?: string;
55
+ branch?: string;
56
+ pr?: number;
57
+ runUrl?: string;
58
+ actor?: string;
59
+ };
60
+ export type ImportResult = {
61
+ testCaseId: number;
62
+ status: 'passed' | 'failed';
63
+ output: string;
64
+ latencyMs?: number;
65
+ costUsd?: number;
66
+ assertionsJson?: Record<string, unknown>;
67
+ };
68
+ export declare function importRunOnFail(baseUrl: string, apiKey: string, evaluationId: string, results: ImportResult[], options: {
69
+ idempotencyKey?: string;
70
+ ci?: CiContext;
71
+ importClientVersion?: string;
72
+ }): Promise<{
73
+ ok: true;
74
+ runId: number;
75
+ } | {
76
+ ok: false;
77
+ status: number;
78
+ body: string;
79
+ }>;
@@ -0,0 +1,74 @@
1
+ "use strict";
2
+ /**
3
+ * API fetch helpers for evalai check.
4
+ * Captures x-request-id from response headers.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.fetchQualityLatest = fetchQualityLatest;
8
+ exports.fetchRunDetails = fetchRunDetails;
9
+ exports.importRunOnFail = importRunOnFail;
10
+ async function fetchQualityLatest(baseUrl, apiKey, evaluationId, baseline) {
11
+ const headers = { Authorization: `Bearer ${apiKey}` };
12
+ const url = `${baseUrl.replace(/\/$/, '')}/api/quality?evaluationId=${evaluationId}&action=latest&baseline=${baseline}`;
13
+ try {
14
+ const res = await fetch(url, { headers });
15
+ const requestId = res.headers.get('x-request-id') ?? undefined;
16
+ const body = await res.text();
17
+ if (!res.ok) {
18
+ return { ok: false, status: res.status, body, requestId };
19
+ }
20
+ const data = JSON.parse(body);
21
+ return { ok: true, data, requestId };
22
+ }
23
+ catch (err) {
24
+ const msg = err instanceof Error ? err.message : String(err);
25
+ return { ok: false, status: 0, body: msg, requestId: undefined };
26
+ }
27
+ }
28
+ async function fetchRunDetails(baseUrl, apiKey, evaluationId, runId) {
29
+ const headers = { Authorization: `Bearer ${apiKey}` };
30
+ const url = `${baseUrl.replace(/\/$/, '')}/api/evaluations/${evaluationId}/runs/${runId}`;
31
+ try {
32
+ const res = await fetch(url, { headers });
33
+ if (!res.ok)
34
+ return { ok: false };
35
+ const data = (await res.json());
36
+ return { ok: true, data };
37
+ }
38
+ catch {
39
+ return { ok: false };
40
+ }
41
+ }
42
+ async function importRunOnFail(baseUrl, apiKey, evaluationId, results, options) {
43
+ const headers = {
44
+ Authorization: `Bearer ${apiKey}`,
45
+ 'Content-Type': 'application/json',
46
+ };
47
+ if (options.idempotencyKey) {
48
+ headers['Idempotency-Key'] = options.idempotencyKey;
49
+ }
50
+ const body = {
51
+ environment: 'dev',
52
+ results,
53
+ importClientVersion: options.importClientVersion ?? 'evalai-cli',
54
+ ci: options.ci,
55
+ };
56
+ const url = `${baseUrl.replace(/\/$/, '')}/api/evaluations/${evaluationId}/runs/import`;
57
+ try {
58
+ const res = await fetch(url, {
59
+ method: 'POST',
60
+ headers,
61
+ body: JSON.stringify(body),
62
+ });
63
+ const text = await res.text();
64
+ if (!res.ok) {
65
+ return { ok: false, status: res.status, body: text };
66
+ }
67
+ const data = JSON.parse(text);
68
+ return { ok: true, runId: data.runId };
69
+ }
70
+ catch (err) {
71
+ const msg = err instanceof Error ? err.message : String(err);
72
+ return { ok: false, status: 0, body: msg };
73
+ }
74
+ }
@@ -14,7 +14,7 @@
14
14
  * --minN <n> Fail if total test cases < n (low sample size)
15
15
  * --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
16
16
  * --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
17
- * --baseline <mode> Baseline comparison mode: "published" (default) or "previous"
17
+ * --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
18
18
  * --evaluationId <id> Required. The evaluation to gate on.
19
19
  * --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
20
20
  * --apiKey <key> API key (default: EVALAI_API_KEY env var)
@@ -33,16 +33,8 @@
33
33
  * EVALAI_BASE_URL — API base URL (default: http://localhost:3000)
34
34
  * EVALAI_API_KEY — API key for authentication
35
35
  */
36
- export declare const EXIT: {
37
- readonly PASS: 0;
38
- readonly SCORE_BELOW: 1;
39
- readonly REGRESSION: 2;
40
- readonly POLICY_VIOLATION: 3;
41
- readonly API_ERROR: 4;
42
- readonly BAD_ARGS: 5;
43
- readonly LOW_N: 6;
44
- readonly WEAK_EVIDENCE: 7;
45
- };
36
+ export { EXIT } from './constants';
37
+ export type FormatType = 'human' | 'json' | 'github';
46
38
  export interface CheckArgs {
47
39
  baseUrl: string;
48
40
  apiKey: string;
@@ -52,7 +44,18 @@ export interface CheckArgs {
52
44
  allowWeakEvidence: boolean;
53
45
  evaluationId: string;
54
46
  policy?: string;
55
- baseline: 'published' | 'previous';
47
+ baseline: 'published' | 'previous' | 'production';
48
+ format: FormatType;
49
+ explain: boolean;
50
+ onFail?: 'import';
56
51
  }
57
- export declare function parseArgs(argv: string[]): CheckArgs;
52
+ export type ParseArgsResult = {
53
+ ok: true;
54
+ args: CheckArgs;
55
+ } | {
56
+ ok: false;
57
+ exitCode: number;
58
+ message: string;
59
+ };
60
+ export declare function parseArgs(argv: string[]): ParseArgsResult;
58
61
  export declare function runCheck(args: CheckArgs): Promise<number>;