npm - @pauly4010/evalai-sdk - Versions diffs - 1.4.1 → 1.5.0 - Mend

@pauly4010/evalai-sdk 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/CHANGELOG.md +34 -0
package/README.md +102 -8
package/dist/cli/api.d.ts +79 -0
package/dist/cli/api.js +74 -0
package/dist/cli/check.d.ts +15 -12
package/dist/cli/check.js +113 -134
package/dist/cli/ci-context.d.ts +6 -0
package/dist/cli/ci-context.js +51 -0
package/dist/cli/config.d.ts +24 -0
package/dist/cli/config.js +158 -0
package/dist/cli/constants.d.ts +13 -0
package/dist/cli/constants.js +16 -0
package/dist/cli/doctor.d.ts +11 -0
package/dist/cli/doctor.js +82 -0
package/dist/cli/formatters/github.d.ts +8 -0
package/dist/cli/formatters/github.js +119 -0
package/dist/cli/formatters/human.d.ts +6 -0
package/dist/cli/formatters/human.js +92 -0
package/dist/cli/formatters/json.d.ts +6 -0
package/dist/cli/formatters/json.js +10 -0
package/dist/cli/formatters/types.d.ts +76 -0
package/dist/cli/formatters/types.js +5 -0
package/dist/cli/gate.d.ts +13 -0
package/dist/cli/gate.js +108 -0
package/dist/cli/index.d.ts +1 -0
package/dist/cli/index.js +31 -5
package/dist/cli/init.d.ts +7 -0
package/dist/cli/init.js +69 -0
package/dist/cli/render/snippet.d.ts +5 -0
package/dist/cli/render/snippet.js +15 -0
package/dist/cli/render/sort.d.ts +10 -0
package/dist/cli/render/sort.js +24 -0
package/dist/cli/report/build-check-report.d.ts +16 -0
package/dist/cli/report/build-check-report.js +94 -0
package/dist/index.d.ts +1 -0
package/dist/index.js +4 -1
package/dist/integrations/openai-eval.d.ts +53 -0
package/dist/integrations/openai-eval.js +226 -0
package/dist/utils/input-hash.d.ts +8 -0
package/dist/utils/input-hash.js +38 -0
package/package.json +5 -1
package/dist/__tests__/assertions.test.d.ts +0 -1
package/dist/__tests__/assertions.test.js +0 -288
package/dist/__tests__/client.test.d.ts +0 -1
package/dist/__tests__/client.test.js +0 -185
package/dist/__tests__/testing.test.d.ts +0 -1
package/dist/__tests__/testing.test.js +0 -230
package/dist/__tests__/workflows.test.d.ts +0 -1
package/dist/__tests__/workflows.test.js +0 -222

package/dist/cli/formatters/github.js ADDED Viewed

@@ -0,0 +1,119 @@
+"use strict";
+/**
+ * GitHub formatter for evalai check.
+ * - stdout: minimal (verdict + score + link) + ::error annotations for failed cases
+ * - Step summary: full Markdown written to GITHUB_STEP_SUMMARY (not stdout)
+ */
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.appendStepSummary = appendStepSummary;
+exports.formatGitHub = formatGitHub;
+const fs = __importStar(require("fs"));
+const snippet_1 = require("../render/snippet");
+const ANNOTATION_MAX = 10;
+function escapeAnnotationMessage(s) {
+    return s.replace(/\r/g, '').replace(/\n/g, '%0A');
+}
+function formatAnnotation(fc) {
+    const id = fc.testCaseId ?? fc.name ?? 'unknown';
+    const reason = fc.reason ?? fc.outputSnippet ?? fc.output ?? 'no output';
+    const msg = escapeAnnotationMessage(`TestCase ${id} failed - ${(0, snippet_1.truncateSnippet)(reason, 100)}`);
+    return `::error title=EvalAI regression::${msg}`;
+}
+function appendStepSummary(report) {
+    const path = typeof process !== 'undefined' && process.env?.GITHUB_STEP_SUMMARY;
+    if (!path)
+        return;
+    const lines = [];
+    const passed = report.verdict === 'pass';
+    lines.push('## EvalAI Gate');
+    lines.push('');
+    lines.push(passed ? '✅ **PASSED**' : `❌ **FAILED**: ${report.reasonMessage ?? report.reasonCode}`);
+    lines.push('');
+    const deltaStr = report.baselineScore != null && report.delta != null
+        ? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
+        : '';
+    lines.push(`**Score:** ${report.score ?? 0}/100${deltaStr}`);
+    lines.push('');
+    const failedCases = report.failedCases ?? [];
+    if (failedCases.length > 0) {
+        lines.push(`### ${failedCases.length} failing case${failedCases.length === 1 ? '' : 's'}`);
+        lines.push('');
+        for (const fc of failedCases.slice(0, 10)) {
+            const label = fc.name ?? fc.input ?? '(unnamed)';
+            const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 80);
+            const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 80);
+            const reason = out ? `got "${out}"` : 'no output';
+            lines.push(`- **${(0, snippet_1.truncateSnippet)(label, 60)}** — expected: ${exp || '(any)'}, ${reason}`);
+        }
+        if (failedCases.length > 10) {
+            lines.push(`- _+ ${failedCases.length - 10} more_`);
+        }
+        lines.push('');
+    }
+    if (report.dashboardUrl) {
+        lines.push(`[View Dashboard](${report.dashboardUrl})`);
+        lines.push('');
+    }
+    try {
+        fs.appendFileSync(path, lines.join('\n'), 'utf8');
+    }
+    catch {
+        // Non-fatal: step summary is best-effort
+    }
+}
+function formatGitHub(report) {
+    const stdoutLines = [];
+    // Emit ::error annotations for failed cases (up to N)
+    const failedCases = report.failedCases ?? [];
+    const toAnnotate = failedCases.slice(0, ANNOTATION_MAX);
+    for (const fc of toAnnotate) {
+        stdoutLines.push(formatAnnotation(fc));
+    }
+    // Minimal summary: verdict + score + link
+    const passed = report.verdict === 'pass';
+    const failReason = report.reasonMessage ?? report.reasonCode;
+    stdoutLines.push(passed ? '\n✓ EvalAI gate PASSED' : `\n✗ EvalAI gate FAILED: ${failReason}`);
+    const deltaStr = report.baselineScore != null && report.delta != null
+        ? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
+        : '';
+    stdoutLines.push(`Score: ${report.score ?? 0}/100${deltaStr}`);
+    if (report.dashboardUrl) {
+        stdoutLines.push(`Dashboard: ${report.dashboardUrl}`);
+    }
+    // Write full markdown to GITHUB_STEP_SUMMARY (not stdout)
+    appendStepSummary(report);
+    return stdoutLines.join('\n');
+}

package/dist/cli/formatters/human.d.ts ADDED Viewed

@@ -0,0 +1,6 @@
+/**
+ * Human-readable formatter for evalai check output.
+ * Deterministic: verdict → score → failures → link → hint.
+ */
+import type { CheckReport } from './types';
+export declare function formatHuman(report: CheckReport): string;

package/dist/cli/formatters/human.js ADDED Viewed

@@ -0,0 +1,92 @@
+"use strict";
+/**
+ * Human-readable formatter for evalai check output.
+ * Deterministic: verdict → score → failures → link → hint.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.formatHuman = formatHuman;
+const snippet_1 = require("../render/snippet");
+const TOP_N = 3;
+function formatHuman(report) {
+    const lines = [];
+    const passed = report.verdict === 'pass';
+    const failReason = report.reasonMessage;
+    lines.push(passed ? '\n✓ EvalAI gate PASSED' : `\n✗ EvalAI gate FAILED: ${failReason ?? report.reasonCode}`);
+    const deltaStr = report.baselineScore != null && report.delta != null
+        ? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
+        : '';
+    lines.push(`Score: ${report.score ?? 0}/100${deltaStr}`);
+    const failedCases = report.failedCases ?? [];
+    if (failedCases.length > 0) {
+        const toShow = failedCases.slice(0, TOP_N);
+        lines.push(`${failedCases.length} failing case${failedCases.length === 1 ? '' : 's'}:`);
+        for (const fc of toShow) {
+            const label = fc.name ?? fc.input ?? '(unnamed)';
+            const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 50);
+            const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 50);
+            const reason = out ? `got "${out}"` : 'no output';
+            lines.push(`  - "${(0, snippet_1.truncateSnippet)(label, 50)}" → expected: ${exp || '(any)'}, ${reason}`);
+        }
+        if (failedCases.length > toShow.length) {
+            lines.push(`  + ${failedCases.length - toShow.length} more`);
+        }
+    }
+    if (report.dashboardUrl) {
+        lines.push(`Dashboard: ${report.dashboardUrl}`);
+    }
+    if (!passed) {
+        lines.push('Next: View full report above, fix failing cases, or adjust gate with --minScore / --maxDrop');
+    }
+    if (report.explain && (report.breakdown01 || report.contribPts || report.flags?.length)) {
+        lines.push('');
+        lines.push('--- Explain ---');
+        if (report.contribPts) {
+            const cp = report.contribPts;
+            const pts = [];
+            if (cp.passRatePts != null)
+                pts.push(`passRate: ${cp.passRatePts}`);
+            if (cp.safetyPts != null)
+                pts.push(`safety: ${cp.safetyPts}`);
+            if (cp.compliancePts != null)
+                pts.push(`compliance: ${cp.compliancePts}`);
+            if (cp.performancePts != null)
+                pts.push(`performance: ${cp.performancePts}`);
+            if (pts.length)
+                lines.push(`Contrib pts: ${pts.join(', ')}`);
+        }
+        if (report.breakdown01) {
+            const b = report.breakdown01;
+            const parts = [];
+            if (b.passRate != null)
+                parts.push(`passRate=${b.passRate}`);
+            if (b.safety != null)
+                parts.push(`safety=${b.safety}`);
+            if (b.judge != null)
+                parts.push(`judge=${b.judge}`);
+            if (b.schema != null)
+                parts.push(`schema=${b.schema}`);
+            if (b.latency != null)
+                parts.push(`latency=${b.latency}`);
+            if (b.cost != null)
+                parts.push(`cost=${b.cost}`);
+            if (parts.length)
+                lines.push(`Breakdown: ${parts.join(', ')}`);
+        }
+        if (report.flags && report.flags.length > 0) {
+            lines.push(`Flags: ${report.flags.join(', ')}`);
+        }
+        if (report.thresholds) {
+            const t = report.thresholds;
+            const parts = [];
+            if (t.minScore != null)
+                parts.push(`minScore=${t.minScore}`);
+            if (t.maxDrop != null)
+                parts.push(`maxDrop=${t.maxDrop}`);
+            if (t.minN != null)
+                parts.push(`minN=${t.minN}`);
+            if (parts.length)
+                lines.push(`Thresholds: ${parts.join(', ')}`);
+        }
+    }
+    return lines.join('\n');
+}

package/dist/cli/formatters/json.d.ts ADDED Viewed

@@ -0,0 +1,6 @@
+/**
+ * JSON formatter for evalai check.
+ * Outputs only JSON, no extra logs.
+ */
+import type { CheckReport } from './types';
+export declare function formatJson(report: CheckReport): string;

package/dist/cli/formatters/json.js ADDED Viewed

@@ -0,0 +1,10 @@
+"use strict";
+/**
+ * JSON formatter for evalai check.
+ * Outputs only JSON, no extra logs.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.formatJson = formatJson;
+function formatJson(report) {
+    return JSON.stringify(report, null, 0);
+}

package/dist/cli/formatters/types.d.ts ADDED Viewed

@@ -0,0 +1,76 @@
+/**
+ * CheckReport and related types for formatters.
+ */
+export type GateVerdict = 'pass' | 'fail';
+export type FailureReasonCode = 'LOW_SCORE' | 'LOW_PASS_RATE' | 'SAFETY_RISK' | 'LATENCY_RISK' | 'COST_RISK' | 'BASELINE_MISSING' | 'MAX_DROP_EXCEEDED' | 'INSUFFICIENT_EVIDENCE' | 'POLICY_VIOLATION' | 'UNKNOWN';
+export type ScoreBreakdown01 = {
+    passRate?: number;
+    safety?: number;
+    judge?: number;
+    schema?: number;
+    latency?: number;
+    cost?: number;
+};
+export type ScoreContribPts = {
+    passRatePts?: number;
+    safetyPts?: number;
+    compliancePts?: number;
+    performancePts?: number;
+};
+export type GateThresholds = {
+    minScore?: number;
+    minPassRate?: number;
+    minSafety?: number;
+    maxDrop?: number;
+    minN?: number;
+    allowWeakEvidence?: boolean;
+    baseline?: 'published' | 'previous' | 'production';
+};
+export type FailedCase = {
+    testCaseId?: number;
+    status?: 'failed' | 'error' | 'skipped' | 'passed';
+    name?: string;
+    input?: string;
+    inputSnippet?: string;
+    expectedOutput?: string;
+    expectedSnippet?: string;
+    output?: string;
+    outputSnippet?: string;
+    reason?: string;
+};
+export type CiContext = {
+    provider?: 'github' | 'gitlab' | 'circle' | 'unknown';
+    repo?: string;
+    sha?: string;
+    branch?: string;
+    pr?: number;
+    runUrl?: string;
+    actor?: string;
+};
+export type CheckReport = {
+    evaluationId: string;
+    runId?: number;
+    verdict: GateVerdict;
+    reasonCode: FailureReasonCode;
+    reasonMessage?: string;
+    score?: number;
+    baselineScore?: number;
+    delta?: number;
+    passRate?: number;
+    safetyPassRate?: number;
+    flags?: string[];
+    breakdown01?: ScoreBreakdown01;
+    contribPts?: ScoreContribPts;
+    thresholds?: GateThresholds;
+    n?: number;
+    evidenceLevel?: 'strong' | 'medium' | 'weak';
+    baselineMissing?: boolean;
+    dashboardUrl?: string;
+    failedCases?: FailedCase[];
+    failedCasesShown?: number;
+    failedCasesMore?: number;
+    requestId?: string;
+    durationMs?: number;
+    ci?: CiContext;
+    explain?: boolean;
+};

package/dist/cli/formatters/types.js ADDED Viewed

@@ -0,0 +1,5 @@
+"use strict";
+/**
+ * CheckReport and related types for formatters.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });

package/dist/cli/gate.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+/**
+ * Pure gate evaluation. No console output.
+ * Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
+ */
+import type { CheckArgs } from './check';
+import type { QualityLatestData } from './api';
+export type GateResult = {
+    exitCode: number;
+    passed: boolean;
+    reasonCode: string;
+    reasonMessage: string | null;
+};
+export declare function evaluateGate(args: CheckArgs, quality: QualityLatestData): GateResult;

package/dist/cli/gate.js ADDED Viewed

@@ -0,0 +1,108 @@
+"use strict";
+/**
+ * Pure gate evaluation. No console output.
+ * Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.evaluateGate = evaluateGate;
+const constants_1 = require("./constants");
+function evaluateGate(args, quality) {
+    const score = quality?.score ?? 0;
+    const total = quality?.total ?? null;
+    const evidenceLevel = quality?.evidenceLevel ?? null;
+    const baselineScore = quality?.baselineScore ?? null;
+    const regressionDelta = quality?.regressionDelta ?? null;
+    const baselineMissing = quality?.baselineMissing === true;
+    const breakdown = quality?.breakdown ?? {};
+    const policyFlags = (quality?.flags ?? []);
+    // Baseline missing → configuration failure (not API error)
+    if (baselineMissing && (args.baseline !== 'published' || args.maxDrop !== undefined)) {
+        const msg = args.baseline === 'production'
+            ? 'No prod runs exist for this evaluation. Tag runs with environment=prod before using --baseline production.'
+            : `Baseline (${args.baseline}) not found. Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`;
+        return {
+            exitCode: constants_1.EXIT.BAD_ARGS,
+            passed: false,
+            reasonCode: 'BASELINE_MISSING',
+            reasonMessage: msg,
+        };
+    }
+    // minN gate
+    if (args.minN !== undefined && total !== null && total < args.minN) {
+        return {
+            exitCode: constants_1.EXIT.LOW_N,
+            passed: false,
+            reasonCode: 'INSUFFICIENT_EVIDENCE',
+            reasonMessage: `total test cases (${total}) < minN (${args.minN})`,
+        };
+    }
+    // allowWeakEvidence gate
+    if (!args.allowWeakEvidence && evidenceLevel === 'weak') {
+        return {
+            exitCode: constants_1.EXIT.WEAK_EVIDENCE,
+            passed: false,
+            reasonCode: 'INSUFFICIENT_EVIDENCE',
+            reasonMessage: "evidence level is 'weak' (use --allowWeakEvidence to permit)",
+        };
+    }
+    // Compute gate result
+    if (args.minScore > 0 && score < args.minScore) {
+        return {
+            exitCode: constants_1.EXIT.SCORE_BELOW,
+            passed: false,
+            reasonCode: 'LOW_SCORE',
+            reasonMessage: `score ${score} < minScore ${args.minScore}`,
+        };
+    }
+    if (args.maxDrop !== undefined && regressionDelta !== null && regressionDelta < -(args.maxDrop)) {
+        return {
+            exitCode: constants_1.EXIT.REGRESSION,
+            passed: false,
+            reasonCode: 'MAX_DROP_EXCEEDED',
+            reasonMessage: `score dropped ${Math.abs(regressionDelta)} pts from baseline (max allowed: ${args.maxDrop})`,
+        };
+    }
+    if (args.policy) {
+        const policyChecks = {
+            HIPAA: { requiredSafetyRate: 0.99, maxFlags: ['SAFETY_RISK'] },
+            SOC2: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK', 'LOW_PASS_RATE'] },
+            GDPR: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK'] },
+            PCI_DSS: { requiredSafetyRate: 0.99, maxFlags: ['SAFETY_RISK', 'LOW_PASS_RATE'] },
+            FINRA_4511: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK'] },
+        };
+        const policyName = args.policy.toUpperCase();
+        const check = policyChecks[policyName];
+        if (!check) {
+            return {
+                exitCode: constants_1.EXIT.BAD_ARGS,
+                passed: false,
+                reasonCode: 'UNKNOWN',
+                reasonMessage: `Unknown policy: ${args.policy}. Available: ${Object.keys(policyChecks).join(', ')}`,
+            };
+        }
+        const safetyRate = breakdown?.safety ?? 0;
+        if (safetyRate < check.requiredSafetyRate) {
+            return {
+                exitCode: constants_1.EXIT.POLICY_VIOLATION,
+                passed: false,
+                reasonCode: 'POLICY_VIOLATION',
+                reasonMessage: `policy ${policyName}: safety ${Math.round(safetyRate * 100)}% < required ${Math.round(check.requiredSafetyRate * 100)}%`,
+            };
+        }
+        const violations = policyFlags.filter((f) => check.maxFlags.includes(f));
+        if (violations.length > 0) {
+            return {
+                exitCode: constants_1.EXIT.POLICY_VIOLATION,
+                passed: false,
+                reasonCode: 'POLICY_VIOLATION',
+                reasonMessage: `policy ${policyName}: ${violations.join(', ')}`,
+            };
+        }
+    }
+    return {
+        exitCode: constants_1.EXIT.PASS,
+        passed: true,
+        reasonCode: 'PASS',
+        reasonMessage: null,
+    };
+}

package/dist/cli/index.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@
  * evalai — EvalAI CLI
  *
  * Commands:
+ *   evalai init   — Create evalai.config.json
  *   evalai check  — CI/CD evaluation gate (see evalai check --help)
  */
 export {};

package/dist/cli/index.js CHANGED Viewed

@@ -4,15 +4,35 @@
  * evalai — EvalAI CLI
  *
  * Commands:
+ *   evalai init   — Create evalai.config.json
  *   evalai check  — CI/CD evaluation gate (see evalai check --help)
  */
 Object.defineProperty(exports, "__esModule", { value: true });
 const check_1 = require("./check");
+const init_1 = require("./init");
+const doctor_1 = require("./doctor");
 const argv = process.argv.slice(2);
 const subcommand = argv[0];
-if (subcommand === 'check') {
-    const args = (0, check_1.parseArgs)(argv.slice(1));
-    (0, check_1.runCheck)(args)
+if (subcommand === 'init') {
+    const cwd = process.cwd();
+    const ok = (0, init_1.runInit)(cwd);
+    process.exit(ok ? 0 : 1);
+}
+else if (subcommand === 'doctor') {
+    (0, doctor_1.runDoctor)(argv.slice(1))
+        .then((code) => process.exit(code))
+        .catch((err) => {
+        console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
+        process.exit(1);
+    });
+}
+else if (subcommand === 'check') {
+    const parsed = (0, check_1.parseArgs)(argv.slice(1));
+    if (!parsed.ok) {
+        console.error(parsed.message);
+        process.exit(parsed.exitCode);
+    }
+    (0, check_1.runCheck)(parsed.args)
         .then((code) => process.exit(code))
         .catch((err) => {
         console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
@@ -23,20 +43,26 @@ else {
     console.log(`EvalAI CLI
 Usage:
+  evalai init              Create evalai.config.json
+  evalai doctor [options]  Verify CI/CD setup (same endpoint as check)
   evalai check [options]   CI/CD evaluation gate
 Options for check:
-  --evaluationId <id>  Required. Evaluation to gate on.
+  --evaluationId <id>  Evaluation to gate on (or from config)
   --apiKey <key>      API key (or EVALAI_API_KEY env)
+  --format <fmt>      Output format: human (default), json, github
+  --explain           Show score breakdown and thresholds
+  --onFail import     When gate fails, import run with CI context
   --minScore <n>      Fail if score < n (0-100)
   --maxDrop <n>       Fail if score dropped > n from baseline
   --minN <n>          Fail if total test cases < n
   --allowWeakEvidence Allow weak evidence level
   --policy <name>     Enforce policy (HIPAA, SOC2, GDPR, etc.)
-  --baseline <mode>   "published" or "previous"
+  --baseline <mode>   "published", "previous", or "production"
   --baseUrl <url>     API base URL
 Examples:
+  evalai init
   evalai check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
   evalai check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
 `);

package/dist/cli/init.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env node
+/**
+ * evalai init — Create evalai.config.json
+ *
+ * Creates the smallest possible config file. Defaults belong in code.
+ */
+export declare function runInit(cwd?: string): boolean;

package/dist/cli/init.js ADDED Viewed

@@ -0,0 +1,69 @@
+#!/usr/bin/env node
+"use strict";
+/**
+ * evalai init — Create evalai.config.json
+ *
+ * Creates the smallest possible config file. Defaults belong in code.
+ */
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.runInit = runInit;
+const fs = __importStar(require("fs"));
+const path = __importStar(require("path"));
+const CONFIG_CONTENT = `{
+  "evaluationId": ""
+}
+`;
+function runInit(cwd = process.cwd()) {
+    const configPath = path.join(cwd, 'evalai.config.json');
+    if (fs.existsSync(configPath)) {
+        console.log(`evalai.config.json already exists at ${path.resolve(configPath)}`);
+        return false;
+    }
+    fs.writeFileSync(configPath, CONFIG_CONTENT, 'utf-8');
+    const resolvedPath = path.resolve(configPath);
+    console.log(`Wrote evalai.config.json at ${resolvedPath}`);
+    console.log('');
+    console.log('Next: paste evaluationId into evalai.config.json, then run npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import');
+    console.log('');
+    console.log('GitHub Actions snippet (add to your workflow):');
+    console.log('  - name: EvalAI gate');
+    console.log('    env:');
+    console.log('      EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}');
+    console.log('    run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import');
+    console.log('');
+    console.log('To uninstall: delete evalai.config.json.');
+    return true;
+}

package/dist/cli/render/snippet.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+/**
+ * Truncate a string for deterministic output.
+ * Replaces newlines with space, caps length.
+ */
+export declare function truncateSnippet(s: string | undefined | null, maxLen?: number): string;

package/dist/cli/render/snippet.js ADDED Viewed

@@ -0,0 +1,15 @@
+"use strict";
+/**
+ * Truncate a string for deterministic output.
+ * Replaces newlines with space, caps length.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.truncateSnippet = truncateSnippet;
+function truncateSnippet(s, maxLen = 140) {
+    if (s == null)
+        return '';
+    const normalized = s.replace(/\s+/g, ' ').trim();
+    if (normalized.length <= maxLen)
+        return normalized;
+    return normalized.slice(0, maxLen) + '…';
+}

package/dist/cli/render/sort.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * Deterministic ordering for failed cases.
+ * Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
+ */
+export interface SortableCase {
+    status?: string;
+    testCaseId?: number;
+    [key: string]: unknown;
+}
+export declare function sortFailedCases<T extends SortableCase>(cases: T[]): T[];

package/dist/cli/render/sort.js ADDED Viewed

@@ -0,0 +1,24 @@
+"use strict";
+/**
+ * Deterministic ordering for failed cases.
+ * Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.sortFailedCases = sortFailedCases;
+const STATUS_SEVERITY = {
+    failed: 0,
+    error: 1,
+    skipped: 2,
+    passed: 3,
+};
+function sortFailedCases(cases) {
+    return [...cases].sort((a, b) => {
+        const sevA = STATUS_SEVERITY[a.status?.toLowerCase() ?? ''] ?? 4;
+        const sevB = STATUS_SEVERITY[b.status?.toLowerCase() ?? ''] ?? 4;
+        if (sevA !== sevB)
+            return sevA - sevB;
+        const idA = a.testCaseId ?? 0;
+        const idB = b.testCaseId ?? 0;
+        return idA - idB;
+    });
+}