npm - @pauly4010/evalai-sdk - Versions diffs - 1.4.1 → 1.5.5 - Mend

@pauly4010/evalai-sdk 1.4.1 → 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

package/CHANGELOG.md +85 -0
package/README.md +205 -543
package/dist/assertions.d.ts +2 -2
package/dist/assertions.js +104 -71
package/dist/batch.js +12 -17
package/dist/cache.js +7 -11
package/dist/cli/api.d.ts +108 -0
package/dist/cli/api.js +130 -0
package/dist/cli/check.d.ts +28 -13
package/dist/cli/check.js +249 -142
package/dist/cli/ci-context.d.ts +6 -0
package/dist/cli/ci-context.js +110 -0
package/dist/cli/config.d.ts +30 -0
package/dist/cli/config.js +207 -0
package/dist/cli/constants.d.ts +15 -0
package/dist/cli/constants.js +18 -0
package/dist/cli/doctor.d.ts +11 -0
package/dist/cli/doctor.js +82 -0
package/dist/cli/formatters/github.d.ts +8 -0
package/dist/cli/formatters/github.js +130 -0
package/dist/cli/formatters/human.d.ts +6 -0
package/dist/cli/formatters/human.js +107 -0
package/dist/cli/formatters/json.d.ts +6 -0
package/dist/cli/formatters/json.js +10 -0
package/dist/cli/formatters/pr-comment.d.ts +12 -0
package/dist/cli/formatters/pr-comment.js +101 -0
package/dist/cli/formatters/types.d.ts +100 -0
package/dist/cli/formatters/types.js +5 -0
package/dist/cli/gate.d.ts +21 -0
package/dist/cli/gate.js +175 -0
package/dist/cli/index.d.ts +1 -0
package/dist/cli/index.js +67 -23
package/dist/cli/init.d.ts +7 -0
package/dist/cli/init.js +69 -0
package/dist/cli/policy-packs.d.ts +23 -0
package/dist/cli/policy-packs.js +83 -0
package/dist/cli/profiles.d.ts +28 -0
package/dist/cli/profiles.js +30 -0
package/dist/cli/reason-codes.d.ts +17 -0
package/dist/cli/reason-codes.js +19 -0
package/dist/cli/render/snippet.d.ts +5 -0
package/dist/cli/render/snippet.js +15 -0
package/dist/cli/render/sort.d.ts +10 -0
package/dist/cli/render/sort.js +24 -0
package/dist/cli/report/build-check-report.d.ts +19 -0
package/dist/cli/report/build-check-report.js +124 -0
package/dist/cli/share.d.ts +17 -0
package/dist/cli/share.js +83 -0
package/dist/client.d.ts +2 -2
package/dist/client.js +144 -132
package/dist/context.d.ts +1 -1
package/dist/context.js +4 -6
package/dist/errors.d.ts +2 -0
package/dist/errors.js +116 -107
package/dist/export.d.ts +6 -6
package/dist/export.js +39 -33
package/dist/index.d.ts +25 -24
package/dist/index.js +62 -56
package/dist/integrations/anthropic.d.ts +1 -1
package/dist/integrations/anthropic.js +23 -19
package/dist/integrations/openai-eval.d.ts +57 -0
package/dist/integrations/openai-eval.js +230 -0
package/dist/integrations/openai.d.ts +1 -1
package/dist/integrations/openai.js +23 -19
package/dist/local.d.ts +2 -2
package/dist/local.js +25 -25
package/dist/logger.d.ts +1 -1
package/dist/logger.js +24 -28
package/dist/matchers/index.d.ts +1 -0
package/dist/matchers/index.js +6 -0
package/dist/matchers/to-pass-gate.d.ts +29 -0
package/dist/matchers/to-pass-gate.js +35 -0
package/dist/pagination.d.ts +1 -1
package/dist/pagination.js +6 -6
package/dist/snapshot.js +24 -24
package/dist/streaming.js +11 -11
package/dist/testing.d.ts +6 -2
package/dist/testing.js +30 -12
package/dist/types.d.ts +22 -22
package/dist/types.js +13 -13
package/dist/utils/input-hash.d.ts +8 -0
package/dist/utils/input-hash.js +38 -0
package/dist/version.d.ts +7 -0
package/dist/version.js +10 -0
package/dist/workflows.d.ts +7 -7
package/dist/workflows.js +44 -44
package/package.json +102 -90
package/dist/__tests__/assertions.test.d.ts +0 -1
package/dist/__tests__/assertions.test.js +0 -288
package/dist/__tests__/client.test.d.ts +0 -1
package/dist/__tests__/client.test.js +0 -185
package/dist/__tests__/testing.test.d.ts +0 -1
package/dist/__tests__/testing.test.js +0 -230
package/dist/__tests__/workflows.test.d.ts +0 -1
package/dist/__tests__/workflows.test.js +0 -222

package/dist/cli/formatters/pr-comment.js ADDED Viewed

@@ -0,0 +1,101 @@
+"use strict";
+/**
+ * PR comment markdown builder for evalai check --pr-comment-out.
+ * Produces deterministic markdown for GitHub Action to post as PR comment.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.PR_COMMENT_MARKER = void 0;
+exports.buildPrComment = buildPrComment;
+const snippet_1 = require("../render/snippet");
+const TOP_FAILURES = 3;
+function escapeMarkdown(s) {
+    return s.replace(/\|/g, "\\|").replace(/\n/g, " ");
+}
+/**
+ * Hidden marker for GitHub Action to find and update existing comment (sticky update).
+ * Action should: 1) post body from file 2) search PR comments for this marker 3) update if found, else create.
+ * Export for use in Action scripts.
+ */
+exports.PR_COMMENT_MARKER = "<!-- evalai-gate-comment -->";
+function buildPrComment(report) {
+    const lines = [];
+    lines.push(exports.PR_COMMENT_MARKER);
+    lines.push("");
+    const passed = report.verdict === "pass";
+    const gateApplied = report.gateApplied !== false;
+    // Verdict badge — distinguish "PASS" from "NOT GATED"
+    if (!gateApplied) {
+        lines.push("## ⚠️ EvalAI Regression Gate — NOT APPLIED");
+        lines.push("");
+        lines.push("**Gate not applied: baseline missing.**");
+        if (report.actionableMessage) {
+            lines.push("");
+            lines.push(report.actionableMessage);
+        }
+    }
+    else {
+        lines.push(passed ? "## ✅ EvalAI Regression Gate — PASSED" : "## 🚨 EvalAI Regression Gate — FAILED");
+    }
+    lines.push("");
+    // Score + Delta (skip when gate not applied)
+    const deltaStr = report.baselineScore != null && report.delta != null
+        ? ` (${report.delta >= 0 ? "+" : ""}${report.delta} from baseline ${report.baselineScore})`
+        : "";
+    lines.push(`**Score:** ${report.score ?? 0}/100${deltaStr}`);
+    lines.push("");
+    // ReasonCode
+    lines.push(`**Reason:** ${report.reasonCode}`);
+    if (report.reasonMessage) {
+        lines.push(`_${escapeMarkdown(report.reasonMessage)}_`);
+    }
+    lines.push("");
+    // Policy (if any)
+    if (report.policy) {
+        lines.push(`**Policy:** ${report.policy}`);
+        lines.push("");
+    }
+    // Top failures (max 3)
+    const failedCases = report.failedCases ?? [];
+    if (failedCases.length > 0) {
+        lines.push("### Top Issues");
+        lines.push("");
+        for (const fc of failedCases.slice(0, TOP_FAILURES)) {
+            const label = fc.name ?? fc.input ?? "(unnamed)";
+            const reason = fc.reason ?? fc.outputSnippet ?? fc.output ?? "no output";
+            lines.push(`- **${(0, snippet_1.truncateSnippet)(escapeMarkdown(label), 60)}** — ${(0, snippet_1.truncateSnippet)(escapeMarkdown(reason), 80)}`);
+        }
+        if (failedCases.length > TOP_FAILURES) {
+            lines.push(`- _+ ${failedCases.length - TOP_FAILURES} more_`);
+        }
+        lines.push("");
+    }
+    // Explain summary (if --explain)
+    if (report.explain && report.contribPts) {
+        const pts = report.contribPts;
+        const parts = [];
+        if (pts.passRatePts != null)
+            parts.push(`pass rate: ${pts.passRatePts} pts`);
+        if (pts.safetyPts != null)
+            parts.push(`safety: ${pts.safetyPts} pts`);
+        if (pts.compliancePts != null)
+            parts.push(`compliance: ${pts.compliancePts} pts`);
+        if (pts.performancePts != null)
+            parts.push(`performance: ${pts.performancePts} pts`);
+        if (parts.length > 0) {
+            lines.push("### Breakdown");
+            lines.push("");
+            lines.push(parts.join(" | "));
+            lines.push("");
+        }
+    }
+    // Dashboard URL
+    if (report.dashboardUrl) {
+        lines.push(`🔎 [Dashboard](${report.dashboardUrl})`);
+    }
+    // Share URL (if exists)
+    if (report.shareUrl) {
+        lines.push(`🔗 [Share Snapshot](${report.shareUrl})`);
+    }
+    lines.push("");
+    return lines.join("\n");
+}

package/dist/cli/formatters/types.d.ts ADDED Viewed

@@ -0,0 +1,100 @@
+/**
+ * CheckReport and related types for formatters.
+ */
+export type GateVerdict = "pass" | "warn" | "fail";
+/** "neutral" = exit 0 but gate not applied (e.g. baseline missing with --baseline auto) */
+export type GateMode = "enforced" | "neutral";
+/** Canonical reason codes. Import REASON_CODES from ../reason-codes for constants. */
+export type FailureReasonCode = "PASS" | "WARN_REGRESSION" | "LOW_SAMPLE_SIZE" | "BASELINE_MISSING" | "SCORE_TOO_LOW" | "DELTA_TOO_HIGH" | "COST_BUDGET_EXCEEDED" | "LATENCY_BUDGET_EXCEEDED" | "POLICY_FAILED" | "UNKNOWN" | "LOW_SCORE" | "LOW_PASS_RATE" | "SAFETY_RISK" | "LATENCY_RISK" | "COST_RISK" | "MAX_DROP_EXCEEDED" | "INSUFFICIENT_EVIDENCE" | "POLICY_VIOLATION";
+export type ScoreBreakdown01 = {
+    passRate?: number;
+    safety?: number;
+    judge?: number;
+    schema?: number;
+    latency?: number;
+    cost?: number;
+};
+export type ScoreContribPts = {
+    passRatePts?: number;
+    safetyPts?: number;
+    compliancePts?: number;
+    performancePts?: number;
+};
+export type GateThresholds = {
+    minScore?: number;
+    minPassRate?: number;
+    minSafety?: number;
+    maxDrop?: number;
+    warnDrop?: number;
+    minN?: number;
+    allowWeakEvidence?: boolean;
+    baseline?: "published" | "previous" | "production" | "auto";
+    maxCostUsd?: number;
+    maxLatencyMs?: number;
+    maxCostDeltaUsd?: number;
+};
+export type FailedCase = {
+    testCaseId?: number;
+    status?: "failed" | "error" | "skipped" | "passed";
+    name?: string;
+    input?: string;
+    inputSnippet?: string;
+    expectedOutput?: string;
+    expectedSnippet?: string;
+    output?: string;
+    outputSnippet?: string;
+    reason?: string;
+};
+export type CiContext = {
+    provider?: "github" | "gitlab" | "circle" | "unknown";
+    repo?: string;
+    sha?: string;
+    branch?: string;
+    pr?: number;
+    runUrl?: string;
+    actor?: string;
+};
+export type CheckReport = {
+    evaluationId: string;
+    runId?: number;
+    verdict: GateVerdict;
+    /** false when gate not applied (e.g. baseline missing, exit 0) — prevents false confidence */
+    gateApplied: boolean;
+    /** "enforced" = gate ran; "neutral" = exit 0, gate skipped */
+    gateMode: GateMode;
+    reasonCode: FailureReasonCode;
+    /** Actionable message for PR comment / UX */
+    actionableMessage?: string;
+    reasonMessage?: string;
+    score?: number;
+    baselineScore?: number;
+    delta?: number;
+    passRate?: number;
+    safetyPassRate?: number;
+    flags?: string[];
+    breakdown01?: ScoreBreakdown01;
+    contribPts?: ScoreContribPts;
+    thresholds?: GateThresholds;
+    n?: number;
+    evidenceLevel?: "strong" | "medium" | "weak";
+    baselineMissing?: boolean;
+    baselineStatus?: "found" | "missing";
+    dashboardUrl?: string;
+    failedCases?: FailedCase[];
+    failedCasesShown?: number;
+    failedCasesMore?: number;
+    requestId?: string;
+    durationMs?: number;
+    ci?: CiContext;
+    explain?: boolean;
+    shareUrl?: string;
+    policy?: string;
+    baselineRunId?: number;
+    ciRunUrl?: string;
+    /** When --explain and policy failed: which sub-check failed, remediation, snapshot */
+    policyEvidence?: {
+        failedCheck?: string;
+        remediation?: string;
+        snapshot?: Record<string, unknown>;
+    };
+};

package/dist/cli/formatters/types.js ADDED Viewed

@@ -0,0 +1,5 @@
+"use strict";
+/**
+ * CheckReport and related types for formatters.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });

package/dist/cli/gate.d.ts ADDED Viewed

@@ -0,0 +1,21 @@
+/**
+ * Pure gate evaluation. No console output.
+ * Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
+ */
+import type { QualityLatestData } from "./api";
+import type { CheckArgs } from "./check";
+export type GateResult = {
+    exitCode: number;
+    passed: boolean;
+    reasonCode: string;
+    reasonMessage: string | null;
+    /** true when gate was skipped (e.g. baseline missing + auto) */
+    gateSkipped?: boolean;
+    /** When policy failed: sub-check, remediation, snapshot for explain */
+    policyEvidence?: {
+        failedCheck: string;
+        remediation: string;
+        snapshot?: Record<string, unknown>;
+    };
+};
+export declare function evaluateGate(args: CheckArgs, quality: QualityLatestData): GateResult;

package/dist/cli/gate.js ADDED Viewed

@@ -0,0 +1,175 @@
+"use strict";
+/**
+ * Pure gate evaluation. No console output.
+ * Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.evaluateGate = evaluateGate;
+const constants_1 = require("./constants");
+const policy_packs_1 = require("./policy-packs");
+const reason_codes_1 = require("./reason-codes");
+function evaluateGate(args, quality) {
+    const score = quality?.score ?? 0;
+    const total = quality?.total ?? null;
+    const evidenceLevel = quality?.evidenceLevel ?? null;
+    const _baselineScore = quality?.baselineScore ?? null;
+    const regressionDelta = quality?.regressionDelta ?? null;
+    const baselineMissing = quality?.baselineMissing === true;
+    const breakdown = quality?.breakdown ?? {};
+    const policyFlags = (quality?.flags ?? []);
+    const avgLatencyMs = quality?.avgLatencyMs ?? null;
+    const costUsd = quality?.costUsd ?? null;
+    const baselineCostUsd = quality?.baselineCostUsd ?? null;
+    // Baseline missing FIRST: --baseline auto → exit 0 (neutral, gate not applied); others → BAD_ARGS
+    // Must run before budget gates so baseline missing + maxCostDeltaUsd ⇒ neutral, not budget failure
+    if (baselineMissing) {
+        const msg = args.baseline === "auto"
+            ? "No baseline found. Tip: Publish a baseline from the dashboard, or run with --baseline previous once you have runs."
+            : args.baseline === "production"
+                ? "No prod runs exist for this evaluation. Tag runs with environment=prod before using --baseline production."
+                : `Baseline (${args.baseline}) not found. Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`;
+        if (args.baseline === "auto") {
+            return {
+                exitCode: constants_1.EXIT.PASS,
+                passed: false,
+                reasonCode: reason_codes_1.REASON_CODES.BASELINE_MISSING,
+                reasonMessage: msg,
+                gateSkipped: true,
+            };
+        }
+        if (args.baseline !== "published" || args.maxDrop !== undefined) {
+            return {
+                exitCode: constants_1.EXIT.BAD_ARGS,
+                passed: false,
+                reasonCode: reason_codes_1.REASON_CODES.BASELINE_MISSING,
+                reasonMessage: msg,
+            };
+        }
+    }
+    // Budget gates (after baseline check)
+    if (args.maxCostUsd != null && costUsd != null && costUsd > args.maxCostUsd) {
+        return {
+            exitCode: constants_1.EXIT.SCORE_BELOW,
+            passed: false,
+            reasonCode: reason_codes_1.REASON_CODES.COST_BUDGET_EXCEEDED,
+            reasonMessage: `cost $${costUsd.toFixed(4)} exceeds maxCostUsd $${args.maxCostUsd.toFixed(4)}`,
+        };
+    }
+    if (args.maxLatencyMs != null && avgLatencyMs != null && avgLatencyMs > args.maxLatencyMs) {
+        return {
+            exitCode: constants_1.EXIT.SCORE_BELOW,
+            passed: false,
+            reasonCode: reason_codes_1.REASON_CODES.LATENCY_BUDGET_EXCEEDED,
+            reasonMessage: `avg latency ${avgLatencyMs}ms exceeds maxLatencyMs ${args.maxLatencyMs}`,
+        };
+    }
+    if (args.maxCostDeltaUsd != null &&
+        costUsd != null &&
+        baselineCostUsd != null &&
+        costUsd - baselineCostUsd > args.maxCostDeltaUsd) {
+        return {
+            exitCode: constants_1.EXIT.SCORE_BELOW,
+            passed: false,
+            reasonCode: reason_codes_1.REASON_CODES.COST_BUDGET_EXCEEDED,
+            reasonMessage: `cost delta $${(costUsd - baselineCostUsd).toFixed(4)} exceeds maxCostDeltaUsd $${args.maxCostDeltaUsd.toFixed(4)}`,
+        };
+    }
+    // minN gate
+    if (args.minN !== undefined && total !== null && total < args.minN) {
+        return {
+            exitCode: constants_1.EXIT.LOW_N,
+            passed: false,
+            reasonCode: reason_codes_1.REASON_CODES.LOW_SAMPLE_SIZE,
+            reasonMessage: `total test cases (${total}) < minN (${args.minN})`,
+        };
+    }
+    // allowWeakEvidence gate
+    if (!args.allowWeakEvidence && evidenceLevel === "weak") {
+        return {
+            exitCode: constants_1.EXIT.WEAK_EVIDENCE,
+            passed: false,
+            reasonCode: reason_codes_1.REASON_CODES.LOW_SAMPLE_SIZE,
+            reasonMessage: "evidence level is 'weak' (use --allowWeakEvidence to permit)",
+        };
+    }
+    // Compute gate result
+    if (args.minScore > 0 && score < args.minScore) {
+        return {
+            exitCode: constants_1.EXIT.SCORE_BELOW,
+            passed: false,
+            reasonCode: reason_codes_1.REASON_CODES.SCORE_TOO_LOW,
+            reasonMessage: `score ${score} < minScore ${args.minScore}`,
+        };
+    }
+    // warnDrop: soft warning band; maxDrop: hard fail
+    if (args.maxDrop !== undefined && regressionDelta !== null && regressionDelta < -args.maxDrop) {
+        return {
+            exitCode: constants_1.EXIT.REGRESSION,
+            passed: false,
+            reasonCode: reason_codes_1.REASON_CODES.DELTA_TOO_HIGH,
+            reasonMessage: `score dropped ${Math.abs(regressionDelta)} pts from baseline (max allowed: ${args.maxDrop})`,
+        };
+    }
+    if (args.warnDrop !== undefined &&
+        regressionDelta !== null &&
+        regressionDelta < -args.warnDrop &&
+        (args.maxDrop === undefined || regressionDelta >= -args.maxDrop)) {
+        return {
+            exitCode: constants_1.EXIT.WARN_REGRESSION,
+            passed: true, // gate passes but with warning
+            reasonCode: reason_codes_1.REASON_CODES.WARN_REGRESSION,
+            reasonMessage: `score dropped ${Math.abs(regressionDelta)} pts from baseline (warn threshold: ${args.warnDrop}${args.maxDrop != null ? `, fail at ${args.maxDrop}` : ""})`,
+        };
+    }
+    if (args.policy) {
+        const pack = (0, policy_packs_1.resolvePolicyPack)(args.policy);
+        if (!pack) {
+            const valid = (0, policy_packs_1.getValidPolicyVersions)().join(", ");
+            return {
+                exitCode: constants_1.EXIT.BAD_ARGS,
+                passed: false,
+                reasonCode: reason_codes_1.REASON_CODES.UNKNOWN,
+                reasonMessage: `Unknown policy or version: ${args.policy}. Valid: ${valid}`,
+            };
+        }
+        const { requiredSafetyRate, maxFlags } = pack.thresholds;
+        const safetyRate = breakdown?.safety ?? 0;
+        if (safetyRate < requiredSafetyRate) {
+            return {
+                exitCode: constants_1.EXIT.POLICY_VIOLATION,
+                passed: false,
+                reasonCode: reason_codes_1.REASON_CODES.POLICY_FAILED,
+                reasonMessage: `policy ${pack.policyId}@${pack.version}: safety ${Math.round(safetyRate * 100)}% < required ${Math.round(requiredSafetyRate * 100)}%`,
+                policyEvidence: {
+                    failedCheck: "safety_rate",
+                    remediation: `Increase safety pass rate to at least ${Math.round(requiredSafetyRate * 100)}%. Review failing test cases for safety-related assertions.`,
+                    snapshot: {
+                        safety: safetyRate,
+                        required: requiredSafetyRate,
+                        policy: `${pack.policyId}@${pack.version}`,
+                    },
+                },
+            };
+        }
+        const violations = policyFlags.filter((f) => maxFlags.includes(f));
+        if (violations.length > 0) {
+            return {
+                exitCode: constants_1.EXIT.POLICY_VIOLATION,
+                passed: false,
+                reasonCode: reason_codes_1.REASON_CODES.POLICY_FAILED,
+                reasonMessage: `policy ${pack.policyId}@${pack.version}: ${violations.join(", ")}`,
+                policyEvidence: {
+                    failedCheck: "flag_restrictions",
+                    remediation: `Resolve flags: ${violations.join(", ")}. These indicate policy violations that must be addressed.`,
+                    snapshot: { violations, policy: `${pack.policyId}@${pack.version}` },
+                },
+            };
+        }
+    }
+    return {
+        exitCode: constants_1.EXIT.PASS,
+        passed: true,
+        reasonCode: reason_codes_1.REASON_CODES.PASS,
+        reasonMessage: null,
+    };
+}

package/dist/cli/index.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@
  * evalai — EvalAI CLI
  *
  * Commands:
+ *   evalai init   — Create evalai.config.json
  *   evalai check  — CI/CD evaluation gate (see evalai check --help)
  */
 export {};

package/dist/cli/index.js CHANGED Viewed

@@ -4,41 +4,85 @@
  * evalai — EvalAI CLI
  *
  * Commands:
+ *   evalai init   — Create evalai.config.json
  *   evalai check  — CI/CD evaluation gate (see evalai check --help)
  */
 Object.defineProperty(exports, "__esModule", { value: true });
 const check_1 = require("./check");
+const doctor_1 = require("./doctor");
+const init_1 = require("./init");
+const share_1 = require("./share");
 const argv = process.argv.slice(2);
 const subcommand = argv[0];
-if (subcommand === 'check') {
-    const args = (0, check_1.parseArgs)(argv.slice(1));
-    (0, check_1.runCheck)(args)
+if (subcommand === "init") {
+    const cwd = process.cwd();
+    const ok = (0, init_1.runInit)(cwd);
+    process.exit(ok ? 0 : 1);
+}
+else if (subcommand === "doctor") {
+    (0, doctor_1.runDoctor)(argv.slice(1))
+        .then((code) => process.exit(code))
+        .catch((err) => {
+        console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
+        process.exit(1);
+    });
+}
+else if (subcommand === "check") {
+    const parsed = (0, check_1.parseArgs)(argv.slice(1));
+    if (!parsed.ok) {
+        console.error(parsed.message);
+        process.exit(parsed.exitCode);
+    }
+    (0, check_1.runCheck)(parsed.args)
         .then((code) => process.exit(code))
         .catch((err) => {
         console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
         process.exit(4);
     });
 }
+else if (subcommand === "share") {
+    const parsed = (0, share_1.parseShareArgs)(argv.slice(1));
+    if ("error" in parsed) {
+        console.error(parsed.error);
+        process.exit(1);
+    }
+    (0, share_1.runShare)(parsed)
+        .then((code) => process.exit(code))
+        .catch((err) => {
+        console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
+        process.exit(1);
+    });
+}
 else {
-    console.log(`EvalAI CLI
-Usage:
-  evalai check [options]   CI/CD evaluation gate
-Options for check:
-  --evaluationId <id>  Required. Evaluation to gate on.
-  --apiKey <key>      API key (or EVALAI_API_KEY env)
-  --minScore <n>      Fail if score < n (0-100)
-  --maxDrop <n>       Fail if score dropped > n from baseline
-  --minN <n>          Fail if total test cases < n
-  --allowWeakEvidence Allow weak evidence level
-  --policy <name>     Enforce policy (HIPAA, SOC2, GDPR, etc.)
-  --baseline <mode>   "published" or "previous"
-  --baseUrl <url>     API base URL
-Examples:
-  evalai check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
-  evalai check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
+    console.log(`EvalAI CLI
+Usage:
+  evalai init              Create evalai.config.json
+  evalai doctor [options]  Verify CI/CD setup (same endpoint as check)
+  evalai check [options]   CI/CD evaluation gate
+  evalai share [options]   Create share link for a run
+Options for check:
+  --evaluationId <id>  Evaluation to gate on (or from config)
+  --apiKey <key>      API key (or EVALAI_API_KEY env)
+  --format <fmt>      Output format: human (default), json, github
+  --explain           Show score breakdown and thresholds
+  --onFail import     When gate fails, import run with CI context
+  --minScore <n>      Fail if score < n (0-100)
+  --maxDrop <n>       Fail if score dropped > n from baseline
+  --warnDrop <n>      Warn (exit 8) if score dropped > n but < maxDrop
+  --minN <n>          Fail if total test cases < n
+  --allowWeakEvidence Allow weak evidence level
+  --policy <name>     Enforce policy (HIPAA, SOC2, GDPR, etc.)
+  --baseline <mode>   "published", "previous", or "production"
+  --share <mode>      Share link: always | fail | never (fail = only when gate fails)
+  --baseUrl <url>     API base URL
+Examples:
+  evalai init
+  evalai check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
+  evalai check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
+  evalai share --scope run --evaluationId 42 --runId 123 --expires 7d --apiKey $EVALAI_API_KEY
 `);
-    process.exit(subcommand === '--help' || subcommand === '-h' ? 0 : 1);
+    process.exit(subcommand === "--help" || subcommand === "-h" ? 0 : 1);
 }

package/dist/cli/init.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env node
+/**
+ * evalai init — Create evalai.config.json
+ *
+ * Creates the smallest possible config file. Defaults belong in code.
+ */
+export declare function runInit(cwd?: string): boolean;

package/dist/cli/init.js ADDED Viewed

@@ -0,0 +1,69 @@
+#!/usr/bin/env node
+"use strict";
+/**
+ * evalai init — Create evalai.config.json
+ *
+ * Creates the smallest possible config file. Defaults belong in code.
+ */
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.runInit = runInit;
+const fs = __importStar(require("node:fs"));
+const path = __importStar(require("node:path"));
+const CONFIG_CONTENT = `{
+  "evaluationId": ""
+}
+`;
+function runInit(cwd = process.cwd()) {
+    const configPath = path.join(cwd, "evalai.config.json");
+    if (fs.existsSync(configPath)) {
+        console.log(`evalai.config.json already exists at ${path.resolve(configPath)}`);
+        return false;
+    }
+    fs.writeFileSync(configPath, CONFIG_CONTENT, "utf-8");
+    const resolvedPath = path.resolve(configPath);
+    console.log(`Wrote evalai.config.json at ${resolvedPath}`);
+    console.log("");
+    console.log("Next: paste evaluationId into evalai.config.json, then run npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import");
+    console.log("");
+    console.log("GitHub Actions snippet (add to your workflow):");
+    console.log("  - name: EvalAI gate");
+    console.log("    env:");
+    console.log("      EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}");
+    console.log("    run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import");
+    console.log("");
+    console.log("To uninstall: delete evalai.config.json.");
+    return true;
+}

package/dist/cli/policy-packs.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+/**
+ * Versioned policy packs for evalai check --policy.
+ * Schema: policyId, version, thresholds, rationale, checks.
+ * Usage: --policy HIPAA@1
+ */
+export type PolicyPack = {
+    policyId: string;
+    version: number;
+    thresholds: {
+        requiredSafetyRate: number;
+        maxFlags: string[];
+    };
+    rationale: string;
+    checks: string[];
+};
+export declare const POLICY_PACKS: Record<string, Record<number, PolicyPack>>;
+/**
+ * Parse --policy flag (e.g. "HIPAA@1" or "HIPAA") and resolve to PolicyPack.
+ * Default version is 1 when omitted.
+ */
+export declare function resolvePolicyPack(spec: string): PolicyPack | null;
+/** List valid policy@version specs for error messages */
+export declare function getValidPolicyVersions(): string[];