npm - @evalgate/sdk - Versions diffs - 2.2.3 → 2.3.0 - Mend

@evalgate/sdk 2.2.3 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/CHANGELOG.md +31 -0
package/README.md +39 -2
package/dist/assertions.d.ts +186 -6
package/dist/assertions.js +515 -61
package/dist/batch.js +4 -4
package/dist/cache.d.ts +4 -0
package/dist/cache.js +4 -0
package/dist/cli/baseline.d.ts +14 -0
package/dist/cli/baseline.js +43 -3
package/dist/cli/check.d.ts +5 -2
package/dist/cli/check.js +20 -12
package/dist/cli/compare.d.ts +80 -0
package/dist/cli/compare.js +266 -0
package/dist/cli/index.js +244 -101
package/dist/cli/regression-gate.js +23 -0
package/dist/cli/run.js +22 -0
package/dist/cli/start.d.ts +26 -0
package/dist/cli/start.js +130 -0
package/dist/cli/templates.d.ts +24 -0
package/dist/cli/templates.js +314 -0
package/dist/cli/traces.d.ts +109 -0
package/dist/cli/traces.js +152 -0
package/dist/cli/validate.d.ts +37 -0
package/dist/cli/validate.js +252 -0
package/dist/cli/watch.d.ts +19 -0
package/dist/cli/watch.js +175 -0
package/dist/client.js +6 -13
package/dist/constants.d.ts +2 -0
package/dist/constants.js +5 -0
package/dist/index.d.ts +8 -6
package/dist/index.js +26 -6
package/dist/integrations/openai.js +83 -60
package/dist/logger.d.ts +3 -1
package/dist/logger.js +2 -1
package/dist/otel.d.ts +130 -0
package/dist/otel.js +309 -0
package/dist/runtime/eval.d.ts +14 -4
package/dist/runtime/eval.js +127 -2
package/dist/runtime/registry.d.ts +4 -2
package/dist/runtime/registry.js +11 -3
package/dist/runtime/run-report.d.ts +1 -1
package/dist/runtime/run-report.js +7 -4
package/dist/runtime/types.d.ts +38 -0
package/dist/testing.d.ts +8 -0
package/dist/testing.js +45 -10
package/dist/version.d.ts +2 -2
package/dist/version.js +2 -2
package/dist/workflows.d.ts +2 -0
package/dist/workflows.js +184 -102
package/package.json +124 -117

package/dist/cli/index.js CHANGED Viewed

@@ -11,6 +11,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
 const baseline_1 = require("./baseline");
 const check_1 = require("./check");
 const ci_1 = require("./ci");
+const compare_1 = require("./compare");
 const diff_1 = require("./diff");
 const discover_1 = require("./discover");
 const doctor_1 = require("./doctor");
@@ -22,13 +23,161 @@ const print_config_1 = require("./print-config");
 const regression_gate_1 = require("./regression-gate");
 const run_1 = require("./run");
 const share_1 = require("./share");
+const start_1 = require("./start");
+const templates_1 = require("./templates");
 const upgrade_1 = require("./upgrade");
+const validate_1 = require("./validate");
+const watch_1 = require("./watch");
 const argv = process.argv.slice(2);
 const subcommand = argv[0];
+const subArgs = argv.slice(1);
+const wantsHelp = subArgs.includes("--help") || subArgs.includes("-h");
+// ── Per-subcommand help text ──
+const SUBCOMMAND_HELP = {
+    init: `evalgate init — Create evalgate.config.json + baseline + CI workflow\n\nUsage:\n  evalgate init [options]\n\nOptions:\n  --template <name>  Start with a real working template (chatbot, codegen, agent, safety, rag)\n  --list-templates   Show all available templates\n\nCreates project scaffolding for EvalGate in the current directory.`,
+    start: `evalgate start — Zero-config startup (one command → passing run)\n\nUsage:\n  evalgate start [options]\n\nOptions:\n  --format <fmt>   Output format: human (default), json\n  --watch          Enable watch mode after first run\n  --skip-init      Skip initialization if not set up\n\nExamples:\n  evalgate start\n  evalgate start --watch\n  evalgate start --format json`,
+    compare: `evalgate compare — Side-by-side result file comparison\n\nCompares two or more saved run result JSON files. Does NOT re-run anything.\nYou run each model/config separately (evalgate run --write-results), then compare the artifacts.\n\nUsage:\n  evalgate compare --base <file> --head <file> [options]\n  evalgate compare --runs <file1> <file2> [file3...] [options]\n\nOptions:\n  --base <file>      Baseline run result JSON file\n  --head <file>      Head run result JSON file\n  --runs <files>     N-way compare (3+ run result JSON files)\n  --labels <names>   Optional cosmetic labels for the output table (e.g., model names)\n  --format <fmt>     Output format: human (default), json\n  --sort-by <key>    Sort by: name (default), score, duration\n\nExamples:\n  evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json\n  evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"\n  evalgate compare --runs run-a.json run-b.json run-c.json`,
+    watch: `evalgate watch — Watch mode (re-execute on file save)\n\nUsage:\n  evalgate run --watch [options]\n  evalgate watch [options]\n\nOptions:\n  --debounce <ms>    Debounce interval (default: 300ms)\n  --no-clear         Don't clear screen between runs\n  --format <fmt>     Output format: human (default), json\n  --write-results    Write results to .evalgate/last-run.json\n\nExamples:\n  evalgate run --watch\n  evalgate watch --write-results`,
+    gate: `evalgate gate — Run the regression gate\n\nUsage:\n  evalgate gate [options]\n\nOptions:\n  --format <fmt>   Output format: human (default), json, github\n  --dry-run        Run checks but always exit 0 (preview mode)\n\nExamples:\n  evalgate gate\n  evalgate gate --format json\n  evalgate gate --dry-run`,
+    check: `evalgate check — CI/CD evaluation gate (API-based)\n\nUsage:\n  evalgate check [options]\n\nOptions:\n  --evaluationId <id>  Evaluation to gate on\n  --apiKey <key>       API key (or EVALGATE_API_KEY env)\n  --format <fmt>       Output format: human (default), json, github\n  --explain            Show score breakdown\n  --minScore <n>       Fail if score < n\n  --maxDrop <n>        Fail if score dropped > n\n  --policy <name>      Enforce policy (HIPAA, SOC2, etc.)\n\nExamples:\n  evalgate check --minScore 92 --evaluationId 42`,
+    explain: `evalgate explain — Explain last gate/check failure\n\nUsage:\n  evalgate explain [options]\n\nOptions:\n  --report <path>  Path to report JSON (default: evals/regression-report.json)\n  --format <fmt>   Output format: human (default), json`,
+    discover: `evalgate discover — Discover behavioral specs\n\nUsage:\n  evalgate discover [options]\n\nOptions:\n  --manifest  Generate evaluation manifest for incremental analysis`,
+    run: `evalgate run — Run evaluation specifications\n\nUsage:\n  evalgate run [options]\n\nOptions:\n  --spec-ids <ids>    Comma-separated list of spec IDs\n  --impacted-only     Run only impacted specs (requires --base)\n  --base <branch>     Base branch for impact analysis\n  --format <fmt>      Output format: human (default), json\n  --write-results     Write results to .evalgate/last-run.json`,
+    diff: `evalgate diff — Compare two run reports\n\nUsage:\n  evalgate diff [options]\n\nOptions:\n  --base <ref>   Base branch or report path\n  --head <path>  Head report path\n  --format <fmt> Output format: human (default), json`,
+    validate: `evalgate validate — Validate spec files without running them\n\nUsage:\n  evalgate validate [options]\n\nOptions:\n  --format <fmt>  Output format: human (default), json`,
+    doctor: `evalgate doctor — Comprehensive CI/CD readiness checklist\n\nUsage:\n  evalgate doctor [options]\n\nOptions:\n  --report         Output JSON diagnostic bundle\n  --format <fmt>   Output format: human (default), json\n  --strict         Treat warnings as failures\n  --apiKey <key>   API key\n  --evaluationId <id>  Evaluation to verify`,
+    baseline: `evalgate baseline — Manage regression gate baselines\n\nUsage:\n  evalgate baseline init     Create starter evals/baseline.json\n  evalgate baseline update   Run tests and update baseline`,
+    upgrade: `evalgate upgrade — Upgrade from Tier 1 to Tier 2\n\nUsage:\n  evalgate upgrade --full`,
+    ci: `evalgate ci — One-command CI loop (manifest → impact → run → diff)\n\nUsage:\n  evalgate ci [options]\n\nOptions:\n  --base <ref>       Base reference for diff\n  --impacted-only    Run only impacted specs\n  --format <fmt>     Output format: human (default), json, github\n  --write-results    Write run results`,
+    share: `evalgate share — Create share link for a run\n\nUsage:\n  evalgate share [options]\n\nOptions:\n  --scope <s>         Share scope\n  --evaluationId <id> Evaluation ID\n  --runId <id>        Run ID\n  --expires <dur>     Expiry duration (e.g. 7d)\n  --apiKey <key>      API key`,
+    "impact-analysis": `evalgate impact-analysis — Analyze impact of changes\n\nUsage:\n  evalgate impact-analysis [options]\n\nOptions:\n  --base <branch>          Base branch (default: main)\n  --changed-files <files>  Comma-separated list of changed files\n  --format <fmt>           Output format: human (default), json`,
+    "print-config": `evalgate print-config — Show resolved config\n\nUsage:\n  evalgate print-config [options]\n\nOptions:\n  --format <fmt>  Output format: human (default), json`,
+};
+// Intercept --help for any known subcommand
+if (subcommand && wantsHelp && subcommand in SUBCOMMAND_HELP) {
+    console.log(SUBCOMMAND_HELP[subcommand]);
+    process.exit(0);
+}
 if (subcommand === "init") {
     const cwd = process.cwd();
+    const args = argv.slice(1);
+    // Handle --list-templates
+    if (args.includes("--list-templates")) {
+        (0, templates_1.printTemplateList)();
+        process.exit(0);
+    }
+    // Handle --template <name>
+    const templateIndex = args.indexOf("--template");
+    const templateName = templateIndex !== -1 ? args[templateIndex + 1] : undefined;
+    if (templateName) {
+        if (!templates_1.AVAILABLE_TEMPLATES.includes(templateName)) {
+            console.error(`  ✖ Unknown template: ${templateName}`);
+            (0, templates_1.printTemplateList)();
+            process.exit(1);
+        }
+    }
     const ok = (0, init_1.runInit)(cwd);
-    process.exit(ok ? 0 : 1);
+    if (!ok)
+        process.exit(1);
+    // Install template after init if requested
+    if (templateName) {
+        console.log(`\n  📋 Installing template: ${templateName}\n`);
+        const { filesCreated, filesSkipped } = (0, templates_1.installTemplate)(templateName, cwd);
+        for (const f of filesCreated)
+            console.log(`  ✔ Created ${f}`);
+        for (const f of filesSkipped)
+            console.log(`  – Skipped ${f} (already exists)`);
+        console.log("");
+    }
+    process.exit(0);
+}
+else if (subcommand === "start") {
+    // Parse arguments for start command
+    const args = argv.slice(1);
+    const formatIndex = args.indexOf("--format");
+    const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
+    const watch = args.includes("--watch");
+    const skipInit = args.includes("--skip-init");
+    (0, start_1.runStart)({ format, watch, skipInit })
+        .then((code) => process.exit(code))
+        .catch((err) => {
+        console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
+        process.exit(1);
+    });
+}
+else if (subcommand === "watch") {
+    // Parse arguments for watch command
+    const args = argv.slice(1);
+    const formatIndex = args.indexOf("--format");
+    const debounceIndex = args.indexOf("--debounce");
+    const writeResultsIndex = args.indexOf("--write-results");
+    const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
+    const debounceMs = debounceIndex !== -1 ? parseInt(args[debounceIndex + 1], 10) : undefined;
+    const writeResults = writeResultsIndex !== -1;
+    const clearScreen = !args.includes("--no-clear");
+    (0, watch_1.runWatch)({ format, writeResults, debounceMs, clearScreen })
+        .then(() => process.exit(0))
+        .catch((err) => {
+        console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
+        process.exit(1);
+    });
+}
+else if (subcommand === "compare") {
+    // Parse arguments for compare command
+    const args = argv.slice(1);
+    const runsIndex = args.indexOf("--runs");
+    const baseIndex = args.indexOf("--base");
+    const headIndex = args.indexOf("--head");
+    const labelsIndex = args.indexOf("--labels");
+    const formatIndex = args.indexOf("--format");
+    const sortByIndex = args.indexOf("--sort-by");
+    // Collect run files: --runs <f1> <f2> ... OR --base <f1> --head <f2>
+    const runs = [];
+    if (runsIndex !== -1) {
+        for (let i = runsIndex + 1; i < args.length; i++) {
+            if (args[i].startsWith("--"))
+                break;
+            runs.push(args[i]);
+        }
+    }
+    else {
+        // --base / --head shorthand for 2-file compare
+        if (baseIndex !== -1 && args[baseIndex + 1])
+            runs.push(args[baseIndex + 1]);
+        if (headIndex !== -1 && args[headIndex + 1])
+            runs.push(args[headIndex + 1]);
+    }
+    // Collect labels (all args after --labels until next flag)
+    const labels = [];
+    if (labelsIndex !== -1) {
+        for (let i = labelsIndex + 1; i < args.length; i++) {
+            if (args[i].startsWith("--"))
+                break;
+            labels.push(args[i]);
+        }
+    }
+    const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
+    const sortBy = sortByIndex !== -1
+        ? args[sortByIndex + 1]
+        : "name";
+    if (runs.length < 2) {
+        console.error("Error: At least 2 run files are required.");
+        console.error("Usage: evalgate compare --base results-a.json --head results-b.json");
+        console.error("       evalgate compare --runs <file1> <file2> [<file3> ...]");
+        console.error("       --labels are optional metadata, not required identifiers.");
+        process.exit(1);
+    }
+    (0, compare_1.runCompareCLI)({
+        runs,
+        labels: labels.length > 0 ? labels : undefined,
+        format,
+        sortBy,
+    })
+        .then(() => process.exit(0))
+        .catch((err) => {
+        console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
+        process.exit(1);
+    });
 }
 else if (subcommand === "baseline") {
     const code = (0, baseline_1.runBaseline)(argv.slice(1));
@@ -179,23 +328,38 @@ else if (subcommand === "run") {
     const baseIndex = args.indexOf("--base");
     const formatIndex = args.indexOf("--format");
     const writeResultsIndex = args.indexOf("--write-results");
+    const watchFlag = args.includes("--watch");
     const specIds = specIdsIndex !== -1 ? args[specIdsIndex + 1]?.split(",") : undefined;
     const impactedOnly = impactedOnlyIndex !== -1;
     const baseBranch = baseIndex !== -1 ? args[baseIndex + 1] : undefined;
     const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
     const writeResults = writeResultsIndex !== -1;
-    (0, run_1.runEvaluationsCLI)({
-        specIds,
-        impactedOnly: impactedOnly ? !!baseBranch : false,
-        baseBranch,
-        format,
-        writeResults,
-    })
-        .then(() => process.exit(0))
-        .catch((err) => {
-        console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
-        process.exit(2);
-    });
+    if (watchFlag) {
+        // Delegate to watch mode
+        const debounceIndex = args.indexOf("--debounce");
+        const debounceMs = debounceIndex !== -1 ? parseInt(args[debounceIndex + 1], 10) : undefined;
+        const clearScreen = !args.includes("--no-clear");
+        (0, watch_1.runWatch)({ specIds, format, writeResults, debounceMs, clearScreen })
+            .then(() => process.exit(0))
+            .catch((err) => {
+            console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
+            process.exit(1);
+        });
+    }
+    else {
+        (0, run_1.runEvaluationsCLI)({
+            specIds,
+            impactedOnly: impactedOnly ? !!baseBranch : false,
+            baseBranch,
+            format,
+            writeResults,
+        })
+            .then(() => process.exit(0))
+            .catch((err) => {
+            console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
+            process.exit(2);
+        });
+    }
 }
 else if (subcommand === "diff") {
     // Parse arguments for diff command
@@ -213,6 +377,14 @@ else if (subcommand === "diff") {
         process.exit(2);
     });
 }
+else if (subcommand === "validate") {
+    (0, validate_1.runValidate)(argv.slice(1))
+        .then((result) => process.exit(result.passed ? 0 : 1))
+        .catch((err) => {
+        console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
+        process.exit(1);
+    });
+}
 else if (subcommand === "ci") {
     // Parse arguments for ci command
     const args = argv.slice(1);
@@ -237,96 +409,67 @@ else {
     console.log(`EvalGate CLI
 Usage:
-  evalgate init                Create evalgate.config.json + baseline + CI workflow
-  evalgate discover            Discover behavioral specs in project and show statistics
-  evalgate discover --manifest  Generate evaluation manifest for incremental analysis
-  evalgate impact-analysis     Analyze impact of changes and suggest targeted tests
-    --base <branch>          Base branch to compare against (default: main)
-    --changed-files <files>  Comma-separated list of changed files (for CI)
-    --format <fmt>           Output format: human (default), json
-  evalgate ci                  One-command CI loop (manifest → impact → run → diff)
-    --base <ref>            Base reference for diff (baseline|last|<runId>|<path>|<gitref>)
-    --impacted-only          Run only specs impacted by changes
-    --format <fmt>           Output format: human (default), json, github
-    --write-results          Write run results to .evalgate/last-run.json
-  evalgate run                 Run evaluation specifications
-    --spec-ids <ids>         Comma-separated list of spec IDs to run
-    --impacted-only          Run only specs impacted by changes (requires --base)
-    --base <branch>          Base branch for impact analysis (with --impacted-only)
-    --format <fmt>           Output format: human (default), json
-    --write-results          Write results to .evalgate/last-run.json
-  evalgate diff                Compare two run reports and show behavioral changes
-    --base <branch>          Base branch or report path (default: main)
-    --head <path>            Head report path (default: .evalgate/last-run.json)
-    --format <fmt>           Output format: human (default), json
-  evalgate gate [options]      Run regression gate (local test-based, no API needed)
-  evalgate check [options]     CI/CD evaluation gate (API-based)
-  evalgate explain [options]   Explain last gate/check failure with root causes + fixes
-  evalgate doctor [options]    Comprehensive CI/CD readiness checklist
-  evalgate baseline init       Create starter evals/baseline.json
-  evalgate baseline update     Run tests and update baseline with real scores
-  evalgate upgrade --full      Upgrade from Tier 1 to Tier 2 (full gate)
-  evalgate print-config        Show resolved config with source-of-truth annotations
-  evalgate share [options]     Create share link for a run
-Options for gate:
-  --format <fmt>      Output format: human (default), json, github
-Options for check:
-  --evaluationId <id>  Evaluation to gate on (or from config)
-  --apiKey <key>      API key (or EVALAI_API_KEY env)
-  --format <fmt>      Output format: human (default), json, github
-  --explain           Show score breakdown and thresholds
-  --onFail import     When gate fails, import run with CI context
-  --minScore <n>      Fail if score < n (0-100)
-  --maxDrop <n>       Fail if score dropped > n from baseline
-  --warnDrop <n>      Warn (exit 8) if score dropped > n but < maxDrop
-  --minN <n>          Fail if total test cases < n
-  --allowWeakEvidence Allow weak evidence level
-  --policy <name>     Enforce policy (HIPAA, SOC2, GDPR, etc.)
-  --baseline <mode>   "published", "previous", or "production"
-  --share <mode>      Share link: always | fail | never (fail = only when gate fails)
-  --baseUrl <url>     API base URL
-Options for explain:
-  --report <path>     Path to report JSON (default: evals/regression-report.json)
-  --format <fmt>      Output format: human (default), json
-Options for print-config:
-  --format <fmt>      Output format: human (default), json
-Options for doctor:
-  --report            Output JSON diagnostic bundle
-  --format <fmt>      Output format: human (default), json
-  --strict            Treat warnings as failures (exit 2)
-  --apiKey <key>      API key (or EVALAI_API_KEY env)
-  --baseUrl <url>     API base URL
-  --evaluationId <id> Evaluation to verify
+  evalgate start                 Zero-config startup (init + discover + run in one command)
+    --watch                    Enable watch mode after first run
+    --format <fmt>             Output format: human (default), json
+  evalgate init                  Create evalgate.config.json + baseline + CI workflow
+    --template <name>          Start with a template (chatbot, codegen, agent, safety, rag)
+    --list-templates           Show all available templates
+  evalgate discover              Discover behavioral specs in project and show statistics
+    --manifest                 Generate evaluation manifest for incremental analysis
+  evalgate run                   Run evaluation specifications
+    --spec-ids <ids>           Comma-separated list of spec IDs to run
+    --impacted-only            Run only specs impacted by changes (requires --base)
+    --base <branch>            Base branch for impact analysis (with --impacted-only)
+    --format <fmt>             Output format: human (default), json
+    --write-results            Write results to .evalgate/last-run.json
+    --watch                    Re-execute on file save (watch mode)
+    --debounce <ms>            Watch debounce interval (default: 300ms)
+    --no-clear                 Don't clear screen between watch runs
+  evalgate watch                 Watch mode (alias for evalgate run --watch)
+  evalgate compare               Side-by-side run comparison
+    --base <file>              Baseline run result JSON file
+    --head <file>              Head run result JSON file
+    --runs <f1> <f2> [...]     N-way compare (3+ run files)
+    --labels <l1> <l2> [...]   Optional human-readable labels (e.g., model names)
+    --sort-by <key>            Sort by: name (default), score, duration
+    --format <fmt>             Output format: human (default), json
+  evalgate diff                  Compare two run reports and show behavioral changes
+    --base <branch>            Base branch or report path (default: main)
+    --head <path>              Head report path (default: .evalgate/last-run.json)
+    --format <fmt>             Output format: human (default), json
+  evalgate impact-analysis       Analyze impact of changes and suggest targeted tests
+    --base <branch>            Base branch to compare against (default: main)
+    --changed-files <files>    Comma-separated list of changed files (for CI)
+    --format <fmt>             Output format: human (default), json
+  evalgate ci                    One-command CI loop (manifest → impact → run → diff)
+    --base <ref>               Base reference for diff
+    --impacted-only            Run only specs impacted by changes
+    --format <fmt>             Output format: human (default), json, github
+    --write-results            Write run results to .evalgate/last-run.json
+  evalgate gate [options]        Run regression gate (local test-based, no API needed)
+  evalgate check [options]       CI/CD evaluation gate (API-based)
+  evalgate explain [options]     Explain last gate/check failure with root causes + fixes
+  evalgate doctor [options]      Comprehensive CI/CD readiness checklist
+  evalgate validate              Validate spec files without running them
+  evalgate baseline init         Create starter evals/baseline.json
+  evalgate baseline update       Run tests and update baseline with real scores
+  evalgate upgrade --full        Upgrade from Tier 1 to Tier 2 (full gate)
+  evalgate print-config          Show resolved config with source-of-truth annotations
+  evalgate share [options]       Create share link for a run
 Examples:
-  evalgate init
-  evalgate discover
-  evalgate discover --manifest
-  evalgate impact-analysis --base main
-  evalgate impact-analysis --base main --format json
-  evalgate impact-analysis --changed-files src/utils.ts,datasets/test.json
-  evalgate run
-  evalgate run --spec-ids spec1,spec2
-  evalgate run --impacted-only --base main
-  evalgate run --format json --write-results
-  evalgate diff
-  evalgate diff --base main
-  evalgate diff --base main --format json
-  evalgate diff --a .evalgate/runs/base.json --b .evalgate/last-run.json
-  evalgate gate
-  evalgate gate --format json
-  evalgate explain
-  evalgate doctor
-  evalgate print-config
-  evalgate doctor --report
-  evalgate check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
-  evalgate check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
-  evalgate share --scope run --evaluationId 42 --runId 123 --expires 7d --apiKey $EVALAI_API_KEY
+  evalgate start                                          Zero to eval in one command
+  evalgate init --template chatbot                        Scaffold with chatbot evals
+  evalgate run --watch                                    Re-run on file save
+  evalgate compare --base gpt4o.json --head claude.json    Side-by-side run diff
+  evalgate run --spec-ids spec1,spec2                     Run specific specs
+  evalgate run --impacted-only --base main                Run only impacted specs
+  evalgate diff --base main                               Behavioral diff
+  evalgate ci --base main --impacted-only                 Full CI loop
+  evalgate gate --format json                             Regression gate
+  evalgate check --minScore 92 --evaluationId 42          API-based gate
+  evalgate doctor                                         Preflight check
 `);
     process.exit(subcommand === "--help" || subcommand === "-h" ? 0 : 1);
 }

package/dist/cli/regression-gate.js CHANGED Viewed

@@ -48,6 +48,7 @@ exports.runGate = runGate;
 const node_child_process_1 = require("node:child_process");
 const fs = __importStar(require("node:fs"));
 const path = __importStar(require("node:path"));
+const baseline_1 = require("./baseline");
 const REPORT_REL = "evals/regression-report.json";
 const BASELINE_REL = "evals/baseline.json";
 /** Detect the package manager used in the project */
@@ -147,6 +148,28 @@ function runBuiltinGate(cwd) {
             runner,
         };
     }
+    // Verify baseline integrity
+    const checksumResult = (0, baseline_1.verifyBaselineChecksum)(baselineData);
+    if (!checksumResult.valid) {
+        return {
+            schemaVersion: 1,
+            timestamp: now,
+            exitCode: 2,
+            category: "infra_error",
+            passed: false,
+            failures: [
+                checksumResult.reason ?? "Baseline checksum verification failed",
+            ],
+            deltas: [],
+            baseline: null,
+            durationMs: Date.now() - t0,
+            command,
+            runner,
+        };
+    }
+    if (checksumResult.reason === "no_checksum") {
+        console.warn("⚠ Baseline has no checksum. Run 'evalgate baseline update' to stamp one.");
+    }
     const baselineMeta = baselineData.updatedAt
         ? {
             updatedAt: baselineData.updatedAt,

package/dist/cli/run.js CHANGED Viewed

@@ -54,6 +54,7 @@ const fs = __importStar(require("node:fs/promises"));
 const path = __importStar(require("node:path"));
 const registry_1 = require("../runtime/registry");
 const impact_analysis_1 = require("./impact-analysis");
+const traces_1 = require("./traces");
 /**
  * Generate deterministic run ID
  */
@@ -377,6 +378,15 @@ function printHumanResults(result) {
     console.log(`   ❌ Failed: ${result.summary.failed}`);
     console.log(`   ⏭️  Skipped: ${result.summary.skipped}`);
     console.log(`   📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
+    // Latency percentiles
+    const durations = result.results
+        .filter((r) => r.result.status !== "skipped")
+        .map((r) => r.result.duration);
+    if (durations.length > 0) {
+        const latency = (0, traces_1.calculatePercentiles)(durations);
+        console.log("");
+        console.log((0, traces_1.formatLatencyTable)(latency));
+    }
     const hasScores = result.results.some((r) => r.result.score !== undefined);
     console.log(`\n📋 Individual Results:${hasScores ? "  (score = value returned by spec executor, 0–100)" : ""}`);
     for (const spec of result.results) {
@@ -404,6 +414,18 @@ function printJsonResults(result) {
 async function runEvaluationsCLI(options) {
     try {
         const result = await runEvaluations(options);
+        // Auto-write structured traces
+        if (result.results.length > 0) {
+            try {
+                const tracePath = await (0, traces_1.writeTraces)(result);
+                if (options.format !== "json") {
+                    console.log(`\n🔍 Trace written to ${tracePath}`);
+                }
+            }
+            catch {
+                // Trace writing is best-effort, don't fail the run
+            }
+        }
         if (options.format === "json") {
             printJsonResults(result);
         }

package/dist/cli/start.d.ts ADDED Viewed

@@ -0,0 +1,26 @@
+/**
+ * evalgate start — Zero-config startup
+ *
+ * One command to go from nothing to a passing eval run:
+ *   npx evalgate start
+ *
+ * What it does:
+ *   1. If no evalgate.config.json, runs init
+ *   2. If no manifest, runs discover --manifest
+ *   3. Runs evalgate run --write-results
+ *   4. Prints results
+ *
+ * The goal: zero decisions, one command, immediate value.
+ */
+export interface StartOptions {
+    /** Output format */
+    format?: "human" | "json";
+    /** Skip init if not already set up */
+    skipInit?: boolean;
+    /** Enable watch mode after first run */
+    watch?: boolean;
+}
+/**
+ * Zero-config startup: one command → passing run
+ */
+export declare function runStart(options?: StartOptions, projectRoot?: string): Promise<number>;

package/dist/cli/start.js ADDED Viewed

@@ -0,0 +1,130 @@
+"use strict";
+/**
+ * evalgate start — Zero-config startup
+ *
+ * One command to go from nothing to a passing eval run:
+ *   npx evalgate start
+ *
+ * What it does:
+ *   1. If no evalgate.config.json, runs init
+ *   2. If no manifest, runs discover --manifest
+ *   3. Runs evalgate run --write-results
+ *   4. Prints results
+ *
+ * The goal: zero decisions, one command, immediate value.
+ */
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.runStart = runStart;
+const fs = __importStar(require("node:fs"));
+const path = __importStar(require("node:path"));
+const discover_1 = require("./discover");
+const init_1 = require("./init");
+const run_1 = require("./run");
+/**
+ * Zero-config startup: one command → passing run
+ */
+async function runStart(options = {}, projectRoot = process.cwd()) {
+    const format = options.format ?? "human";
+    if (format === "human") {
+        console.log("\n🚀 evalgate start — zero-config evaluation run\n");
+    }
+    // Step 1: Ensure project is initialized
+    const configPath = path.join(projectRoot, "evalgate.config.json");
+    if (!fs.existsSync(configPath) && !options.skipInit) {
+        if (format === "human") {
+            console.log("📦 No evalgate.config.json found. Initializing...\n");
+        }
+        const initOk = (0, init_1.runInit)(projectRoot);
+        if (!initOk) {
+            console.error("❌ Initialization failed. Run `evalgate init` manually.");
+            return 1;
+        }
+        if (format === "human")
+            console.log("");
+    }
+    // Step 2: Ensure manifest exists (discover specs)
+    const manifestPath = path.join(projectRoot, ".evalgate", "manifest.json");
+    if (!fs.existsSync(manifestPath)) {
+        if (format === "human") {
+            console.log("🔍 No manifest found. Discovering specs...\n");
+        }
+        try {
+            await (0, discover_1.discoverSpecs)({ manifest: true });
+        }
+        catch (err) {
+            // Discovery may fail if no spec files exist yet — that's OK for legacy mode
+            if (format === "human") {
+                console.log(`   ℹ️  Discovery: ${err instanceof Error ? err.message : String(err)}`);
+                console.log("   Falling back to gate mode...\n");
+            }
+        }
+    }
+    // Step 3: Run evaluations
+    if (format === "human") {
+        console.log("▶️  Running evaluations...\n");
+    }
+    try {
+        const result = await (0, run_1.runEvaluations)({ writeResults: true, format }, projectRoot);
+        if (format === "json") {
+            (0, run_1.printJsonResults)(result);
+        }
+        else {
+            (0, run_1.printHumanResults)(result);
+        }
+        // Step 4: If watch mode requested, transition to watch
+        if (options.watch) {
+            const { runWatch } = await Promise.resolve().then(() => __importStar(require("./watch")));
+            await runWatch({ writeResults: true, format }, projectRoot);
+            return 0; // Never reached (watch runs forever)
+        }
+        return result.summary.failed > 0 ? 1 : 0;
+    }
+    catch (error) {
+        if (format === "human") {
+            console.error(`\n❌ ${error instanceof Error ? error.message : String(error)}`);
+            console.log("\n💡 Tips:");
+            console.log("   • Create spec files with defineEval() in eval/ directory");
+            console.log("   • Run `evalgate discover` to verify spec detection");
+            console.log("   • Run `evalgate doctor` for full diagnostics");
+        }
+        else {
+            console.error(JSON.stringify({
+                error: error instanceof Error ? error.message : String(error),
+            }));
+        }
+        return 1;
+    }
+}