@evalgate/sdk 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +638 -0
- package/README.md +398 -0
- package/dist/assertions.d.ts +189 -0
- package/dist/assertions.js +662 -0
- package/dist/batch.d.ts +68 -0
- package/dist/batch.js +179 -0
- package/dist/cache.d.ts +65 -0
- package/dist/cache.js +131 -0
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +132 -0
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/check.d.ts +73 -0
- package/dist/cli/check.js +355 -0
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +112 -0
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +230 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.d.ts +88 -0
- package/dist/cli/doctor.js +675 -0
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.d.ts +58 -0
- package/dist/cli/explain.js +561 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +135 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +110 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +103 -0
- package/dist/cli/formatters/types.d.ts +103 -0
- package/dist/cli/formatters/types.js +8 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +179 -0
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.d.ts +9 -0
- package/dist/cli/index.js +332 -0
- package/dist/cli/init.d.ts +16 -0
- package/dist/cli/init.js +292 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +89 -0
- package/dist/cli/print-config.d.ts +29 -0
- package/dist/cli/print-config.js +270 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/regression-gate.d.ts +15 -0
- package/dist/cli/regression-gate.js +341 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +132 -0
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +91 -0
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +492 -0
- package/dist/cli/workspace.d.ts +31 -0
- package/dist/cli/workspace.js +68 -0
- package/dist/client.d.ts +368 -0
- package/dist/client.js +893 -0
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +232 -0
- package/dist/context.d.ts +134 -0
- package/dist/context.js +215 -0
- package/dist/errors.d.ts +82 -0
- package/dist/errors.js +298 -0
- package/dist/export.d.ts +195 -0
- package/dist/export.js +344 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.js +153 -0
- package/dist/integrations/anthropic.d.ts +91 -0
- package/dist/integrations/anthropic.js +163 -0
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +232 -0
- package/dist/integrations/openai.d.ts +92 -0
- package/dist/integrations/openai.js +160 -0
- package/dist/local.d.ts +39 -0
- package/dist/local.js +148 -0
- package/dist/logger.d.ts +128 -0
- package/dist/logger.js +227 -0
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +74 -0
- package/dist/pagination.js +139 -0
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +400 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/snapshot.d.ts +176 -0
- package/dist/snapshot.js +322 -0
- package/dist/streaming.d.ts +173 -0
- package/dist/streaming.js +268 -0
- package/dist/testing.d.ts +273 -0
- package/dist/testing.js +317 -0
- package/dist/types.d.ts +754 -0
- package/dist/types.js +54 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +41 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +389 -0
- package/dist/workflows.js +671 -0
- package/package.json +117 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalai baseline — Baseline management commands
|
|
4
|
+
*
|
|
5
|
+
* Subcommands:
|
|
6
|
+
* evalgate baseline init — Create a starter evals/baseline.json
|
|
7
|
+
* evalgate baseline update — Run tests + update baseline with real scores
|
|
8
|
+
*/
|
|
9
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
12
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
13
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
14
|
+
}
|
|
15
|
+
Object.defineProperty(o, k2, desc);
|
|
16
|
+
}) : (function(o, m, k, k2) {
|
|
17
|
+
if (k2 === undefined) k2 = k;
|
|
18
|
+
o[k2] = m[k];
|
|
19
|
+
}));
|
|
20
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
21
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
22
|
+
}) : function(o, v) {
|
|
23
|
+
o["default"] = v;
|
|
24
|
+
});
|
|
25
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
26
|
+
var ownKeys = function(o) {
|
|
27
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
28
|
+
var ar = [];
|
|
29
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
30
|
+
return ar;
|
|
31
|
+
};
|
|
32
|
+
return ownKeys(o);
|
|
33
|
+
};
|
|
34
|
+
return function (mod) {
|
|
35
|
+
if (mod && mod.__esModule) return mod;
|
|
36
|
+
var result = {};
|
|
37
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
38
|
+
__setModuleDefault(result, mod);
|
|
39
|
+
return result;
|
|
40
|
+
};
|
|
41
|
+
})();
|
|
42
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
43
|
+
exports.runBaselineInit = runBaselineInit;
|
|
44
|
+
exports.runBaselineUpdate = runBaselineUpdate;
|
|
45
|
+
exports.runBaseline = runBaseline;
|
|
46
|
+
const node_child_process_1 = require("node:child_process");
|
|
47
|
+
const fs = __importStar(require("node:fs"));
|
|
48
|
+
const path = __importStar(require("node:path"));
|
|
49
|
+
const BASELINE_REL = "evals/baseline.json";
|
|
50
|
+
/** Detect the package manager used in the project */
|
|
51
|
+
function detectPackageManager(cwd) {
|
|
52
|
+
if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
|
|
53
|
+
return "pnpm";
|
|
54
|
+
if (fs.existsSync(path.join(cwd, "yarn.lock")))
|
|
55
|
+
return "yarn";
|
|
56
|
+
return "npm";
|
|
57
|
+
}
|
|
58
|
+
/** Run an npm script via the detected package manager */
|
|
59
|
+
function runScript(cwd, scriptName) {
|
|
60
|
+
const pm = detectPackageManager(cwd);
|
|
61
|
+
const isWin = process.platform === "win32";
|
|
62
|
+
const result = (0, node_child_process_1.spawnSync)(pm, ["run", scriptName], {
|
|
63
|
+
cwd,
|
|
64
|
+
stdio: "inherit",
|
|
65
|
+
shell: isWin,
|
|
66
|
+
});
|
|
67
|
+
return result.status ?? 1;
|
|
68
|
+
}
|
|
69
|
+
function runBaselineInit(cwd) {
|
|
70
|
+
const baselinePath = path.join(cwd, BASELINE_REL);
|
|
71
|
+
if (fs.existsSync(baselinePath)) {
|
|
72
|
+
console.log(`⚠ ${BASELINE_REL} already exists. Delete it first or use 'evalgate baseline update'.`);
|
|
73
|
+
return 1;
|
|
74
|
+
}
|
|
75
|
+
// Ensure evals/ directory exists
|
|
76
|
+
const evalsDir = path.join(cwd, "evals");
|
|
77
|
+
if (!fs.existsSync(evalsDir)) {
|
|
78
|
+
fs.mkdirSync(evalsDir, { recursive: true });
|
|
79
|
+
}
|
|
80
|
+
const user = process.env.USER || process.env.USERNAME || "unknown";
|
|
81
|
+
const now = new Date().toISOString();
|
|
82
|
+
const baseline = {
|
|
83
|
+
schemaVersion: 1,
|
|
84
|
+
description: "Regression gate baseline — created by evalgate baseline init",
|
|
85
|
+
generatedAt: now,
|
|
86
|
+
generatedBy: user,
|
|
87
|
+
commitSha: "0000000",
|
|
88
|
+
updatedAt: now,
|
|
89
|
+
updatedBy: user,
|
|
90
|
+
tolerance: {
|
|
91
|
+
scoreDrop: 5,
|
|
92
|
+
passRateDrop: 5,
|
|
93
|
+
maxLatencyIncreaseMs: 200,
|
|
94
|
+
maxCostIncreaseUsd: 0.05,
|
|
95
|
+
},
|
|
96
|
+
goldenEval: {
|
|
97
|
+
score: 100,
|
|
98
|
+
passRate: 100,
|
|
99
|
+
totalCases: 3,
|
|
100
|
+
passedCases: 3,
|
|
101
|
+
},
|
|
102
|
+
qualityScore: {
|
|
103
|
+
overall: 90,
|
|
104
|
+
grade: "A",
|
|
105
|
+
accuracy: 85,
|
|
106
|
+
safety: 100,
|
|
107
|
+
latency: 90,
|
|
108
|
+
cost: 90,
|
|
109
|
+
consistency: 90,
|
|
110
|
+
},
|
|
111
|
+
confidenceTests: {
|
|
112
|
+
unitPassed: true,
|
|
113
|
+
unitTotal: 0,
|
|
114
|
+
dbPassed: true,
|
|
115
|
+
dbTotal: 0,
|
|
116
|
+
},
|
|
117
|
+
productMetrics: {},
|
|
118
|
+
};
|
|
119
|
+
fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
|
|
120
|
+
console.log(`✅ Created ${BASELINE_REL} with sample values\n`);
|
|
121
|
+
console.log("Next steps:");
|
|
122
|
+
console.log(` 1. Commit ${BASELINE_REL} to your repo`);
|
|
123
|
+
console.log(" 2. Run 'evalgate baseline update' to populate with real scores");
|
|
124
|
+
console.log(" 3. Run 'evalgate gate' to verify the regression gate\n");
|
|
125
|
+
return 0;
|
|
126
|
+
}
|
|
127
|
+
// ── baseline update ──
|
|
128
|
+
function runBaselineUpdate(cwd) {
|
|
129
|
+
// Check if eval:baseline-update script exists in package.json
|
|
130
|
+
const pkgPath = path.join(cwd, "package.json");
|
|
131
|
+
if (!fs.existsSync(pkgPath)) {
|
|
132
|
+
console.error("❌ No package.json found. Run this from your project root.");
|
|
133
|
+
return 1;
|
|
134
|
+
}
|
|
135
|
+
let pkg;
|
|
136
|
+
try {
|
|
137
|
+
pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
|
|
138
|
+
}
|
|
139
|
+
catch {
|
|
140
|
+
console.error("❌ Failed to parse package.json");
|
|
141
|
+
return 1;
|
|
142
|
+
}
|
|
143
|
+
if (!pkg.scripts?.["eval:baseline-update"]) {
|
|
144
|
+
console.error("❌ Missing 'eval:baseline-update' script in package.json.");
|
|
145
|
+
console.error(' Add it: "eval:baseline-update": "npx tsx scripts/regression-gate.ts --update-baseline"');
|
|
146
|
+
return 1;
|
|
147
|
+
}
|
|
148
|
+
console.log("📊 Running baseline update...\n");
|
|
149
|
+
return runScript(cwd, "eval:baseline-update");
|
|
150
|
+
}
|
|
151
|
+
// ── baseline router ──
|
|
152
|
+
function runBaseline(argv) {
|
|
153
|
+
const sub = argv[0];
|
|
154
|
+
const cwd = process.cwd();
|
|
155
|
+
if (sub === "init") {
|
|
156
|
+
return runBaselineInit(cwd);
|
|
157
|
+
}
|
|
158
|
+
if (sub === "update") {
|
|
159
|
+
return runBaselineUpdate(cwd);
|
|
160
|
+
}
|
|
161
|
+
console.log(`evalai baseline — Manage regression gate baselines
|
|
162
|
+
|
|
163
|
+
Usage:
|
|
164
|
+
evalgate baseline init Create starter ${BASELINE_REL}
|
|
165
|
+
evalgate baseline update Run tests and update baseline with real scores
|
|
166
|
+
|
|
167
|
+
Examples:
|
|
168
|
+
evalgate baseline init
|
|
169
|
+
evalgate baseline update
|
|
170
|
+
`);
|
|
171
|
+
return sub === "--help" || sub === "-h" ? 0 : 1;
|
|
172
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* evalgate check — CI/CD evaluation gate
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* evalgate check --minScore 92 --evaluationId 42
|
|
7
|
+
* evalgate check --minScore 90 --maxDrop 5 --evaluationId 42
|
|
8
|
+
* evalgate check --policy HIPAA --evaluationId 42
|
|
9
|
+
* evalgate check --baseline published --evaluationId 42
|
|
10
|
+
*
|
|
11
|
+
* Flags:
|
|
12
|
+
* --minScore <n> Fail if quality score < n (0-100)
|
|
13
|
+
* --maxDrop <n> Fail if score dropped > n points from baseline
|
|
14
|
+
* --minN <n> Fail if total test cases < n (low sample size)
|
|
15
|
+
* --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
|
|
16
|
+
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
17
|
+
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
18
|
+
* --evaluationId <id> Required. The evaluation to gate on.
|
|
19
|
+
* --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or http://localhost:3000)
|
|
20
|
+
* --apiKey <key> API key (default: EVALGATE_API_KEY env var)
|
|
21
|
+
* --share <mode> Share link: "always" | "fail" | "never" (default: never)
|
|
22
|
+
* fail = create public share link only when gate fails (CI-friendly)
|
|
23
|
+
* --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
|
|
24
|
+
* --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
|
|
25
|
+
*
|
|
26
|
+
* Exit codes:
|
|
27
|
+
* 0 — Gate passed
|
|
28
|
+
* 1 — Gate failed: score below threshold
|
|
29
|
+
* 2 — Gate failed: regression exceeded maxDrop
|
|
30
|
+
* 3 — Gate failed: policy violation
|
|
31
|
+
* 4 — API error / network failure
|
|
32
|
+
* 5 — Invalid arguments
|
|
33
|
+
* 6 — Gate failed: total test cases < minN
|
|
34
|
+
* 7 — Gate failed: weak evidence (evidenceLevel === 'weak')
|
|
35
|
+
* 8 — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
|
|
36
|
+
*
|
|
37
|
+
* Environment:
|
|
38
|
+
* EVALGATE_BASE_URL — API base URL (default: http://localhost:3000)
|
|
39
|
+
* EVALGATE_API_KEY — API key for authentication
|
|
40
|
+
*/
|
|
41
|
+
export { EXIT } from "./constants";
|
|
42
|
+
export type FormatType = "human" | "json" | "github";
|
|
43
|
+
export type ShareMode = "always" | "fail" | "never";
|
|
44
|
+
export interface CheckArgs {
|
|
45
|
+
baseUrl: string;
|
|
46
|
+
apiKey: string;
|
|
47
|
+
minScore: number;
|
|
48
|
+
maxDrop?: number;
|
|
49
|
+
warnDrop?: number;
|
|
50
|
+
minN?: number;
|
|
51
|
+
allowWeakEvidence: boolean;
|
|
52
|
+
evaluationId: string;
|
|
53
|
+
policy?: string;
|
|
54
|
+
baseline: "published" | "previous" | "production" | "auto";
|
|
55
|
+
format: FormatType;
|
|
56
|
+
explain: boolean;
|
|
57
|
+
onFail?: "import";
|
|
58
|
+
share: ShareMode;
|
|
59
|
+
prCommentOut?: string;
|
|
60
|
+
maxCostUsd?: number;
|
|
61
|
+
maxLatencyMs?: number;
|
|
62
|
+
maxCostDeltaUsd?: number;
|
|
63
|
+
}
|
|
64
|
+
export type ParseArgsResult = {
|
|
65
|
+
ok: true;
|
|
66
|
+
args: CheckArgs;
|
|
67
|
+
} | {
|
|
68
|
+
ok: false;
|
|
69
|
+
exitCode: number;
|
|
70
|
+
message: string;
|
|
71
|
+
};
|
|
72
|
+
export declare function parseArgs(argv: string[]): ParseArgsResult;
|
|
73
|
+
export declare function runCheck(args: CheckArgs): Promise<number>;
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* evalgate check — CI/CD evaluation gate
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* evalgate check --minScore 92 --evaluationId 42
|
|
8
|
+
* evalgate check --minScore 90 --maxDrop 5 --evaluationId 42
|
|
9
|
+
* evalgate check --policy HIPAA --evaluationId 42
|
|
10
|
+
* evalgate check --baseline published --evaluationId 42
|
|
11
|
+
*
|
|
12
|
+
* Flags:
|
|
13
|
+
* --minScore <n> Fail if quality score < n (0-100)
|
|
14
|
+
* --maxDrop <n> Fail if score dropped > n points from baseline
|
|
15
|
+
* --minN <n> Fail if total test cases < n (low sample size)
|
|
16
|
+
* --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
|
|
17
|
+
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
18
|
+
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
19
|
+
* --evaluationId <id> Required. The evaluation to gate on.
|
|
20
|
+
* --baseUrl <url> API base URL (default: EVALGATE_BASE_URL or http://localhost:3000)
|
|
21
|
+
* --apiKey <key> API key (default: EVALGATE_API_KEY env var)
|
|
22
|
+
* --share <mode> Share link: "always" | "fail" | "never" (default: never)
|
|
23
|
+
* fail = create public share link only when gate fails (CI-friendly)
|
|
24
|
+
* --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
|
|
25
|
+
* --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
|
|
26
|
+
*
|
|
27
|
+
* Exit codes:
|
|
28
|
+
* 0 — Gate passed
|
|
29
|
+
* 1 — Gate failed: score below threshold
|
|
30
|
+
* 2 — Gate failed: regression exceeded maxDrop
|
|
31
|
+
* 3 — Gate failed: policy violation
|
|
32
|
+
* 4 — API error / network failure
|
|
33
|
+
* 5 — Invalid arguments
|
|
34
|
+
* 6 — Gate failed: total test cases < minN
|
|
35
|
+
* 7 — Gate failed: weak evidence (evidenceLevel === 'weak')
|
|
36
|
+
* 8 — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
|
|
37
|
+
*
|
|
38
|
+
* Environment:
|
|
39
|
+
* EVALGATE_BASE_URL — API base URL (default: http://localhost:3000)
|
|
40
|
+
* EVALGATE_API_KEY — API key for authentication
|
|
41
|
+
*/
|
|
42
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
43
|
+
if (k2 === undefined) k2 = k;
|
|
44
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
45
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
46
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
47
|
+
}
|
|
48
|
+
Object.defineProperty(o, k2, desc);
|
|
49
|
+
}) : (function(o, m, k, k2) {
|
|
50
|
+
if (k2 === undefined) k2 = k;
|
|
51
|
+
o[k2] = m[k];
|
|
52
|
+
}));
|
|
53
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
54
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
55
|
+
}) : function(o, v) {
|
|
56
|
+
o["default"] = v;
|
|
57
|
+
});
|
|
58
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
59
|
+
var ownKeys = function(o) {
|
|
60
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
61
|
+
var ar = [];
|
|
62
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
63
|
+
return ar;
|
|
64
|
+
};
|
|
65
|
+
return ownKeys(o);
|
|
66
|
+
};
|
|
67
|
+
return function (mod) {
|
|
68
|
+
if (mod && mod.__esModule) return mod;
|
|
69
|
+
var result = {};
|
|
70
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
71
|
+
__setModuleDefault(result, mod);
|
|
72
|
+
return result;
|
|
73
|
+
};
|
|
74
|
+
})();
|
|
75
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
76
|
+
exports.EXIT = void 0;
|
|
77
|
+
exports.parseArgs = parseArgs;
|
|
78
|
+
exports.runCheck = runCheck;
|
|
79
|
+
const fs = __importStar(require("node:fs"));
|
|
80
|
+
const path = __importStar(require("node:path"));
|
|
81
|
+
const api_1 = require("./api");
|
|
82
|
+
const ci_context_1 = require("./ci-context");
|
|
83
|
+
const config_1 = require("./config");
|
|
84
|
+
const constants_1 = require("./constants");
|
|
85
|
+
const github_1 = require("./formatters/github");
|
|
86
|
+
const human_1 = require("./formatters/human");
|
|
87
|
+
const json_1 = require("./formatters/json");
|
|
88
|
+
const pr_comment_1 = require("./formatters/pr-comment");
|
|
89
|
+
const gate_1 = require("./gate");
|
|
90
|
+
const build_check_report_1 = require("./report/build-check-report");
|
|
91
|
+
var constants_2 = require("./constants");
|
|
92
|
+
Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return constants_2.EXIT; } });
|
|
93
|
+
function parseArgs(argv) {
|
|
94
|
+
const args = {};
|
|
95
|
+
for (let i = 0; i < argv.length; i++) {
|
|
96
|
+
const arg = argv[i];
|
|
97
|
+
if (arg.startsWith("--")) {
|
|
98
|
+
const key = arg.slice(2);
|
|
99
|
+
const next = argv[i + 1];
|
|
100
|
+
if (next !== undefined && !next.startsWith("--")) {
|
|
101
|
+
args[key] = next;
|
|
102
|
+
i++;
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
args[key] = "true";
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
let baseUrl = args.baseUrl || process.env.EVALGATE_BASE_URL || "http://localhost:3000";
|
|
110
|
+
const apiKey = args.apiKey ||
|
|
111
|
+
process.env.EVALGATE_API_KEY ||
|
|
112
|
+
process.env.EVALAI_API_KEY ||
|
|
113
|
+
"";
|
|
114
|
+
let minScore = parseInt(args.minScore || "0", 10);
|
|
115
|
+
let maxDrop = args.maxDrop ? parseInt(args.maxDrop, 10) : undefined;
|
|
116
|
+
let warnDrop = args.warnDrop ? parseInt(args.warnDrop, 10) : undefined;
|
|
117
|
+
let minN = args.minN ? parseInt(args.minN, 10) : undefined;
|
|
118
|
+
let allowWeakEvidence = args.allowWeakEvidence === "true" || args.allowWeakEvidence === "1";
|
|
119
|
+
let evaluationId = args.evaluationId || "";
|
|
120
|
+
const policy = args.policy || undefined;
|
|
121
|
+
const formatRaw = args.format || "human";
|
|
122
|
+
const format = formatRaw === "json" ? "json" : formatRaw === "github" ? "github" : "human";
|
|
123
|
+
const explain = args.explain === "true" || args.explain === "1";
|
|
124
|
+
const onFail = args.onFail === "import" ? "import" : undefined;
|
|
125
|
+
const shareRaw = args.share || "never";
|
|
126
|
+
const share = shareRaw === "always" ? "always" : shareRaw === "fail" ? "fail" : "never";
|
|
127
|
+
const prCommentOut = args["pr-comment-out"] || args.prCommentOut || undefined;
|
|
128
|
+
const maxCostUsd = args["max-cost-usd"] || args.maxCostUsd
|
|
129
|
+
? parseFloat(args["max-cost-usd"] || args.maxCostUsd || "0")
|
|
130
|
+
: undefined;
|
|
131
|
+
const maxLatencyMs = args["max-latency-ms"] || args.maxLatencyMs
|
|
132
|
+
? parseInt(args["max-latency-ms"] || args.maxLatencyMs || "0", 10)
|
|
133
|
+
: undefined;
|
|
134
|
+
const maxCostDeltaUsd = args["max-cost-delta-usd"] || args.maxCostDeltaUsd
|
|
135
|
+
? parseFloat(args["max-cost-delta-usd"] || args.maxCostDeltaUsd || "0")
|
|
136
|
+
: undefined;
|
|
137
|
+
const profile = (args.profile || args.profile);
|
|
138
|
+
let baseline = (args.baseline === "auto"
|
|
139
|
+
? "auto"
|
|
140
|
+
: args.baseline === "previous"
|
|
141
|
+
? "previous"
|
|
142
|
+
: args.baseline === "production"
|
|
143
|
+
? "production"
|
|
144
|
+
: "published");
|
|
145
|
+
const config = (0, config_1.loadConfig)(process.cwd());
|
|
146
|
+
const merged = (0, config_1.mergeConfigWithArgs)(config, {
|
|
147
|
+
evaluationId: args.evaluationId,
|
|
148
|
+
baseUrl: args.baseUrl ||
|
|
149
|
+
process.env.EVALGATE_BASE_URL ||
|
|
150
|
+
process.env.EVALAI_BASE_URL,
|
|
151
|
+
minScore: args.minScore,
|
|
152
|
+
maxDrop: args.maxDrop,
|
|
153
|
+
warnDrop: args.warnDrop,
|
|
154
|
+
minN: args.minN,
|
|
155
|
+
allowWeakEvidence: args.allowWeakEvidence,
|
|
156
|
+
baseline: args.baseline,
|
|
157
|
+
profile: profile,
|
|
158
|
+
prCommentOut: args["pr-comment-out"] ?? args.prCommentOut,
|
|
159
|
+
});
|
|
160
|
+
if (!evaluationId && merged.evaluationId)
|
|
161
|
+
evaluationId = merged.evaluationId;
|
|
162
|
+
if (merged.baseUrl)
|
|
163
|
+
baseUrl = merged.baseUrl;
|
|
164
|
+
if (merged.minScore != null && args.minScore === undefined)
|
|
165
|
+
minScore = merged.minScore ?? 0;
|
|
166
|
+
if (merged.maxDrop != null && args.maxDrop === undefined)
|
|
167
|
+
maxDrop = merged.maxDrop;
|
|
168
|
+
if (merged.warnDrop != null && args.warnDrop === undefined)
|
|
169
|
+
warnDrop = merged.warnDrop;
|
|
170
|
+
if (merged.minN != null && args.minN === undefined)
|
|
171
|
+
minN = merged.minN;
|
|
172
|
+
if (merged.allowWeakEvidence != null && args.allowWeakEvidence === undefined)
|
|
173
|
+
allowWeakEvidence = merged.allowWeakEvidence ?? false;
|
|
174
|
+
if (merged.baseline && !args.baseline)
|
|
175
|
+
baseline = merged.baseline;
|
|
176
|
+
if (!apiKey) {
|
|
177
|
+
return {
|
|
178
|
+
ok: false,
|
|
179
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
180
|
+
message: "Error: --apiKey or EVALGATE_API_KEY is required",
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
if (!evaluationId) {
|
|
184
|
+
return {
|
|
185
|
+
ok: false,
|
|
186
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
187
|
+
message: "Run npx evalgate init and paste your evaluationId, or pass --evaluationId.",
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
if (Number.isNaN(minScore) || minScore < 0 || minScore > 100) {
|
|
191
|
+
return {
|
|
192
|
+
ok: false,
|
|
193
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
194
|
+
message: "Error: --minScore must be 0-100",
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
if (minN !== undefined && (Number.isNaN(minN) || minN < 1)) {
|
|
198
|
+
return {
|
|
199
|
+
ok: false,
|
|
200
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
201
|
+
message: "Error: --minN must be a positive number",
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
return {
|
|
205
|
+
ok: true,
|
|
206
|
+
args: {
|
|
207
|
+
baseUrl,
|
|
208
|
+
apiKey,
|
|
209
|
+
minScore,
|
|
210
|
+
maxDrop,
|
|
211
|
+
warnDrop,
|
|
212
|
+
minN,
|
|
213
|
+
allowWeakEvidence,
|
|
214
|
+
evaluationId,
|
|
215
|
+
policy,
|
|
216
|
+
baseline,
|
|
217
|
+
format,
|
|
218
|
+
explain,
|
|
219
|
+
onFail,
|
|
220
|
+
share,
|
|
221
|
+
prCommentOut,
|
|
222
|
+
maxCostUsd: maxCostUsd != null && !Number.isNaN(maxCostUsd)
|
|
223
|
+
? maxCostUsd
|
|
224
|
+
: undefined,
|
|
225
|
+
maxLatencyMs: maxLatencyMs != null && !Number.isNaN(maxLatencyMs)
|
|
226
|
+
? maxLatencyMs
|
|
227
|
+
: undefined,
|
|
228
|
+
maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd)
|
|
229
|
+
? maxCostDeltaUsd
|
|
230
|
+
: undefined,
|
|
231
|
+
},
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
async function runCheck(args) {
|
|
235
|
+
const qualityResult = await (0, api_1.fetchQualityLatest)(args.baseUrl, args.apiKey, args.evaluationId, args.baseline);
|
|
236
|
+
if (!qualityResult.ok) {
|
|
237
|
+
if (qualityResult.status === 0) {
|
|
238
|
+
console.error(`EvalGate gate ERROR: Network failure — ${qualityResult.body}`);
|
|
239
|
+
}
|
|
240
|
+
else {
|
|
241
|
+
console.error(`EvalGate gate ERROR: API returned ${qualityResult.status} — ${qualityResult.body}`);
|
|
242
|
+
}
|
|
243
|
+
return constants_1.EXIT.API_ERROR;
|
|
244
|
+
}
|
|
245
|
+
const { data: quality, requestId } = qualityResult;
|
|
246
|
+
const evaluationRunId = quality?.evaluationRunId;
|
|
247
|
+
let runDetails = null;
|
|
248
|
+
if (evaluationRunId != null) {
|
|
249
|
+
const runRes = await (0, api_1.fetchRunDetails)(args.baseUrl, args.apiKey, args.evaluationId, evaluationRunId);
|
|
250
|
+
if (runRes.ok)
|
|
251
|
+
runDetails = runRes.data;
|
|
252
|
+
}
|
|
253
|
+
const gateResult = (0, gate_1.evaluateGate)(args, quality);
|
|
254
|
+
// Create share before report when PR comment needs shareUrl (--pr-comment-out + --share fail + gate failed)
|
|
255
|
+
let shareUrl;
|
|
256
|
+
const shouldCreateShare = quality?.evaluationRunId != null &&
|
|
257
|
+
(args.share === "always" || (args.share === "fail" && !gateResult.passed));
|
|
258
|
+
if (shouldCreateShare) {
|
|
259
|
+
const exportRes = await (0, api_1.fetchRunExport)(args.baseUrl, args.apiKey, args.evaluationId, quality.evaluationRunId);
|
|
260
|
+
if (exportRes.ok) {
|
|
261
|
+
const publishRes = await (0, api_1.publishShare)(args.baseUrl, args.apiKey, args.evaluationId, exportRes.exportData, quality.evaluationRunId);
|
|
262
|
+
if (publishRes.ok) {
|
|
263
|
+
shareUrl = publishRes.data.shareUrl;
|
|
264
|
+
console.error(`\nPublic share link created: ${shareUrl}`);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
const ci = (0, ci_context_1.captureCiContext)();
|
|
269
|
+
const report = (0, build_check_report_1.buildCheckReport)({
|
|
270
|
+
args,
|
|
271
|
+
quality,
|
|
272
|
+
runDetails,
|
|
273
|
+
gateResult,
|
|
274
|
+
requestId,
|
|
275
|
+
shareUrl,
|
|
276
|
+
baselineRunId: quality?.baselineRunId ?? undefined,
|
|
277
|
+
ciRunUrl: ci?.runUrl ?? undefined,
|
|
278
|
+
});
|
|
279
|
+
// Persist report artifact so `evalgate explain` works with zero flags
|
|
280
|
+
try {
|
|
281
|
+
const reportDir = path.join(process.cwd(), ".evalgate");
|
|
282
|
+
if (!fs.existsSync(reportDir))
|
|
283
|
+
fs.mkdirSync(reportDir, { recursive: true });
|
|
284
|
+
fs.writeFileSync(path.join(reportDir, "last-report.json"), JSON.stringify(report, null, 2), "utf8");
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
// Non-fatal: best-effort artifact write
|
|
288
|
+
}
|
|
289
|
+
const formatted = args.format === "json"
|
|
290
|
+
? (0, json_1.formatJson)(report)
|
|
291
|
+
: args.format === "github"
|
|
292
|
+
? (0, github_1.formatGitHub)(report)
|
|
293
|
+
: (0, human_1.formatHuman)(report);
|
|
294
|
+
console.log(formatted);
|
|
295
|
+
// Guided flow hint on failure
|
|
296
|
+
if (!gateResult.passed) {
|
|
297
|
+
console.error("\nNext: evalgate explain");
|
|
298
|
+
}
|
|
299
|
+
// --pr-comment-out: write markdown to file for GitHub Action to post
|
|
300
|
+
if (args.prCommentOut) {
|
|
301
|
+
try {
|
|
302
|
+
const markdown = (0, pr_comment_1.buildPrComment)(report);
|
|
303
|
+
fs.writeFileSync(args.prCommentOut, markdown, "utf8");
|
|
304
|
+
}
|
|
305
|
+
catch (err) {
|
|
306
|
+
console.error(`EvalGate: failed to write PR comment to ${args.prCommentOut}: ${err instanceof Error ? err.message : String(err)}`);
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
// --onFail import: when gate fails, import run with CI context
|
|
310
|
+
if (!gateResult.passed &&
|
|
311
|
+
args.onFail === "import" &&
|
|
312
|
+
runDetails?.results &&
|
|
313
|
+
quality?.evaluationRunId) {
|
|
314
|
+
const importResults = runDetails.results
|
|
315
|
+
.filter((r) => r.testCaseId != null &&
|
|
316
|
+
(r.status === "passed" || r.status === "failed"))
|
|
317
|
+
.map((r) => ({
|
|
318
|
+
testCaseId: r.testCaseId,
|
|
319
|
+
status: r.status,
|
|
320
|
+
output: r.output ?? "",
|
|
321
|
+
latencyMs: r.durationMs,
|
|
322
|
+
assertionsJson: r.assertionsJson,
|
|
323
|
+
}));
|
|
324
|
+
if (importResults.length > 0) {
|
|
325
|
+
const idempotencyKey = ci
|
|
326
|
+
? (0, ci_context_1.computeIdempotencyKey)(args.evaluationId, ci)
|
|
327
|
+
: undefined;
|
|
328
|
+
const importRes = await (0, api_1.importRunOnFail)(args.baseUrl, args.apiKey, args.evaluationId, importResults, {
|
|
329
|
+
idempotencyKey,
|
|
330
|
+
ci,
|
|
331
|
+
importClientVersion: "evalgate-cli",
|
|
332
|
+
checkReport: report,
|
|
333
|
+
});
|
|
334
|
+
if (!importRes.ok) {
|
|
335
|
+
console.error(`EvalGate import (onFail): ${importRes.status} — ${importRes.body}`);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
return gateResult.exitCode;
|
|
340
|
+
}
|
|
341
|
+
// Main entry point
|
|
342
|
+
const isDirectRun = typeof require !== "undefined" && require.main === module;
|
|
343
|
+
if (isDirectRun) {
|
|
344
|
+
const parsed = parseArgs(process.argv.slice(2));
|
|
345
|
+
if (!parsed.ok) {
|
|
346
|
+
console.error(parsed.message);
|
|
347
|
+
process.exit(parsed.exitCode);
|
|
348
|
+
}
|
|
349
|
+
runCheck(parsed.args)
|
|
350
|
+
.then((code) => process.exit(code))
|
|
351
|
+
.catch((err) => {
|
|
352
|
+
console.error(`EvalGate gate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
353
|
+
process.exit(constants_1.EXIT.API_ERROR);
|
|
354
|
+
});
|
|
355
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CI context capture and idempotency key for --onFail import.
|
|
3
|
+
*/
|
|
4
|
+
import type { CiContext } from "./api";
|
|
5
|
+
export declare function captureCiContext(): CiContext | undefined;
|
|
6
|
+
export declare function computeIdempotencyKey(evaluationId: string, ci: CiContext): string | undefined;
|