@evalgate/sdk 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +638 -0
- package/README.md +398 -0
- package/dist/assertions.d.ts +189 -0
- package/dist/assertions.js +662 -0
- package/dist/batch.d.ts +68 -0
- package/dist/batch.js +179 -0
- package/dist/cache.d.ts +65 -0
- package/dist/cache.js +131 -0
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +132 -0
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/check.d.ts +73 -0
- package/dist/cli/check.js +355 -0
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +112 -0
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +230 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.d.ts +88 -0
- package/dist/cli/doctor.js +675 -0
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.d.ts +58 -0
- package/dist/cli/explain.js +561 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +135 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +110 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +103 -0
- package/dist/cli/formatters/types.d.ts +103 -0
- package/dist/cli/formatters/types.js +8 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +179 -0
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.d.ts +9 -0
- package/dist/cli/index.js +332 -0
- package/dist/cli/init.d.ts +16 -0
- package/dist/cli/init.js +292 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +89 -0
- package/dist/cli/print-config.d.ts +29 -0
- package/dist/cli/print-config.js +270 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/regression-gate.d.ts +15 -0
- package/dist/cli/regression-gate.js +341 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +132 -0
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +91 -0
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +492 -0
- package/dist/cli/workspace.d.ts +31 -0
- package/dist/cli/workspace.js +68 -0
- package/dist/client.d.ts +368 -0
- package/dist/client.js +893 -0
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +232 -0
- package/dist/context.d.ts +134 -0
- package/dist/context.js +215 -0
- package/dist/errors.d.ts +82 -0
- package/dist/errors.js +298 -0
- package/dist/export.d.ts +195 -0
- package/dist/export.js +344 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.js +153 -0
- package/dist/integrations/anthropic.d.ts +91 -0
- package/dist/integrations/anthropic.js +163 -0
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +232 -0
- package/dist/integrations/openai.d.ts +92 -0
- package/dist/integrations/openai.js +160 -0
- package/dist/local.d.ts +39 -0
- package/dist/local.js +148 -0
- package/dist/logger.d.ts +128 -0
- package/dist/logger.js +227 -0
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +74 -0
- package/dist/pagination.js +139 -0
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +400 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/snapshot.d.ts +176 -0
- package/dist/snapshot.js +322 -0
- package/dist/streaming.d.ts +173 -0
- package/dist/streaming.js +268 -0
- package/dist/testing.d.ts +273 -0
- package/dist/testing.js +317 -0
- package/dist/types.d.ts +754 -0
- package/dist/types.js +54 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +41 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +389 -0
- package/dist/workflows.js +671 -0
- package/package.json +117 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* GitHub formatter for evalgate check.
|
|
4
|
+
* - stdout: minimal (verdict + score + link) + ::error annotations for failed cases
|
|
5
|
+
* - Step summary: full Markdown written to GITHUB_STEP_SUMMARY (not stdout)
|
|
6
|
+
*/
|
|
7
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
8
|
+
if (k2 === undefined) k2 = k;
|
|
9
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
10
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
11
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
12
|
+
}
|
|
13
|
+
Object.defineProperty(o, k2, desc);
|
|
14
|
+
}) : (function(o, m, k, k2) {
|
|
15
|
+
if (k2 === undefined) k2 = k;
|
|
16
|
+
o[k2] = m[k];
|
|
17
|
+
}));
|
|
18
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
19
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
20
|
+
}) : function(o, v) {
|
|
21
|
+
o["default"] = v;
|
|
22
|
+
});
|
|
23
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
24
|
+
var ownKeys = function(o) {
|
|
25
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
26
|
+
var ar = [];
|
|
27
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
28
|
+
return ar;
|
|
29
|
+
};
|
|
30
|
+
return ownKeys(o);
|
|
31
|
+
};
|
|
32
|
+
return function (mod) {
|
|
33
|
+
if (mod && mod.__esModule) return mod;
|
|
34
|
+
var result = {};
|
|
35
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
36
|
+
__setModuleDefault(result, mod);
|
|
37
|
+
return result;
|
|
38
|
+
};
|
|
39
|
+
})();
|
|
40
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
41
|
+
exports.appendStepSummary = appendStepSummary;
|
|
42
|
+
exports.formatGitHub = formatGitHub;
|
|
43
|
+
const fs = __importStar(require("node:fs"));
|
|
44
|
+
const snippet_1 = require("../render/snippet");
|
|
45
|
+
const ANNOTATION_MAX = 10;
|
|
46
|
+
function escapeAnnotationMessage(s) {
|
|
47
|
+
return s.replace(/\r/g, "").replace(/\n/g, "%0A");
|
|
48
|
+
}
|
|
49
|
+
function formatAnnotation(fc) {
|
|
50
|
+
const id = fc.testCaseId ?? fc.name ?? "unknown";
|
|
51
|
+
const reason = fc.reason ?? fc.outputSnippet ?? fc.output ?? "no output";
|
|
52
|
+
const msg = escapeAnnotationMessage(`TestCase ${id} failed - ${(0, snippet_1.truncateSnippet)(reason, 100)}`);
|
|
53
|
+
return `::error title=EvalGate regression::${msg}`;
|
|
54
|
+
}
|
|
55
|
+
function appendStepSummary(report) {
|
|
56
|
+
const path = typeof process !== "undefined" && process.env?.GITHUB_STEP_SUMMARY;
|
|
57
|
+
if (!path)
|
|
58
|
+
return;
|
|
59
|
+
const lines = [];
|
|
60
|
+
const passed = report.verdict === "pass";
|
|
61
|
+
const warned = report.verdict === "warn";
|
|
62
|
+
lines.push("## EvalGate Gate");
|
|
63
|
+
lines.push("");
|
|
64
|
+
lines.push(passed && !warned
|
|
65
|
+
? "✅ **PASSED**"
|
|
66
|
+
: warned
|
|
67
|
+
? `⚠️ **WARNED**: ${report.reasonMessage ?? report.reasonCode}`
|
|
68
|
+
: `❌ **FAILED**: ${report.reasonMessage ?? report.reasonCode}`);
|
|
69
|
+
lines.push("");
|
|
70
|
+
const deltaStr = report.baselineScore != null && report.delta != null
|
|
71
|
+
? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? "+" : ""}${report.delta} pts)`
|
|
72
|
+
: "";
|
|
73
|
+
lines.push(`**Score:** ${report.score ?? 0}/100${deltaStr}`);
|
|
74
|
+
lines.push("");
|
|
75
|
+
const failedCases = report.failedCases ?? [];
|
|
76
|
+
if (failedCases.length > 0) {
|
|
77
|
+
lines.push(`### ${failedCases.length} failing case${failedCases.length === 1 ? "" : "s"}`);
|
|
78
|
+
lines.push("");
|
|
79
|
+
for (const fc of failedCases.slice(0, 10)) {
|
|
80
|
+
const label = fc.name ?? fc.input ?? "(unnamed)";
|
|
81
|
+
const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 80);
|
|
82
|
+
const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 80);
|
|
83
|
+
const reason = out ? `got "${out}"` : "no output";
|
|
84
|
+
lines.push(`- **${(0, snippet_1.truncateSnippet)(label, 60)}** — expected: ${exp || "(unknown)"}, ${reason}`);
|
|
85
|
+
}
|
|
86
|
+
if (failedCases.length > 10) {
|
|
87
|
+
lines.push(`- _+ ${failedCases.length - 10} more_`);
|
|
88
|
+
}
|
|
89
|
+
lines.push("");
|
|
90
|
+
}
|
|
91
|
+
if (report.dashboardUrl) {
|
|
92
|
+
lines.push(`[View Dashboard](${report.dashboardUrl})`);
|
|
93
|
+
lines.push("");
|
|
94
|
+
}
|
|
95
|
+
if (!passed) {
|
|
96
|
+
lines.push("> **Tip:** Run `evalgate explain` locally to see root causes and suggested fixes.");
|
|
97
|
+
lines.push("> Report saved to `.evalgate/last-report.json` — upload as a build artifact for offline analysis.");
|
|
98
|
+
lines.push("");
|
|
99
|
+
}
|
|
100
|
+
try {
|
|
101
|
+
fs.appendFileSync(path, lines.join("\n"), "utf8");
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
// Non-fatal: step summary is best-effort
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
function formatGitHub(report) {
|
|
108
|
+
const stdoutLines = [];
|
|
109
|
+
// Emit ::error annotations for failed cases (up to N)
|
|
110
|
+
const failedCases = report.failedCases ?? [];
|
|
111
|
+
const toAnnotate = failedCases.slice(0, ANNOTATION_MAX);
|
|
112
|
+
for (const fc of toAnnotate) {
|
|
113
|
+
stdoutLines.push(formatAnnotation(fc));
|
|
114
|
+
}
|
|
115
|
+
// Minimal summary: verdict + score + link
|
|
116
|
+
const passed = report.verdict === "pass";
|
|
117
|
+
const warned = report.verdict === "warn";
|
|
118
|
+
const failReason = report.reasonMessage ?? report.reasonCode;
|
|
119
|
+
if (passed && !warned)
|
|
120
|
+
stdoutLines.push("\n✓ EvalGate gate PASSED");
|
|
121
|
+
else if (warned)
|
|
122
|
+
stdoutLines.push(`\n⚠ EvalGate gate WARNED: ${failReason}`);
|
|
123
|
+
else
|
|
124
|
+
stdoutLines.push(`\n✗ EvalGate gate FAILED: ${failReason}`);
|
|
125
|
+
const deltaStr = report.baselineScore != null && report.delta != null
|
|
126
|
+
? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? "+" : ""}${report.delta} pts)`
|
|
127
|
+
: "";
|
|
128
|
+
stdoutLines.push(`Score: ${report.score ?? 0}/100${deltaStr}`);
|
|
129
|
+
if (report.dashboardUrl) {
|
|
130
|
+
stdoutLines.push(`Dashboard: ${report.dashboardUrl}`);
|
|
131
|
+
}
|
|
132
|
+
// Write full markdown to GITHUB_STEP_SUMMARY (not stdout)
|
|
133
|
+
appendStepSummary(report);
|
|
134
|
+
return stdoutLines.join("\n");
|
|
135
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Human-readable formatter for evalgate check output.
|
|
4
|
+
* Deterministic: verdict → score → failures → link → hint.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.formatHuman = formatHuman;
|
|
8
|
+
const snippet_1 = require("../render/snippet");
|
|
9
|
+
const TOP_N = 3;
|
|
10
|
+
function formatHuman(report) {
|
|
11
|
+
const lines = [];
|
|
12
|
+
const passed = report.verdict === "pass";
|
|
13
|
+
const warned = report.verdict === "warn";
|
|
14
|
+
const failReason = report.reasonMessage;
|
|
15
|
+
lines.push(passed && !warned
|
|
16
|
+
? "\n✓ EvalGate gate PASSED"
|
|
17
|
+
: warned
|
|
18
|
+
? `\n⚠ EvalGate gate WARNED: ${failReason ?? report.reasonCode}`
|
|
19
|
+
: `\n✗ EvalGate gate FAILED: ${failReason ?? report.reasonCode}`);
|
|
20
|
+
const deltaStr = report.baselineScore != null && report.delta != null
|
|
21
|
+
? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? "+" : ""}${report.delta} pts)`
|
|
22
|
+
: "";
|
|
23
|
+
lines.push(`Score: ${report.score ?? 0}/100${deltaStr}`);
|
|
24
|
+
const failedCases = report.failedCases ?? [];
|
|
25
|
+
if (failedCases.length > 0) {
|
|
26
|
+
const toShow = failedCases.slice(0, TOP_N);
|
|
27
|
+
lines.push(`${failedCases.length} failing case${failedCases.length === 1 ? "" : "s"}:`);
|
|
28
|
+
for (const fc of toShow) {
|
|
29
|
+
const label = fc.name ?? fc.input ?? "(unnamed)";
|
|
30
|
+
const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 50);
|
|
31
|
+
const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 50);
|
|
32
|
+
const reason = out ? `got "${out}"` : "no output";
|
|
33
|
+
lines.push(` - "${(0, snippet_1.truncateSnippet)(label, 50)}" → expected: ${exp || "(unknown)"}, ${reason}`);
|
|
34
|
+
}
|
|
35
|
+
if (failedCases.length > toShow.length) {
|
|
36
|
+
lines.push(` + ${failedCases.length - toShow.length} more`);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
if (report.dashboardUrl) {
|
|
40
|
+
lines.push(`Dashboard: ${report.dashboardUrl}`);
|
|
41
|
+
}
|
|
42
|
+
if (!passed || warned) {
|
|
43
|
+
lines.push("Next: View full report above, fix failing cases, or adjust gate with --minScore / --maxDrop / --warnDrop");
|
|
44
|
+
}
|
|
45
|
+
if (report.explain &&
|
|
46
|
+
(report.breakdown01 ||
|
|
47
|
+
report.contribPts ||
|
|
48
|
+
report.flags?.length ||
|
|
49
|
+
report.policyEvidence)) {
|
|
50
|
+
lines.push("");
|
|
51
|
+
lines.push("--- Explain ---");
|
|
52
|
+
if (report.contribPts) {
|
|
53
|
+
const cp = report.contribPts;
|
|
54
|
+
const pts = [];
|
|
55
|
+
if (cp.passRatePts != null)
|
|
56
|
+
pts.push(`passRate: ${cp.passRatePts}`);
|
|
57
|
+
if (cp.safetyPts != null)
|
|
58
|
+
pts.push(`safety: ${cp.safetyPts}`);
|
|
59
|
+
if (cp.compliancePts != null)
|
|
60
|
+
pts.push(`compliance: ${cp.compliancePts}`);
|
|
61
|
+
if (cp.performancePts != null)
|
|
62
|
+
pts.push(`performance: ${cp.performancePts}`);
|
|
63
|
+
if (pts.length)
|
|
64
|
+
lines.push(`Contrib pts: ${pts.join(", ")}`);
|
|
65
|
+
}
|
|
66
|
+
if (report.breakdown01) {
|
|
67
|
+
const b = report.breakdown01;
|
|
68
|
+
const parts = [];
|
|
69
|
+
if (b.passRate != null)
|
|
70
|
+
parts.push(`passRate=${b.passRate}`);
|
|
71
|
+
if (b.safety != null)
|
|
72
|
+
parts.push(`safety=${b.safety}`);
|
|
73
|
+
if (b.judge != null)
|
|
74
|
+
parts.push(`judge=${b.judge}`);
|
|
75
|
+
if (b.schema != null)
|
|
76
|
+
parts.push(`schema=${b.schema}`);
|
|
77
|
+
if (b.latency != null)
|
|
78
|
+
parts.push(`latency=${b.latency}`);
|
|
79
|
+
if (b.cost != null)
|
|
80
|
+
parts.push(`cost=${b.cost}`);
|
|
81
|
+
if (parts.length)
|
|
82
|
+
lines.push(`Breakdown: ${parts.join(", ")}`);
|
|
83
|
+
}
|
|
84
|
+
if (report.flags && report.flags.length > 0) {
|
|
85
|
+
lines.push(`Flags: ${report.flags.join(", ")}`);
|
|
86
|
+
}
|
|
87
|
+
if (report.thresholds) {
|
|
88
|
+
const t = report.thresholds;
|
|
89
|
+
const parts = [];
|
|
90
|
+
if (t.minScore != null)
|
|
91
|
+
parts.push(`minScore=${t.minScore}`);
|
|
92
|
+
if (t.maxDrop != null)
|
|
93
|
+
parts.push(`maxDrop=${t.maxDrop}`);
|
|
94
|
+
if (t.minN != null)
|
|
95
|
+
parts.push(`minN=${t.minN}`);
|
|
96
|
+
if (parts.length)
|
|
97
|
+
lines.push(`Thresholds: ${parts.join(", ")}`);
|
|
98
|
+
}
|
|
99
|
+
if (report.policyEvidence) {
|
|
100
|
+
const pe = report.policyEvidence;
|
|
101
|
+
lines.push(`Policy sub-check failed: ${pe.failedCheck ?? "unknown"}`);
|
|
102
|
+
if (pe.remediation)
|
|
103
|
+
lines.push(`Remediation: ${pe.remediation}`);
|
|
104
|
+
if (pe.snapshot && Object.keys(pe.snapshot).length > 0) {
|
|
105
|
+
lines.push(`Snapshot: ${JSON.stringify(pe.snapshot)}`);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
return lines.join("\n");
|
|
110
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* JSON formatter for evalgate check.
|
|
4
|
+
* Outputs only JSON, no extra logs.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.formatJson = formatJson;
|
|
8
|
+
function formatJson(report) {
|
|
9
|
+
return JSON.stringify(report, null, 0);
|
|
10
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PR comment markdown builder for evalgate check --pr-comment-out.
|
|
3
|
+
* Produces deterministic markdown for GitHub Action to post as PR comment.
|
|
4
|
+
*/
|
|
5
|
+
import type { CheckReport } from "./types";
|
|
6
|
+
/**
|
|
7
|
+
* Hidden marker for GitHub Action to find and update existing comment (sticky update).
|
|
8
|
+
* Action should: 1) post body from file 2) search PR comments for this marker 3) update if found, else create.
|
|
9
|
+
* Export for use in Action scripts.
|
|
10
|
+
*/
|
|
11
|
+
export declare const PR_COMMENT_MARKER = "<!-- evalgate-gate-comment -->";
|
|
12
|
+
export declare function buildPrComment(report: CheckReport): string;
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* PR comment markdown builder for evalgate check --pr-comment-out.
|
|
4
|
+
* Produces deterministic markdown for GitHub Action to post as PR comment.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.PR_COMMENT_MARKER = void 0;
|
|
8
|
+
exports.buildPrComment = buildPrComment;
|
|
9
|
+
const snippet_1 = require("../render/snippet");
|
|
10
|
+
const TOP_FAILURES = 3;
|
|
11
|
+
function escapeMarkdown(s) {
|
|
12
|
+
return s.replace(/\|/g, "\\|").replace(/\n/g, " ");
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Hidden marker for GitHub Action to find and update existing comment (sticky update).
|
|
16
|
+
* Action should: 1) post body from file 2) search PR comments for this marker 3) update if found, else create.
|
|
17
|
+
* Export for use in Action scripts.
|
|
18
|
+
*/
|
|
19
|
+
exports.PR_COMMENT_MARKER = "<!-- evalgate-gate-comment -->";
|
|
20
|
+
function buildPrComment(report) {
|
|
21
|
+
const lines = [];
|
|
22
|
+
lines.push(exports.PR_COMMENT_MARKER);
|
|
23
|
+
lines.push("");
|
|
24
|
+
const passed = report.verdict === "pass";
|
|
25
|
+
const gateApplied = report.gateApplied !== false;
|
|
26
|
+
// Verdict badge — distinguish "PASS" from "NOT GATED"
|
|
27
|
+
if (!gateApplied) {
|
|
28
|
+
lines.push("## ⚠️ EvalGate Regression Gate — NOT APPLIED");
|
|
29
|
+
lines.push("");
|
|
30
|
+
lines.push("**Gate not applied: baseline missing.**");
|
|
31
|
+
if (report.actionableMessage) {
|
|
32
|
+
lines.push("");
|
|
33
|
+
lines.push(report.actionableMessage);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
lines.push(passed
|
|
38
|
+
? "## ✅ EvalGate Regression Gate — PASSED"
|
|
39
|
+
: "## 🚨 EvalGate Regression Gate — FAILED");
|
|
40
|
+
}
|
|
41
|
+
lines.push("");
|
|
42
|
+
// Score + Delta (skip when gate not applied)
|
|
43
|
+
const deltaStr = report.baselineScore != null && report.delta != null
|
|
44
|
+
? ` (${report.delta >= 0 ? "+" : ""}${report.delta} from baseline ${report.baselineScore})`
|
|
45
|
+
: "";
|
|
46
|
+
lines.push(`**Score:** ${report.score ?? 0}/100${deltaStr}`);
|
|
47
|
+
lines.push("");
|
|
48
|
+
// ReasonCode
|
|
49
|
+
lines.push(`**Reason:** ${report.reasonCode}`);
|
|
50
|
+
if (report.reasonMessage) {
|
|
51
|
+
lines.push(`_${escapeMarkdown(report.reasonMessage)}_`);
|
|
52
|
+
}
|
|
53
|
+
lines.push("");
|
|
54
|
+
// Policy (if unknown)
|
|
55
|
+
if (report.policy) {
|
|
56
|
+
lines.push(`**Policy:** ${report.policy}`);
|
|
57
|
+
lines.push("");
|
|
58
|
+
}
|
|
59
|
+
// Top failures (max 3)
|
|
60
|
+
const failedCases = report.failedCases ?? [];
|
|
61
|
+
if (failedCases.length > 0) {
|
|
62
|
+
lines.push("### Top Issues");
|
|
63
|
+
lines.push("");
|
|
64
|
+
for (const fc of failedCases.slice(0, TOP_FAILURES)) {
|
|
65
|
+
const label = fc.name ?? fc.input ?? "(unnamed)";
|
|
66
|
+
const reason = fc.reason ?? fc.outputSnippet ?? fc.output ?? "no output";
|
|
67
|
+
lines.push(`- **${(0, snippet_1.truncateSnippet)(escapeMarkdown(label), 60)}** — ${(0, snippet_1.truncateSnippet)(escapeMarkdown(reason), 80)}`);
|
|
68
|
+
}
|
|
69
|
+
if (failedCases.length > TOP_FAILURES) {
|
|
70
|
+
lines.push(`- _+ ${failedCases.length - TOP_FAILURES} more_`);
|
|
71
|
+
}
|
|
72
|
+
lines.push("");
|
|
73
|
+
}
|
|
74
|
+
// Explain summary (if --explain)
|
|
75
|
+
if (report.explain && report.contribPts) {
|
|
76
|
+
const pts = report.contribPts;
|
|
77
|
+
const parts = [];
|
|
78
|
+
if (pts.passRatePts != null)
|
|
79
|
+
parts.push(`pass rate: ${pts.passRatePts} pts`);
|
|
80
|
+
if (pts.safetyPts != null)
|
|
81
|
+
parts.push(`safety: ${pts.safetyPts} pts`);
|
|
82
|
+
if (pts.compliancePts != null)
|
|
83
|
+
parts.push(`compliance: ${pts.compliancePts} pts`);
|
|
84
|
+
if (pts.performancePts != null)
|
|
85
|
+
parts.push(`performance: ${pts.performancePts} pts`);
|
|
86
|
+
if (parts.length > 0) {
|
|
87
|
+
lines.push("### Breakdown");
|
|
88
|
+
lines.push("");
|
|
89
|
+
lines.push(parts.join(" | "));
|
|
90
|
+
lines.push("");
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
// Dashboard URL
|
|
94
|
+
if (report.dashboardUrl) {
|
|
95
|
+
lines.push(`🔎 [Dashboard](${report.dashboardUrl})`);
|
|
96
|
+
}
|
|
97
|
+
// Share URL (if exists)
|
|
98
|
+
if (report.shareUrl) {
|
|
99
|
+
lines.push(`🔗 [Share Snapshot](${report.shareUrl})`);
|
|
100
|
+
}
|
|
101
|
+
lines.push("");
|
|
102
|
+
return lines.join("\n");
|
|
103
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CheckReport and related types for formatters.
|
|
3
|
+
*/
|
|
4
|
+
export type GateVerdict = "pass" | "warn" | "fail";
|
|
5
|
+
/** "neutral" = exit 0 but gate not applied (e.g. baseline missing with --baseline auto) */
|
|
6
|
+
export type GateMode = "enforced" | "neutral";
|
|
7
|
+
/** Canonical reason codes. Import REASON_CODES from ../reason-codes for constants. */
|
|
8
|
+
export type FailureReasonCode = "PASS" | "WARN_REGRESSION" | "LOW_SAMPLE_SIZE" | "BASELINE_MISSING" | "SCORE_TOO_LOW" | "DELTA_TOO_HIGH" | "COST_BUDGET_EXCEEDED" | "LATENCY_BUDGET_EXCEEDED" | "POLICY_FAILED" | "UNKNOWN" | "LOW_SCORE" | "LOW_PASS_RATE" | "SAFETY_RISK" | "LATENCY_RISK" | "COST_RISK" | "MAX_DROP_EXCEEDED" | "INSUFFICIENT_EVIDENCE" | "POLICY_VIOLATION";
|
|
9
|
+
export type ScoreBreakdown01 = {
|
|
10
|
+
passRate?: number;
|
|
11
|
+
safety?: number;
|
|
12
|
+
judge?: number;
|
|
13
|
+
schema?: number;
|
|
14
|
+
latency?: number;
|
|
15
|
+
cost?: number;
|
|
16
|
+
};
|
|
17
|
+
export type ScoreContribPts = {
|
|
18
|
+
passRatePts?: number;
|
|
19
|
+
safetyPts?: number;
|
|
20
|
+
compliancePts?: number;
|
|
21
|
+
performancePts?: number;
|
|
22
|
+
};
|
|
23
|
+
export type GateThresholds = {
|
|
24
|
+
minScore?: number;
|
|
25
|
+
minPassRate?: number;
|
|
26
|
+
minSafety?: number;
|
|
27
|
+
maxDrop?: number;
|
|
28
|
+
warnDrop?: number;
|
|
29
|
+
minN?: number;
|
|
30
|
+
allowWeakEvidence?: boolean;
|
|
31
|
+
baseline?: "published" | "previous" | "production" | "auto";
|
|
32
|
+
maxCostUsd?: number;
|
|
33
|
+
maxLatencyMs?: number;
|
|
34
|
+
maxCostDeltaUsd?: number;
|
|
35
|
+
};
|
|
36
|
+
export type FailedCase = {
|
|
37
|
+
testCaseId?: number;
|
|
38
|
+
status?: "failed" | "error" | "skipped" | "passed";
|
|
39
|
+
name?: string;
|
|
40
|
+
input?: string;
|
|
41
|
+
inputSnippet?: string;
|
|
42
|
+
expectedOutput?: string;
|
|
43
|
+
expectedSnippet?: string;
|
|
44
|
+
output?: string;
|
|
45
|
+
outputSnippet?: string;
|
|
46
|
+
reason?: string;
|
|
47
|
+
};
|
|
48
|
+
export type CiContext = {
|
|
49
|
+
provider?: "github" | "gitlab" | "circle" | "unknown";
|
|
50
|
+
repo?: string;
|
|
51
|
+
sha?: string;
|
|
52
|
+
branch?: string;
|
|
53
|
+
pr?: number;
|
|
54
|
+
runUrl?: string;
|
|
55
|
+
actor?: string;
|
|
56
|
+
};
|
|
57
|
+
/** Current schema version for CheckReport (.evalgate/last-report.json). Bump on breaking changes. */
|
|
58
|
+
export declare const CHECK_REPORT_SCHEMA_VERSION = 1;
|
|
59
|
+
export type CheckReport = {
|
|
60
|
+
schemaVersion?: number;
|
|
61
|
+
evaluationId: string;
|
|
62
|
+
runId?: number;
|
|
63
|
+
verdict: GateVerdict;
|
|
64
|
+
/** false when gate not applied (e.g. baseline missing, exit 0) — prevents false confidence */
|
|
65
|
+
gateApplied: boolean;
|
|
66
|
+
/** "enforced" = gate ran; "neutral" = exit 0, gate skipped */
|
|
67
|
+
gateMode: GateMode;
|
|
68
|
+
reasonCode: FailureReasonCode;
|
|
69
|
+
/** Actionable message for PR comment / UX */
|
|
70
|
+
actionableMessage?: string;
|
|
71
|
+
reasonMessage?: string;
|
|
72
|
+
score?: number;
|
|
73
|
+
baselineScore?: number;
|
|
74
|
+
delta?: number;
|
|
75
|
+
passRate?: number;
|
|
76
|
+
safetyPassRate?: number;
|
|
77
|
+
flags?: string[];
|
|
78
|
+
breakdown01?: ScoreBreakdown01;
|
|
79
|
+
contribPts?: ScoreContribPts;
|
|
80
|
+
thresholds?: GateThresholds;
|
|
81
|
+
n?: number;
|
|
82
|
+
evidenceLevel?: "strong" | "medium" | "weak";
|
|
83
|
+
baselineMissing?: boolean;
|
|
84
|
+
baselineStatus?: "found" | "missing";
|
|
85
|
+
dashboardUrl?: string;
|
|
86
|
+
failedCases?: FailedCase[];
|
|
87
|
+
failedCasesShown?: number;
|
|
88
|
+
failedCasesMore?: number;
|
|
89
|
+
requestId?: string;
|
|
90
|
+
durationMs?: number;
|
|
91
|
+
ci?: CiContext;
|
|
92
|
+
explain?: boolean;
|
|
93
|
+
shareUrl?: string;
|
|
94
|
+
policy?: string;
|
|
95
|
+
baselineRunId?: number;
|
|
96
|
+
ciRunUrl?: string;
|
|
97
|
+
/** When --explain and policy failed: which sub-check failed, remediation, snapshot */
|
|
98
|
+
policyEvidence?: {
|
|
99
|
+
failedCheck?: string;
|
|
100
|
+
remediation?: string;
|
|
101
|
+
snapshot?: Record<string, unknown>;
|
|
102
|
+
};
|
|
103
|
+
};
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* CheckReport and related types for formatters.
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.CHECK_REPORT_SCHEMA_VERSION = void 0;
|
|
7
|
+
/** Current schema version for CheckReport (.evalgate/last-report.json). Bump on breaking changes. */
|
|
8
|
+
exports.CHECK_REPORT_SCHEMA_VERSION = 1;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure gate evaluation. No console output.
|
|
3
|
+
* Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
|
|
4
|
+
*/
|
|
5
|
+
import type { QualityLatestData } from "./api";
|
|
6
|
+
import type { CheckArgs } from "./check";
|
|
7
|
+
export type GateResult = {
|
|
8
|
+
exitCode: number;
|
|
9
|
+
passed: boolean;
|
|
10
|
+
reasonCode: string;
|
|
11
|
+
reasonMessage: string | null;
|
|
12
|
+
/** true when gate was skipped (e.g. baseline missing + auto) */
|
|
13
|
+
gateSkipped?: boolean;
|
|
14
|
+
/** When policy failed: sub-check, remediation, snapshot for explain */
|
|
15
|
+
policyEvidence?: {
|
|
16
|
+
failedCheck: string;
|
|
17
|
+
remediation: string;
|
|
18
|
+
snapshot?: Record<string, unknown>;
|
|
19
|
+
};
|
|
20
|
+
};
|
|
21
|
+
export declare function evaluateGate(args: CheckArgs, quality: QualityLatestData): GateResult;
|